ref: 47ec5de29e56c4cf223e0582b930b88ca2a589ee
dir: /sys/src/cmd/aux/antiword/word2text.c/
/* * word2text.c * Copyright (C) 1998-2005 A.J. van Os; Released under GNU GPL * * Description: * MS Word to "text" functions */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #if defined(__riscos) #include "DeskLib:Hourglass.h" #include "drawfile.h" #endif /* __riscos */ #include "antiword.h" #define INITIAL_SIZE 40 #define EXTENTION_SIZE 20 /* Macros to make sure all such statements will be identical */ #define OUTPUT_LINE() \ do {\ vAlign2Window(pDiag, pAnchor, lWidthMax, ucAlignment);\ TRACE_MSG("after vAlign2Window");\ pAnchor = pStartNewOutput(pAnchor, NULL);\ pOutput = pAnchor;\ } while(0) #define RESET_LINE() \ do {\ pAnchor = pStartNewOutput(pAnchor, NULL);\ pOutput = pAnchor;\ } while(0) #if defined(__riscos) /* Length of the document in characters */ static ULONG ulDocumentLength; /* Number of characters processed so far */ static ULONG ulCharCounter; static int iCurrPct, iPrevPct; #endif /* __riscos */ /* The document is in the format belonging to this version of Word */ static int iWordVersion = -1; /* Special treatment for files from Word 4/5/6 on an Apple Macintosh */ static BOOL bOldMacFile = FALSE; /* Section Information */ static const section_block_type *pSection = NULL; static const section_block_type *pSectionNext = NULL; /* All the (command line) options */ static options_type tOptions; /* Needed for reading a complete table row */ static const row_block_type *pRowInfo = NULL; static BOOL bStartRow = FALSE; static BOOL bEndRowNorm = FALSE; static BOOL bEndRowFast = FALSE; static BOOL bIsTableRow = FALSE; /* Index of the next style and font information */ static USHORT usIstdNext = ISTD_NORMAL; /* Needed for finding the start of a style */ static const style_block_type *pStyleInfo = NULL; static style_block_type tStyleNext; static BOOL bStartStyle = FALSE; static BOOL bStartStyleNext = FALSE; /* Needed for finding the start of a font */ static const font_block_type *pFontInfo = NULL; static font_block_type tFontNext; static BOOL bStartFont = FALSE; static BOOL bStartFontNext = FALSE; /* Needed for finding an image */ static ULONG ulFileOffsetImage = FC_INVALID; /* * vUpdateCounters - Update the counters for the hourglass */ static void vUpdateCounters(void) { #if defined(__riscos) ulCharCounter++; iCurrPct = (int)((ulCharCounter * 100) / ulDocumentLength); if (iCurrPct != iPrevPct) { Hourglass_Percentage(iCurrPct); iPrevPct = iCurrPct; } #endif /* __riscos */ } /* end of vUpdateCounters */ /* * bOutputContainsText - see if the output contains more than white space */ BOOL bOutputContainsText(const output_type *pAnchor) { const output_type *pCurr; size_t tIndex; fail(pAnchor == NULL); for (pCurr = pAnchor; pCurr != NULL; pCurr = pCurr->pNext) { fail(pCurr->lStringWidth < 0); for (tIndex = 0; tIndex < pCurr->tNextFree; tIndex++) { if (isspace((int)(UCHAR)pCurr->szStorage[tIndex])) { continue; } #if defined(DEBUG) if (pCurr->szStorage[tIndex] == FILLER_CHAR) { continue; } #endif /* DEBUG */ return TRUE; } } return FALSE; } /* end of bOutputContainsText */ /* * lTotalStringWidth - compute the total width of the output string */ static long lTotalStringWidth(const output_type *pAnchor) { const output_type *pCurr; long lTotal; lTotal = 0; for (pCurr = pAnchor; pCurr != NULL; pCurr = pCurr->pNext) { DBG_DEC_C(pCurr->lStringWidth < 0, pCurr->lStringWidth); fail(pCurr->lStringWidth < 0); lTotal += pCurr->lStringWidth; } return lTotal; } /* end of lTotalStringWidth */ /* * vStoreByte - store one byte */ static void vStoreByte(UCHAR ucChar, output_type *pOutput) { fail(pOutput == NULL); if (ucChar == 0) { pOutput->szStorage[pOutput->tNextFree] = '\0'; return; } while (pOutput->tNextFree + 2 > pOutput->tStorageSize) { pOutput->tStorageSize += EXTENTION_SIZE; pOutput->szStorage = xrealloc(pOutput->szStorage, pOutput->tStorageSize); } pOutput->szStorage[pOutput->tNextFree] = (char)ucChar; pOutput->szStorage[pOutput->tNextFree + 1] = '\0'; pOutput->tNextFree++; } /* end of vStoreByte */ /* * vStoreChar - store a character as one or more bytes */ static void vStoreChar(ULONG ulChar, BOOL bChangeAllowed, output_type *pOutput) { char szResult[4]; size_t tIndex, tLen; fail(pOutput == NULL); if (tOptions.eEncoding == encoding_utf_8 && bChangeAllowed) { DBG_HEX_C(ulChar > 0xffff, ulChar); fail(ulChar > 0xffff); tLen = tUcs2Utf8(ulChar, szResult, sizeof(szResult)); for (tIndex = 0; tIndex < tLen; tIndex++) { vStoreByte((UCHAR)szResult[tIndex], pOutput); } } else { DBG_HEX_C(ulChar > 0xff, ulChar); fail(ulChar > 0xff); vStoreByte((UCHAR)ulChar, pOutput); tLen = 1; } pOutput->lStringWidth += lComputeStringWidth( pOutput->szStorage + pOutput->tNextFree - tLen, tLen, pOutput->tFontRef, pOutput->usFontSize); } /* end of vStoreChar */ /* * vStoreCharacter - store one character */ static void vStoreCharacter(ULONG ulChar, output_type *pOutput) { vStoreChar(ulChar, TRUE, pOutput); } /* end of vStoreCharacter */ /* * vStoreString - store a string */ static void vStoreString(const char *szString, size_t tStringLength, output_type *pOutput) { size_t tIndex; fail(szString == NULL || pOutput == NULL); for (tIndex = 0; tIndex < tStringLength; tIndex++) { vStoreCharacter((ULONG)(UCHAR)szString[tIndex], pOutput); } } /* end of vStoreString */ /* * vStoreNumberAsDecimal - store a number as a decimal number */ static void vStoreNumberAsDecimal(UINT uiNumber, output_type *pOutput) { size_t tLen; char szString[3 * sizeof(UINT) + 1]; fail(uiNumber == 0); fail(pOutput == NULL); tLen = (size_t)sprintf(szString, "%u", uiNumber); vStoreString(szString, tLen, pOutput); } /* end of vStoreNumberAsDecimal */ /* * vStoreNumberAsRoman - store a number as a roman numerical */ static void vStoreNumberAsRoman(UINT uiNumber, output_type *pOutput) { size_t tLen; char szString[15]; fail(uiNumber == 0); fail(pOutput == NULL); tLen = tNumber2Roman(uiNumber, FALSE, szString); vStoreString(szString, tLen, pOutput); } /* end of vStoreNumberAsRoman */ /* * vStoreStyle - store a style */ static void vStoreStyle(diagram_type *pDiag, output_type *pOutput, const style_block_type *pStyle) { size_t tLen; char szString[120]; fail(pDiag == NULL); fail(pOutput == NULL); fail(pStyle == NULL); if (tOptions.eConversionType == conversion_xml) { vSetHeaders(pDiag, pStyle->usIstd); } else { tLen = tStyle2Window(szString, sizeof(szString), pStyle, pSection); vStoreString(szString, tLen, pOutput); } } /* end of vStoreStyle */ /* * vPutIndentation - output the specified amount of indentation */ static void vPutIndentation(diagram_type *pDiag, output_type *pOutput, BOOL bNoMarks, BOOL bFirstLine, UINT uiListNumber, UCHAR ucNFC, const char *szListChar, long lLeftIndentation, long lLeftIndentation1) { long lWidth; size_t tIndex, tNextFree; char szLine[30]; fail(pDiag == NULL); fail(pOutput == NULL); fail(szListChar == NULL); fail(lLeftIndentation < 0); if (tOptions.eConversionType == conversion_xml) { /* XML does its own indentation at rendering time */ return; } if (bNoMarks) { if (bFirstLine) { lLeftIndentation += lLeftIndentation1; } if (lLeftIndentation < 0) { lLeftIndentation = 0; } vSetLeftIndentation(pDiag, lLeftIndentation); return; } if (lLeftIndentation <= 0) { DBG_HEX_C(ucNFC != 0x00, ucNFC); vSetLeftIndentation(pDiag, 0); return; } #if defined(DEBUG) if (tOptions.eEncoding == encoding_utf_8) { fail(strlen(szListChar) > 3); } else { DBG_HEX_C(iscntrl((int)szListChar[0]), szListChar[0]); fail(iscntrl((int)szListChar[0])); fail(szListChar[1] != '\0'); } #endif /* DEBUG */ switch (ucNFC) { case LIST_ARABIC_NUM: case LIST_NUMBER_TXT: tNextFree = (size_t)sprintf(szLine, "%u", uiListNumber); break; case LIST_UPPER_ROMAN: case LIST_LOWER_ROMAN: tNextFree = tNumber2Roman(uiListNumber, ucNFC == LIST_UPPER_ROMAN, szLine); break; case LIST_UPPER_ALPHA: case LIST_LOWER_ALPHA: tNextFree = tNumber2Alpha(uiListNumber, ucNFC == LIST_UPPER_ALPHA, szLine); break; case LIST_ORDINAL_NUM: case LIST_ORDINAL_TXT: if (uiListNumber % 10 == 1 && uiListNumber != 11) { tNextFree = (size_t)sprintf(szLine, "%ust", uiListNumber); } else if (uiListNumber % 10 == 2 && uiListNumber != 12) { tNextFree = (size_t)sprintf(szLine, "%und", uiListNumber); } else if (uiListNumber % 10 == 3 && uiListNumber != 13) { tNextFree = (size_t)sprintf(szLine, "%urd", uiListNumber); } else { tNextFree = (size_t)sprintf(szLine, "%uth", uiListNumber); } break; case LIST_OUTLINE_NUM: tNextFree = (size_t)sprintf(szLine, "%02u", uiListNumber); break; case LIST_SPECIAL: case LIST_SPECIAL2: case LIST_BULLETS: tNextFree = 0; break; default: DBG_HEX(ucNFC); DBG_FIXME(); tNextFree = (size_t)sprintf(szLine, "%u", uiListNumber); break; } tNextFree += (size_t)sprintf(szLine + tNextFree, "%.3s", szListChar); szLine[tNextFree++] = ' '; szLine[tNextFree] = '\0'; lWidth = lComputeStringWidth(szLine, tNextFree, pOutput->tFontRef, pOutput->usFontSize); lLeftIndentation -= lWidth; if (lLeftIndentation < 0) { lLeftIndentation = 0; } vSetLeftIndentation(pDiag, lLeftIndentation); for (tIndex = 0; tIndex < tNextFree; tIndex++) { vStoreChar((ULONG)(UCHAR)szLine[tIndex], FALSE, pOutput); } } /* end of vPutIndentation */ /* * vPutSeparatorLine - output a separator line * * A separator line is a horizontal line two inches long. * Two inches equals 144000 millipoints. */ static void vPutSeparatorLine(output_type *pOutput) { long lCharWidth; int iCounter, iChars; char szOne[2]; fail(pOutput == NULL); szOne[0] = OUR_EM_DASH; szOne[1] = '\0'; lCharWidth = lComputeStringWidth(szOne, 1, pOutput->tFontRef, pOutput->usFontSize); NO_DBG_DEC(lCharWidth); iChars = (int)((144000 + lCharWidth / 2) / lCharWidth); NO_DBG_DEC(iChars); for (iCounter = 0; iCounter < iChars; iCounter++) { vStoreCharacter((ULONG)(UCHAR)OUR_EM_DASH, pOutput); } } /* end of vPutSeparatorLine */ /* * pStartNextOutput - start the next output record * * returns a pointer to the next record */ static output_type * pStartNextOutput(output_type *pCurrent) { output_type *pNew; TRACE_MSG("pStartNextOutput"); if (pCurrent->tNextFree == 0) { /* The current record is empty, re-use */ fail(pCurrent->szStorage[0] != '\0'); fail(pCurrent->lStringWidth != 0); return pCurrent; } /* The current record is in use, make a new one */ pNew = xmalloc(sizeof(*pNew)); pCurrent->pNext = pNew; pNew->tStorageSize = INITIAL_SIZE; pNew->szStorage = xmalloc(pNew->tStorageSize); pNew->szStorage[0] = '\0'; pNew->tNextFree = 0; pNew->lStringWidth = 0; pNew->ucFontColor = FONT_COLOR_DEFAULT; pNew->usFontStyle = FONT_REGULAR; pNew->tFontRef = (drawfile_fontref)0; pNew->usFontSize = DEFAULT_FONT_SIZE; pNew->pPrev = pCurrent; pNew->pNext = NULL; return pNew; } /* end of pStartNextOutput */ /* * pStartNewOutput */ static output_type * pStartNewOutput(output_type *pAnchor, output_type *pLeftOver) { output_type *pCurr, *pNext; USHORT usFontStyle, usFontSize; drawfile_fontref tFontRef; UCHAR ucFontColor; TRACE_MSG("pStartNewOutput"); ucFontColor = FONT_COLOR_DEFAULT; usFontStyle = FONT_REGULAR; tFontRef = (drawfile_fontref)0; usFontSize = DEFAULT_FONT_SIZE; /* Free the old output space */ pCurr = pAnchor; while (pCurr != NULL) { TRACE_MSG("Free the old output space"); pNext = pCurr->pNext; pCurr->szStorage = xfree(pCurr->szStorage); if (pCurr->pNext == NULL) { ucFontColor = pCurr->ucFontColor; usFontStyle = pCurr->usFontStyle; tFontRef = pCurr->tFontRef; usFontSize = pCurr->usFontSize; } pCurr = xfree(pCurr); pCurr = pNext; } if (pLeftOver == NULL) { /* Create new output space */ TRACE_MSG("Create new output space"); pLeftOver = xmalloc(sizeof(*pLeftOver)); pLeftOver->tStorageSize = INITIAL_SIZE; NO_DBG_DEC(pLeftOver->tStorageSize); TRACE_MSG("before 2nd xmalloc"); pLeftOver->szStorage = xmalloc(pLeftOver->tStorageSize); TRACE_MSG("after 2nd xmalloc"); pLeftOver->szStorage[0] = '\0'; pLeftOver->tNextFree = 0; pLeftOver->lStringWidth = 0; pLeftOver->ucFontColor = ucFontColor; pLeftOver->usFontStyle = usFontStyle; pLeftOver->tFontRef = tFontRef; pLeftOver->usFontSize = usFontSize; pLeftOver->pPrev = NULL; pLeftOver->pNext = NULL; } fail(!bCheckDoubleLinkedList(pLeftOver)); return pLeftOver; } /* end of pStartNewOutput */ /* * ulGetChar - get the next character from the specified list * * returns the next character of EOF */ static ULONG ulGetChar(FILE *pFile, list_id_enum eListID) { const font_block_type *pCurr; ULONG ulChar, ulFileOffset, ulCharPos; row_info_enum eRowInfo; USHORT usChar, usPropMod; BOOL bSkip; fail(pFile == NULL); pCurr = pFontInfo; bSkip = FALSE; for (;;) { usChar = usNextChar(pFile, eListID, &ulFileOffset, &ulCharPos, &usPropMod); if (usChar == (USHORT)EOF) { return (ULONG)EOF; } vUpdateCounters(); eRowInfo = ePropMod2RowInfo(usPropMod, iWordVersion); if (!bStartRow) { #if 0 bStartRow = eRowInfo == found_a_cell || (pRowInfo != NULL && ulFileOffset == pRowInfo->ulFileOffsetStart && eRowInfo != found_not_a_cell); #else bStartRow = pRowInfo != NULL && ulFileOffset == pRowInfo->ulFileOffsetStart; #endif NO_DBG_HEX_C(bStartRow, pRowInfo->ulFileOffsetStart); } if (!bEndRowNorm) { #if 0 bEndRow = eRowInfo == found_end_of_row || (pRowInfo != NULL && ulFileOffset == pRowInfo->ulFileOffsetEnd && eRowInfo != found_not_end_of_row); #else bEndRowNorm = pRowInfo != NULL && ulFileOffset == pRowInfo->ulFileOffsetEnd; #endif NO_DBG_HEX_C(bEndRowNorm, pRowInfo->ulFileOffsetEnd); } if (!bEndRowFast) { bEndRowFast = eRowInfo == found_end_of_row; NO_DBG_HEX_C(bEndRowFast, pRowInfo->ulFileOffsetEnd); } if (!bStartStyle) { bStartStyle = pStyleInfo != NULL && ulFileOffset == pStyleInfo->ulFileOffset; NO_DBG_HEX_C(bStartStyle, ulFileOffset); } if (pCurr != NULL && ulFileOffset == pCurr->ulFileOffset) { bStartFont = TRUE; NO_DBG_HEX(ulFileOffset); pFontInfo = pCurr; pCurr = pGetNextFontInfoListItem(pCurr); } /* Skip embedded characters */ if (usChar == START_EMBEDDED) { bSkip = TRUE; continue; } if (usChar == END_IGNORE || usChar == END_EMBEDDED) { bSkip = FALSE; continue; } if (bSkip) { continue; } ulChar = ulTranslateCharacters(usChar, ulFileOffset, iWordVersion, tOptions.eConversionType, tOptions.eEncoding, bOldMacFile); if (ulChar == IGNORE_CHARACTER) { continue; } if (ulChar == PICTURE) { ulFileOffsetImage = ulGetPictInfoListItem(ulFileOffset); } else { ulFileOffsetImage = FC_INVALID; } if (ulChar == PAR_END) { /* End of paragraph seen, prepare for the next */ vFillStyleFromStylesheet(usIstdNext, &tStyleNext); vCorrectStyleValues(&tStyleNext); bStartStyleNext = TRUE; vFillFontFromStylesheet(usIstdNext, &tFontNext); vCorrectFontValues(&tFontNext); bStartFontNext = TRUE; } if (ulChar == PAGE_BREAK) { /* Might be the start of a new section */ pSectionNext = pGetSectionInfo(pSection, ulCharPos); } return ulChar; } } /* end of ulGetChar */ /* * lGetWidthMax - get the maximum line width from the paragraph break value * * Returns the maximum line width in millipoints */ static long lGetWidthMax(int iParagraphBreak) { fail(iParagraphBreak < 0); if (iParagraphBreak == 0) { return LONG_MAX; } if (iParagraphBreak < MIN_SCREEN_WIDTH) { return lChar2MilliPoints(MIN_SCREEN_WIDTH); } if (iParagraphBreak > MAX_SCREEN_WIDTH) { return lChar2MilliPoints(MAX_SCREEN_WIDTH); } return lChar2MilliPoints(iParagraphBreak); } /* end of lGetWidthMax */ /* * bWordDecryptor - turn Word to something more useful * * returns TRUE when succesful, otherwise FALSE */ BOOL bWordDecryptor(FILE *pFile, long lFilesize, diagram_type *pDiag) { imagedata_type tImage; const style_block_type *pStyleTmp; const font_block_type *pFontTmp; const char *szListChar; output_type *pAnchor, *pOutput, *pLeftOver; ULONG ulChar; long lBeforeIndentation, lAfterIndentation; long lLeftIndentation, lLeftIndentation1, lRightIndentation; long lWidthCurr, lWidthMax, lDefaultTabWidth, lHalfSpaceWidth, lTmp; list_id_enum eListID; image_info_enum eRes; UINT uiFootnoteNumber, uiEndnoteNumber, uiTmp; int iListSeqNumber; BOOL bWasTableRow, bTableFontClosed, bWasEndOfParagraph; BOOL bInList, bWasInList, bNoMarks, bFirstLine; BOOL bAllCapitals, bHiddenText, bMarkDelText, bSuccess; USHORT usListNumber; USHORT usFontStyle, usFontStyleMinimal, usFontSize, usTmp; UCHAR ucFontNumber, ucFontColor; UCHAR ucNFC, ucAlignment; fail(pFile == NULL || lFilesize <= 0 || pDiag == NULL); TRACE_MSG("bWordDecryptor"); iWordVersion = iInitDocument(pFile, lFilesize); if (iWordVersion < 0) { DBG_DEC(iWordVersion); return FALSE; } vGetOptions(&tOptions); bOldMacFile = bIsOldMacFile(); vPrepareHdrFtrText(pFile); vPrepareFootnoteText(pFile); vPrologue2(pDiag, iWordVersion); /* Initialisation */ #if defined(__riscos) ulCharCounter = 0; iCurrPct = 0; iPrevPct = -1; ulDocumentLength = ulGetDocumentLength(); #endif /* __riscos */ pSection = pGetSectionInfo(NULL, 0); pSectionNext = pSection; lDefaultTabWidth = lGetDefaultTabWidth(); DBG_DEC_C(lDefaultTabWidth != 36000, lDefaultTabWidth); pRowInfo = pGetNextRowInfoListItem(); DBG_HEX_C(pRowInfo != NULL, pRowInfo->ulFileOffsetStart); DBG_HEX_C(pRowInfo != NULL, pRowInfo->ulFileOffsetEnd); DBG_MSG_C(pRowInfo == NULL, "No rows at all"); bStartRow = FALSE; bEndRowNorm = FALSE; bEndRowFast = FALSE; bIsTableRow = FALSE; bWasTableRow = FALSE; vResetStyles(); pStyleInfo = pGetNextTextStyle(NULL); bStartStyle = FALSE; bInList = FALSE; bWasInList = FALSE; iListSeqNumber = 0; usIstdNext = ISTD_NORMAL; pAnchor = NULL; pFontInfo = pGetNextFontInfoListItem(NULL); DBG_HEX_C(pFontInfo != NULL, pFontInfo->ulFileOffset); DBG_MSG_C(pFontInfo == NULL, "No fonts at all"); bStartFont = FALSE; ucFontNumber = 0; usFontStyleMinimal = FONT_REGULAR; usFontStyle = FONT_REGULAR; usFontSize = DEFAULT_FONT_SIZE; ucFontColor = FONT_COLOR_DEFAULT; pAnchor = pStartNewOutput(pAnchor, NULL); pOutput = pAnchor; pOutput->ucFontColor = ucFontColor; pOutput->usFontStyle = usFontStyle; pOutput->tFontRef = tOpenFont(ucFontNumber, usFontStyle, usFontSize); pOutput->usFontSize = usFontSize; bTableFontClosed = TRUE; lBeforeIndentation = 0; lAfterIndentation = 0; lLeftIndentation = 0; lLeftIndentation1 = 0; lRightIndentation = 0; bWasEndOfParagraph = TRUE; bNoMarks = TRUE; bFirstLine = TRUE; ucNFC = LIST_BULLETS; if (pStyleInfo != NULL) { szListChar = pStyleInfo->szListChar; pStyleTmp = pStyleInfo; } else { if (tStyleNext.szListChar[0] == '\0') { vGetBulletValue(tOptions.eConversionType, tOptions.eEncoding, tStyleNext.szListChar, 4); } szListChar = tStyleNext.szListChar; pStyleTmp = &tStyleNext; } usListNumber = 0; ucAlignment = ALIGNMENT_LEFT; bAllCapitals = FALSE; bHiddenText = FALSE; bMarkDelText = FALSE; lWidthMax = lGetWidthMax(tOptions.iParagraphBreak); NO_DBG_DEC(lWidthMax); Hourglass_On(); uiFootnoteNumber = 0; uiEndnoteNumber = 0; eListID = text_list; for(;;) { ulChar = ulGetChar(pFile, eListID); if (ulChar == (ULONG)EOF) { if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { RESET_LINE(); } switch (eListID) { case text_list: if (tOptions.eConversionType != conversion_xml) { eListID = footnote_list; if (uiFootnoteNumber != 0) { vPutSeparatorLine(pAnchor); OUTPUT_LINE(); uiFootnoteNumber = 0; } break; } /* No break or return */ case footnote_list: eListID = endnote_list; if (uiEndnoteNumber != 0) { vPutSeparatorLine(pAnchor); OUTPUT_LINE(); uiEndnoteNumber = 0; } break; case endnote_list: eListID = textbox_list; if (bExistsTextBox()) { vPutSeparatorLine(pAnchor); OUTPUT_LINE(); } break; case textbox_list: eListID = hdrtextbox_list; if (bExistsHdrTextBox()) { vPutSeparatorLine(pAnchor); OUTPUT_LINE(); } break; case hdrtextbox_list: default: eListID = end_of_lists; break; } if (eListID == end_of_lists) { break; } continue; } if (ulChar == UNKNOWN_NOTE_CHAR) { switch (eListID) { case footnote_list: ulChar = FOOTNOTE_CHAR; break; case endnote_list: ulChar = ENDNOTE_CHAR; break; default: break; } } if (bStartRow) { /* Begin of a tablerow found */ if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { RESET_LINE(); } fail(pAnchor != pOutput); if (bTableFontClosed) { /* Start special table font */ vCloseFont(); /* * Compensate for the fact that Word uses * proportional fonts for its tables and we * only one fixed-width font */ uiTmp = ((UINT)usFontSize * 5 + 3) / 6; if (uiTmp < MIN_TABLEFONT_SIZE) { uiTmp = MIN_TABLEFONT_SIZE; } else if (uiTmp > MAX_TABLEFONT_SIZE) { uiTmp = MAX_TABLEFONT_SIZE; } pOutput->usFontSize = (USHORT)uiTmp; pOutput->tFontRef = tOpenTableFont(pOutput->usFontSize); pOutput->usFontStyle = FONT_REGULAR; pOutput->ucFontColor = FONT_COLOR_BLACK; bTableFontClosed = FALSE; } bIsTableRow = TRUE; bStartRow = FALSE; } if (bWasTableRow && !bIsTableRow && ulChar != PAR_END && ulChar != HARD_RETURN && ulChar != PAGE_BREAK && ulChar != COLUMN_FEED) { /* * The end of a table should be followed by an * empty line, like the end of a paragraph */ OUTPUT_LINE(); vEndOfParagraph(pDiag, pOutput->tFontRef, pOutput->usFontSize, (long)pOutput->usFontSize * 600); } switch (ulChar) { case PAGE_BREAK: case COLUMN_FEED: if (bIsTableRow) { /* Ignore when in a table */ break; } if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { RESET_LINE(); } if (ulChar == PAGE_BREAK) { vEndOfPage(pDiag, lAfterIndentation, pSection != pSectionNext); } else { vEndOfParagraph(pDiag, pOutput->tFontRef, pOutput->usFontSize, lAfterIndentation); } break; default: break; } if (bStartFont || (bStartFontNext && ulChar != PAR_END)) { /* Begin of a font found */ if (bStartFont) { /* bStartFont takes priority */ fail(pFontInfo == NULL); pFontTmp = pFontInfo; } else { pFontTmp = &tFontNext; } bAllCapitals = bIsCapitals(pFontTmp->usFontStyle); bHiddenText = bIsHidden(pFontTmp->usFontStyle); bMarkDelText = bIsMarkDel(pFontTmp->usFontStyle); usTmp = pFontTmp->usFontStyle & (FONT_BOLD|FONT_ITALIC|FONT_UNDERLINE| FONT_STRIKE|FONT_MARKDEL| FONT_SUPERSCRIPT|FONT_SUBSCRIPT); if (!bIsTableRow && (usFontSize != pFontTmp->usFontSize || ucFontNumber != pFontTmp->ucFontNumber || usFontStyleMinimal != usTmp || ucFontColor != pFontTmp->ucFontColor)) { pOutput = pStartNextOutput(pOutput); vCloseFont(); pOutput->ucFontColor = pFontTmp->ucFontColor; pOutput->usFontStyle = pFontTmp->usFontStyle; pOutput->usFontSize = pFontTmp->usFontSize; pOutput->tFontRef = tOpenFont( pFontTmp->ucFontNumber, pFontTmp->usFontStyle, pFontTmp->usFontSize); fail(!bCheckDoubleLinkedList(pAnchor)); } ucFontNumber = pFontTmp->ucFontNumber; usFontSize = pFontTmp->usFontSize; ucFontColor = pFontTmp->ucFontColor; usFontStyle = pFontTmp->usFontStyle; usFontStyleMinimal = usTmp; if (bStartFont) { /* Get the next font info */ pFontInfo = pGetNextFontInfoListItem(pFontInfo); NO_DBG_HEX_C(pFontInfo != NULL, pFontInfo->ulFileOffset); DBG_MSG_C(pFontInfo == NULL, "No more fonts"); } bStartFont = FALSE; bStartFontNext = FALSE; } if (bStartStyle || (bStartStyleNext && ulChar != PAR_END)) { bFirstLine = TRUE; /* Begin of a style found */ if (bStartStyle) { /* bStartStyle takes priority */ fail(pStyleInfo == NULL); pStyleTmp = pStyleInfo; } else { pStyleTmp = &tStyleNext; } if (!bIsTableRow) { vStoreStyle(pDiag, pOutput, pStyleTmp); } usIstdNext = pStyleTmp->usIstdNext; lBeforeIndentation = lTwips2MilliPoints(pStyleTmp->usBeforeIndent); lAfterIndentation = lTwips2MilliPoints(pStyleTmp->usAfterIndent); lLeftIndentation = lTwips2MilliPoints(pStyleTmp->sLeftIndent); lLeftIndentation1 = lTwips2MilliPoints(pStyleTmp->sLeftIndent1); lRightIndentation = lTwips2MilliPoints(pStyleTmp->sRightIndent); bInList = bStyleImpliesList(pStyleTmp, iWordVersion); bNoMarks = !bInList || pStyleTmp->bNumPause; ucNFC = pStyleTmp->ucNFC; szListChar = pStyleTmp->szListChar; ucAlignment = pStyleTmp->ucAlignment; if (bInList && !bWasInList) { /* Start of a list */ iListSeqNumber++; vStartOfList(pDiag, ucNFC, bWasTableRow && !bIsTableRow); } if (!bInList && bWasInList) { /* End of a list */ vEndOfList(pDiag); } bWasInList = bInList; if (bStartStyle) { pStyleInfo = pGetNextTextStyle(pStyleInfo); NO_DBG_HEX_C(pStyleInfo != NULL, pStyleInfo->ulFileOffset); DBG_MSG_C(pStyleInfo == NULL, "No more styles"); } bStartStyle = FALSE; bStartStyleNext = FALSE; } if (bWasEndOfParagraph) { vStartOfParagraph1(pDiag, lBeforeIndentation); } if (!bIsTableRow && lTotalStringWidth(pAnchor) == 0) { if (!bNoMarks) { usListNumber = usGetListValue(iListSeqNumber, iWordVersion, pStyleTmp); } if (bInList && bFirstLine) { vStartOfListItem(pDiag, bNoMarks); } vPutIndentation(pDiag, pAnchor, bNoMarks, bFirstLine, usListNumber, ucNFC, szListChar, lLeftIndentation, lLeftIndentation1); bFirstLine = FALSE; /* One number or mark per paragraph will do */ bNoMarks = TRUE; } if (bWasEndOfParagraph) { vStartOfParagraph2(pDiag); bWasEndOfParagraph = FALSE; } switch (ulChar) { case PICTURE: (void)memset(&tImage, 0, sizeof(tImage)); eRes = eExamineImage(pFile, ulFileOffsetImage, &tImage); switch (eRes) { case image_no_information: bSuccess = FALSE; break; case image_minimal_information: case image_full_information: #if 0 if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { RESET_LINE(); } #endif bSuccess = bTranslateImage(pDiag, pFile, eRes == image_minimal_information, ulFileOffsetImage, &tImage); break; default: DBG_DEC(eRes); bSuccess = FALSE; break; } if (!bSuccess) { vStoreString("[pic]", 5, pOutput); } break; case FOOTNOTE_CHAR: uiFootnoteNumber++; if (tOptions.eConversionType == conversion_xml) { vStoreCharacter((ULONG)FOOTNOTE_OR_ENDNOTE, pOutput); break; } vStoreCharacter((ULONG)'[', pOutput); vStoreNumberAsDecimal(uiFootnoteNumber, pOutput); vStoreCharacter((ULONG)']', pOutput); break; case ENDNOTE_CHAR: uiEndnoteNumber++; vStoreCharacter((ULONG)'[', pOutput); vStoreNumberAsRoman(uiEndnoteNumber, pOutput); vStoreCharacter((ULONG)']', pOutput); break; case UNKNOWN_NOTE_CHAR: vStoreString("[?]", 3, pOutput); break; case PAR_END: if (bIsTableRow) { vStoreCharacter((ULONG)'\n', pOutput); break; } if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { vMove2NextLine(pDiag, pOutput->tFontRef, pOutput->usFontSize); RESET_LINE(); } vEndOfParagraph(pDiag, pOutput->tFontRef, pOutput->usFontSize, lAfterIndentation); bWasEndOfParagraph = TRUE; break; case HARD_RETURN: if (bIsTableRow) { vStoreCharacter((ULONG)'\n', pOutput); break; } if (bOutputContainsText(pAnchor)) { OUTPUT_LINE(); } else { vMove2NextLine(pDiag, pOutput->tFontRef, pOutput->usFontSize); RESET_LINE(); } break; case PAGE_BREAK: case COLUMN_FEED: pSection = pSectionNext; break; case TABLE_SEPARATOR: if (bIsTableRow) { vStoreCharacter(ulChar, pOutput); break; } vStoreCharacter((ULONG)' ', pOutput); vStoreCharacter((ULONG)TABLE_SEPARATOR_CHAR, pOutput); break; case TAB: if (bIsTableRow || tOptions.eConversionType == conversion_xml) { vStoreCharacter((ULONG)' ', pOutput); break; } if (tOptions.iParagraphBreak == 0 && (tOptions.eConversionType == conversion_text || tOptions.eConversionType == conversion_fmt_text)) { /* No logical lines, so no tab expansion */ vStoreCharacter(TAB, pOutput); break; } lHalfSpaceWidth = (lComputeSpaceWidth( pOutput->tFontRef, pOutput->usFontSize) + 1) / 2; lTmp = lTotalStringWidth(pAnchor); lTmp += lDrawUnits2MilliPoints(pDiag->lXleft); lTmp /= lDefaultTabWidth; do { vStoreCharacter((ULONG)FILLER_CHAR, pOutput); lWidthCurr = lTotalStringWidth(pAnchor); lWidthCurr += lDrawUnits2MilliPoints(pDiag->lXleft); } while (lTmp == lWidthCurr / lDefaultTabWidth && lWidthCurr < lWidthMax + lRightIndentation); break; default: if (bHiddenText && tOptions.bHideHiddenText) { continue; } if (bMarkDelText && tOptions.bRemoveRemovedText) { continue; } if (ulChar == UNICODE_ELLIPSIS && tOptions.eEncoding != encoding_utf_8) { vStoreString("...", 3, pOutput); } else { if (bAllCapitals) { ulChar = ulToUpper(ulChar); } vStoreCharacter(ulChar, pOutput); } break; } if (bWasTableRow && !bIsTableRow) { /* End of a table */ vEndOfTable(pDiag); /* Resume normal font */ NO_DBG_MSG("End of table font"); vCloseFont(); bTableFontClosed = TRUE; pOutput->ucFontColor = ucFontColor; pOutput->usFontStyle = usFontStyle; pOutput->usFontSize = usFontSize; pOutput->tFontRef = tOpenFont( ucFontNumber, usFontStyle, usFontSize); } bWasTableRow = bIsTableRow; if (bIsTableRow) { fail(pAnchor != pOutput); if (!bEndRowNorm && !bEndRowFast) { continue; } /* End of a table row */ if (bEndRowNorm) { fail(pRowInfo == NULL); vTableRow2Window(pDiag, pAnchor, pRowInfo, tOptions.eConversionType, tOptions.iParagraphBreak); } else { fail(!bEndRowFast); } /* Reset */ pAnchor = pStartNewOutput(pAnchor, NULL); pOutput = pAnchor; if (bEndRowNorm) { pRowInfo = pGetNextRowInfoListItem(); } bIsTableRow = FALSE; bEndRowNorm = FALSE; bEndRowFast = FALSE; NO_DBG_HEX_C(pRowInfo != NULL, pRowInfo->ulFileOffsetStart); NO_DBG_HEX_C(pRowInfo != NULL, pRowInfo->ulFileOffsetEnd); continue; } lWidthCurr = lTotalStringWidth(pAnchor); lWidthCurr += lDrawUnits2MilliPoints(pDiag->lXleft); if (lWidthCurr < lWidthMax + lRightIndentation) { continue; } pLeftOver = pSplitList(pAnchor); vJustify2Window(pDiag, pAnchor, lWidthMax, lRightIndentation, ucAlignment); pAnchor = pStartNewOutput(pAnchor, pLeftOver); for (pOutput = pAnchor; pOutput->pNext != NULL; pOutput = pOutput->pNext) ; /* EMPTY */ fail(pOutput == NULL); if (lTotalStringWidth(pAnchor) > 0) { vSetLeftIndentation(pDiag, lLeftIndentation); } } pAnchor = pStartNewOutput(pAnchor, NULL); pAnchor->szStorage = xfree(pAnchor->szStorage); pAnchor = xfree(pAnchor); vCloseFont(); vFreeDocument(); Hourglass_Off(); return TRUE; } /* end of bWordDecryptor */ /* * lLastStringWidth - compute the width of the last part of the output string */ static long lLastStringWidth(const output_type *pAnchor) { const output_type *pCurr, *pStart; pStart = NULL; for (pCurr = pAnchor; pCurr != NULL; pCurr = pCurr->pNext) { if (pCurr->tNextFree == 1 && (pCurr->szStorage[0] == PAR_END || pCurr->szStorage[0] == HARD_RETURN)) { /* Found a separator. Start after the separator */ pStart = pCurr->pNext; } } if (pStart == NULL) { /* No separators. Use the whole output string */ pStart = pAnchor; } return lTotalStringWidth(pStart); } /* end of lLastStringWidth */ /* * pHdrFtrDecryptor - turn a header/footer list element to something useful */ output_type * pHdrFtrDecryptor(FILE *pFile, ULONG ulCharPosStart, ULONG ulCharPosNext) { output_type *pAnchor, *pOutput, *pLeftOver; ULONG ulChar, ulFileOffset, ulCharPos; long lWidthCurr, lWidthMax; long lRightIndentation; USHORT usChar; UCHAR ucAlignment; BOOL bSkip; fail(iWordVersion < 0); fail(tOptions.eConversionType == conversion_unknown); fail(tOptions.eEncoding == 0); if (ulCharPosStart == ulCharPosNext) { /* There are no bytes to decrypt */ return NULL; } lRightIndentation = 0; ucAlignment = ALIGNMENT_LEFT; bSkip = FALSE; lWidthMax = lGetWidthMax(tOptions.iParagraphBreak); pAnchor = pStartNewOutput(NULL, NULL); pOutput = pAnchor; pOutput->tFontRef = tOpenFont(0, FONT_REGULAR, DEFAULT_FONT_SIZE); usChar = usToHdrFtrPosition(pFile, ulCharPosStart); ulCharPos = ulCharPosStart; ulFileOffset = ulCharPos2FileOffset(ulCharPos); while (usChar != (USHORT)EOF && ulCharPos != ulCharPosNext) { /* Skip embedded characters */ if (usChar == START_EMBEDDED) { bSkip = TRUE; } else if (usChar == END_IGNORE || usChar == END_EMBEDDED) { bSkip = FALSE; } /* Translate character */ if (bSkip || usChar == END_IGNORE || usChar == END_EMBEDDED) { ulChar = IGNORE_CHARACTER; } else { ulChar = ulTranslateCharacters(usChar, ulFileOffset, iWordVersion, tOptions.eConversionType, tOptions.eEncoding, bOldMacFile); } /* Process character */ if (ulChar != IGNORE_CHARACTER) { switch (ulChar) { case PICTURE: vStoreString("[pic]", 5, pOutput); break; case PAR_END: case HARD_RETURN: case PAGE_BREAK: case COLUMN_FEED: /* To the next substring */ pOutput = pStartNextOutput(pOutput); vCloseFont(); pOutput->tFontRef = tOpenFont(0, FONT_REGULAR, DEFAULT_FONT_SIZE); /* A substring with just one character */ if (ulChar == HARD_RETURN) { vStoreCharacter(HARD_RETURN, pOutput); } else { vStoreCharacter(PAR_END, pOutput); } /* To the next substring */ pOutput = pStartNextOutput(pOutput); vCloseFont(); pOutput->tFontRef = tOpenFont(0, FONT_REGULAR, DEFAULT_FONT_SIZE); fail(!bCheckDoubleLinkedList(pAnchor)); break; case TABLE_SEPARATOR: vStoreCharacter((ULONG)' ', pOutput); vStoreCharacter((ULONG)TABLE_SEPARATOR_CHAR, pOutput); break; case TAB: vStoreCharacter((ULONG)FILLER_CHAR, pOutput); break; default: vStoreCharacter(ulChar, pOutput); break; } } lWidthCurr = lLastStringWidth(pAnchor); if (lWidthCurr >= lWidthMax + lRightIndentation) { pLeftOver = pSplitList(pAnchor); for (pOutput = pAnchor; pOutput->pNext != NULL; pOutput = pOutput->pNext) ; /* EMPTY */ fail(pOutput == NULL); /* To the next substring */ pOutput = pStartNextOutput(pOutput); /* A substring with just one HARD_RETURN */ vStoreCharacter(HARD_RETURN, pOutput); /* Put the leftover piece(s) at the end */ pOutput->pNext = pLeftOver; if (pLeftOver != NULL) { pLeftOver->pPrev = pOutput; } fail(!bCheckDoubleLinkedList(pAnchor)); for (pOutput = pAnchor; pOutput->pNext != NULL; pOutput = pOutput->pNext) ; /* EMPTY */ fail(pOutput == NULL); } usChar = usNextChar(pFile, hdrftr_list, &ulFileOffset, &ulCharPos, NULL); } vCloseFont(); if (bOutputContainsText(pAnchor)) { return pAnchor; } pAnchor = pStartNewOutput(pAnchor, NULL); pAnchor->szStorage = xfree(pAnchor->szStorage); pAnchor = xfree(pAnchor); return NULL; } /* end of pHdrFtrDecryptor */ /* * pFootnoteDecryptor - turn a footnote text list element into text */ char * szFootnoteDecryptor(FILE *pFile, ULONG ulCharPosStart, ULONG ulCharPosNext) { char *szText; ULONG ulChar, ulFileOffset, ulCharPos; USHORT usChar; size_t tLen, tIndex, tNextFree, tStorageSize; char szResult[6]; BOOL bSkip; fail(iWordVersion < 0); fail(tOptions.eConversionType == conversion_unknown); fail(tOptions.eEncoding == 0); if (ulCharPosStart == ulCharPosNext) { /* There are no bytes to decrypt */ return NULL; } if (tOptions.eConversionType != conversion_xml) { /* Only implemented for XML output */ return NULL; } bSkip = FALSE; /* Initialise the text buffer */ tStorageSize = INITIAL_SIZE; szText = xmalloc(tStorageSize); tNextFree = 0; szText[tNextFree] = '\0'; /* Goto the start */ usChar = usToFootnotePosition(pFile, ulCharPosStart); ulCharPos = ulCharPosStart; ulFileOffset = ulCharPos2FileOffset(ulCharPos); /* Skip the unwanted starting characters */ while (usChar != (USHORT)EOF && ulCharPos != ulCharPosNext && (usChar == FOOTNOTE_OR_ENDNOTE || usChar == PAR_END || usChar == TAB || usChar == (USHORT)' ')) { usChar = usNextChar(pFile, footnote_list, &ulFileOffset, &ulCharPos, NULL); } /* Process the footnote text */ while (usChar != (USHORT)EOF && ulCharPos != ulCharPosNext) { /* Skip embedded characters */ if (usChar == START_EMBEDDED) { bSkip = TRUE; } else if (usChar == END_IGNORE || usChar == END_EMBEDDED) { bSkip = FALSE; } /* Translate character */ if (bSkip || usChar == END_IGNORE || usChar == END_EMBEDDED || usChar == FOOTNOTE_OR_ENDNOTE) { ulChar = IGNORE_CHARACTER; } else { ulChar = ulTranslateCharacters(usChar, ulFileOffset, iWordVersion, tOptions.eConversionType, tOptions.eEncoding, bOldMacFile); } /* Process character */ if (ulChar == PICTURE) { tLen = 5; strcpy(szResult, "[pic]"); } else if (ulChar == IGNORE_CHARACTER) { tLen = 0; szResult[0] = '\0'; } else { switch (ulChar) { case PAR_END: case HARD_RETURN: case PAGE_BREAK: case COLUMN_FEED: ulChar = (ULONG)PAR_END; break; case TAB: ulChar = (ULONG)' '; break; default: break; } tLen = tUcs2Utf8(ulChar, szResult, sizeof(szResult)); } /* Add the results to the text */ if (tNextFree + tLen + 1 > tStorageSize) { tStorageSize += EXTENTION_SIZE; szText = xrealloc(szText, tStorageSize); } for (tIndex = 0; tIndex < tLen; tIndex++) { szText[tNextFree++] = szResult[tIndex]; } szText[tNextFree] = '\0'; /* Next character */ usChar = usNextChar(pFile, footnote_list, &ulFileOffset, &ulCharPos, NULL); } /* Remove redundant spaces */ while (tNextFree != 0 && szText[tNextFree - 1] == ' ') { szText[tNextFree - 1] = '\0'; tNextFree--; } if (tNextFree == 0) { /* No text */ szText = xfree(szText); return NULL; } return szText; } /* end of szFootnoteDecryptor */