|
|
/*************************************************************************
* * * BREAKER.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * Word breaker module * * This module provides word-breaking routines applicable to the ANSI * * character-set. This means American English. * * Note that ANSI does not mean ASCII. * * * * WARNING: Tab setting is 4 for this file * * * ************************************************************************** * * * Current Owner: BinhN * * * ************************************************************************** * * * Released by Development: (date) * * * *************************************************************************/ #include <verstamp.h>
SETVERSIONSTAMP(MVBK);
#include <mvopsys.h>
#include <iterror.h>
#include <mvsearch.h>
#include "common.h"
/* Macros to access structure's members */
#define CP_CLASS(p) (((LPCMAP)p)->Class & 0xff)
#define CP_NORMC(p) (((LPCMAP)p)->Norm)
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * All of them should be declared near *************************************************************************/ PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS, WORD); PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord, LPCMAP lpCharPropTab, LPB lpbLigatureTab, WORD wcLigature);
/*************************************************************************
* * SINGLE TO DOUBLE-WIDTH KATAKANA MAPPING ARRAY * *************************************************************************/
// Single-Width to Double-Width Mapping Array
//
static const int mtable[][2]={ {129,66},{129,117},{129,118},{129,65},{129,69},{131,146},{131,64}, {131,66},{131,68},{131,70},{131,72},{131,131},{131,133},{131,135}, {131,98},{129,91},{131,65},{131,67},{131,69},{131,71},{131,73}, {131,74},{131,76},{131,78},{131,80},{131,82},{131,84},{131,86}, {131,88},{131,90},{131,92},{131,94},{131,96},{131,99},{131,101}, {131,103},{131,105},{131,106},{131,107},{131,108},{131,109}, {131,110},{131,113},{131,116},{131,119},{131,122},{131,125}, {131,126},{131,128},{131,129},{131,130},{131,132},{131,134}, {131,136},{131,137},{131,138},{131,139},{131,140},{131,141}, {131,143},{131,147},{129,74},{129,75} };
/*************************************************************************
* @doc API INDEX RETRIEVAL * * @func LPIBI FAR PASCAL | BreakerInitiate | * Allocates a breaker parameter block. This parameter block keeps * track of the breaker's "global" variables. * * @rdesc NULL if the call fails (ie. no more memory) * a pointer to the block if it succeeds. *************************************************************************/
PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void) { _LPIBI lpibi; register HANDLE hibi;
if ((hibi = GlobalAlloc(GMEM_MOVEABLE | GMEM_ZEROINIT, sizeof(IBI))) == NULL) { return NULL; } //
// All variables not explicitly initialized are assumed to be
// initialized as zero.
//
lpibi = (_LPIBI)GlobalLock(hibi); lpibi->hibi = hibi; return lpibi; }
/*************************************************************************
* @doc API INDEX RETRIEVAL * * @func void FAR PASCAL | BreakerFree | * Frees a word-breaker parameter block. * * @parm LPIBI | lpibi | * Pointer to the InternalBreakInfo Structure containing all the * informations about states *************************************************************************/ PUBLIC void EXPORT_API FAR PASCAL BreakerFree(_LPIBI lpibi) { HANDLE hibi; /* Do sanity check */ if (lpibi == NULL) return;
hibi = lpibi->hibi; GlobalUnlock(hibi); GlobalFree(hibi); }
// - - - - - - - - -
// Break words out from a block of standard text characters.
//
// This routine is incredibly important. Any change in the performance
// of this function will have immediate and obvious influence upon the
// performance of the indexing system as a whole. Consequently, the
// function should be very fast.
//
// This function uses a simple state machine to try to achieve the
// necessary speed. It's in a different loop depending upon what kind
// of characters it's trying to find, and it uses "goto" statements to
// shift back and forth between "states".
//
/*************************************************************************
* @doc API RETRIEVAL INDEX * * @func ERR | FBreakWords | * This function break a string into a sequence of words. * * @parm LPBRK_PARMS | lpBrkParms | * Pointer to structure containing all the parameters needed for * the breaker. They include: * 1/ Pointer to the InternalBreakInfo * 2/ Pointer to input buffer containing the word stream * 3/ Size of the input bufer * 4/ Offset in the source text of the first byte of the input buffer * 5/ Pointer to user's parameter block for the user's function * 6/ User's function to call with words. The format of the call should * be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb, * LPV lpvUser) * The function should return S_OK if succeeded * The function can be NULL * 7/ Pointer to stop word table. This table contains stop words specific * to this breaker. If this is non-null, then the function * will flag errors for stop word present in the query * 8/ Pointer to character table. If NULL, then the default built-in * character table will be used * * @rdesc * The function returns S_OK if succeeded. The failure's causes * are: * @flag E_WORDTOOLONG | Word too long * @flag errors | returned by the lpfnfOutWord *************************************************************************/
PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS lpBrkParms) { return (WordBreakStem(lpBrkParms, FALSE)); }
#if 0
/*************************************************************************
* @doc API RETRIEVAL INDEX * * @func ERR | FBreakAndStemWords | * This function breaks a string into a sequence of words and * stems each resulting word * * @parm LPBRK_PARMS | lpBrkParms | * Pointer to structure containing all the parameters needed for * the breaker. They include: * 1/ Pointer to the InternalBreakInfo * 2/ Pointer to input buffer containing the word stream * 3/ Size of the input bufer * 4/ Offset in the source text of the first byte of the input buffer * 5/ Pointer to user's parameter block for the user's function * 6/ User's function to call with words. The format of the call should * be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb, * LPV lpvUser) * The function should return S_OK if succeeded * The function can be NULL * 7/ Pointer to stop word table. This table contains stop words specific * to this breaker. If this is non-null, then the function * will flag errors for stop word present in the query * 8/ Pointer to character table. If NULL, then the default built-in * character table will be used * * @rdesc * The function returns S_OK if succeeded. The failure's causes * are: * @flag E_WORDTOOLONG | Word too long * @flag Other errors | returned by the lpfnfOutWord *************************************************************************/
PUBLIC ERR EXPORT_API FAR PASCAL FBreakAndStemWords(LPBRK_PARMS lpBrkParms) { return (WordBreakStem(lpBrkParms, TRUE)); } #endif
PUBLIC ERR EXPORT_API FAR PASCAL BreakerVersion (void) { return CHARTABVER; }
// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms) { return E_NOTSUPPORTED; }
// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FSelectWord (LPCSTR pBuffer, DWORD dwCount, DWORD dwOffset, LPDWORD pStart, LPDWORD pEnd) { return E_NOTSUPPORTED; }
/*************************************************************************
* @doc INTERNAL * * @func ERR | WordBreakStem | * This function breaks a string into a sequence of words and * stems each resulting word * * @parm BYTE | fStem | * If set, stem the word * * @rdesc * The function returns S_OK if succeeded. The failure's causes * are: * @flag E_WORDTOOLONG | Word too long * @flag Other errors | returned by the lpfnfOutWord *************************************************************************/
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS lpBrkParms, WORD fStem) { register LPB lpbRawWord; // Pointer to RawWord buffer
register LPB lpbNormWord; // Pointer to NormWord buffer
LPCMAP lpCharPropTab; // Pointer to the char property table
LPB lpbInBuffer; // Buffer to groot through.
LPB lpbRawWordLimit; // Limit of RawWord buffer
#if 0
LPB lpbNormWordLimit; // Limit of NormWord buffer
#endif
BYTE bCurChar; // Current character.
BYTE fScan = TRUE; ERR fRet; #if 0
BYTE astStemmed[CB_MAX_WORD_LEN + 2]; // Temporary buffer for stemming
#endif
LPB lpbLigature = NULL; WORD wcLigature = 0; LPCHARTAB lpCharTab; LPB astNormWord; LPB astRawWord; BYTE fAcceptWildCard;
/* Breakers parameters break out */
_LPIBI lpibi; LPB lpbInBuf; CB cbInBufSize; LCB lcbInBufOffset; LPV lpvUser; FWORDCB lpfnfOutWord; _LPSIPB lpsipb; LPCMAP lpCMap = NULL;
/*
* Initialize variables */
if (lpBrkParms == NULL || (lpibi = lpBrkParms->lpInternalBreakInfo) == NULL) return E_INVALIDARG;
astNormWord = (LPB)lpibi->astNormWord; astRawWord = (LPB)lpibi->astRawWord;
lpbInBuf = lpBrkParms->lpbBuf; lpvUser = lpBrkParms->lpvUser; lpfnfOutWord = lpBrkParms->lpfnOutWord; lpsipb = lpBrkParms->lpStopInfoBlock; fAcceptWildCard = (BYTE)(lpBrkParms->fFlags & ACCEPT_WILDCARD);
/*
* Restore to the proper state. This is in place to handle * words that cross block boundaries, and to deal with explicit * buffer-flush commands. */ if ((lpbInBuffer = lpbInBuf) != NULL) {
cbInBufSize = lpBrkParms->cbBufCount; lcbInBufOffset = lpBrkParms->lcbBufOffset;
if (lpCharTab = lpBrkParms->lpCharTab) { lpCMap = (LPCMAP)(lpCharTab->lpCMapTab); lpbLigature = lpCharTab->lpLigature; wcLigature = lpCharTab->wcLigature; } else { return(E_INVALIDARG); }
lpbRawWordLimit = (LPB)&astRawWord[CB_MAX_WORD_LEN];
switch (lpibi->state) { case SCAN_WHITE_STATE: goto ScanWhite; // Running through white space.
case SCAN_WORD_STATE: lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2]; lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2]; goto ScanWord; // Found one 'a'..'z', collecting.
case SCAN_NUM_STATE: lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2]; lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2]; goto ScanNumber;// Found one '0'..'9', collecting.
case SCAN_LEADBYTE_STATE: lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2]; lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2]; goto ScanLeadByte;
case SCAN_SBKANA_STATE: lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2]; lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2]; goto ScanSbKana; } } else { cbInBufSize = fScan = 0; switch (lpibi->state) { case SCAN_WHITE_STATE: return S_OK; // Still stuck in white space.
case SCAN_WORD_STATE: goto FlushWord; // Flush a word.
case SCAN_NUM_STATE: goto FlushNumber; // Flush a number.
case SCAN_LEADBYTE_STATE: goto ScanLeadByte; case SCAN_SBKANA_STATE: goto ScanSbKana; } } //
// W H I T E - S P A C E S T A T E
//
// While in this state the code is hunting through white-space,
// searching for an alpha character or a digit character. If
// it finds one, it initializes the word and goes to either the
// word-collection state or the number-collection state.
//
ScanWhite: for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) { //
// Get the character and its class.
//
switch (CP_CLASS(&lpCMap[*lpbInBuffer])) { case CLASS_WILDCARD: if (fAcceptWildCard == FALSE) continue; case CLASS_TYPE: // Found the 1st byte of the special string
case CLASS_CHAR: // Found a non-normalized char
case CLASS_NORM: // Found a normalized character
case CLASS_LIGATURE: // Found a ligature
// jump to the word-collection state.
lpibi->lcb = (DWORD)(lcbInBufOffset + (lpbInBuffer - lpbInBuf)); lpbRawWord = (LPB)&astRawWord[2]; lpbNormWord = (LPB)&astNormWord[2]; goto ScanWord;
case CLASS_DIGIT: // Found a digit.
lpibi->lcb = (DWORD)(lcbInBufOffset + (lpbInBuffer - lpbInBuf)); lpibi->cbNormPunctLen = lpibi->cbRawPunctLen = 0; lpbRawWord = (LPB)&astRawWord[2]; lpbNormWord = (LPB)&astNormWord[2]; goto ScanNumber; case CLASS_LEADBYTE: lpibi->lcb = (DWORD)(lcbInBufOffset + (lpbInBuffer - lpbInBuf)); lpbRawWord = (LPB)&astRawWord[2]; lpbNormWord = (LPB)&astNormWord[2]; *(LPW)astNormWord = *(LPW)astRawWord = 0; goto ScanLeadByte; case CLASS_SBKANA: lpibi->lcb = (DWORD)(lcbInBufOffset + (lpbInBuffer - lpbInBuf)); *(LPW)astNormWord = *(LPW)astRawWord = 0; lpbRawWord = (LPB)&astRawWord[2]; lpbNormWord = (LPB)&astNormWord[2]; goto ScanSbKana; } } //
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data.
//
lpibi->state = SCAN_WHITE_STATE; return S_OK;
ScanWord: //
// W O R D S T A T E
//
// While in this state the code is attempting to append alpha
// and digit characters to the alpha character it's already
// found. Apostrophes are stripped.
//
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) { //
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer]; switch (CP_CLASS(lpCharPropTab)) { case CLASS_NORM : case CLASS_DIGIT : case CLASS_CHAR: //
// Found a normalized character or a digit.
// Append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; *lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]); break; case CLASS_LIGATURE: //
// Found an ligature character. Normalize
// it and append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; lpbNormWord += LigatureMap (bCurChar, lpbNormWord, lpCMap, lpbLigature, wcLigature); break; case CLASS_STRIP: //
// Found an apostrophe or somesuch. Ignore
// this character, but increment the word length,
// since it counts as part of the un-normalized
// word's length.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; break;
case CLASS_TYPE : /* Set the flag to remind us to get the
second byte. */ lpibi->fGotType = TRUE; *lpbRawWord++ = *lpbNormWord++ = bCurChar; break;
case CLASS_WILDCARD: //
// Found a wildcard character
// Append it to the output buffer if we accept wildcard
//
if (fAcceptWildCard) { if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; *lpbNormWord++ = bCurChar; break; } default: if (lpibi->fGotType == TRUE) { lpibi->fGotType = FALSE;
/* Found a the 2nd byte of a special type
Append it to the output buffer. */
*lpbRawWord++ = *lpbNormWord++ = bCurChar; break; } //
// Found something weird, or I have been ordered
// to flush the output buffer. Flush the output
// buffer and go back to the "grooting through
// white space" state (#0).
//
FlushWord: if (fScan) { /* Recalculate the length only if scanning */ *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]); *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]); }
/* Check for stop word if required */ if (lpsipb) { if (lpsipb->lpfnStopListLookup(lpsipb, astNormWord) == S_OK) { goto ScanWhite; // Ignore stop words
} } #if 0
if (fStem) { /* Do stemming if requested */ if (FStem(astStemmed, astNormWord) == S_OK) { MEMCPY(astNormWord, astStemmed, GETWORD(astStemmed) + sizeof(WORD)); } } #endif
/* Execute user's function */ if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord, lpibi->astNormWord, lpibi->lcb, lpvUser)) != S_OK) return fRet; goto ScanWhite; } } //
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data. If they
// just want me to flush, I come back to the "flush a
// word" state (#1f), since at this time I already have
// a valid word, since I got an alpha-char in state #0,
// and may have gotten more since.
//
lpibi->state = SCAN_WORD_STATE; *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]); *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]); return S_OK;
ScanLeadByte: if(!cbInBufSize) { // no character - we may have lost a DBC
//
lpibi->state = SCAN_WHITE_STATE; *(LPW)astNormWord = *(LPW)astRawWord = 0; return S_OK; }
if(!GETWORD(astNormWord)) { // process lead byte
//
*(LPW)astNormWord = *(LPW)astRawWord = 1; astNormWord[2] = *lpbInBuffer++; --cbInBufSize; }
if(!cbInBufSize) { // no more characters - set up state so we come back to get trail byte.
//
lpibi->state = SCAN_LEADBYTE_STATE; return S_OK; }
// process trail byte
//
*(LPW)astNormWord = *(LPW)astRawWord = 2; astNormWord[3] = *lpbInBuffer++; --cbInBufSize;
// flush the DBC
//
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser)) != S_OK) return fRet;
if(!cbInBufSize) { // no more characters - we have already flushed our DBC so we will just
// set the state back to scanning for white space.
//
lpibi->state = SCAN_WHITE_STATE; return S_OK; }
// all done - go back to scanning white space.
//
goto ScanWhite;
ScanSbKana: if(!cbInBufSize) { // Buffer is empty. Flush the buffer if we are holding a character.
//
if(GETWORD(astNormWord)) { if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser)) != S_OK) return fRet; }
lpibi->state = SCAN_WHITE_STATE; *(LPW)astNormWord = *(LPW)astRawWord = 0; return S_OK; }
// Note: The basic algorithm (including the mapping table) used here to
// convert half-width Katakana characters to full-width Katakana appears
// in the book "Understanding Japanese Information Systems" by
// O'Reily & Associates.
// If the RawWord buffer is empty then we will process this as a first
// character (we are not looking for an diacritic mark).
//
if(!GETWORD(astRawWord)) { // Verify that we have a half-width Katakana character. This check is
// a good safeguard against erroneous information in a user defined
// charmap.
//
if(*lpbInBuffer >= 161 && *lpbInBuffer <= 223) { // We have a half-width Katakana character. Now compute the equivalent
// full-width character via the mapping table.
//
astNormWord[2] = (BYTE)(mtable[*lpbInBuffer-161][0]); astNormWord[3] = (BYTE)(mtable[*lpbInBuffer-161][1]); *(LPW)astNormWord = 2; } else { // This is an error condition. For some reason the charmap has
// *lpbInBuffer tagged as CLASS_SBKANA when in fact it's not
// a single byte Katakana character. This is probably the result
// of an improperly formed user defined charmap.
//
// Since there's no way to determine the real class of this character
// we will send it to the bit bucket.
//
lpbInBuffer++; cbInBufSize--; *(LPW)astNormWord = *(LPW)astRawWord = 0; lpibi->state = SCAN_WHITE_STATE; goto ScanWhite; } *(LPW)astRawWord = 1; // we have processed one character so far
astRawWord[2] = *lpbInBuffer; // we will need the original character later
lpbInBuffer++; cbInBufSize--; }
// Check if we have more characters in the buffer.
//
if(!cbInBufSize) { // Return because the buffer is empty.
//
lpibi->state = SCAN_SBKANA_STATE; return S_OK; }
// check if the second character is nigori mark.
//
if(*lpbInBuffer == 222) { // see if we have a half-width katakana that can be modified by nigori.
//
if((astRawWord[1] >= 182 && astRawWord[1] <= 196) || (astRawWord[1] >= 202 && astRawWord[1] <= 206) || (astRawWord[1] == 179)) { // transform kana into kana with maru
//
if((astNormWord[2] >= 74 && astNormWord[2] <= 103) || (astNormWord[2] >= 110 && astNormWord[2] <= 122)) astNormWord[2]++; else if(astNormWord[2] == 131 && astNormWord[3] == 69) astNormWord[3] = 148;
// set the word lengths and advance the buffer.
//
*(LPW)astNormWord=2; *(LPW)astRawWord =2; lpbInBuffer++; cbInBufSize--; } }
// check if following character is maru mark
//
else if(*lpbInBuffer==223) { // see if we have a half-width katakana that can be modified by maru.
//
if((astRawWord[2] >= 202 && astRawWord[2] <= 206)) { // transform kana into kana with nigori
//
if(astNormWord[3] >= 110 && astNormWord[3] <= 122) astNormWord[3]+=2;
// set the word lengths and advance the buffer.
//
*(LPW)astNormWord=2; *(LPW)astRawWord=2; lpbInBuffer++; cbInBufSize--; } }
// Note: If the character at *lpbInBuffer wasn't a diacritic mark, then it
// will be processed when ScanWhite is re-entered.
//
// Another note: The above code only combines diacritic marks with
// single-width Katakana characters that can be modifed
// by these marks (not all can). If we happen to encounter
// a situation where the diacritic can't be combined
// into the character, we let the character continue
// back to ScanWhite where it will be re-sent to
// ScanSbKana, however this time it will be a first
// character and be converted into its stand-alone
// full-width equivalent (maru and nigori have full-width
// character equilalents that contain just the mark).
// flush the buffer
//
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser)) != S_OK) return fRet;
// reset word lengths and return to scanning for white space.
//
*(LPW)astNormWord = *(LPW)astRawWord = 0; lpibi->state = SCAN_WHITE_STATE;
// Return if buffer is empty
//
if(!cbInBufSize) return S_OK;
// all done - go back to scanning white space.
//
goto ScanWhite;
ScanNumber: //
// N U M B E R S T A T E
//
// While in this state the code is attempting to append alpha
// and digit characters to the digit character it's already
// found. This state is more complex than the word grabbing
// state, because it deals with slashes and hyphens in a weird
// way. They're allowed in a number unless they appear at the
// end. Extra variables have to account for these conditions.
//
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) { //
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer]; switch (CP_CLASS(lpCharPropTab)) { case CLASS_DIGIT : case CLASS_NORM : case CLASS_CHAR: //
// Found a normalized character or a digit.
// Append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; *lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]); lpibi->cbRawPunctLen = 0; lpibi->cbNormPunctLen = 0; break;
case CLASS_LIGATURE: //
// Found an ligature character. Normalize
// it and append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; lpbNormWord += LigatureMap (bCurChar, lpbNormWord, lpCMap, lpbLigature, wcLigature); lpibi->cbRawPunctLen = 0; lpibi->cbNormPunctLen = 0; break; case CLASS_NKEEP: //
// Found a hyphen or a slash. These are kept
// as part of the number unless they appear at
// the end of the number.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; *lpbNormWord++= bCurChar; lpibi->cbRawPunctLen++; lpibi->cbNormPunctLen++; break;
case CLASS_NSTRIP: //
// Found a comma or somesuch. Ignore this
// character, but increment the word length,
// since it counts as part of the un-normalized
// number's length.
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++= bCurChar; lpibi->cbRawPunctLen++; break;
case CLASS_CONTEXTNSTRIP: //
// Found special character used for number separator. This
// may be a space in French, ie. 100 000. The problem here
// is that we must differentiate it from a regular word
// separator. In the meantime, ignore this character, but
// increment the word length
//
if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++= bCurChar; lpibi->cbRawPunctLen++; cbInBufSize--; lpbInBuffer++; goto ScanSeparator; // Found a "possible" separator
break;
case CLASS_WILDCARD: //
// Found a wildcard character
// Append it to the output buffer if we accept wildcard
//
if (fAcceptWildCard) { if (lpbRawWord >= lpbRawWordLimit) return (E_WORDTOOLONG); *lpbRawWord++ = bCurChar; *lpbNormWord++ = bCurChar; break; }
default: //
// Found something weird, or I have been ordered
// to flush the output buffer. Flush the output
// buffer and go back to the "grooting through
// white space" state (#0).
//
// This is a little more complicated than the
// analogous routine for dealing with words.
// This has to deal with words that have some
// number of trailing punctuation characters.
// These need to be stripped from the word, and
// the un-normalized word length value needs to
// be adjusted as well.
//
FlushNumber: if (fScan) { /* Recalculate the length only if scanning */ *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2] - lpibi->cbRawPunctLen); *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2] - lpibi->cbNormPunctLen); }
/* Check for stop word if required */ if (lpsipb) { if (lpsipb->lpfnStopListLookup(lpsipb, astNormWord) == S_OK) { goto ScanWhite; // Ignore stop words
} }
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord, astNormWord, lpibi->lcb, lpvUser)) != S_OK) return fRet; goto ScanWhite; } } //
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data. If they
// just want me to flush, I come back to the "flush a
// number" state (#2f), since at this time I already have
// a valid word, since I got an digit-char in state #0,
// and may have gotten more since.
//
lpibi->state = SCAN_NUM_STATE; *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]); *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]); return S_OK;
ScanSeparator: // S E P A R A T O R S T A T E
//
// This state deals with special character used to separate digits
// of numbers. Example:
// 100 000 ' ' is used to separate the digits in French(??)
// In some sense, comma belongs to this class, when we
// deal with US numbers. Because of compability with Liljoe, they
// are set to be CLASS_NSTRIP. The rules to distinguish between
// a digit separator from regular word separator is: If there is a
// digit thats follows, then this is a digit separator, else it is
// a regular word separator
//
if (cbInBufSize) { //
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer]; if (CP_CLASS(lpCharPropTab) == CLASS_DIGIT) {
/* The followed character is a digit, so this must be a digit
* separator. Continue to get the number */
goto ScanNumber; } else { /* Back out the change since this is a word separator */
lpbRawWord--; *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]); lpibi->cbRawPunctLen--; goto FlushNumber; } } //
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data.
//
lpibi->state = SCAN_SEP_STATE; *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]); *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]); return S_OK; }
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord, LPCMAP lpCMap, LPB lpbLigatureTab, WORD wcLigature) { for (;wcLigature > 0; wcLigature --) { if (*lpbLigatureTab == c) { *lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[1]]); *lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[2]]); return 2; } lpbLigatureTab += 3; }
/* Not a ligature */ *lpbNormWord++ = CP_NORMC(&lpCMap[c]); return 1; }
|