|
|
#include <windows.h>
#include "IWBrKr.h"
#include "DefBrKr.h"
#define ZERO_WIDTH_SPACE 0x200B
#define MAX_Def_WordBrKr_Prcess_Len 1000
BOOL IsWinNT(void) { OSVERSIONINFOA osVersionInfo; BOOL fRet = FALSE; osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo); GetVersionExA(&osVersionInfo); if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) { fRet = TRUE; } return fRet; }
BOOL MyGetStringTypeEx( LCID LocalID, DWORD dwInfoType, const WCHAR *lpSrcStr, // unicode base
INT cchSrc, LPWORD lpCharType) { BOOL fRet = FALSE;
if (IsWinNT()) { fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType); } else { DWORD dwANSISize = 0; dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc, NULL, 0, NULL, NULL); if (dwANSISize) { LPSTR lpAnsiStr = NULL; lpAnsiStr = new CHAR[dwANSISize]; if (lpAnsiStr) { dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc, lpAnsiStr, dwANSISize, NULL, NULL); fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType); if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType || CT_CTYPE3 == dwInfoType)) { for (INT i = 0; i < cchSrc; ++i) { switch (dwInfoType) { case CT_CTYPE1: lpCharType[i] = C1_ALPHA; break; case CT_CTYPE3: lpCharType[i] = (C3_NONSPACING | C3_ALPHA); break; } } fRet = TRUE; } delete [] lpAnsiStr; lpAnsiStr = NULL; } } } return fRet; }
CDefWordBreaker::CDefWordBreaker() { ccCompare = MAX_Def_WordBrKr_Prcess_Len; } //+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::IsWordChar
//
// Synopsis: Find whether the i'th character in the buffer _awString
// is a word character (rather than word break)
//
// Arguments: [i] -- index into _awString
//
// History: 22-Jul-1994 BartoszM Created
//
//--------------------------------------------------------------------------
inline BOOL CDefWordBreaker::IsWordChar( int i, PWORD _aCharInfo1, PWORD _aCharInfo3, const WCHAR* pwcChunk) const { if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT)) || (_aCharInfo3[i] & C3_NONSPACING) ) { return TRUE; }
WCHAR c = pwcChunk[i];
if (c == L'_') return TRUE;
if (c == 0xa0) // non breaking space
{ // followed by a non-spacing character
// (looking ahead is okay)
if (_aCharInfo3[i+1] & C3_NONSPACING) return TRUE; } return FALSE; }
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::ScanChunk
//
// Synopsis: For each character find its type
//
//
// History: 16-Aug-94 BartoszM Created
//
//----------------------------------------------------------------------------
BOOL CDefWordBreaker::ScanChunk( PWORD _aCharInfo1, PWORD _aCharInfo3, const WCHAR *pwcChunk, ULONG ucwc) { BOOL fRet = FALSE;
// POSIX character typing, Source, Size of source, Character info
if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) { // Additional POSIX, Source, Size of source, Character info 3
} else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) { //
} else { fRet = TRUE; } return fRet; }
/*
BOOL CDefWordBreaker::ScanChunk( PWORD _aCharInfo1, PWORD _aCharInfo3, const WCHAR *pwcChunk, ULONG ucwc) {
//
// GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
// we pass in a null string.
//
// Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
if (IsWinNT()) { if (!MyGetStringTypeEx(0, // Dummy
CT_CTYPE1, // POSIX character typing
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo1 ) ) // Character info
{ return FALSE; }
if ( !MyGetStringTypeEx(0, // Dummy
CT_CTYPE3, // Additional POSIX
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo3 ) ) // Character info 3
{ return FALSE; } } else { //
// BUGBUG: This is all wrong -- we don't know if this is the right
// locale to use and there isn't a way to know at this point.
//
if (!MyGetStringTypeEx( GetSystemDefaultLCID(), CT_CTYPE1, // POSIX character typing
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo1 ) ) // Character info
{ // ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
// GetLastError() ));
// Win9x just stinks. No 2 ways about it.
if ( ERROR_INVALID_PARAMETER == GetLastError() ) { for ( unsigned i = 0; i < ucwc; i++ ) _aCharInfo1[i] = C1_ALPHA;
return TRUE; }
return FALSE; }
if ( !MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, // Additional POSIX
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo3 ) ) // Character info 3
{ // ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
// GetLastError() ));
// Win9x just stinks. No 2 ways about it.
if ( ERROR_INVALID_PARAMETER == GetLastError() ) { for ( unsigned i = 0; i < ucwc; i++ ) _aCharInfo3[i] = ( C3_NONSPACING | C3_ALPHA );
return TRUE; }
return FALSE; } }
return TRUE; } //ScanChunk
*/ //+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::BreakText
//
// Synopsis: Break input stream into words.
//
// Arguments: [pTextSource] - source of input buffers
// [pWordSink] - sink for words
// [pPhraseSink] - sink for noun phrases
//
// History: 07-June-91 t-WadeR Created
// 12-Oct-92 AmyA Added Unicode support
// 18-Nov-92 AmyA Overloaded
// 11-Apr-94 KyleP Sync with spec
// 26-Aug-94 BartoszM Fixed Unicode parsing
//
//----------------------------------------------------------------------------
SCODE CDefWordBreaker::BreakText( TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink, DWORD dwBase) { LPWORD _aCharInfo1 = NULL; LPWORD _aCharInfo3 = NULL;
if ( 0 == pTextSource ) return E_INVALIDARG;
if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd) return S_OK;
if (pTextSource->iCur > pTextSource->iEnd) { // Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
return E_FAIL; }
SCODE sc = S_OK;
ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
do { //
// Flag for first time thru loop below. This is to fix the case
// where the length of the buffer passed in is less than
// MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
// and we break out the inner loop and call
// pfnFillTextBuffer without having processed any characters,
// and so pfnFillTextBuffer returns TRUE without adding any new
// characters and this results in an infinite loop.
BOOL fFirstTime = TRUE; while (pTextSource->iCur < pTextSource->iEnd) { cwc = pTextSource->iEnd - pTextSource->iCur; // Process in buckets of MAX_II_BUFER_LEN only
if (cwc >= CDefWordBreaker::ccCompare) { cwc = CDefWordBreaker::ccCompare; } else if ( !fFirstTime) { break; } else { }
if (_aCharInfo1) { delete [] _aCharInfo1; _aCharInfo1 = NULL; } if (_aCharInfo3) { delete [] _aCharInfo3; _aCharInfo3 = NULL; } _aCharInfo1 = new WORD[cwc + 1]; _aCharInfo3 = new WORD[cwc + 1]; if (_aCharInfo1 && _aCharInfo3) { Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase); }
// Win4Assert( cwcProcd <= cwc );
pTextSource->iCur += cwcProcd; fFirstTime = FALSE; } } while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
cwc = pTextSource->iEnd - pTextSource->iCur; // we know that the remaining text should be less than ccCompare
// Win4Assert( cwc < CDefWordBreaker::ccCompare );
if (0 != cwc) { if (_aCharInfo1) { delete [] _aCharInfo1; _aCharInfo1 = NULL; } if (_aCharInfo3) { delete [] _aCharInfo3; _aCharInfo3 = NULL; } _aCharInfo1 = new WORD[cwc + 1]; _aCharInfo3 = new WORD[cwc + 1]; if (_aCharInfo1 && _aCharInfo1) { Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase); } }
if (_aCharInfo1) { delete [] _aCharInfo1; _aCharInfo1 = NULL; } if (_aCharInfo3) { delete [] _aCharInfo3; _aCharInfo3 = NULL; }
return sc; } //BreakText
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::Tokenize
//
// Synopsis: Tokenize the input buffer into words
//
// Arguments: [pTextSource] -- input text source
// [cwc] -- # chars to process
// [pWordSink] -- sink for words
// [cwcProd] -- # chars actually processed returned here
//
// History: 10-Aug-95 SitaramR Created
//
//----------------------------------------------------------------------------
void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource, ULONG cwc, IWordSink *pWordSink, ULONG& cwcProcd, PWORD _aCharInfo1, PWORD _aCharInfo3, DWORD dwBase) { const WCHAR* pwcChunk = NULL; WCHAR _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];
pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) { return; }
BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
//
// iBeginWord is the offset into _aCharInfo of the beginning character of
// a word. iCur is the first *unprocessed* character.
// They are indexes into the mapped chunk.
//
unsigned iBeginWord = 0; unsigned iCur = 0;
//
// Pump words from mapped chunk to word sink
//
while (iCur < cwc) { //
// Skip whitespace, punctuation, etc.
//
for (; iCur < cwc; iCur++) if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk)) break;
// iCur points to a word char or is equal to _cMapped
iBeginWord = iCur; if (iCur < cwc) iCur++; // we knew it pointed at word character
//
// Find word break. Filter may output Unicode zero-width-space, which
// should be ignored by the wordbreaker.
//
fWordHasZWS = FALSE; for (; iCur < cwc; iCur++) { if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk)) { if (pwcChunk[iCur] == ZERO_WIDTH_SPACE ) fWordHasZWS = TRUE; else break; } }
if (fWordHasZWS) { //
// Copy word into _awcBufZWS after stripping zero-width-spaces
//
uLenZWS = 0; for ( unsigned i=iBeginWord; i<iCur; i++ ) { if (pwcChunk[i] != ZERO_WIDTH_SPACE ) _awcBufZWS[uLenZWS++] = pwcChunk[i]; } }
// iCur points to a non-word char or is equal to _cMapped
if (iCur < cwc) { // store the word and its source position
if ( fWordHasZWS ) pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase); else pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
iCur++; // we knew it pointed at non-word char
iBeginWord = iCur; // in case we exit the loop now
}
} // next word
// Win4Assert( iCur == _cMapped );
// End of words in chunk.
// iCur == _cMapped
// iBeginWord points at beginning of word or == _cMapped
if ( 0 == iBeginWord ) { // A single word fills from beginning of this chunk
// to the end. This is either a very long word or
// a short word in a leftover buffer.
// store the word and its source position
if ( fWordHasZWS ) pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur, pTextSource->iCur + dwBase); // its source pos.
else pWordSink->PutWord( iCur, pwcChunk, // the word
iCur, pTextSource->iCur + dwBase); // its source pos.
//
// Position it to not add the word twice.
//
iBeginWord = iCur; }
//
// If this is the last chunk from text source, then process the
// last fragment
//
if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur ) { // store the word and its source position
if ( fWordHasZWS ) pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase); else pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
iBeginWord = iCur; }
cwcProcd = iBeginWord; }
|