mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
875 lines
26 KiB
875 lines
26 KiB
// ftslex.cpp : Unicode word lexer and sort key provider for WinHelp browser.
|
|
//
|
|
|
|
#include "stdafx.h"
|
|
#include <stdlib.h>
|
|
#include <malloc.h>
|
|
#include "ftslex.h"
|
|
|
|
#define char_types(w) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)))
|
|
#define set_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) = bType)
|
|
#define add_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) |= bType)
|
|
#define sub_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) &= ~bType)
|
|
|
|
UINT ftslex_os_version= 0;
|
|
|
|
CP g_lastCP;
|
|
WORD g_wLocales = 0;
|
|
LCID g_lcids[MAX_LOCALES];
|
|
CP g_wCPs [MAX_LOCALES];
|
|
|
|
BYTE bLeadBytes [0x100];
|
|
BYTE *pbCharTypes [0x100];
|
|
BYTE bDefaultTable[0x100];
|
|
|
|
BOOL CALLBACK LocaleEnumProc(LPTSTR);
|
|
BOOL CALLBACK CodePageEnumProc(LPTSTR);
|
|
|
|
CP g_cpSet[] =
|
|
{
|
|
ANSI_CHARSET, 1252,
|
|
SYMBOL_CHARSET, 1252, // ?? Should be a different code page, but what??
|
|
SHIFTJIS_CHARSET, 932,
|
|
HANGEUL_CHARSET, 949,
|
|
GB2312_CHARSET, 936,
|
|
CHINESEBIG5_CHARSET, 950,
|
|
THAI_CHARSET, 874,
|
|
HEBREW_CHARSET, 1255,
|
|
ARABIC_CHARSET, 1256,
|
|
GREEK_CHARSET, 1253,
|
|
TURKISH_CHARSET, 1254,
|
|
BALTIC_CHARSET, 1257,
|
|
EASTEUROPE_CHARSET, 1250,
|
|
RUSSIAN_CHARSET, 1251
|
|
};
|
|
|
|
extern "C" void InitialFTSLex()
|
|
{
|
|
g_lcids[g_wLocales] = GetUserDefaultLCID();
|
|
g_wCPs [g_wLocales] = GetACP();
|
|
g_wLocales++;
|
|
|
|
ftslex_os_version = (GetVersion() >> 30) & 0x0003;
|
|
|
|
for (int i = 0; i < 256; i++)
|
|
pbCharTypes[i] = bDefaultTable;
|
|
|
|
EnumSystemLocalesA((LOCALE_ENUMPROC)LocaleEnumProc, LCID_SUPPORTED); //INSTALLED);
|
|
|
|
EnumSystemCodePagesA((CODEPAGE_ENUMPROC)CodePageEnumProc, CP_INSTALLED);
|
|
|
|
if (pbCharTypes[0] != bDefaultTable) // special code point type overrides:
|
|
{
|
|
add_char_types(L'_', LETTER_CHAR); // treat underscore as char, for software prefix names.
|
|
sub_char_types(L'"', LETTER_IMBED); // remove double quote as imbed (suffix), no <WORD">.
|
|
sub_char_types(L'/', LETTER_IMBED); // remove right slash as imbed (suffix)
|
|
sub_char_types(L'=', LETTER_IMBED); // remove equal sign as imbed (suffix)
|
|
sub_char_types(L'@', LETTER_IMBED); // remove at sign as imbed (suffix)
|
|
sub_char_types(L'\\', LETTER_IMBED); // remove left slash as imbed (suffix)
|
|
}
|
|
}
|
|
|
|
extern "C" void ShutdownFTSLex()
|
|
{
|
|
for (int i = 0; i < 256; i++)
|
|
{
|
|
if (pbCharTypes[i] != bDefaultTable)
|
|
delete [] pbCharTypes[i];
|
|
}
|
|
}
|
|
|
|
UINT APIENTRY GetOSVersion()
|
|
{
|
|
return ftslex_os_version;
|
|
}
|
|
|
|
|
|
BOOL CALLBACK LocaleEnumProc(LPSTR lpLocaleString)
|
|
{
|
|
LCID lcid;
|
|
BYTE bCP[6];
|
|
CP wCP;
|
|
LPSTR lpEndString;
|
|
|
|
lcid = strtoul(lpLocaleString, &lpEndString, 16);
|
|
|
|
if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTANSICODEPAGE, (LPSTR)bCP, sizeof(bCP)))
|
|
{
|
|
wCP = atoi((PSTR)bCP);
|
|
|
|
if (g_wLocales < MAX_LOCALES)
|
|
{
|
|
g_lcids[g_wLocales] = lcid;
|
|
g_wCPs [g_wLocales] = wCP;
|
|
g_wLocales++;
|
|
}
|
|
}
|
|
|
|
if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTCODEPAGE, (LPSTR)bCP, sizeof(bCP)))
|
|
{
|
|
wCP = atoi((PSTR)bCP);
|
|
|
|
if (g_wLocales < MAX_LOCALES)
|
|
{
|
|
g_lcids[g_wLocales] = lcid;
|
|
g_wCPs [g_wLocales] = wCP;
|
|
g_wLocales++;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
LCID APIENTRY GetLocaleFromCP(CP wCP)
|
|
{
|
|
for (int i = 0; i < g_wLocales; i++)
|
|
if (wCP == g_wCPs[i])
|
|
return g_lcids[i];
|
|
|
|
return GetUserDefaultLCID();
|
|
}
|
|
|
|
|
|
CP APIENTRY GetCPFromLocale(LCID lcid)
|
|
{
|
|
for (int i = 0; i < g_wLocales; i++)
|
|
if (lcid == g_lcids[i])
|
|
return g_wCPs[i];
|
|
|
|
return GetACP();
|
|
}
|
|
|
|
|
|
CP APIENTRY GetCPFromCharset(BYTE charset)
|
|
{
|
|
for (int i = 0; i < sizeof(g_cpSet)/sizeof(g_cpSet[0]); i += 2)
|
|
if (charset == (BYTE)g_cpSet[i])
|
|
return g_cpSet[i+1];
|
|
|
|
return GetACP();
|
|
}
|
|
|
|
|
|
BOOL CALLBACK CodePageEnumProc(LPSTR lpCodePageString)
|
|
{
|
|
BYTE bSection;
|
|
BYTE szChars[2];
|
|
LCID lcid;
|
|
int i, j, nCount, nFinal;
|
|
WCHAR wChars;
|
|
WORD wCharType1, wCharType2, wCharType3;
|
|
CP wCP;
|
|
CPINFO CPInfo;
|
|
|
|
wCP = atoi(lpCodePageString);
|
|
|
|
if (wCP == 37 || wCP == 500 || wCP == 875 || wCP == 1026)
|
|
return TRUE; // do not process EBCDIC code pages
|
|
|
|
// if (wCP < 1200 || wCP > 1299)
|
|
// return TRUE; // only process Windows code pages
|
|
|
|
// lcid = GetLocaleFromCP(wCP); // the linguists argue to use to user's
|
|
lcid = GetUserDefaultLCID(); // ... LCID for multilingual contexts
|
|
|
|
if (!GetCPInfo(wCP, &CPInfo))
|
|
return TRUE;
|
|
|
|
#ifdef TESTMODE
|
|
else
|
|
{
|
|
TRACE("CODEPAGE: %5d, MAXCHARSIZE: %3d, DEFAULTCHAR: %2X", wCP, CPInfo.MaxCharSize, CPInfo.DefaultChar[0]);
|
|
for (i = 0; i < MAX_LEADBYTES; i++)
|
|
TRACE(", %d", CPInfo.LeadByte[i]);
|
|
TRACE("\n");
|
|
}
|
|
#endif
|
|
|
|
if (nFinal = (CPInfo.MaxCharSize == 1) ? 0 : 255) // one pass if no lead bytes (MaxCharSize = 1)
|
|
{
|
|
g_lastCP = wCP;
|
|
|
|
memset(bLeadBytes, 0, sizeof(bLeadBytes));
|
|
|
|
for (i = 0; i < MAX_LEADBYTES; i += 2)
|
|
{
|
|
if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1])
|
|
break; // end of lead byte ranges
|
|
|
|
for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++)
|
|
bLeadBytes[j] = TRUE; // mark as valid lead byte
|
|
}
|
|
}
|
|
|
|
for (i = 0; i <= nFinal; i++) // thumb thru all potential lead bytes
|
|
{
|
|
if (!i || bLeadBytes[i]) // lead bytes OR chars 0x00 - 0xff
|
|
{
|
|
for (j = 0; j < 256; j++)
|
|
{
|
|
nCount = 0;
|
|
if (i)
|
|
szChars[nCount++] = i; // create leadbyte/char pairs
|
|
szChars[nCount++] = j;
|
|
|
|
if (MultiByteToWideChar(wCP, MB_ERR_INVALID_CHARS, (PSTR)szChars, nCount, (PWSTR)&wChars, 1) != 1)
|
|
continue; // not valid UNICODE character
|
|
|
|
bSection = HIBYTE(wChars);
|
|
|
|
if (pbCharTypes[bSection] == bDefaultTable) // UNICODE section not accessed yet
|
|
{
|
|
pbCharTypes[bSection] = New BYTE[256];
|
|
|
|
if (!pbCharTypes[bSection])
|
|
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
|
|
|
|
memset(pbCharTypes[bSection], 0, 256 * sizeof(BYTE));
|
|
}
|
|
// already processed this UNICODE char
|
|
else if (char_types(wChars))
|
|
continue;
|
|
|
|
GetStringTypeA(lcid, CT_CTYPE1, (PSTR)szChars, i ? 2 : 1, &wCharType1);
|
|
GetStringTypeA(lcid, CT_CTYPE2, (PSTR)szChars, i ? 2 : 1, &wCharType2);
|
|
GetStringTypeA(lcid, CT_CTYPE3, (PSTR)szChars, i ? 2 : 1, &wCharType3);
|
|
|
|
#ifdef TESTMODE
|
|
if (wCharType1 & 0x0001) TRACE("UPPER ");
|
|
if (wCharType1 & 0x0002) TRACE("LOWER ");
|
|
if (wCharType1 & 0x0004) TRACE("DIGIT ");
|
|
if (wCharType1 & 0x0008) TRACE("SPACE ");
|
|
if (wCharType1 & 0x0010) TRACE("PUNCT ");
|
|
if (wCharType1 & 0x0020) TRACE("CNTRL ");
|
|
if (wCharType1 & 0x0040) TRACE("BLANK ");
|
|
if (wCharType1 & 0x0080) TRACE("XDIGIT ");
|
|
if (wCharType1 & 0x0100) TRACE("ALPHA ");
|
|
|
|
if (wCharType2 == 0x0001) TRACE("LEFTTORIGHT ");
|
|
if (wCharType2 == 0x0002) TRACE("RIGHTTOLEFT ");
|
|
if (wCharType2 == 0x0003) TRACE("EUROPENUMBER ");
|
|
if (wCharType2 == 0x0004) TRACE("EUROPESEPARATOR ");
|
|
if (wCharType2 == 0x0005) TRACE("EUROPETERMINATOR ");
|
|
if (wCharType2 == 0x0006) TRACE("ARABICNUMBER ");
|
|
if (wCharType2 == 0x0007) TRACE("COMMONSEPARATOR ");
|
|
if (wCharType2 == 0x0008) TRACE("BLOCKSEPARATOR ");
|
|
if (wCharType2 == 0x0009) TRACE("SEGMENTSEPARATOR ");
|
|
if (wCharType2 == 0x000a) TRACE("WHITESPACE ");
|
|
if (wCharType2 == 0x000b) TRACE("OTHERNEUTRAL ");
|
|
|
|
if (wCharType3 & 0x0001) TRACE("NONSPACING ");
|
|
if (wCharType3 & 0x0002) TRACE("DIACRITIC ");
|
|
if (wCharType3 & 0x0004) TRACE("VOWELMARK ");
|
|
if (wCharType3 & 0x0008) TRACE("SYMBOL ");
|
|
if (wCharType3 & 0x0010) TRACE("KATAKANA ");
|
|
if (wCharType3 & 0x0020) TRACE("HIRAGANA ");
|
|
if (wCharType3 & 0x0040) TRACE("HALFWIDTH ");
|
|
if (wCharType3 & 0x0080) TRACE("FULLWIDTH ");
|
|
if (wCharType3 & 0x0100) TRACE("IDEOGRAPH ");
|
|
if (wCharType3 & 0x0200) TRACE("KASHIDA ");
|
|
if (wCharType3 & 0x0400) TRACE("LEXICAL ");
|
|
if (wCharType3 & 0x8000) TRACE("C3ALPHA ");
|
|
TRACE("\n");
|
|
#endif
|
|
|
|
set_char_types(wChars, CHAR_DEFINED);
|
|
|
|
if (wCharType1 & C1_ALPHA) // process characters
|
|
add_char_types(wChars, LETTER_CHAR);
|
|
|
|
if (wCharType1 & C1_SPACE)
|
|
add_char_types(wChars, SPACE_CHAR); // mark space characters
|
|
|
|
if ((wCharType1 & C1_DIGIT) || (wCharType2 == C2_EUROPENUMBER) || (wCharType2 == C2_ARABICNUMBER))
|
|
add_char_types(wChars, DIGIT_CHAR); // mark number characters
|
|
|
|
if (wCharType3 & C3_LEXICAL)
|
|
add_char_types(wChars, LETTER_IMBED); // mark letter embedded separators
|
|
|
|
if (wCharType2 == C2_COMMONSEPARATOR || wCharType2 == C2_EUROPESEPARATOR)
|
|
add_char_types(wChars, DIGIT_IMBED); // mark number embedded separators
|
|
}
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
LPSTR APIENTRY CharNextMult(CP wCP, LPCSTR str, int n)
|
|
{
|
|
int i, j;
|
|
|
|
if (wCP != g_lastCP) // we are processing a new CP, so
|
|
{ // ... set up our lead byte tables
|
|
CPINFO CPInfo;
|
|
|
|
if (!GetCPInfo(wCP, &CPInfo))
|
|
return (LPSTR)str + n; // error return, let's make a guess
|
|
|
|
g_lastCP = wCP;
|
|
|
|
memset(bLeadBytes, 0, sizeof(bLeadBytes)); // establish lead bytes
|
|
|
|
for (i = 0; i < MAX_LEADBYTES; i += 2)
|
|
{
|
|
if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1])
|
|
break; // end of lead byte ranges
|
|
|
|
for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++)
|
|
bLeadBytes[j] = TRUE; // mark as valid lead byte
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < n; i++, str++)
|
|
if (bLeadBytes[*PBYTE(str)])
|
|
str++;
|
|
|
|
return (LPSTR)str;
|
|
}
|
|
|
|
|
|
int APIENTRY FTSWordBreakA (CP wCP, LPSTR *ppText, LPINT pcText, LPSTR *paToken, LPSTR *paTokenEnd,
|
|
LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces)
|
|
{
|
|
int i, cwChar, nRet, diff;
|
|
CPINFO CPInfo;
|
|
LPWSTR pwText, ppwText;
|
|
|
|
if (!GetCPInfo(wCP, &CPInfo))
|
|
return 0;
|
|
|
|
cwChar = *pcText << 1;
|
|
|
|
if (!(pwText = ppwText = New WCHAR[cwChar]))
|
|
return 0;
|
|
|
|
cwChar = MultiByteToWideChar(wCP, 0, *ppText, *pcText, pwText, cwChar);
|
|
|
|
nRet = FTSWordBreakW(&ppwText, &cwChar, (LPWSTR *)paToken, (LPWSTR *)paTokenEnd, paType, paHash, cwTokens, fTokenizeSpaces);
|
|
|
|
if (nRet)
|
|
{
|
|
if (CPInfo.MaxCharSize == 1) // single byte code page
|
|
{
|
|
for (i = 0; i < nRet; i++)
|
|
{
|
|
if (paToken)
|
|
paToken[i] = *ppText + ((LPWSTR)paToken[i] - pwText);
|
|
|
|
if (paTokenEnd)
|
|
paTokenEnd[i] = *ppText +((LPWSTR)paTokenEnd[i] - pwText);
|
|
}
|
|
|
|
*ppText += ppwText - pwText;
|
|
|
|
*pcText = cwChar;
|
|
}
|
|
|
|
else // DBCS code pages
|
|
{
|
|
LPSTR cPtr = *ppText;
|
|
LPWSTR wPtr = pwText;
|
|
|
|
for (i = 0; i < nRet; i++)
|
|
{
|
|
if (paToken)
|
|
{
|
|
diff = (LPWSTR)paToken[i] - wPtr; // how many more Unicode chars
|
|
|
|
cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars
|
|
|
|
wPtr += diff; // adjust our Unicode pointer
|
|
|
|
paToken[i] = cPtr; // return our DBCS pointer
|
|
}
|
|
|
|
if (paTokenEnd)
|
|
{
|
|
diff = (LPWSTR)paTokenEnd[i] - wPtr; // how many more Unicode chars
|
|
|
|
cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars
|
|
|
|
wPtr += diff; // adjust our Unicode pointer
|
|
|
|
paTokenEnd[i] = cPtr; // return our DBCS pointer
|
|
}
|
|
}
|
|
|
|
diff = ppwText - wPtr; // how many more Unicode chars
|
|
|
|
cPtr = CharNextMult(wCP,cPtr, diff); // advance that many DBCS chars
|
|
|
|
*pcText -= cPtr - *ppText; // return remaining DBCS chars
|
|
|
|
*ppText = cPtr; // return our DBCS pointer
|
|
}
|
|
}
|
|
|
|
delete [] pwText;
|
|
|
|
return nRet;
|
|
}
|
|
|
|
|
|
int APIENTRY FTSWordBreakW (LPWSTR *ppwText, LPINT pcwText, LPWSTR *paToken, LPWSTR *paTokenEnd,
|
|
LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces)
|
|
{
|
|
BYTE bCharType, bPrevType, bFirstCharType;
|
|
UINT wHash;
|
|
WORD wPunc, cwTokensOut = 0;
|
|
WCHAR wChar, wChar2, wImbed = 0;
|
|
LPWSTR pwPos, pwLimit, pwTokenStart, pwStart;
|
|
|
|
pwPos = pwStart = *ppwText; // position WCHAR pointer to beginning of text
|
|
wChar = *pwPos; // get first UNICODE character
|
|
pwLimit = pwPos + *pcwText; // end of UNICODE text
|
|
|
|
FOREVER_
|
|
{ // token hash value init
|
|
wHash = 0;
|
|
|
|
if (pwPos == pwLimit) // have reached end of UNCODE text
|
|
break;
|
|
|
|
bFirstCharType = (char_types(wChar) & WORD_TYPE);
|
|
bPrevType = 0;
|
|
|
|
if (!bFirstCharType && (fTokenizeSpaces & STARTING_IMBEDS))
|
|
{
|
|
bCharType = char_types(wChar);
|
|
|
|
if (bCharType & LETTER_IMBED)
|
|
{
|
|
if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & LETTER_CHAR)
|
|
{
|
|
bFirstCharType = TRUE;
|
|
bPrevType |= LETTER_CHAR;
|
|
}
|
|
}
|
|
|
|
if (bCharType & DIGIT_IMBED)
|
|
{
|
|
if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & DIGIT_CHAR)
|
|
{
|
|
bFirstCharType = TRUE;
|
|
bPrevType |= DIGIT_CHAR;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bFirstCharType) // current WCHAR is letter or number
|
|
{
|
|
pwTokenStart = pwPos; // save pointer to beginning of token
|
|
wHash = 0; // seed hash value
|
|
|
|
FOREVER_
|
|
{
|
|
if (pwPos > pwStart && !(fTokenizeSpaces & STARTING_IMBEDS))
|
|
wImbed = *(pwPos - 1); // get possible starting imbed char
|
|
|
|
do
|
|
{
|
|
wChar = *pwPos; // current UNICODE character
|
|
bCharType = char_types(wChar);
|
|
|
|
if ((bCharType & WORD_TYPE) ||
|
|
|
|
((bCharType & LETTER_IMBED) && // changed to allow C3_LEXICAL (letter
|
|
(wChar != wImbed) &&
|
|
(bPrevType & LETTER_CHAR)) || // ... imbed) to be suffix
|
|
// (pwPos+1 == pwLimit || char_types(*(pwPos+1)) & LETTER_CHAR)) ||
|
|
|
|
((bCharType & DIGIT_IMBED) &&
|
|
(bPrevType & DIGIT_CHAR) &&
|
|
(pwPos+1 == pwLimit || char_types(*(pwPos+1)) & DIGIT_CHAR || (fTokenizeSpaces & STARTING_IMBEDS))))
|
|
|
|
{
|
|
wHash = _rotl(wHash, 5) - wChar; // token continues: letter, number, or
|
|
bPrevType = bCharType; // ... surrounded embedded character
|
|
}
|
|
|
|
else
|
|
break; // else token complete
|
|
}
|
|
while (++pwPos != pwLimit); // until end of UNICODE text
|
|
|
|
if (!cwTokens)
|
|
cwTokensOut++; // just count number of tokens needed
|
|
|
|
else
|
|
{
|
|
if (paToken)
|
|
paToken[cwTokensOut] = pwTokenStart; // token start pointer
|
|
|
|
if (paTokenEnd)
|
|
paTokenEnd[cwTokensOut] = pwPos; // token end pointer
|
|
|
|
if (paHash)
|
|
paHash[cwTokensOut] = wHash; // token hash value
|
|
|
|
if (paType)
|
|
paType[cwTokensOut] = bFirstCharType; // mark token as word (chars/digits)
|
|
|
|
if (++cwTokensOut >= cwTokens) // no more token pointer space
|
|
{
|
|
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
|
|
*ppwText = pwPos; // update WCHAR text starting pointer
|
|
return(cwTokensOut); // return token count
|
|
}
|
|
}
|
|
// remove all spans of space characters
|
|
if ((fTokenizeSpaces & REMOVE_SPACE_CHARS) && pwPos != pwLimit)
|
|
{
|
|
while (pwPos != pwLimit && (char_types(*pwPos) & SPACE_CHAR))
|
|
pwPos++;
|
|
|
|
if (pwPos == pwLimit)
|
|
break;
|
|
|
|
pwTokenStart = pwPos;
|
|
wChar = *pwPos;
|
|
wHash = 0;
|
|
|
|
if (!(char_types(wChar) & WORD_TYPE)) // lexing into non-space punctuation
|
|
break;
|
|
}
|
|
|
|
else if (!(fTokenizeSpaces & TOKENIZE_SPACES) && pwPos != pwLimit &&
|
|
wChar == L' ' && (pwPos+1) != pwLimit &&
|
|
char_types(wChar2 = *(pwPos+1)) & WORD_TYPE)
|
|
{
|
|
pwTokenStart = ++pwPos; // if "fTokenizeSpaces" is FALSE, then
|
|
wHash = 0; // ... remove single space between words continue;
|
|
} // ... as a token
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (pwPos == pwLimit) break; // ... at end of provided WCHAR text
|
|
|
|
pwTokenStart = pwPos; // save pointer to beginning of token
|
|
wHash = 0; // seed hash value
|
|
wPunc = wChar; // punctuation type (space vs. non-space)
|
|
|
|
do
|
|
{
|
|
wChar = *pwPos; // current UNICODE character
|
|
|
|
if (fTokenizeSpaces & TOKENIZE_SPACES) // "fTokenizeSpaces" option for WinHelp
|
|
if ((wPunc == L' ' && wChar != L' ') ||
|
|
(wPunc != L' ' && wChar == L' '))
|
|
break; // tokenize spans of spaces -OR- non-spaces
|
|
|
|
bCharType = char_types(wChar);
|
|
if (!(bCharType & WORD_TYPE) || !wChar)
|
|
{
|
|
if (!(fTokenizeSpaces & REMOVE_SPACE_CHARS) || !(bCharType & SPACE_CHAR))
|
|
wHash = _rotl(wHash, 5) - wChar; // punctuation token continues: not letter/number
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
while (++pwPos != pwLimit); // until end of UNICODE text
|
|
|
|
|
|
if (pwPos != pwLimit || pwTokenStart != pwLimit)
|
|
{ // discard empty final token
|
|
LPWSTR pw, pwNew = pwPos;
|
|
|
|
if (fTokenizeSpaces & REMOVE_SPACE_CHARS) // remove spans of space chars
|
|
{
|
|
|
|
for (; pwTokenStart < pwPos; ++pwTokenStart)
|
|
if (!(char_types(*pwTokenStart) & SPACE_CHAR)) break;
|
|
|
|
for (pw = pwNew = pwTokenStart; pw < pwPos; pw++)
|
|
if (!(char_types(*pw) & SPACE_CHAR))
|
|
*pwNew++ = *pw;
|
|
}
|
|
|
|
if (pwNew != pwTokenStart)
|
|
{
|
|
if (!cwTokens)
|
|
cwTokensOut++; // just count number of tokens needed
|
|
|
|
else
|
|
{
|
|
if (paToken)
|
|
paToken[cwTokensOut] = pwTokenStart; // Token start pointer
|
|
|
|
if (paTokenEnd)
|
|
paTokenEnd[cwTokensOut] = pwNew; // Token end pointer
|
|
|
|
if (paHash)
|
|
paHash[cwTokensOut] = wHash; // Token hash value
|
|
|
|
if (paType)
|
|
paType[cwTokensOut] = 0; // mark token as punctuation
|
|
|
|
if (++cwTokensOut >= cwTokens)
|
|
{
|
|
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
|
|
*ppwText = pwPos; // update WCHAR text starting pointer
|
|
return(cwTokensOut); // return token count
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cwTokens)
|
|
{
|
|
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
|
|
*ppwText = pwPos; // update WCHAR text starting pointer
|
|
}
|
|
return cwTokensOut; // return token count
|
|
}
|
|
|
|
|
|
int APIENTRY LCSortKeyW(LCID lcid, WORD wMapFlags, LPCWSTR pwSource, int cwSource, LPWSTR pwDest, int cwDest)
|
|
{
|
|
int cb, nRet;
|
|
#ifdef _DEBUG
|
|
int err = 0;
|
|
#endif
|
|
|
|
if (ftslex_os_version != OS_NT)
|
|
{
|
|
PBYTE pbSource = NULL;
|
|
UINT cbSource = 0;
|
|
|
|
cbSource= cwSource << 1; // 1 WC can generate 2 bytes of MB
|
|
|
|
pbSource = (cbSource > MAX_STACK_ALLOC)? New BYTE[cwSource] : PBYTE(_alloca(cbSource));
|
|
|
|
if (!pbSource)
|
|
return 0; // error return
|
|
|
|
cb = WideCharToMultiByte(GetACP(), 0, pwSource, cwSource, (PSTR)pbSource, cbSource, NULL, NULL);
|
|
|
|
ASSERT(cb || !cbSource);
|
|
|
|
nRet = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), (cwDest-1)<<1) >> 1;
|
|
#ifdef _DEBUG
|
|
if (nRet == 0 && cb) {
|
|
err = GetLastError();
|
|
char szBuf[256];
|
|
int cbShouldBe = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), 0);
|
|
wsprintf(szBuf,
|
|
"LCMapStringA error code:%u cwdest == %u, should be = %u", err,
|
|
(cwDest-1) <<1, cbShouldBe);
|
|
MessageBox(NULL, szBuf, "", MB_OK);
|
|
}
|
|
#endif
|
|
|
|
ASSERT(nRet || !cb);
|
|
|
|
LPWSTR pwText = pwDest + 1;
|
|
LPWSTR pwEnd = pwText + nRet;
|
|
|
|
for ( ; pwText < pwEnd; pwText++)
|
|
*pwText = (*pwText >> 8) | (*pwText << 8); // bring sort key weights in byte reversed order
|
|
|
|
if (pbSource && cbSource > MAX_STACK_ALLOC) delete [] pbSource;
|
|
}
|
|
else {
|
|
nRet = LCMapStringW(lcid, LCMAP_FLAGS, pwSource, cwSource, pwDest+1, (cwDest-1) << 1) >> 1;
|
|
}
|
|
|
|
ASSERT(nRet || !cwSource); // invalid zero length sort key
|
|
|
|
if (nRet)
|
|
{
|
|
nRet++;
|
|
|
|
if (cwDest && pwDest) // set a sort keys prefix so tokens group first by
|
|
{
|
|
BYTE bCharType = char_types(*pwSource);
|
|
/*
|
|
BYTE bCharType2;
|
|
|
|
if ((bCharType & (LETTER_IMBED | DIGIT_IMBED)) && nRet > 2)
|
|
{
|
|
bCharType2 = char_types(*(pwSource+1)); // handle input matching for imbeds
|
|
|
|
if (((bCharType & LETTER_IMBED) && (bCharType2 & LETTER_CHAR)) ||
|
|
((bCharType & DIGIT_IMBED) && (bCharType2 & DIGIT_CHAR)))
|
|
*pwDest = ~(bCharType2 & WORD_TYPE); // ... alphabetics, then numerics, then punctuation
|
|
}
|
|
*/
|
|
// Prefix values --
|
|
//
|
|
// 1 - Letters
|
|
// 2 - Underscore(s)
|
|
// 3 - Digits
|
|
// 4 - All other punctuation streams
|
|
|
|
if (bCharType & LETTER_CHAR)
|
|
*pwDest = (*pwSource == L'_')? 2 : 1;
|
|
else
|
|
*pwDest = (bCharType & DIGIT_CHAR)? 3 : 4;
|
|
|
|
// *pwDest = ~(bCharType & WORD_TYPE); // ... alphabetics, then numerics, then punctuation
|
|
}
|
|
}
|
|
|
|
if ((wMapFlags & LCSORT_START) && cwDest && pwDest) // flag to return char class start sort key
|
|
{
|
|
for (int i = 0; i < nRet; i++) // skipping characters by two (alpha sort weights)
|
|
if (HIBYTE(pwDest[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
|
|
{
|
|
pwDest[i] = 0;
|
|
return i; // return WCHAR character length
|
|
}
|
|
|
|
pwDest[0] = 0; // empty return
|
|
return 0;
|
|
}
|
|
|
|
return nRet;
|
|
}
|
|
|
|
|
|
int APIENTRY LCSortKeyFirstW(LPWSTR pwText, int cwText) // convert start sort key to first matching sort key
|
|
{
|
|
for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights)
|
|
if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
|
|
{
|
|
pwText[i] = 0;
|
|
return i; // return character length
|
|
}
|
|
|
|
return 0; // no separator
|
|
}
|
|
|
|
|
|
int APIENTRY LCSortKeyLastW(LPWSTR pwText, int cwText) // convert start sort key to last matching sort key
|
|
{
|
|
for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights)
|
|
if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
|
|
{
|
|
pwText[i-1]++; // increment last alpha weight
|
|
pwText[i] = 0;
|
|
return i; // return character length
|
|
}
|
|
|
|
return 0; // no separator
|
|
}
|
|
|
|
|
|
int APIENTRY LCSortKeyBase(LPWSTR pwText, int cwText) // convert sort key to base characters
|
|
{ // removes diacritic weights from sort key
|
|
LPSTR pCopy, pEnd;
|
|
LPWSTR pwStart = pwText;
|
|
|
|
while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator
|
|
pwText++;
|
|
|
|
if (LOBYTE(*pwText) == SORT_KEY_SEPARATOR) // no case weights at all
|
|
return cwText; // returning original sort key
|
|
|
|
pCopy = (LPSTR)pwText; // point to next word for search
|
|
pEnd = (LPSTR)(pwStart + cwText);
|
|
|
|
*pwText++ = ((SORT_KEY_SEPARATOR << 8) | SORT_KEY_SEPARATOR);
|
|
|
|
while ((pCopy += 2) < pEnd) // remember, sort key is byte reversed
|
|
{
|
|
if (*(pCopy+1) == SORT_KEY_SEPARATOR) // found diacritic separator (high byte)
|
|
{
|
|
while ((pCopy + 2) < pEnd)
|
|
{ // lobyte + next hibyte
|
|
*pwText++ = ((WCHAR)(BYTE)*pCopy << 8) | (BYTE)(*(pCopy + 3));
|
|
pCopy += 2;
|
|
}
|
|
|
|
if (*pwText = (WCHAR)(BYTE)*pCopy << 8) // check if terminating wide-null
|
|
pwText++;
|
|
|
|
break;
|
|
}
|
|
|
|
else if (*pCopy == SORT_KEY_SEPARATOR) // found diacritic separator (low byte)
|
|
{
|
|
pCopy += 2;
|
|
|
|
while (pCopy < pEnd)
|
|
{
|
|
*pwText++ = *((LPWSTR)pCopy);
|
|
pCopy += 2;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
return pwText - pwStart;
|
|
}
|
|
|
|
|
|
int APIENTRY LCSortKeyLower(LPWSTR pwText, int cwText) // convert sort key to lower case
|
|
{
|
|
LPSTR pWork, pAlpha;
|
|
LPWSTR pwWork, pwEnd;
|
|
|
|
LPSTR pEnd = (LPSTR)(pwText + cwText);
|
|
LPWSTR pwStart = pwText;
|
|
|
|
while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator
|
|
pwText++;
|
|
|
|
for (pwWork = pwText; pwWork < (LPWSTR)pEnd; pwWork++)
|
|
*pwWork = (*pwWork >> 8) | (*pwWork << 8); // bring sort key weights in byte order
|
|
|
|
for (pWork = (LPSTR)pwText + 1; pWork < pEnd; pWork++) // skip diacritic separator
|
|
if (*pWork == SORT_KEY_SEPARATOR) // find alpha weights separator
|
|
break;
|
|
|
|
if (*++pWork == SORT_KEY_SEPARATOR)
|
|
pwEnd = (LPWSTR)pEnd; // no alpha weights
|
|
|
|
else
|
|
{
|
|
for (pAlpha = pWork + 1; pAlpha < pEnd; pAlpha++) // skip non-separator character to start
|
|
if (*pAlpha == SORT_KEY_SEPARATOR)
|
|
break; // find final sort key separator
|
|
|
|
memcpy(pWork, pAlpha, pEnd - pAlpha); // copy remaining buffer
|
|
|
|
memset(pWork + (pEnd - pAlpha), 0, pAlpha - pWork); // clear remaining buffer
|
|
|
|
pwEnd = (LPWSTR)pEnd;
|
|
while(!(*--pwEnd)) {}; // find last non-zero word
|
|
pwEnd++;
|
|
}
|
|
|
|
for (pwWork = pwText; pwWork < pwEnd; pwWork++)
|
|
*pwWork = (*pwWork >> 8) | (*pwWork << 8); // byte reverse sort keys weights
|
|
|
|
return pwEnd - pwStart; // number of words being returned
|
|
}
|
|
|
|
////////////////////////////////// global function put in for hiliter /////////////
|
|
|
|
WORD RemoveWhiteSpace(WCHAR* pwChar, int cw, int& cBase, int& cLimit) {
|
|
// remove space from Unicode strings so they match query box entries
|
|
int i, j;
|
|
cBase = cLimit = 0; // number of leading/trailing blank characters
|
|
BOOL fNonBlank = FALSE; // set when we reach the first non-blank character
|
|
for (i=j=0; i<cw; i++) {
|
|
WCHAR w = pwChar[i];
|
|
if (char_types(w) & SPACE_CHAR) { // we got a space character
|
|
if (!fNonBlank) cBase++;
|
|
else cLimit++;
|
|
}
|
|
else { // a non-space character
|
|
pwChar[j++] = w; // change it in place
|
|
fNonBlank = TRUE;
|
|
cLimit = 0;
|
|
}
|
|
}
|
|
return j; // new length
|
|
}
|