Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

875 lines
26 KiB

// ftslex.cpp : Unicode word lexer and sort key provider for WinHelp browser.
//
#include "stdafx.h"
#include <stdlib.h>
#include <malloc.h>
#include "ftslex.h"
#define char_types(w) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)))
#define set_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) = bType)
#define add_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) |= bType)
#define sub_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) &= ~bType)
UINT ftslex_os_version= 0;
CP g_lastCP;
WORD g_wLocales = 0;
LCID g_lcids[MAX_LOCALES];
CP g_wCPs [MAX_LOCALES];
BYTE bLeadBytes [0x100];
BYTE *pbCharTypes [0x100];
BYTE bDefaultTable[0x100];
BOOL CALLBACK LocaleEnumProc(LPTSTR);
BOOL CALLBACK CodePageEnumProc(LPTSTR);
CP g_cpSet[] =
{
ANSI_CHARSET, 1252,
SYMBOL_CHARSET, 1252, // ?? Should be a different code page, but what??
SHIFTJIS_CHARSET, 932,
HANGEUL_CHARSET, 949,
GB2312_CHARSET, 936,
CHINESEBIG5_CHARSET, 950,
THAI_CHARSET, 874,
HEBREW_CHARSET, 1255,
ARABIC_CHARSET, 1256,
GREEK_CHARSET, 1253,
TURKISH_CHARSET, 1254,
BALTIC_CHARSET, 1257,
EASTEUROPE_CHARSET, 1250,
RUSSIAN_CHARSET, 1251
};
extern "C" void InitialFTSLex()
{
g_lcids[g_wLocales] = GetUserDefaultLCID();
g_wCPs [g_wLocales] = GetACP();
g_wLocales++;
ftslex_os_version = (GetVersion() >> 30) & 0x0003;
for (int i = 0; i < 256; i++)
pbCharTypes[i] = bDefaultTable;
EnumSystemLocalesA((LOCALE_ENUMPROC)LocaleEnumProc, LCID_SUPPORTED); //INSTALLED);
EnumSystemCodePagesA((CODEPAGE_ENUMPROC)CodePageEnumProc, CP_INSTALLED);
if (pbCharTypes[0] != bDefaultTable) // special code point type overrides:
{
add_char_types(L'_', LETTER_CHAR); // treat underscore as char, for software prefix names.
sub_char_types(L'"', LETTER_IMBED); // remove double quote as imbed (suffix), no <WORD">.
sub_char_types(L'/', LETTER_IMBED); // remove right slash as imbed (suffix)
sub_char_types(L'=', LETTER_IMBED); // remove equal sign as imbed (suffix)
sub_char_types(L'@', LETTER_IMBED); // remove at sign as imbed (suffix)
sub_char_types(L'\\', LETTER_IMBED); // remove left slash as imbed (suffix)
}
}
extern "C" void ShutdownFTSLex()
{
for (int i = 0; i < 256; i++)
{
if (pbCharTypes[i] != bDefaultTable)
delete [] pbCharTypes[i];
}
}
UINT APIENTRY GetOSVersion()
{
return ftslex_os_version;
}
BOOL CALLBACK LocaleEnumProc(LPSTR lpLocaleString)
{
LCID lcid;
BYTE bCP[6];
CP wCP;
LPSTR lpEndString;
lcid = strtoul(lpLocaleString, &lpEndString, 16);
if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTANSICODEPAGE, (LPSTR)bCP, sizeof(bCP)))
{
wCP = atoi((PSTR)bCP);
if (g_wLocales < MAX_LOCALES)
{
g_lcids[g_wLocales] = lcid;
g_wCPs [g_wLocales] = wCP;
g_wLocales++;
}
}
if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTCODEPAGE, (LPSTR)bCP, sizeof(bCP)))
{
wCP = atoi((PSTR)bCP);
if (g_wLocales < MAX_LOCALES)
{
g_lcids[g_wLocales] = lcid;
g_wCPs [g_wLocales] = wCP;
g_wLocales++;
}
}
return TRUE;
}
LCID APIENTRY GetLocaleFromCP(CP wCP)
{
for (int i = 0; i < g_wLocales; i++)
if (wCP == g_wCPs[i])
return g_lcids[i];
return GetUserDefaultLCID();
}
CP APIENTRY GetCPFromLocale(LCID lcid)
{
for (int i = 0; i < g_wLocales; i++)
if (lcid == g_lcids[i])
return g_wCPs[i];
return GetACP();
}
CP APIENTRY GetCPFromCharset(BYTE charset)
{
for (int i = 0; i < sizeof(g_cpSet)/sizeof(g_cpSet[0]); i += 2)
if (charset == (BYTE)g_cpSet[i])
return g_cpSet[i+1];
return GetACP();
}
BOOL CALLBACK CodePageEnumProc(LPSTR lpCodePageString)
{
BYTE bSection;
BYTE szChars[2];
LCID lcid;
int i, j, nCount, nFinal;
WCHAR wChars;
WORD wCharType1, wCharType2, wCharType3;
CP wCP;
CPINFO CPInfo;
wCP = atoi(lpCodePageString);
if (wCP == 37 || wCP == 500 || wCP == 875 || wCP == 1026)
return TRUE; // do not process EBCDIC code pages
// if (wCP < 1200 || wCP > 1299)
// return TRUE; // only process Windows code pages
// lcid = GetLocaleFromCP(wCP); // the linguists argue to use to user's
lcid = GetUserDefaultLCID(); // ... LCID for multilingual contexts
if (!GetCPInfo(wCP, &CPInfo))
return TRUE;
#ifdef TESTMODE
else
{
TRACE("CODEPAGE: %5d, MAXCHARSIZE: %3d, DEFAULTCHAR: %2X", wCP, CPInfo.MaxCharSize, CPInfo.DefaultChar[0]);
for (i = 0; i < MAX_LEADBYTES; i++)
TRACE(", %d", CPInfo.LeadByte[i]);
TRACE("\n");
}
#endif
if (nFinal = (CPInfo.MaxCharSize == 1) ? 0 : 255) // one pass if no lead bytes (MaxCharSize = 1)
{
g_lastCP = wCP;
memset(bLeadBytes, 0, sizeof(bLeadBytes));
for (i = 0; i < MAX_LEADBYTES; i += 2)
{
if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1])
break; // end of lead byte ranges
for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++)
bLeadBytes[j] = TRUE; // mark as valid lead byte
}
}
for (i = 0; i <= nFinal; i++) // thumb thru all potential lead bytes
{
if (!i || bLeadBytes[i]) // lead bytes OR chars 0x00 - 0xff
{
for (j = 0; j < 256; j++)
{
nCount = 0;
if (i)
szChars[nCount++] = i; // create leadbyte/char pairs
szChars[nCount++] = j;
if (MultiByteToWideChar(wCP, MB_ERR_INVALID_CHARS, (PSTR)szChars, nCount, (PWSTR)&wChars, 1) != 1)
continue; // not valid UNICODE character
bSection = HIBYTE(wChars);
if (pbCharTypes[bSection] == bDefaultTable) // UNICODE section not accessed yet
{
pbCharTypes[bSection] = New BYTE[256];
if (!pbCharTypes[bSection])
RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL);
memset(pbCharTypes[bSection], 0, 256 * sizeof(BYTE));
}
// already processed this UNICODE char
else if (char_types(wChars))
continue;
GetStringTypeA(lcid, CT_CTYPE1, (PSTR)szChars, i ? 2 : 1, &wCharType1);
GetStringTypeA(lcid, CT_CTYPE2, (PSTR)szChars, i ? 2 : 1, &wCharType2);
GetStringTypeA(lcid, CT_CTYPE3, (PSTR)szChars, i ? 2 : 1, &wCharType3);
#ifdef TESTMODE
if (wCharType1 & 0x0001) TRACE("UPPER ");
if (wCharType1 & 0x0002) TRACE("LOWER ");
if (wCharType1 & 0x0004) TRACE("DIGIT ");
if (wCharType1 & 0x0008) TRACE("SPACE ");
if (wCharType1 & 0x0010) TRACE("PUNCT ");
if (wCharType1 & 0x0020) TRACE("CNTRL ");
if (wCharType1 & 0x0040) TRACE("BLANK ");
if (wCharType1 & 0x0080) TRACE("XDIGIT ");
if (wCharType1 & 0x0100) TRACE("ALPHA ");
if (wCharType2 == 0x0001) TRACE("LEFTTORIGHT ");
if (wCharType2 == 0x0002) TRACE("RIGHTTOLEFT ");
if (wCharType2 == 0x0003) TRACE("EUROPENUMBER ");
if (wCharType2 == 0x0004) TRACE("EUROPESEPARATOR ");
if (wCharType2 == 0x0005) TRACE("EUROPETERMINATOR ");
if (wCharType2 == 0x0006) TRACE("ARABICNUMBER ");
if (wCharType2 == 0x0007) TRACE("COMMONSEPARATOR ");
if (wCharType2 == 0x0008) TRACE("BLOCKSEPARATOR ");
if (wCharType2 == 0x0009) TRACE("SEGMENTSEPARATOR ");
if (wCharType2 == 0x000a) TRACE("WHITESPACE ");
if (wCharType2 == 0x000b) TRACE("OTHERNEUTRAL ");
if (wCharType3 & 0x0001) TRACE("NONSPACING ");
if (wCharType3 & 0x0002) TRACE("DIACRITIC ");
if (wCharType3 & 0x0004) TRACE("VOWELMARK ");
if (wCharType3 & 0x0008) TRACE("SYMBOL ");
if (wCharType3 & 0x0010) TRACE("KATAKANA ");
if (wCharType3 & 0x0020) TRACE("HIRAGANA ");
if (wCharType3 & 0x0040) TRACE("HALFWIDTH ");
if (wCharType3 & 0x0080) TRACE("FULLWIDTH ");
if (wCharType3 & 0x0100) TRACE("IDEOGRAPH ");
if (wCharType3 & 0x0200) TRACE("KASHIDA ");
if (wCharType3 & 0x0400) TRACE("LEXICAL ");
if (wCharType3 & 0x8000) TRACE("C3ALPHA ");
TRACE("\n");
#endif
set_char_types(wChars, CHAR_DEFINED);
if (wCharType1 & C1_ALPHA) // process characters
add_char_types(wChars, LETTER_CHAR);
if (wCharType1 & C1_SPACE)
add_char_types(wChars, SPACE_CHAR); // mark space characters
if ((wCharType1 & C1_DIGIT) || (wCharType2 == C2_EUROPENUMBER) || (wCharType2 == C2_ARABICNUMBER))
add_char_types(wChars, DIGIT_CHAR); // mark number characters
if (wCharType3 & C3_LEXICAL)
add_char_types(wChars, LETTER_IMBED); // mark letter embedded separators
if (wCharType2 == C2_COMMONSEPARATOR || wCharType2 == C2_EUROPESEPARATOR)
add_char_types(wChars, DIGIT_IMBED); // mark number embedded separators
}
}
}
return TRUE;
}
LPSTR APIENTRY CharNextMult(CP wCP, LPCSTR str, int n)
{
int i, j;
if (wCP != g_lastCP) // we are processing a new CP, so
{ // ... set up our lead byte tables
CPINFO CPInfo;
if (!GetCPInfo(wCP, &CPInfo))
return (LPSTR)str + n; // error return, let's make a guess
g_lastCP = wCP;
memset(bLeadBytes, 0, sizeof(bLeadBytes)); // establish lead bytes
for (i = 0; i < MAX_LEADBYTES; i += 2)
{
if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1])
break; // end of lead byte ranges
for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++)
bLeadBytes[j] = TRUE; // mark as valid lead byte
}
}
for (i = 0; i < n; i++, str++)
if (bLeadBytes[*PBYTE(str)])
str++;
return (LPSTR)str;
}
int APIENTRY FTSWordBreakA (CP wCP, LPSTR *ppText, LPINT pcText, LPSTR *paToken, LPSTR *paTokenEnd,
LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces)
{
int i, cwChar, nRet, diff;
CPINFO CPInfo;
LPWSTR pwText, ppwText;
if (!GetCPInfo(wCP, &CPInfo))
return 0;
cwChar = *pcText << 1;
if (!(pwText = ppwText = New WCHAR[cwChar]))
return 0;
cwChar = MultiByteToWideChar(wCP, 0, *ppText, *pcText, pwText, cwChar);
nRet = FTSWordBreakW(&ppwText, &cwChar, (LPWSTR *)paToken, (LPWSTR *)paTokenEnd, paType, paHash, cwTokens, fTokenizeSpaces);
if (nRet)
{
if (CPInfo.MaxCharSize == 1) // single byte code page
{
for (i = 0; i < nRet; i++)
{
if (paToken)
paToken[i] = *ppText + ((LPWSTR)paToken[i] - pwText);
if (paTokenEnd)
paTokenEnd[i] = *ppText +((LPWSTR)paTokenEnd[i] - pwText);
}
*ppText += ppwText - pwText;
*pcText = cwChar;
}
else // DBCS code pages
{
LPSTR cPtr = *ppText;
LPWSTR wPtr = pwText;
for (i = 0; i < nRet; i++)
{
if (paToken)
{
diff = (LPWSTR)paToken[i] - wPtr; // how many more Unicode chars
cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars
wPtr += diff; // adjust our Unicode pointer
paToken[i] = cPtr; // return our DBCS pointer
}
if (paTokenEnd)
{
diff = (LPWSTR)paTokenEnd[i] - wPtr; // how many more Unicode chars
cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars
wPtr += diff; // adjust our Unicode pointer
paTokenEnd[i] = cPtr; // return our DBCS pointer
}
}
diff = ppwText - wPtr; // how many more Unicode chars
cPtr = CharNextMult(wCP,cPtr, diff); // advance that many DBCS chars
*pcText -= cPtr - *ppText; // return remaining DBCS chars
*ppText = cPtr; // return our DBCS pointer
}
}
delete [] pwText;
return nRet;
}
int APIENTRY FTSWordBreakW (LPWSTR *ppwText, LPINT pcwText, LPWSTR *paToken, LPWSTR *paTokenEnd,
LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces)
{
BYTE bCharType, bPrevType, bFirstCharType;
UINT wHash;
WORD wPunc, cwTokensOut = 0;
WCHAR wChar, wChar2, wImbed = 0;
LPWSTR pwPos, pwLimit, pwTokenStart, pwStart;
pwPos = pwStart = *ppwText; // position WCHAR pointer to beginning of text
wChar = *pwPos; // get first UNICODE character
pwLimit = pwPos + *pcwText; // end of UNICODE text
FOREVER_
{ // token hash value init
wHash = 0;
if (pwPos == pwLimit) // have reached end of UNCODE text
break;
bFirstCharType = (char_types(wChar) & WORD_TYPE);
bPrevType = 0;
if (!bFirstCharType && (fTokenizeSpaces & STARTING_IMBEDS))
{
bCharType = char_types(wChar);
if (bCharType & LETTER_IMBED)
{
if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & LETTER_CHAR)
{
bFirstCharType = TRUE;
bPrevType |= LETTER_CHAR;
}
}
if (bCharType & DIGIT_IMBED)
{
if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & DIGIT_CHAR)
{
bFirstCharType = TRUE;
bPrevType |= DIGIT_CHAR;
}
}
}
if (bFirstCharType) // current WCHAR is letter or number
{
pwTokenStart = pwPos; // save pointer to beginning of token
wHash = 0; // seed hash value
FOREVER_
{
if (pwPos > pwStart && !(fTokenizeSpaces & STARTING_IMBEDS))
wImbed = *(pwPos - 1); // get possible starting imbed char
do
{
wChar = *pwPos; // current UNICODE character
bCharType = char_types(wChar);
if ((bCharType & WORD_TYPE) ||
((bCharType & LETTER_IMBED) && // changed to allow C3_LEXICAL (letter
(wChar != wImbed) &&
(bPrevType & LETTER_CHAR)) || // ... imbed) to be suffix
// (pwPos+1 == pwLimit || char_types(*(pwPos+1)) & LETTER_CHAR)) ||
((bCharType & DIGIT_IMBED) &&
(bPrevType & DIGIT_CHAR) &&
(pwPos+1 == pwLimit || char_types(*(pwPos+1)) & DIGIT_CHAR || (fTokenizeSpaces & STARTING_IMBEDS))))
{
wHash = _rotl(wHash, 5) - wChar; // token continues: letter, number, or
bPrevType = bCharType; // ... surrounded embedded character
}
else
break; // else token complete
}
while (++pwPos != pwLimit); // until end of UNICODE text
if (!cwTokens)
cwTokensOut++; // just count number of tokens needed
else
{
if (paToken)
paToken[cwTokensOut] = pwTokenStart; // token start pointer
if (paTokenEnd)
paTokenEnd[cwTokensOut] = pwPos; // token end pointer
if (paHash)
paHash[cwTokensOut] = wHash; // token hash value
if (paType)
paType[cwTokensOut] = bFirstCharType; // mark token as word (chars/digits)
if (++cwTokensOut >= cwTokens) // no more token pointer space
{
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
*ppwText = pwPos; // update WCHAR text starting pointer
return(cwTokensOut); // return token count
}
}
// remove all spans of space characters
if ((fTokenizeSpaces & REMOVE_SPACE_CHARS) && pwPos != pwLimit)
{
while (pwPos != pwLimit && (char_types(*pwPos) & SPACE_CHAR))
pwPos++;
if (pwPos == pwLimit)
break;
pwTokenStart = pwPos;
wChar = *pwPos;
wHash = 0;
if (!(char_types(wChar) & WORD_TYPE)) // lexing into non-space punctuation
break;
}
else if (!(fTokenizeSpaces & TOKENIZE_SPACES) && pwPos != pwLimit &&
wChar == L' ' && (pwPos+1) != pwLimit &&
char_types(wChar2 = *(pwPos+1)) & WORD_TYPE)
{
pwTokenStart = ++pwPos; // if "fTokenizeSpaces" is FALSE, then
wHash = 0; // ... remove single space between words continue;
} // ... as a token
else
break;
}
}
if (pwPos == pwLimit) break; // ... at end of provided WCHAR text
pwTokenStart = pwPos; // save pointer to beginning of token
wHash = 0; // seed hash value
wPunc = wChar; // punctuation type (space vs. non-space)
do
{
wChar = *pwPos; // current UNICODE character
if (fTokenizeSpaces & TOKENIZE_SPACES) // "fTokenizeSpaces" option for WinHelp
if ((wPunc == L' ' && wChar != L' ') ||
(wPunc != L' ' && wChar == L' '))
break; // tokenize spans of spaces -OR- non-spaces
bCharType = char_types(wChar);
if (!(bCharType & WORD_TYPE) || !wChar)
{
if (!(fTokenizeSpaces & REMOVE_SPACE_CHARS) || !(bCharType & SPACE_CHAR))
wHash = _rotl(wHash, 5) - wChar; // punctuation token continues: not letter/number
}
else
break;
}
while (++pwPos != pwLimit); // until end of UNICODE text
if (pwPos != pwLimit || pwTokenStart != pwLimit)
{ // discard empty final token
LPWSTR pw, pwNew = pwPos;
if (fTokenizeSpaces & REMOVE_SPACE_CHARS) // remove spans of space chars
{
for (; pwTokenStart < pwPos; ++pwTokenStart)
if (!(char_types(*pwTokenStart) & SPACE_CHAR)) break;
for (pw = pwNew = pwTokenStart; pw < pwPos; pw++)
if (!(char_types(*pw) & SPACE_CHAR))
*pwNew++ = *pw;
}
if (pwNew != pwTokenStart)
{
if (!cwTokens)
cwTokensOut++; // just count number of tokens needed
else
{
if (paToken)
paToken[cwTokensOut] = pwTokenStart; // Token start pointer
if (paTokenEnd)
paTokenEnd[cwTokensOut] = pwNew; // Token end pointer
if (paHash)
paHash[cwTokensOut] = wHash; // Token hash value
if (paType)
paType[cwTokensOut] = 0; // mark token as punctuation
if (++cwTokensOut >= cwTokens)
{
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
*ppwText = pwPos; // update WCHAR text starting pointer
return(cwTokensOut); // return token count
}
}
}
}
}
if (cwTokens)
{
*pcwText -= (pwPos - *ppwText); // update UNICODE character count
*ppwText = pwPos; // update WCHAR text starting pointer
}
return cwTokensOut; // return token count
}
int APIENTRY LCSortKeyW(LCID lcid, WORD wMapFlags, LPCWSTR pwSource, int cwSource, LPWSTR pwDest, int cwDest)
{
int cb, nRet;
#ifdef _DEBUG
int err = 0;
#endif
if (ftslex_os_version != OS_NT)
{
PBYTE pbSource = NULL;
UINT cbSource = 0;
cbSource= cwSource << 1; // 1 WC can generate 2 bytes of MB
pbSource = (cbSource > MAX_STACK_ALLOC)? New BYTE[cwSource] : PBYTE(_alloca(cbSource));
if (!pbSource)
return 0; // error return
cb = WideCharToMultiByte(GetACP(), 0, pwSource, cwSource, (PSTR)pbSource, cbSource, NULL, NULL);
ASSERT(cb || !cbSource);
nRet = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), (cwDest-1)<<1) >> 1;
#ifdef _DEBUG
if (nRet == 0 && cb) {
err = GetLastError();
char szBuf[256];
int cbShouldBe = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), 0);
wsprintf(szBuf,
"LCMapStringA error code:%u cwdest == %u, should be = %u", err,
(cwDest-1) <<1, cbShouldBe);
MessageBox(NULL, szBuf, "", MB_OK);
}
#endif
ASSERT(nRet || !cb);
LPWSTR pwText = pwDest + 1;
LPWSTR pwEnd = pwText + nRet;
for ( ; pwText < pwEnd; pwText++)
*pwText = (*pwText >> 8) | (*pwText << 8); // bring sort key weights in byte reversed order
if (pbSource && cbSource > MAX_STACK_ALLOC) delete [] pbSource;
}
else {
nRet = LCMapStringW(lcid, LCMAP_FLAGS, pwSource, cwSource, pwDest+1, (cwDest-1) << 1) >> 1;
}
ASSERT(nRet || !cwSource); // invalid zero length sort key
if (nRet)
{
nRet++;
if (cwDest && pwDest) // set a sort keys prefix so tokens group first by
{
BYTE bCharType = char_types(*pwSource);
/*
BYTE bCharType2;
if ((bCharType & (LETTER_IMBED | DIGIT_IMBED)) && nRet > 2)
{
bCharType2 = char_types(*(pwSource+1)); // handle input matching for imbeds
if (((bCharType & LETTER_IMBED) && (bCharType2 & LETTER_CHAR)) ||
((bCharType & DIGIT_IMBED) && (bCharType2 & DIGIT_CHAR)))
*pwDest = ~(bCharType2 & WORD_TYPE); // ... alphabetics, then numerics, then punctuation
}
*/
// Prefix values --
//
// 1 - Letters
// 2 - Underscore(s)
// 3 - Digits
// 4 - All other punctuation streams
if (bCharType & LETTER_CHAR)
*pwDest = (*pwSource == L'_')? 2 : 1;
else
*pwDest = (bCharType & DIGIT_CHAR)? 3 : 4;
// *pwDest = ~(bCharType & WORD_TYPE); // ... alphabetics, then numerics, then punctuation
}
}
if ((wMapFlags & LCSORT_START) && cwDest && pwDest) // flag to return char class start sort key
{
for (int i = 0; i < nRet; i++) // skipping characters by two (alpha sort weights)
if (HIBYTE(pwDest[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
{
pwDest[i] = 0;
return i; // return WCHAR character length
}
pwDest[0] = 0; // empty return
return 0;
}
return nRet;
}
int APIENTRY LCSortKeyFirstW(LPWSTR pwText, int cwText) // convert start sort key to first matching sort key
{
for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights)
if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
{
pwText[i] = 0;
return i; // return character length
}
return 0; // no separator
}
int APIENTRY LCSortKeyLastW(LPWSTR pwText, int cwText) // convert start sort key to last matching sort key
{
for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights)
if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator
{
pwText[i-1]++; // increment last alpha weight
pwText[i] = 0;
return i; // return character length
}
return 0; // no separator
}
int APIENTRY LCSortKeyBase(LPWSTR pwText, int cwText) // convert sort key to base characters
{ // removes diacritic weights from sort key
LPSTR pCopy, pEnd;
LPWSTR pwStart = pwText;
while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator
pwText++;
if (LOBYTE(*pwText) == SORT_KEY_SEPARATOR) // no case weights at all
return cwText; // returning original sort key
pCopy = (LPSTR)pwText; // point to next word for search
pEnd = (LPSTR)(pwStart + cwText);
*pwText++ = ((SORT_KEY_SEPARATOR << 8) | SORT_KEY_SEPARATOR);
while ((pCopy += 2) < pEnd) // remember, sort key is byte reversed
{
if (*(pCopy+1) == SORT_KEY_SEPARATOR) // found diacritic separator (high byte)
{
while ((pCopy + 2) < pEnd)
{ // lobyte + next hibyte
*pwText++ = ((WCHAR)(BYTE)*pCopy << 8) | (BYTE)(*(pCopy + 3));
pCopy += 2;
}
if (*pwText = (WCHAR)(BYTE)*pCopy << 8) // check if terminating wide-null
pwText++;
break;
}
else if (*pCopy == SORT_KEY_SEPARATOR) // found diacritic separator (low byte)
{
pCopy += 2;
while (pCopy < pEnd)
{
*pwText++ = *((LPWSTR)pCopy);
pCopy += 2;
}
break;
}
}
return pwText - pwStart;
}
int APIENTRY LCSortKeyLower(LPWSTR pwText, int cwText) // convert sort key to lower case
{
LPSTR pWork, pAlpha;
LPWSTR pwWork, pwEnd;
LPSTR pEnd = (LPSTR)(pwText + cwText);
LPWSTR pwStart = pwText;
while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator
pwText++;
for (pwWork = pwText; pwWork < (LPWSTR)pEnd; pwWork++)
*pwWork = (*pwWork >> 8) | (*pwWork << 8); // bring sort key weights in byte order
for (pWork = (LPSTR)pwText + 1; pWork < pEnd; pWork++) // skip diacritic separator
if (*pWork == SORT_KEY_SEPARATOR) // find alpha weights separator
break;
if (*++pWork == SORT_KEY_SEPARATOR)
pwEnd = (LPWSTR)pEnd; // no alpha weights
else
{
for (pAlpha = pWork + 1; pAlpha < pEnd; pAlpha++) // skip non-separator character to start
if (*pAlpha == SORT_KEY_SEPARATOR)
break; // find final sort key separator
memcpy(pWork, pAlpha, pEnd - pAlpha); // copy remaining buffer
memset(pWork + (pEnd - pAlpha), 0, pAlpha - pWork); // clear remaining buffer
pwEnd = (LPWSTR)pEnd;
while(!(*--pwEnd)) {}; // find last non-zero word
pwEnd++;
}
for (pwWork = pwText; pwWork < pwEnd; pwWork++)
*pwWork = (*pwWork >> 8) | (*pwWork << 8); // byte reverse sort keys weights
return pwEnd - pwStart; // number of words being returned
}
////////////////////////////////// global function put in for hiliter /////////////
WORD RemoveWhiteSpace(WCHAR* pwChar, int cw, int& cBase, int& cLimit) {
// remove space from Unicode strings so they match query box entries
int i, j;
cBase = cLimit = 0; // number of leading/trailing blank characters
BOOL fNonBlank = FALSE; // set when we reach the first non-blank character
for (i=j=0; i<cw; i++) {
WCHAR w = pwChar[i];
if (char_types(w) & SPACE_CHAR) { // we got a space character
if (!fNonBlank) cBase++;
else cLimit++;
}
else { // a non-space character
pwChar[j++] = w; // change it in place
fNonBlank = TRUE;
cLimit = 0;
}
}
return j; // new length
}