// ftslex.cpp : Unicode word lexer and sort key provider for WinHelp browser. // #include "stdafx.h" #include #include #include "ftslex.h" #define char_types(w) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w))) #define set_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) = bType) #define add_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) |= bType) #define sub_char_types(w, bType) (*(pbCharTypes[BYTE(w>>8)] + BYTE(w)) &= ~bType) UINT ftslex_os_version= 0; CP g_lastCP; WORD g_wLocales = 0; LCID g_lcids[MAX_LOCALES]; CP g_wCPs [MAX_LOCALES]; BYTE bLeadBytes [0x100]; BYTE *pbCharTypes [0x100]; BYTE bDefaultTable[0x100]; BOOL CALLBACK LocaleEnumProc(LPTSTR); BOOL CALLBACK CodePageEnumProc(LPTSTR); CP g_cpSet[] = { ANSI_CHARSET, 1252, SYMBOL_CHARSET, 1252, // ?? Should be a different code page, but what?? SHIFTJIS_CHARSET, 932, HANGEUL_CHARSET, 949, GB2312_CHARSET, 936, CHINESEBIG5_CHARSET, 950, THAI_CHARSET, 874, HEBREW_CHARSET, 1255, ARABIC_CHARSET, 1256, GREEK_CHARSET, 1253, TURKISH_CHARSET, 1254, BALTIC_CHARSET, 1257, EASTEUROPE_CHARSET, 1250, RUSSIAN_CHARSET, 1251 }; extern "C" void InitialFTSLex() { g_lcids[g_wLocales] = GetUserDefaultLCID(); g_wCPs [g_wLocales] = GetACP(); g_wLocales++; ftslex_os_version = (GetVersion() >> 30) & 0x0003; for (int i = 0; i < 256; i++) pbCharTypes[i] = bDefaultTable; EnumSystemLocalesA((LOCALE_ENUMPROC)LocaleEnumProc, LCID_SUPPORTED); //INSTALLED); EnumSystemCodePagesA((CODEPAGE_ENUMPROC)CodePageEnumProc, CP_INSTALLED); if (pbCharTypes[0] != bDefaultTable) // special code point type overrides: { add_char_types(L'_', LETTER_CHAR); // treat underscore as char, for software prefix names. sub_char_types(L'"', LETTER_IMBED); // remove double quote as imbed (suffix), no . sub_char_types(L'/', LETTER_IMBED); // remove right slash as imbed (suffix) sub_char_types(L'=', LETTER_IMBED); // remove equal sign as imbed (suffix) sub_char_types(L'@', LETTER_IMBED); // remove at sign as imbed (suffix) sub_char_types(L'\\', LETTER_IMBED); // remove left slash as imbed (suffix) } } extern "C" void ShutdownFTSLex() { for (int i = 0; i < 256; i++) { if (pbCharTypes[i] != bDefaultTable) delete [] pbCharTypes[i]; } } UINT APIENTRY GetOSVersion() { return ftslex_os_version; } BOOL CALLBACK LocaleEnumProc(LPSTR lpLocaleString) { LCID lcid; BYTE bCP[6]; CP wCP; LPSTR lpEndString; lcid = strtoul(lpLocaleString, &lpEndString, 16); if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTANSICODEPAGE, (LPSTR)bCP, sizeof(bCP))) { wCP = atoi((PSTR)bCP); if (g_wLocales < MAX_LOCALES) { g_lcids[g_wLocales] = lcid; g_wCPs [g_wLocales] = wCP; g_wLocales++; } } if (GetLocaleInfoA(lcid, LOCALE_IDEFAULTCODEPAGE, (LPSTR)bCP, sizeof(bCP))) { wCP = atoi((PSTR)bCP); if (g_wLocales < MAX_LOCALES) { g_lcids[g_wLocales] = lcid; g_wCPs [g_wLocales] = wCP; g_wLocales++; } } return TRUE; } LCID APIENTRY GetLocaleFromCP(CP wCP) { for (int i = 0; i < g_wLocales; i++) if (wCP == g_wCPs[i]) return g_lcids[i]; return GetUserDefaultLCID(); } CP APIENTRY GetCPFromLocale(LCID lcid) { for (int i = 0; i < g_wLocales; i++) if (lcid == g_lcids[i]) return g_wCPs[i]; return GetACP(); } CP APIENTRY GetCPFromCharset(BYTE charset) { for (int i = 0; i < sizeof(g_cpSet)/sizeof(g_cpSet[0]); i += 2) if (charset == (BYTE)g_cpSet[i]) return g_cpSet[i+1]; return GetACP(); } BOOL CALLBACK CodePageEnumProc(LPSTR lpCodePageString) { BYTE bSection; BYTE szChars[2]; LCID lcid; int i, j, nCount, nFinal; WCHAR wChars; WORD wCharType1, wCharType2, wCharType3; CP wCP; CPINFO CPInfo; wCP = atoi(lpCodePageString); if (wCP == 37 || wCP == 500 || wCP == 875 || wCP == 1026) return TRUE; // do not process EBCDIC code pages // if (wCP < 1200 || wCP > 1299) // return TRUE; // only process Windows code pages // lcid = GetLocaleFromCP(wCP); // the linguists argue to use to user's lcid = GetUserDefaultLCID(); // ... LCID for multilingual contexts if (!GetCPInfo(wCP, &CPInfo)) return TRUE; #ifdef TESTMODE else { TRACE("CODEPAGE: %5d, MAXCHARSIZE: %3d, DEFAULTCHAR: %2X", wCP, CPInfo.MaxCharSize, CPInfo.DefaultChar[0]); for (i = 0; i < MAX_LEADBYTES; i++) TRACE(", %d", CPInfo.LeadByte[i]); TRACE("\n"); } #endif if (nFinal = (CPInfo.MaxCharSize == 1) ? 0 : 255) // one pass if no lead bytes (MaxCharSize = 1) { g_lastCP = wCP; memset(bLeadBytes, 0, sizeof(bLeadBytes)); for (i = 0; i < MAX_LEADBYTES; i += 2) { if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1]) break; // end of lead byte ranges for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++) bLeadBytes[j] = TRUE; // mark as valid lead byte } } for (i = 0; i <= nFinal; i++) // thumb thru all potential lead bytes { if (!i || bLeadBytes[i]) // lead bytes OR chars 0x00 - 0xff { for (j = 0; j < 256; j++) { nCount = 0; if (i) szChars[nCount++] = i; // create leadbyte/char pairs szChars[nCount++] = j; if (MultiByteToWideChar(wCP, MB_ERR_INVALID_CHARS, (PSTR)szChars, nCount, (PWSTR)&wChars, 1) != 1) continue; // not valid UNICODE character bSection = HIBYTE(wChars); if (pbCharTypes[bSection] == bDefaultTable) // UNICODE section not accessed yet { pbCharTypes[bSection] = New BYTE[256]; if (!pbCharTypes[bSection]) RaiseException(STATUS_NO_MEMORY, EXCEPTION_NONCONTINUABLE, 0, NULL); memset(pbCharTypes[bSection], 0, 256 * sizeof(BYTE)); } // already processed this UNICODE char else if (char_types(wChars)) continue; GetStringTypeA(lcid, CT_CTYPE1, (PSTR)szChars, i ? 2 : 1, &wCharType1); GetStringTypeA(lcid, CT_CTYPE2, (PSTR)szChars, i ? 2 : 1, &wCharType2); GetStringTypeA(lcid, CT_CTYPE3, (PSTR)szChars, i ? 2 : 1, &wCharType3); #ifdef TESTMODE if (wCharType1 & 0x0001) TRACE("UPPER "); if (wCharType1 & 0x0002) TRACE("LOWER "); if (wCharType1 & 0x0004) TRACE("DIGIT "); if (wCharType1 & 0x0008) TRACE("SPACE "); if (wCharType1 & 0x0010) TRACE("PUNCT "); if (wCharType1 & 0x0020) TRACE("CNTRL "); if (wCharType1 & 0x0040) TRACE("BLANK "); if (wCharType1 & 0x0080) TRACE("XDIGIT "); if (wCharType1 & 0x0100) TRACE("ALPHA "); if (wCharType2 == 0x0001) TRACE("LEFTTORIGHT "); if (wCharType2 == 0x0002) TRACE("RIGHTTOLEFT "); if (wCharType2 == 0x0003) TRACE("EUROPENUMBER "); if (wCharType2 == 0x0004) TRACE("EUROPESEPARATOR "); if (wCharType2 == 0x0005) TRACE("EUROPETERMINATOR "); if (wCharType2 == 0x0006) TRACE("ARABICNUMBER "); if (wCharType2 == 0x0007) TRACE("COMMONSEPARATOR "); if (wCharType2 == 0x0008) TRACE("BLOCKSEPARATOR "); if (wCharType2 == 0x0009) TRACE("SEGMENTSEPARATOR "); if (wCharType2 == 0x000a) TRACE("WHITESPACE "); if (wCharType2 == 0x000b) TRACE("OTHERNEUTRAL "); if (wCharType3 & 0x0001) TRACE("NONSPACING "); if (wCharType3 & 0x0002) TRACE("DIACRITIC "); if (wCharType3 & 0x0004) TRACE("VOWELMARK "); if (wCharType3 & 0x0008) TRACE("SYMBOL "); if (wCharType3 & 0x0010) TRACE("KATAKANA "); if (wCharType3 & 0x0020) TRACE("HIRAGANA "); if (wCharType3 & 0x0040) TRACE("HALFWIDTH "); if (wCharType3 & 0x0080) TRACE("FULLWIDTH "); if (wCharType3 & 0x0100) TRACE("IDEOGRAPH "); if (wCharType3 & 0x0200) TRACE("KASHIDA "); if (wCharType3 & 0x0400) TRACE("LEXICAL "); if (wCharType3 & 0x8000) TRACE("C3ALPHA "); TRACE("\n"); #endif set_char_types(wChars, CHAR_DEFINED); if (wCharType1 & C1_ALPHA) // process characters add_char_types(wChars, LETTER_CHAR); if (wCharType1 & C1_SPACE) add_char_types(wChars, SPACE_CHAR); // mark space characters if ((wCharType1 & C1_DIGIT) || (wCharType2 == C2_EUROPENUMBER) || (wCharType2 == C2_ARABICNUMBER)) add_char_types(wChars, DIGIT_CHAR); // mark number characters if (wCharType3 & C3_LEXICAL) add_char_types(wChars, LETTER_IMBED); // mark letter embedded separators if (wCharType2 == C2_COMMONSEPARATOR || wCharType2 == C2_EUROPESEPARATOR) add_char_types(wChars, DIGIT_IMBED); // mark number embedded separators } } } return TRUE; } LPSTR APIENTRY CharNextMult(CP wCP, LPCSTR str, int n) { int i, j; if (wCP != g_lastCP) // we are processing a new CP, so { // ... set up our lead byte tables CPINFO CPInfo; if (!GetCPInfo(wCP, &CPInfo)) return (LPSTR)str + n; // error return, let's make a guess g_lastCP = wCP; memset(bLeadBytes, 0, sizeof(bLeadBytes)); // establish lead bytes for (i = 0; i < MAX_LEADBYTES; i += 2) { if (!CPInfo.LeadByte[i] && !CPInfo.LeadByte[i+1]) break; // end of lead byte ranges for (j = CPInfo.LeadByte[i]; j <= CPInfo.LeadByte[i+1]; j++) bLeadBytes[j] = TRUE; // mark as valid lead byte } } for (i = 0; i < n; i++, str++) if (bLeadBytes[*PBYTE(str)]) str++; return (LPSTR)str; } int APIENTRY FTSWordBreakA (CP wCP, LPSTR *ppText, LPINT pcText, LPSTR *paToken, LPSTR *paTokenEnd, LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces) { int i, cwChar, nRet, diff; CPINFO CPInfo; LPWSTR pwText, ppwText; if (!GetCPInfo(wCP, &CPInfo)) return 0; cwChar = *pcText << 1; if (!(pwText = ppwText = New WCHAR[cwChar])) return 0; cwChar = MultiByteToWideChar(wCP, 0, *ppText, *pcText, pwText, cwChar); nRet = FTSWordBreakW(&ppwText, &cwChar, (LPWSTR *)paToken, (LPWSTR *)paTokenEnd, paType, paHash, cwTokens, fTokenizeSpaces); if (nRet) { if (CPInfo.MaxCharSize == 1) // single byte code page { for (i = 0; i < nRet; i++) { if (paToken) paToken[i] = *ppText + ((LPWSTR)paToken[i] - pwText); if (paTokenEnd) paTokenEnd[i] = *ppText +((LPWSTR)paTokenEnd[i] - pwText); } *ppText += ppwText - pwText; *pcText = cwChar; } else // DBCS code pages { LPSTR cPtr = *ppText; LPWSTR wPtr = pwText; for (i = 0; i < nRet; i++) { if (paToken) { diff = (LPWSTR)paToken[i] - wPtr; // how many more Unicode chars cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars wPtr += diff; // adjust our Unicode pointer paToken[i] = cPtr; // return our DBCS pointer } if (paTokenEnd) { diff = (LPWSTR)paTokenEnd[i] - wPtr; // how many more Unicode chars cPtr = CharNextMult(wCP, cPtr, diff); // advance that many DBCS chars wPtr += diff; // adjust our Unicode pointer paTokenEnd[i] = cPtr; // return our DBCS pointer } } diff = ppwText - wPtr; // how many more Unicode chars cPtr = CharNextMult(wCP,cPtr, diff); // advance that many DBCS chars *pcText -= cPtr - *ppText; // return remaining DBCS chars *ppText = cPtr; // return our DBCS pointer } } delete [] pwText; return nRet; } int APIENTRY FTSWordBreakW (LPWSTR *ppwText, LPINT pcwText, LPWSTR *paToken, LPWSTR *paTokenEnd, LPBYTE paType, PUINT paHash, int cwTokens, UINT fTokenizeSpaces) { BYTE bCharType, bPrevType, bFirstCharType; UINT wHash; WORD wPunc, cwTokensOut = 0; WCHAR wChar, wChar2, wImbed = 0; LPWSTR pwPos, pwLimit, pwTokenStart, pwStart; pwPos = pwStart = *ppwText; // position WCHAR pointer to beginning of text wChar = *pwPos; // get first UNICODE character pwLimit = pwPos + *pcwText; // end of UNICODE text FOREVER_ { // token hash value init wHash = 0; if (pwPos == pwLimit) // have reached end of UNCODE text break; bFirstCharType = (char_types(wChar) & WORD_TYPE); bPrevType = 0; if (!bFirstCharType && (fTokenizeSpaces & STARTING_IMBEDS)) { bCharType = char_types(wChar); if (bCharType & LETTER_IMBED) { if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & LETTER_CHAR) { bFirstCharType = TRUE; bPrevType |= LETTER_CHAR; } } if (bCharType & DIGIT_IMBED) { if (pwPos+1 != pwLimit && char_types(*(pwPos+1)) & DIGIT_CHAR) { bFirstCharType = TRUE; bPrevType |= DIGIT_CHAR; } } } if (bFirstCharType) // current WCHAR is letter or number { pwTokenStart = pwPos; // save pointer to beginning of token wHash = 0; // seed hash value FOREVER_ { if (pwPos > pwStart && !(fTokenizeSpaces & STARTING_IMBEDS)) wImbed = *(pwPos - 1); // get possible starting imbed char do { wChar = *pwPos; // current UNICODE character bCharType = char_types(wChar); if ((bCharType & WORD_TYPE) || ((bCharType & LETTER_IMBED) && // changed to allow C3_LEXICAL (letter (wChar != wImbed) && (bPrevType & LETTER_CHAR)) || // ... imbed) to be suffix // (pwPos+1 == pwLimit || char_types(*(pwPos+1)) & LETTER_CHAR)) || ((bCharType & DIGIT_IMBED) && (bPrevType & DIGIT_CHAR) && (pwPos+1 == pwLimit || char_types(*(pwPos+1)) & DIGIT_CHAR || (fTokenizeSpaces & STARTING_IMBEDS)))) { wHash = _rotl(wHash, 5) - wChar; // token continues: letter, number, or bPrevType = bCharType; // ... surrounded embedded character } else break; // else token complete } while (++pwPos != pwLimit); // until end of UNICODE text if (!cwTokens) cwTokensOut++; // just count number of tokens needed else { if (paToken) paToken[cwTokensOut] = pwTokenStart; // token start pointer if (paTokenEnd) paTokenEnd[cwTokensOut] = pwPos; // token end pointer if (paHash) paHash[cwTokensOut] = wHash; // token hash value if (paType) paType[cwTokensOut] = bFirstCharType; // mark token as word (chars/digits) if (++cwTokensOut >= cwTokens) // no more token pointer space { *pcwText -= (pwPos - *ppwText); // update UNICODE character count *ppwText = pwPos; // update WCHAR text starting pointer return(cwTokensOut); // return token count } } // remove all spans of space characters if ((fTokenizeSpaces & REMOVE_SPACE_CHARS) && pwPos != pwLimit) { while (pwPos != pwLimit && (char_types(*pwPos) & SPACE_CHAR)) pwPos++; if (pwPos == pwLimit) break; pwTokenStart = pwPos; wChar = *pwPos; wHash = 0; if (!(char_types(wChar) & WORD_TYPE)) // lexing into non-space punctuation break; } else if (!(fTokenizeSpaces & TOKENIZE_SPACES) && pwPos != pwLimit && wChar == L' ' && (pwPos+1) != pwLimit && char_types(wChar2 = *(pwPos+1)) & WORD_TYPE) { pwTokenStart = ++pwPos; // if "fTokenizeSpaces" is FALSE, then wHash = 0; // ... remove single space between words continue; } // ... as a token else break; } } if (pwPos == pwLimit) break; // ... at end of provided WCHAR text pwTokenStart = pwPos; // save pointer to beginning of token wHash = 0; // seed hash value wPunc = wChar; // punctuation type (space vs. non-space) do { wChar = *pwPos; // current UNICODE character if (fTokenizeSpaces & TOKENIZE_SPACES) // "fTokenizeSpaces" option for WinHelp if ((wPunc == L' ' && wChar != L' ') || (wPunc != L' ' && wChar == L' ')) break; // tokenize spans of spaces -OR- non-spaces bCharType = char_types(wChar); if (!(bCharType & WORD_TYPE) || !wChar) { if (!(fTokenizeSpaces & REMOVE_SPACE_CHARS) || !(bCharType & SPACE_CHAR)) wHash = _rotl(wHash, 5) - wChar; // punctuation token continues: not letter/number } else break; } while (++pwPos != pwLimit); // until end of UNICODE text if (pwPos != pwLimit || pwTokenStart != pwLimit) { // discard empty final token LPWSTR pw, pwNew = pwPos; if (fTokenizeSpaces & REMOVE_SPACE_CHARS) // remove spans of space chars { for (; pwTokenStart < pwPos; ++pwTokenStart) if (!(char_types(*pwTokenStart) & SPACE_CHAR)) break; for (pw = pwNew = pwTokenStart; pw < pwPos; pw++) if (!(char_types(*pw) & SPACE_CHAR)) *pwNew++ = *pw; } if (pwNew != pwTokenStart) { if (!cwTokens) cwTokensOut++; // just count number of tokens needed else { if (paToken) paToken[cwTokensOut] = pwTokenStart; // Token start pointer if (paTokenEnd) paTokenEnd[cwTokensOut] = pwNew; // Token end pointer if (paHash) paHash[cwTokensOut] = wHash; // Token hash value if (paType) paType[cwTokensOut] = 0; // mark token as punctuation if (++cwTokensOut >= cwTokens) { *pcwText -= (pwPos - *ppwText); // update UNICODE character count *ppwText = pwPos; // update WCHAR text starting pointer return(cwTokensOut); // return token count } } } } } if (cwTokens) { *pcwText -= (pwPos - *ppwText); // update UNICODE character count *ppwText = pwPos; // update WCHAR text starting pointer } return cwTokensOut; // return token count } int APIENTRY LCSortKeyW(LCID lcid, WORD wMapFlags, LPCWSTR pwSource, int cwSource, LPWSTR pwDest, int cwDest) { int cb, nRet; #ifdef _DEBUG int err = 0; #endif if (ftslex_os_version != OS_NT) { PBYTE pbSource = NULL; UINT cbSource = 0; cbSource= cwSource << 1; // 1 WC can generate 2 bytes of MB pbSource = (cbSource > MAX_STACK_ALLOC)? New BYTE[cwSource] : PBYTE(_alloca(cbSource)); if (!pbSource) return 0; // error return cb = WideCharToMultiByte(GetACP(), 0, pwSource, cwSource, (PSTR)pbSource, cbSource, NULL, NULL); ASSERT(cb || !cbSource); nRet = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), (cwDest-1)<<1) >> 1; #ifdef _DEBUG if (nRet == 0 && cb) { err = GetLastError(); char szBuf[256]; int cbShouldBe = LCMapStringA(lcid, LCMAP_FLAGS_CHICAGO, (PSTR)pbSource, cb, (PSTR)(pwDest+1), 0); wsprintf(szBuf, "LCMapStringA error code:%u cwdest == %u, should be = %u", err, (cwDest-1) <<1, cbShouldBe); MessageBox(NULL, szBuf, "", MB_OK); } #endif ASSERT(nRet || !cb); LPWSTR pwText = pwDest + 1; LPWSTR pwEnd = pwText + nRet; for ( ; pwText < pwEnd; pwText++) *pwText = (*pwText >> 8) | (*pwText << 8); // bring sort key weights in byte reversed order if (pbSource && cbSource > MAX_STACK_ALLOC) delete [] pbSource; } else { nRet = LCMapStringW(lcid, LCMAP_FLAGS, pwSource, cwSource, pwDest+1, (cwDest-1) << 1) >> 1; } ASSERT(nRet || !cwSource); // invalid zero length sort key if (nRet) { nRet++; if (cwDest && pwDest) // set a sort keys prefix so tokens group first by { BYTE bCharType = char_types(*pwSource); /* BYTE bCharType2; if ((bCharType & (LETTER_IMBED | DIGIT_IMBED)) && nRet > 2) { bCharType2 = char_types(*(pwSource+1)); // handle input matching for imbeds if (((bCharType & LETTER_IMBED) && (bCharType2 & LETTER_CHAR)) || ((bCharType & DIGIT_IMBED) && (bCharType2 & DIGIT_CHAR))) *pwDest = ~(bCharType2 & WORD_TYPE); // ... alphabetics, then numerics, then punctuation } */ // Prefix values -- // // 1 - Letters // 2 - Underscore(s) // 3 - Digits // 4 - All other punctuation streams if (bCharType & LETTER_CHAR) *pwDest = (*pwSource == L'_')? 2 : 1; else *pwDest = (bCharType & DIGIT_CHAR)? 3 : 4; // *pwDest = ~(bCharType & WORD_TYPE); // ... alphabetics, then numerics, then punctuation } } if ((wMapFlags & LCSORT_START) && cwDest && pwDest) // flag to return char class start sort key { for (int i = 0; i < nRet; i++) // skipping characters by two (alpha sort weights) if (HIBYTE(pwDest[i]) == SORT_KEY_SEPARATOR) // search for first weight separator { pwDest[i] = 0; return i; // return WCHAR character length } pwDest[0] = 0; // empty return return 0; } return nRet; } int APIENTRY LCSortKeyFirstW(LPWSTR pwText, int cwText) // convert start sort key to first matching sort key { for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights) if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator { pwText[i] = 0; return i; // return character length } return 0; // no separator } int APIENTRY LCSortKeyLastW(LPWSTR pwText, int cwText) // convert start sort key to last matching sort key { for (int i = 0; i < cwText; i++) // skipping characters by two (alpha sort weights) if (HIBYTE(pwText[i]) == SORT_KEY_SEPARATOR) // search for first weight separator { pwText[i-1]++; // increment last alpha weight pwText[i] = 0; return i; // return character length } return 0; // no separator } int APIENTRY LCSortKeyBase(LPWSTR pwText, int cwText) // convert sort key to base characters { // removes diacritic weights from sort key LPSTR pCopy, pEnd; LPWSTR pwStart = pwText; while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator pwText++; if (LOBYTE(*pwText) == SORT_KEY_SEPARATOR) // no case weights at all return cwText; // returning original sort key pCopy = (LPSTR)pwText; // point to next word for search pEnd = (LPSTR)(pwStart + cwText); *pwText++ = ((SORT_KEY_SEPARATOR << 8) | SORT_KEY_SEPARATOR); while ((pCopy += 2) < pEnd) // remember, sort key is byte reversed { if (*(pCopy+1) == SORT_KEY_SEPARATOR) // found diacritic separator (high byte) { while ((pCopy + 2) < pEnd) { // lobyte + next hibyte *pwText++ = ((WCHAR)(BYTE)*pCopy << 8) | (BYTE)(*(pCopy + 3)); pCopy += 2; } if (*pwText = (WCHAR)(BYTE)*pCopy << 8) // check if terminating wide-null pwText++; break; } else if (*pCopy == SORT_KEY_SEPARATOR) // found diacritic separator (low byte) { pCopy += 2; while (pCopy < pEnd) { *pwText++ = *((LPWSTR)pCopy); pCopy += 2; } break; } } return pwText - pwStart; } int APIENTRY LCSortKeyLower(LPWSTR pwText, int cwText) // convert sort key to lower case { LPSTR pWork, pAlpha; LPWSTR pwWork, pwEnd; LPSTR pEnd = (LPSTR)(pwText + cwText); LPWSTR pwStart = pwText; while (HIBYTE(*pwText) != SORT_KEY_SEPARATOR) // search for first weight separator pwText++; for (pwWork = pwText; pwWork < (LPWSTR)pEnd; pwWork++) *pwWork = (*pwWork >> 8) | (*pwWork << 8); // bring sort key weights in byte order for (pWork = (LPSTR)pwText + 1; pWork < pEnd; pWork++) // skip diacritic separator if (*pWork == SORT_KEY_SEPARATOR) // find alpha weights separator break; if (*++pWork == SORT_KEY_SEPARATOR) pwEnd = (LPWSTR)pEnd; // no alpha weights else { for (pAlpha = pWork + 1; pAlpha < pEnd; pAlpha++) // skip non-separator character to start if (*pAlpha == SORT_KEY_SEPARATOR) break; // find final sort key separator memcpy(pWork, pAlpha, pEnd - pAlpha); // copy remaining buffer memset(pWork + (pEnd - pAlpha), 0, pAlpha - pWork); // clear remaining buffer pwEnd = (LPWSTR)pEnd; while(!(*--pwEnd)) {}; // find last non-zero word pwEnd++; } for (pwWork = pwText; pwWork < pwEnd; pwWork++) *pwWork = (*pwWork >> 8) | (*pwWork << 8); // byte reverse sort keys weights return pwEnd - pwStart; // number of words being returned } ////////////////////////////////// global function put in for hiliter ///////////// WORD RemoveWhiteSpace(WCHAR* pwChar, int cw, int& cBase, int& cLimit) { // remove space from Unicode strings so they match query box entries int i, j; cBase = cLimit = 0; // number of leading/trailing blank characters BOOL fNonBlank = FALSE; // set when we reach the first non-blank character for (i=j=0; i