Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1567 lines
41 KiB

//+---------------------------------------------------------------------------
//
//
// CThaiTrieIter - class CThaiTrieIter use for traversing trie.
//
// History:
// created 7/99 aarayas
//
// ©1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "CThaiTrieIter.hpp"
#define WORDSIZE 64
static unsigned int iStackSize = 0;
//+---------------------------------------------------------------------------
//
// Function: IsThaiBeginClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiBeginClusterCharacter(WCHAR wc)
{
return ( ( wc >= THAI_Vowel_Sara_E ) && (wc <= THAI_Vowel_Sara_AI_MaiMaLai) );
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiUpperAndLowerClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiUpperAndLowerClusterCharacter(WCHAR wc)
{
return ( ( (wc == THAI_Vowel_Sign_Mai_HanAkat) ) ||
( (wc >= THAI_Vowel_Sign_Sara_Am) && (wc <= THAI_Vowel_Sign_Phinthu) ) ||
( (wc >= THAI_Tone_MaiTaiKhu) && (wc <= THAI_Nikhahit) ) );
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiEndingClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiEndingClusterCharacter(WCHAR wc)
{
return (
// (wc == THAI_Sign_PaiYanNoi) || // take this line out to fix O11.PaiYanNoi issue.
(wc == THAI_Vowel_Sara_A) ||
(wc == THAI_Vowel_Sara_AA) ||
(wc == THAI_Vowel_LakKhangYao) ||
(wc == THAI_Vowel_MaiYaMok) );
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiMostlyBeginCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiMostlyBeginCharacter(WCHAR wc)
{
return ( (wc >= THAI_Vowel_Sara_E && wc <= THAI_Vowel_Sara_AI_MaiMaLai) || // Character always in front of a word.
(wc == THAI_Cho_Ching) || // Character always in front of a word.
(wc == THAI_Pho_Phung) || // Character always in front of a word.
(wc == THAI_Fo_Fa) || // Character always in front of a word.
(wc == THAI_Ho_Nok_Huk) || // Character always in front of a word.
(wc == THAI_Ho_Hip) || // Character most like in front of a word.
(wc == THAI_Pho_Samphao) || // Character most like in front of a word.
(wc == THAI_Kho_Rakhang) || // Character most like in front of a word.
(wc == THAI_Fo_Fan) || // Character most like in front of a word.
(wc == THAI_So_So) || // Character most like in front of a word.
(wc == THAI_Tho_NangmonTho) ); // Character most like in front of a word.
}
//+---------------------------------------------------------------------------
//
// Function: IsContain
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsContain(const WCHAR* pwcWord, unsigned int iWordLen, WCHAR wc)
{
const WCHAR* pwc = pwcWord;
const WCHAR* pwcEnd = pwcWord + iWordLen;
while (pwc < pwcEnd)
{
if (*pwc == wc)
return true;
pwc++;
}
return false;
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiMostlyLastCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiMostlyLastCharacter(WCHAR wc)
{
return ( (wc == THAI_Vowel_Sign_Sara_Am) || // Always the end of word.
(wc == THAI_Sign_PaiYanNoi) || // Always the end of word.
(wc == THAI_Vowel_MaiYaMok) || // Always the end of word.
(wc == THAI_Vowel_LakKhangYao) || // Most likely the end of word.
(wc == THAI_Thanthakhat) ); // Most likely the end of word.
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiToneMark
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiToneMark(WCHAR wc)
{
return ( (wc >= 0x0e48) && (wc <= 0x0e4b) ||
(wc == 0x0e31));
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiEndingSign
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/02 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiEndingSign(WCHAR wc)
{
return ((bool) (wc == THAI_Vowel_MaiYaMok || wc == THAI_Sign_PaiYanNoi));
}
//+---------------------------------------------------------------------------
//
// Function: GetCluster
//
// Synopsis: The function return the next number of character which represent
// a cluster of Thai text.
//
// ie. Kor Kai, Kor Kai -> 1
// Kor Kai, Sara Um -> 2
//
// * Note this function will not return no more than 3 character,
// for cluster as this would represent invalid sequence of character.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
/*
unsigned int GetCluster(WCHAR* pszIndex)
{
int iRetValue = 0;
// Take all begin cluster character.
while (IsThaiBeginClusterCharacter(*pszIndex))
{
pszIndex++;
iRetValue++;
}
if (IsThaiConsonant(*pszIndex))
{
pszIndex++;
iRetValue++;
while (IsThaiUpperAndLowerClusterCharacter(*pszIndex))
{
pszIndex++;
iRetValue++;
}
while (IsThaiEndingClusterCharacter(*pszIndex))
{
pszIndex++;
iRetValue++;
}
}
if (iRetValue == 0)
// The character is probably a punctuation.
iRetValue++;
return iRetValue;
}
*/
//+---------------------------------------------------------------------------
//
// Function: IsThaiConsonant
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiConsonant(WCHAR wc)
{
return ( (wc >= THAI_Ko_Kai) && (wc <= THAI_Ho_Nok_Huk) );
}
//+---------------------------------------------------------------------------
//
// Define the different part of speech for Thai.
//
//----------------------------------------------------------------------------
WCHAR wzPOSLookup[POSTYPE][46] =
{ L"NONE", // 0 . No tags.
L"NPRP", // 1 . Proper noun
L"NCNM", // 2 . Cardinal number
L"NONM", // 3 . Ordinal number
L"NLBL", // 4 . Label noun
L"NCMN", // 5 . Common noun
L"NTTL", // 6 . Title noun
L"PPRS", // 7 . Personal pronoun
L"PDMN", // 8 . Demonstrative pronoun
L"PNTR", // 9 . Interrogative pronoun
L"PREL", // 10. Relative pronoun
L"VACT", // 11. Active verb
L"VSTA", // 12. Stative verb
L"VATT", // 13. Attributive verb
L"XVBM", // 14. Pre-verb auxiliary, before negator
L"XVAM", // 15. Pre-verb auxiliary, after negator
L"XVMM", // 16. Pre-verb, before or after negator
L"XVBB", // 17. Pre-verb auxiliary, in imperative mood
L"XVAE", // 18. Post-verb auxiliary
L"DDAN", // 19. Definite determiner, after noun without classifier in between
L"DDAC", // 20. Definite determiner, allowing classifier in between
L"DDBQ", // 21. Definite determiner, between noun and classifier or preceding quantitative expression
L"DDAQ", // 22. Definite determiner, following quantitative expression
L"DIAC", // 23. Indefinite determiner, following noun; allowing classifier in between
L"DIBQ", // 24. Indefinite determiner, between noun and classifier or preceding quantitative expression
L"DIAQ", // 25. Indefinite determiner, following quantitative expression
L"DCNM", // 26. Determiner, cardinal number expression
L"DONM", // 27. Determiner, ordinal number expression
L"ADVN", // 28. Adverb with normal form
L"ADVI", // 29. Adverb with iterative form
L"ADVP", // 30. Adverb with prefixed form
L"ADVS", // 31. Sentential adverb
L"CNIT", // 32. Unit classifier
L"CLTV", // 33. Collective classifier
L"CMTR", // 34. Measurement classifier
L"CFQC", // 35. Frequency classifier
L"CVBL", // 36. Verbal classifier
L"JCRG", // 37. Coordinating conjunction
L"JCMP", // 38. Comparative conjunction
L"JSBR", // 39. Subordinating conjunction
L"RPRE", // 40. Preposition
L"INT", // 41. Interjection
L"FIXN", // 42. Nominal prefix
L"FIXV", // 43. Adverbial prefix
L"EAFF", // 44. Ending for affirmative sentencev
L"EITT", // 45. Ending for interrogative sentence
L"NEG", // 46. Negator
L"PUNC", // 47. Punctuation
L"ADVI ADVN",
// 48.
L"ADVI ADVN NCMN",
// 49.
L"ADVI ADVN VSTA",
// 50.
L"ADVI VATT",
// 51.
L"ADVN ADVP",
// 52.
L"ADVN ADVP ADVS",
// 53.
L"ADVN ADVP DIAQ DIBQ JCMP JSBR RPRE",
// 54.
L"ADVN ADVP NCMN VATT",
// 55.
L"ADVN ADVP VSTA",
// 56.
L"ADVN ADVS DDAC DDAN DIAC VATT XVAE",
// 57.
L"ADVN ADVS DDAN NCMN VATT VSTA",
// 58.
L"ADVN ADVS NCMN",
// 59.
L"ADVN ADVS NCMN VATT",
// 60.
L"ADVN ADVS VACT",
// 61.
L"ADVN ADVS VATT",
// 62.
L"ADVN CFQC NCMN RPRE VSTA",
// 63.
L"ADVN CLTV CNIT NCMN RPRE",
// 64.
L"ADVN DCNM",
// 65.
L"ADVN DDAC DDAN",
// 66.
L"ADVN DDAC DDAN NCMN PDMN",
// 67.
L"ADVN DDAC DDAN PDMN",
// 68.
L"ADVN DDAN DDBQ",
// 69.
L"ADVN DDAN DIAC PDMN VSTA",
// 70.
L"ADVN DDAN FIXN PDMN",
// 71.
L"ADVN DDAN NCMN",
// 72.
L"ADVN DDAQ",
// 73.
L"ADVN DDBQ",
// 74.
L"ADVN DDBQ RPRE VATT",
// 75.
L"ADVN DDBQ VATT VSTA XVAE",
// 76.
L"ADVN DIAC",
// 77.
L"ADVN DIAC PDMN",
// 78.
L"ADVN DIBQ",
// 79.
L"ADVN DIBQ NCMN",
// 80.
L"ADVN DIBQ VACT VSTA",
// 81.
L"ADVN DIBQ VATT",
// 82.
L"ADVN DONM JCMP",
// 83.
L"ADVN DONM JSBR NCMN RPRE VATT XVAE",
// 84.
L"ADVN EITT PNTR",
// 85.
L"ADVN FIXN",
// 86.
L"ADVN JCMP",
// 87.
L"ADVN JCRG",
// 88.
L"ADVN JCRG JSBR",
// 89.
L"ADVN JCRG JSBR XVBM XVMM",
// 90.
L"ADVN JCRG RPRE VACT VSTA XVAE",
// 91.
L"ADVN JSBR",
// 92.
L"ADVN JSBR NCMN",
// 93.
L"ADVN JSBR RPRE VATT",
// 94.
L"ADVN JSBR RPRE XVAE",
// 95.
L"ADVN JSBR VSTA",
// 96.
L"ADVN JSBR XVAE XVBM",
// 97.
L"ADVN NCMN",
// 98.
L"ADVN NCMN RPRE VACT VATT VSTA",
// 99.
L"ADVN NCMN RPRE VACT XVAE",
// 100.
L"ADVN NCMN RPRE VATT",
// 101.
L"ADVN NCMN VACT VATT VSTA",
// 102.
L"ADVN NCMN VACT VSTA",
// 103.
L"ADVN NCMN VATT",
// 104.
L"ADVN NCMN VATT VSTA",
// 105.
L"ADVN NEG",
// 106.
L"ADVN NPRP VATT",
// 107.
L"ADVN PDMN VACT",
// 108.
L"ADVN PNTR",
// 109.
L"ADVN RPRE",
// 110.
L"ADVN RPRE VACT VATT XVAE",
// 111.
L"ADVN RPRE VACT XVAM XVBM",
// 112.
L"ADVN RPRE VATT VSTA",
// 113.
L"ADVN RPRE VSTA",
// 114.
L"ADVN VACT",
// 115.
L"ADVN VACT VATT",
// 116.
L"ADVN VACT VATT VSTA",
// 117.
L"ADVN VACT VATT VSTA XVAM XVBM",
// 118.
L"ADVN VACT VSTA",
// 119.
L"ADVN VACT VSTA XVAE",
// 120.
L"ADVN VACT XVAE",
// 121.
L"ADVN VATT",
// 122.
L"ADVN VATT VSTA",
// 123.
L"ADVN VATT VSTA XVAM XVBM XVMM",
// 124.
L"ADVN VATT XVBM",
// 125.
L"ADVN VSTA",
// 126.
L"ADVN VSTA XVAE",
// 127.
L"ADVN VSTA XVBM",
// 128.
L"ADVN XVAE",
// 129.
L"ADVN XVAM",
// 130.
L"ADVN XVBM XVMM",
// 131.
L"ADVP JSBR RPRE VATT",
// 132.
L"ADVP VATT",
// 133.
L"ADVS DDAC JCRG",
// 134.
L"ADVS DDAC JSBR",
// 135.
L"ADVS DDAN VSTA",
// 136.
L"ADVS DIAC",
// 137.
L"ADVS DONM",
// 138.
L"ADVS JCRG JSBR",
// 139.
L"ADVS JCRG JSBR RPRE",
// 140.
L"ADVS JSBR",
// 141.
L"ADVS JSBR RPRE",
// 142.
L"ADVS NCMN",
// 143.
L"ADVS VATT",
// 144.
L"CFQC CLTV CNIT DCNM JCRG JSBR NCMN RPRE XVBM",
// 145.
L"CFQC CNIT PREL",
// 146.
L"CFQC NCMN",
// 147.
L"CLTV CNIT NCMN",
// 148.
L"CLTV CNIT NCMN RPRE",
// 149.
L"CLTV CNIT NCMN VSTA",
// 150.
L"CLTV NCMN",
// 151.
L"CLTV NCMN VACT VATT",
// 152.
L"CLTV NCMN VATT",
// 153.
L"CMTR CNIT NCMN",
// 154.
L"CMTR NCMN",
// 155.
L"CMTR NCMN VATT VSTA",
// 156.
L"CNIT DDAC NCMN VATT",
// 157.
L"CNIT DONM NCMN RPRE VATT",
// 158.
L"CNIT FIXN FIXV JSBR NCMN",
// 159.
L"CNIT JCRG JSBR NCMN PREL RPRE VATT",
// 160.
L"CNIT JSBR RPRE",
// 161.
L"CNIT NCMN",
// 162.
L"CNIT NCMN RPRE",
// 163.
L"CNIT NCMN RPRE VATT",
// 164.
L"CNIT NCMN VACT",
// 165.
L"CNIT NCMN VSTA",
// 166.
L"CNIT NCNM",
// 167.
L"CNIT PPRS",
// 168.
L"DCNM DDAC DIAC DONM VATT VSTA",
// 169.
L"DCNM DDAN DIAC",
// 170.
L"DCNM DIAC NCMN NCNM",
// 171.
L"DCNM DIBQ NCMN",
// 172.
L"DCNM DONM",
// 173.
L"DCNM NCMN",
// 174.
L"DCNM NCNM",
// 175.
L"DCNM NCNM VACT",
// 176.
L"DCNM VATT",
// 177.
L"DDAC DDAN",
// 178.
L"DDAC DDAN DIAC NCMN",
// 179.
L"DDAC DDAN DIAC VATT",
// 180.
L"DDAC DDAN EAFF PDMN",
// 181.
L"DDAC DDAN PDMN",
// 182.
L"DDAC DIAC VSTA",
// 183.
L"DDAC NCMN",
// 184.
L"DDAN DDBQ",
// 185.
L"DDAN DIAC PNTR",
// 186.
L"DDAN NCMN",
// 187.
L"DDAN NCMN RPRE VATT",
// 188.
L"DDAN PDMN",
// 189.
L"DDAN RPRE",
// 190.
L"DDAN VATT",
// 191.
L"DDAQ VATT",
// 192.
L"DDBQ DIBQ",
// 193.
L"DDBQ JCRG JSBR",
// 194.
L"DDBQ JCRG NCMN",
// 195.
L"DIAC PDMN",
// 196.
L"DIBQ JSBR RPRE VSTA",
// 197.
L"DIBQ NCMN",
// 198.
L"DIBQ VATT",
// 199.
L"DIBQ VATT VSTA",
// 200.
L"DIBQ XVBM",
// 201.
L"DONM NCMN RPRE",
// 202.
L"DONM VACT VATT VSTA",
// 203.
L"DONM VATT",
// 204.
L"EAFF XVAE XVAM XVBM",
// 205.
L"EITT JCRG",
// 206.
L"FIXN FIXV NCMN",
// 207.
L"FIXN FIXV RPRE VSTA",
// 208.
L"FIXN JSBR NCMN PREL RPRE VSTA XVBM",
// 209.
L"FIXN NCMN",
// 210.
L"FIXN VACT",
// 211.
L"FIXN VACT VSTA",
// 212.
L"FIXV JSBR RPRE",
// 213.
L"JCMP JSBR",
// 214.
L"JCMP RPRE VSTA",
// 215.
L"JCMP VATT VSTA",
// 216.
L"JCMP VSTA",
// 217.
L"JCRG JSBR",
// 218.
L"JCRG JSBR NCMN RPRE",
// 219.
L"JCRG JSBR RPRE",
// 220.
L"JCRG RPRE",
// 221.
L"JCRG RPRE VATT VSTA",
// 222.
L"JCRG VSTA",
// 223.
L"JSBR NCMN",
// 224.
L"JSBR NCMN XVAE",
// 225.
L"JSBR NCMN XVAM XVBM XVMM",
// 226.
L"JSBR PREL",
// 227.
L"JSBR PREL RPRE",
// 228.
L"JSBR PREL XVBM",
// 229.
L"JSBR RPRE",
// 230.
L"JSBR RPRE VACT",
// 231.
L"JSBR RPRE VACT VSTA",
// 232.
L"JSBR RPRE VACT XVAE XVAM",
// 233.
L"JSBR RPRE VATT",
// 234.
L"JSBR RPRE VSTA",
// 235.
L"JSBR RPRE XVAM",
// 236.
L"JSBR VACT",
// 237.
L"JSBR VACT VSTA",
// 238.
L"JSBR VATT XVBM XVMM",
// 239.
L"JSBR VSTA",
// 240.
L"JSBR XVBM",
// 241.
L"NCMN NCNM",
// 242.
L"NCMN NCNM NPRP",
// 243.
L"NCMN NLBL NPRP",
// 244.
L"NCMN NPRP",
// 245.
L"NCMN NPRP RPRE",
// 246.
L"NCMN NTTL",
// 247.
L"NCMN PDMN PPRS",
// 248.
L"NCMN PDMN VATT",
// 249.
L"NCMN PNTR",
// 250.
L"NCMN PPRS PREL VACT",
// 251.
L"NCMN RPRE",
// 252.
L"NCMN RPRE VACT VATT",
// 253.
L"NCMN RPRE VATT",
// 254.
L"NCMN VACT",
// 255.
L"NCMN VACT VATT",
// 256.
L"NCMN VACT VATT VSTA XVAE",
// 257.
L"NCMN VACT VSTA",
// 258.
L"NCMN VACT VSTA XVAM",
// 259.
L"NCMN VACT VSTA XVBB",
// 260.
L"NCMN VATT",
// 261.
L"NCMN VATT VSTA",
// 262.
L"NCMN VATT XVAM",
// 263.
L"NCMN VSTA",
// 264.
L"NCMN XVBM",
// 265.
L"NPRP RPRE",
// 266.
L"NPRP VATT",
// 267.
L"NTTL PPRS",
// 268.
L"PDMN PPRS",
// 269.
L"PDMN VATT",
// 270.
L"PDMN VATT VSTA",
// 271.
L"PPRS PREL",
// 272.
L"PPRS VATT",
// 273.
L"RPRE VACT",
// 274.
L"RPRE VACT VATT",
// 275.
L"RPRE VACT VSTA",
// 276.
L"RPRE VACT VSTA XVAE",
// 277.
L"RPRE VACT XVAE",
// 278.
L"RPRE VATT",
// 279.
L"RPRE VATT VSTA",
// 280.
L"RPRE VSTA",
// 281.
L"VACT VATT",
// 282.
L"VACT VATT VSTA",
// 283.
L"VACT VATT XVAE XVAM XVBM",
// 284.
L"VACT VSTA",
// 285.
L"VACT VSTA XVAE",
// 286.
L"VACT VSTA XVAE XVAM",
// 287.
L"VACT VSTA XVAE XVAM XVMM",
// 288.
L"VACT VSTA XVAM",
// 289.
L"VACT VSTA XVAM XVMM",
// 290.
L"VACT XVAE",
// 291.
L"VACT XVAM",
// 292.
L"VACT XVAM XVMM",
// 293.
L"VACT XVMM",
// 294.
L"VATT VSTA",
// 295.
L"VSTA XVAE",
// 296.
L"VSTA XVAM",
// 297.
L"VSTA XVAM XVMM",
// 298.
L"VSTA XVBM",
// 299.
L"XVAM XVBM",
// 300.
L"XVAM XVBM XVMM",
// 301.
L"XVAM XVMM",
// 302.
L"UNKN",
// 303. Unknown
L"ABBR"
// 304. Abbrivation
};
//+---------------------------------------------------------------------------
//
// Function: POSCompress
//
// Synopsis: Part Of Speech Compress - translating string to unique id.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD POSCompress(const WCHAR* szTag)
{
int i;
for (i = 0; i < POSTYPE; i++)
{
if (wcscmp(szTag, &wzPOSLookup[i][0]) == 0)
{
return (DWORD)i;
}
}
return POSTYPE;
}
//+---------------------------------------------------------------------------
//
// Function: POSDecompress
//
// Synopsis: Part Of Speech Decompress - Decompress tag get
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline WCHAR* POSDecompress(DWORD dwTag)
{
return (&wzPOSLookup[dwTag][0]);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: Constructor:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiTrieIter::CThaiTrieIter() : resultWord(NULL), soundexWord(NULL), tempWord(NULL),
pTrieScanArray(NULL), m_fThaiNumber(false)
{
resultWord = new WCHAR[WORDSIZE];
tempWord = new WCHAR[WORDSIZE];
pTrieScanArray = new TRIESCAN[53];
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: Destructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiTrieIter::~CThaiTrieIter()
{
if (resultWord)
delete resultWord;
if (tempWord)
delete tempWord;
if (pTrieScanArray)
delete pTrieScanArray;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Initialize variables.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
void CThaiTrieIter::Init(CTrie* ctrie)
{
// Declare varialbes.
WCHAR wc;
// Initialize parent.
CTrieIter::Init(ctrie);
// Initialize Hash table.
for (wc = THAI_Ko_Kai; wc <= THAI_Ho_Nok_Huk; wc++)
GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai]);
for (wc = THAI_Vowel_Sara_E; wc <= THAI_Vowel_Sara_AI_MaiMaLai; wc++)
GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai - 17]);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Initialize variables.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::GetScanFirstChar(WCHAR wc, TRIESCAN* pTrieScan)
{
// Reset the trie scan.
memset(&trieScan1, 0, sizeof(TRIESCAN));
if (!TrieGetNextState(pTrieCtrl, &trieScan1))
return false;
while (wc != trieScan1.wch)
{
// Keep moving the the right of the trie.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1))
{
memset(pTrieScan, 0, sizeof(TRIESCAN));
return false;
}
}
memcpy(pTrieScan, &trieScan1, sizeof(TRIESCAN));
return true;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: The function move trieScan to the relevant node matching with
// with the cluster of Thai character.
//
// Arguments: szCluster - contain the thai character cluster.
// iNumCluster - contain the size of character.
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CThaiTrieIter::MoveCluster(const WCHAR* szCluster, unsigned int iNumCluster)
{
// Declare and initailze local variables.
unsigned int i = 0;
// Assert(iNumCluster <= 6, "Invalid cluster");
CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1))
return FALSE;
while (TRUE)
{
if (szCluster[i] == trieScan1.wch)
{
i++;
if (i == iNumCluster)
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return TRUE;
}
// Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break;
}
// Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break;
}
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: The function move trieScan to the relevant node matching with
// with the cluster of Thai character.
//
// Arguments: szCluster - contain the thai character cluster.
// iNumCluster - contain the size of character.
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::MoveCluster(WCHAR* szCluster, unsigned int iNumCluster, bool fBeginNewWord)
{
// Declare and initailze local variables.
unsigned int i = 0;
Assert(iNumCluster <= 6, "Invalid cluster");
// No need to move.
if (iNumCluster == 0)
return false;
// Use a look indexes for where the first character is at.
if (fBeginNewWord)
{
m_fThaiNumber = false;
// Quick look up for proper characters.
if (szCluster[i] >= THAI_Ko_Kai && szCluster[i] <= THAI_Ho_Nok_Huk)
memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai)], sizeof(TRIESCAN));
else if (szCluster[i] >= THAI_Vowel_Sara_E && szCluster[i] <= THAI_Vowel_Sara_AI_MaiMaLai)
memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai - 17)], sizeof(TRIESCAN));
else
{
Reset();
m_fThaiNumber = IsThaiNumeric(szCluster[i]);
}
if (trieScan.wch == szCluster[i])
i++;
if (i == iNumCluster)
{
GetNode();
return true;
}
}
CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1))
return false;
if (m_fThaiNumber)
{
fWordEnd = true;
if (IsThaiNumeric(szCluster[i]) || szCluster[i] == L',' || szCluster[i] == L'.')
return true;
else
return false;
}
while (true)
{
if (szCluster[i] == trieScan1.wch)
{
i++;
if ((i == iNumCluster) ||
( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) )
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return true;
}
// Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break;
}
// Let Nikhahit equal Sara Am.
// TODO: case Nikhahit Mai To and Sara AA should equal to Mai To Sara Am. TO risk for this version.
// This bug was found because Thairath newspaper doesn't write this properly on their web page.
else if (szCluster[i] == THAI_Nikhahit && szCluster[i+1] == THAI_Vowel_Sara_AA &&
trieScan1.wch == THAI_Vowel_Sign_Sara_Am)
{
if (szCluster[i+1] == THAI_Vowel_Sara_AA)
i++;
i++;
if ((i == iNumCluster) ||
( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) )
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return true;
}
// Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break;
}
// Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break;
}
if (fBeginNewWord)
Reset();
return false;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
SOUNDEXSTATE CThaiTrieIter::MoveSoundexByCluster(WCHAR* szCluster, unsigned int iNumCluster, unsigned int iNumNextCluster)
{
// Declare and initailze local variables.
unsigned int i = 0 , x = 0;
bool fStoreScan = false;
TRIESCAN trieScanPush;
Assert(iNumCluster <= 6, "Invalid cluster");
Assert(iNumNextCluster <= 6, "Invalid cluster");
CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1))
return UNABLE_TO_MOVE;
if (IsThaiEndingSign(*szCluster))
return STOP_MOVE;
// Match as much as possible
while (true)
{
if (szCluster[i] == trieScan1.wch)
{
i++;
if (i == iNumCluster)
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return NOSUBSTITUTE;
}
// Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break;
// Save our current scan position.
memcpy(&trieScanPush, &trieScan1, sizeof(TRIESCAN));
fStoreScan = true;
}
// Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break;
}
// Try doing some tonemark substitution.
if (fStoreScan && IsThaiToneMark(szCluster[i]) )
{
// Restore trieScan1 to last matched.
memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN));
while (true)
{
if (IsThaiToneMark(trieScan1.wch))
{
if ( (i + 1) == iNumCluster)
{
if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster))
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return SUBSTITUTE_DIACRITIC;
}
}
}
// Move the Trie right one node.
// Goes through all the none Tonemark.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break;
}
}
// Try doing droping the current tonemark.
// Example is case can be best found "Click" is spelt in Thai from the
// different group at Microsoft.
if (fStoreScan && !IsThaiToneMark(szCluster[i]) )
{
// Restore trieScan1 to last matched.
memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN));
while (true)
{
if (IsThaiToneMark(trieScan1.wch))
{
if ( (i + 1) == iNumCluster)
{
if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster))
{
memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN));
GetNode();
return SUBSTITUTE_DIACRITIC;
}
}
}
// Move the Trie right one node.
// Drop all the Tonemark.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break;
}
}
return UNABLE_TO_MOVE;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: set trieScan1 = trieScan.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiTrieIter::CopyScan()
{
// Let trieScan1 = trieScan
memcpy(&trieScan1,&trieScan, sizeof(TRIESCAN));
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: the function traverse through the whole dictionary
// to find the best possible match words.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiTrieIter::Soundex(WCHAR* word)
{
// Reset Trie.
Reset();
// Move Down.
Down();
// Clean soundexWord.
memset(resultWord, 0, sizeof(WCHAR) * WORDSIZE);
memset(tempWord, 0, sizeof(WCHAR) * WORDSIZE);
soundexWord = word;
iResultScore = GetScore(L"\x0e04\x0e25\x0e34\x0e01\x0e01\x0e01",soundexWord);
iResultScore = 2000;
#if defined (_DEBUG)
iStackSize = 0;
#endif
Traverse(0,1000);
return iResultScore;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiTrieIter::GetScore(WCHAR* idealWord, WCHAR* inputWord)
{
unsigned int iScore = 1000;
unsigned int idealWordLen = wcslen(idealWord);
unsigned int iInputWordLen = wcslen(inputWord);
unsigned int iIndexBegin = 0;
unsigned int i;
unsigned int x = 0;
unsigned int iMaxCompare;
bool fShouldExit;
for (i=0; i < iInputWordLen; i++)
{
iMaxCompare = ( (iIndexBegin + 2) < idealWordLen ) ? (iIndexBegin + 2) : idealWordLen;
if (i <= idealWordLen)
{
x = iIndexBegin;
fShouldExit = false;
while (true)
{
if ((x >= iMaxCompare) || (fShouldExit) )
break;
if (idealWord[x] == inputWord[i])
{
x++;
iIndexBegin = x;
break;
}
if (IsThaiUpperAndLowerClusterCharacter(inputWord[i]))
iScore += 5;
else
iScore += 10;
x++;
fShouldExit = true;
}
}
else
{
if (IsThaiUpperAndLowerClusterCharacter(inputWord[i]))
iScore += 20;
else
iScore += 30;
}
}
while (x <= idealWordLen)
{
if (IsThaiUpperAndLowerClusterCharacter(idealWord[x]))
iScore += 5;
else
iScore += 10;
x++;
}
return iScore;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::Traverse(unsigned int iCharPos, unsigned int score)
{
TRIESCAN trieScanLevel;
#if defined(_DEBUG)
iStackSize++;
#endif
// push current trieScan into local stack trieScanLevel.
memcpy(&trieScanLevel,&trieScan, sizeof(TRIESCAN));
// Get Node information
GetNode();
// Store the current character to result word.
tempWord[iCharPos] = wc;
tempWord[iCharPos + 1] = 0;
// Determine the distance between two string.
score = GetScore(tempWord, soundexWord);
// See if we have reached the end of a word.
if (fWordEnd)
{
tempWord[iCharPos + 1] = 0;
// Is Soundex score lower than we have.
if (score < iResultScore)
{
// wcscpy(resultWord,tempWord);
Wzncpy(resultWord,tempWord,WORDSIZE);
iResultScore = score;
}
}
// See if we can prune the result of the words.
if (score > (iResultScore + APPROXIMATEWEIGHT))
{
#if defined(_DEBUG)
iStackSize--;
#endif
return true;
}
// Move down Trie branch.
if (Down())
{
Traverse(iCharPos + 1, score);
if (Right())
Traverse(iCharPos + 1, score);
// restore trieScan
memcpy(&trieScan,&trieScanLevel, sizeof(TRIESCAN));
if (Right())
Traverse(iCharPos, score);
}
#if defined(_DEBUG)
iStackSize--;
#endif
return true;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: This function will trieScan1 to the next cluster if
// the move is possible.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::CheckNextCluster(const WCHAR* szCluster, unsigned int iNumCluster)
{
// Declare and initailze local variables.
unsigned int i = 0;
TRIESCAN trieScan2;
Assert(iNumCluster <= 6, "Invalid cluster");
// If there are no cluster to check consider cluster found.
if (0 == iNumCluster)
return true;
memcpy(&trieScan2, &trieScan1, sizeof(TRIESCAN));
// Move down the Trie Branch.
if (!TrieGetNextState(pTrieCtrl, &trieScan2))
return false;
while (true)
{
if (szCluster[i] == trieScan2.wch)
{
i++;
if (i == iNumCluster)
{
return true;
}
// Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan2)) break;
}
// Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan2)) break;
}
return false;
}