|
|
//+---------------------------------------------------------------------------
//
//
// CThaiTrieIter - class CThaiTrieIter use for traversing trie.
//
// History:
// created 7/99 aarayas
//
// �1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "CThaiTrieIter.hpp"
#define WORDSIZE 64
static unsigned int iStackSize = 0;
//+---------------------------------------------------------------------------
//
// Function: IsThaiBeginClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiBeginClusterCharacter(WCHAR wc) { return ( ( wc >= THAI_Vowel_Sara_E ) && (wc <= THAI_Vowel_Sara_AI_MaiMaLai) ); }
//+---------------------------------------------------------------------------
//
// Function: IsThaiUpperAndLowerClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiUpperAndLowerClusterCharacter(WCHAR wc) { return ( ( (wc == THAI_Vowel_Sign_Mai_HanAkat) ) || ( (wc >= THAI_Vowel_Sign_Sara_Am) && (wc <= THAI_Vowel_Sign_Phinthu) ) || ( (wc >= THAI_Tone_MaiTaiKhu) && (wc <= THAI_Nikhahit) ) ); }
//+---------------------------------------------------------------------------
//
// Function: IsThaiEndingClusterCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiEndingClusterCharacter(WCHAR wc) { return ( // (wc == THAI_Sign_PaiYanNoi) || // take this line out to fix O11.PaiYanNoi issue.
(wc == THAI_Vowel_Sara_A) || (wc == THAI_Vowel_Sara_AA) || (wc == THAI_Vowel_LakKhangYao) || (wc == THAI_Vowel_MaiYaMok) ); }
//+---------------------------------------------------------------------------
//
// Function: IsThaiMostlyBeginCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiMostlyBeginCharacter(WCHAR wc) { return ( (wc >= THAI_Vowel_Sara_E && wc <= THAI_Vowel_Sara_AI_MaiMaLai) || // Character always in front of a word.
(wc == THAI_Cho_Ching) || // Character always in front of a word.
(wc == THAI_Pho_Phung) || // Character always in front of a word.
(wc == THAI_Fo_Fa) || // Character always in front of a word.
(wc == THAI_Ho_Nok_Huk) || // Character always in front of a word.
(wc == THAI_Ho_Hip) || // Character most like in front of a word.
(wc == THAI_Pho_Samphao) || // Character most like in front of a word.
(wc == THAI_Kho_Rakhang) || // Character most like in front of a word.
(wc == THAI_Fo_Fan) || // Character most like in front of a word.
(wc == THAI_So_So) || // Character most like in front of a word.
(wc == THAI_Tho_NangmonTho) ); // Character most like in front of a word.
}
//+---------------------------------------------------------------------------
//
// Function: IsContain
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsContain(const WCHAR* pwcWord, unsigned int iWordLen, WCHAR wc) { const WCHAR* pwc = pwcWord; const WCHAR* pwcEnd = pwcWord + iWordLen;
while (pwc < pwcEnd) { if (*pwc == wc) return true; pwc++; }
return false; }
//+---------------------------------------------------------------------------
//
// Function: IsThaiMostlyLastCharacter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiMostlyLastCharacter(WCHAR wc) { return ( (wc == THAI_Vowel_Sign_Sara_Am) || // Always the end of word.
(wc == THAI_Sign_PaiYanNoi) || // Always the end of word.
(wc == THAI_Vowel_MaiYaMok) || // Always the end of word.
(wc == THAI_Vowel_LakKhangYao) || // Most likely the end of word.
(wc == THAI_Thanthakhat) ); // Most likely the end of word.
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiToneMark
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiToneMark(WCHAR wc) { return ( (wc >= 0x0e48) && (wc <= 0x0e4b) || (wc == 0x0e31));
}
//+---------------------------------------------------------------------------
//
// Function: IsThaiEndingSign
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/02 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool IsThaiEndingSign(WCHAR wc) { return ((bool) (wc == THAI_Vowel_MaiYaMok || wc == THAI_Sign_PaiYanNoi)); }
//+---------------------------------------------------------------------------
//
// Function: GetCluster
//
// Synopsis: The function return the next number of character which represent
// a cluster of Thai text.
//
// ie. Kor Kai, Kor Kai -> 1
// Kor Kai, Sara Um -> 2
//
// * Note this function will not return no more than 3 character,
// for cluster as this would represent invalid sequence of character.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
/*
unsigned int GetCluster(WCHAR* pszIndex) { int iRetValue = 0;
// Take all begin cluster character.
while (IsThaiBeginClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; }
if (IsThaiConsonant(*pszIndex)) { pszIndex++; iRetValue++;
while (IsThaiUpperAndLowerClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; }
while (IsThaiEndingClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; } }
if (iRetValue == 0) // The character is probably a punctuation.
iRetValue++;
return iRetValue; }
*/ //+---------------------------------------------------------------------------
//
// Function: IsThaiConsonant
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL IsThaiConsonant(WCHAR wc) { return ( (wc >= THAI_Ko_Kai) && (wc <= THAI_Ho_Nok_Huk) ); }
//+---------------------------------------------------------------------------
//
// Define the different part of speech for Thai.
//
//----------------------------------------------------------------------------
WCHAR wzPOSLookup[POSTYPE][46] = { L"NONE", // 0 . No tags.
L"NPRP", // 1 . Proper noun
L"NCNM", // 2 . Cardinal number
L"NONM", // 3 . Ordinal number
L"NLBL", // 4 . Label noun
L"NCMN", // 5 . Common noun
L"NTTL", // 6 . Title noun
L"PPRS", // 7 . Personal pronoun
L"PDMN", // 8 . Demonstrative pronoun
L"PNTR", // 9 . Interrogative pronoun
L"PREL", // 10. Relative pronoun
L"VACT", // 11. Active verb
L"VSTA", // 12. Stative verb
L"VATT", // 13. Attributive verb
L"XVBM", // 14. Pre-verb auxiliary, before negator
L"XVAM", // 15. Pre-verb auxiliary, after negator
L"XVMM", // 16. Pre-verb, before or after negator
L"XVBB", // 17. Pre-verb auxiliary, in imperative mood
L"XVAE", // 18. Post-verb auxiliary
L"DDAN", // 19. Definite determiner, after noun without classifier in between
L"DDAC", // 20. Definite determiner, allowing classifier in between
L"DDBQ", // 21. Definite determiner, between noun and classifier or preceding quantitative expression
L"DDAQ", // 22. Definite determiner, following quantitative expression
L"DIAC", // 23. Indefinite determiner, following noun; allowing classifier in between
L"DIBQ", // 24. Indefinite determiner, between noun and classifier or preceding quantitative expression
L"DIAQ", // 25. Indefinite determiner, following quantitative expression
L"DCNM", // 26. Determiner, cardinal number expression
L"DONM", // 27. Determiner, ordinal number expression
L"ADVN", // 28. Adverb with normal form
L"ADVI", // 29. Adverb with iterative form
L"ADVP", // 30. Adverb with prefixed form
L"ADVS", // 31. Sentential adverb
L"CNIT", // 32. Unit classifier
L"CLTV", // 33. Collective classifier
L"CMTR", // 34. Measurement classifier
L"CFQC", // 35. Frequency classifier
L"CVBL", // 36. Verbal classifier
L"JCRG", // 37. Coordinating conjunction
L"JCMP", // 38. Comparative conjunction
L"JSBR", // 39. Subordinating conjunction
L"RPRE", // 40. Preposition
L"INT", // 41. Interjection
L"FIXN", // 42. Nominal prefix
L"FIXV", // 43. Adverbial prefix
L"EAFF", // 44. Ending for affirmative sentencev
L"EITT", // 45. Ending for interrogative sentence
L"NEG", // 46. Negator
L"PUNC", // 47. Punctuation
L"ADVI ADVN", // 48.
L"ADVI ADVN NCMN", // 49.
L"ADVI ADVN VSTA", // 50.
L"ADVI VATT", // 51.
L"ADVN ADVP", // 52.
L"ADVN ADVP ADVS", // 53.
L"ADVN ADVP DIAQ DIBQ JCMP JSBR RPRE", // 54.
L"ADVN ADVP NCMN VATT", // 55.
L"ADVN ADVP VSTA", // 56.
L"ADVN ADVS DDAC DDAN DIAC VATT XVAE", // 57.
L"ADVN ADVS DDAN NCMN VATT VSTA", // 58.
L"ADVN ADVS NCMN", // 59.
L"ADVN ADVS NCMN VATT", // 60.
L"ADVN ADVS VACT", // 61.
L"ADVN ADVS VATT", // 62.
L"ADVN CFQC NCMN RPRE VSTA", // 63.
L"ADVN CLTV CNIT NCMN RPRE", // 64.
L"ADVN DCNM", // 65.
L"ADVN DDAC DDAN", // 66.
L"ADVN DDAC DDAN NCMN PDMN", // 67.
L"ADVN DDAC DDAN PDMN", // 68.
L"ADVN DDAN DDBQ", // 69.
L"ADVN DDAN DIAC PDMN VSTA", // 70.
L"ADVN DDAN FIXN PDMN", // 71.
L"ADVN DDAN NCMN", // 72.
L"ADVN DDAQ", // 73.
L"ADVN DDBQ", // 74.
L"ADVN DDBQ RPRE VATT", // 75.
L"ADVN DDBQ VATT VSTA XVAE", // 76.
L"ADVN DIAC", // 77.
L"ADVN DIAC PDMN", // 78.
L"ADVN DIBQ", // 79.
L"ADVN DIBQ NCMN", // 80.
L"ADVN DIBQ VACT VSTA", // 81.
L"ADVN DIBQ VATT", // 82.
L"ADVN DONM JCMP", // 83.
L"ADVN DONM JSBR NCMN RPRE VATT XVAE", // 84.
L"ADVN EITT PNTR", // 85.
L"ADVN FIXN", // 86.
L"ADVN JCMP", // 87.
L"ADVN JCRG", // 88.
L"ADVN JCRG JSBR", // 89.
L"ADVN JCRG JSBR XVBM XVMM", // 90.
L"ADVN JCRG RPRE VACT VSTA XVAE", // 91.
L"ADVN JSBR", // 92.
L"ADVN JSBR NCMN", // 93.
L"ADVN JSBR RPRE VATT", // 94.
L"ADVN JSBR RPRE XVAE", // 95.
L"ADVN JSBR VSTA", // 96.
L"ADVN JSBR XVAE XVBM", // 97.
L"ADVN NCMN", // 98.
L"ADVN NCMN RPRE VACT VATT VSTA", // 99.
L"ADVN NCMN RPRE VACT XVAE", // 100.
L"ADVN NCMN RPRE VATT", // 101.
L"ADVN NCMN VACT VATT VSTA", // 102.
L"ADVN NCMN VACT VSTA", // 103.
L"ADVN NCMN VATT", // 104.
L"ADVN NCMN VATT VSTA", // 105.
L"ADVN NEG", // 106.
L"ADVN NPRP VATT", // 107.
L"ADVN PDMN VACT", // 108.
L"ADVN PNTR", // 109.
L"ADVN RPRE", // 110.
L"ADVN RPRE VACT VATT XVAE", // 111.
L"ADVN RPRE VACT XVAM XVBM", // 112.
L"ADVN RPRE VATT VSTA", // 113.
L"ADVN RPRE VSTA", // 114.
L"ADVN VACT", // 115.
L"ADVN VACT VATT", // 116.
L"ADVN VACT VATT VSTA", // 117.
L"ADVN VACT VATT VSTA XVAM XVBM", // 118.
L"ADVN VACT VSTA", // 119.
L"ADVN VACT VSTA XVAE", // 120.
L"ADVN VACT XVAE", // 121.
L"ADVN VATT", // 122.
L"ADVN VATT VSTA", // 123.
L"ADVN VATT VSTA XVAM XVBM XVMM", // 124.
L"ADVN VATT XVBM", // 125.
L"ADVN VSTA", // 126.
L"ADVN VSTA XVAE", // 127.
L"ADVN VSTA XVBM", // 128.
L"ADVN XVAE", // 129.
L"ADVN XVAM", // 130.
L"ADVN XVBM XVMM", // 131.
L"ADVP JSBR RPRE VATT", // 132.
L"ADVP VATT", // 133.
L"ADVS DDAC JCRG", // 134.
L"ADVS DDAC JSBR", // 135.
L"ADVS DDAN VSTA", // 136.
L"ADVS DIAC", // 137.
L"ADVS DONM", // 138.
L"ADVS JCRG JSBR", // 139.
L"ADVS JCRG JSBR RPRE", // 140.
L"ADVS JSBR", // 141.
L"ADVS JSBR RPRE", // 142.
L"ADVS NCMN", // 143.
L"ADVS VATT", // 144.
L"CFQC CLTV CNIT DCNM JCRG JSBR NCMN RPRE XVBM", // 145.
L"CFQC CNIT PREL", // 146.
L"CFQC NCMN", // 147.
L"CLTV CNIT NCMN", // 148.
L"CLTV CNIT NCMN RPRE", // 149.
L"CLTV CNIT NCMN VSTA", // 150.
L"CLTV NCMN", // 151.
L"CLTV NCMN VACT VATT", // 152.
L"CLTV NCMN VATT", // 153.
L"CMTR CNIT NCMN", // 154.
L"CMTR NCMN", // 155.
L"CMTR NCMN VATT VSTA", // 156.
L"CNIT DDAC NCMN VATT", // 157.
L"CNIT DONM NCMN RPRE VATT", // 158.
L"CNIT FIXN FIXV JSBR NCMN", // 159.
L"CNIT JCRG JSBR NCMN PREL RPRE VATT", // 160.
L"CNIT JSBR RPRE", // 161.
L"CNIT NCMN", // 162.
L"CNIT NCMN RPRE", // 163.
L"CNIT NCMN RPRE VATT", // 164.
L"CNIT NCMN VACT", // 165.
L"CNIT NCMN VSTA", // 166.
L"CNIT NCNM", // 167.
L"CNIT PPRS", // 168.
L"DCNM DDAC DIAC DONM VATT VSTA", // 169.
L"DCNM DDAN DIAC", // 170.
L"DCNM DIAC NCMN NCNM", // 171.
L"DCNM DIBQ NCMN", // 172.
L"DCNM DONM", // 173.
L"DCNM NCMN", // 174.
L"DCNM NCNM", // 175.
L"DCNM NCNM VACT", // 176.
L"DCNM VATT", // 177.
L"DDAC DDAN", // 178.
L"DDAC DDAN DIAC NCMN", // 179.
L"DDAC DDAN DIAC VATT", // 180.
L"DDAC DDAN EAFF PDMN", // 181.
L"DDAC DDAN PDMN", // 182.
L"DDAC DIAC VSTA", // 183.
L"DDAC NCMN", // 184.
L"DDAN DDBQ", // 185.
L"DDAN DIAC PNTR", // 186.
L"DDAN NCMN", // 187.
L"DDAN NCMN RPRE VATT", // 188.
L"DDAN PDMN", // 189.
L"DDAN RPRE", // 190.
L"DDAN VATT", // 191.
L"DDAQ VATT", // 192.
L"DDBQ DIBQ", // 193.
L"DDBQ JCRG JSBR", // 194.
L"DDBQ JCRG NCMN", // 195.
L"DIAC PDMN", // 196.
L"DIBQ JSBR RPRE VSTA", // 197.
L"DIBQ NCMN", // 198.
L"DIBQ VATT", // 199.
L"DIBQ VATT VSTA", // 200.
L"DIBQ XVBM", // 201.
L"DONM NCMN RPRE", // 202.
L"DONM VACT VATT VSTA", // 203.
L"DONM VATT", // 204.
L"EAFF XVAE XVAM XVBM", // 205.
L"EITT JCRG", // 206.
L"FIXN FIXV NCMN", // 207.
L"FIXN FIXV RPRE VSTA", // 208.
L"FIXN JSBR NCMN PREL RPRE VSTA XVBM", // 209.
L"FIXN NCMN", // 210.
L"FIXN VACT", // 211.
L"FIXN VACT VSTA", // 212.
L"FIXV JSBR RPRE", // 213.
L"JCMP JSBR", // 214.
L"JCMP RPRE VSTA", // 215.
L"JCMP VATT VSTA", // 216.
L"JCMP VSTA", // 217.
L"JCRG JSBR", // 218.
L"JCRG JSBR NCMN RPRE", // 219.
L"JCRG JSBR RPRE", // 220.
L"JCRG RPRE", // 221.
L"JCRG RPRE VATT VSTA", // 222.
L"JCRG VSTA", // 223.
L"JSBR NCMN", // 224.
L"JSBR NCMN XVAE", // 225.
L"JSBR NCMN XVAM XVBM XVMM", // 226.
L"JSBR PREL", // 227.
L"JSBR PREL RPRE", // 228.
L"JSBR PREL XVBM", // 229.
L"JSBR RPRE", // 230.
L"JSBR RPRE VACT", // 231.
L"JSBR RPRE VACT VSTA", // 232.
L"JSBR RPRE VACT XVAE XVAM", // 233.
L"JSBR RPRE VATT", // 234.
L"JSBR RPRE VSTA", // 235.
L"JSBR RPRE XVAM", // 236.
L"JSBR VACT", // 237.
L"JSBR VACT VSTA", // 238.
L"JSBR VATT XVBM XVMM", // 239.
L"JSBR VSTA", // 240.
L"JSBR XVBM", // 241.
L"NCMN NCNM", // 242.
L"NCMN NCNM NPRP", // 243.
L"NCMN NLBL NPRP", // 244.
L"NCMN NPRP", // 245.
L"NCMN NPRP RPRE", // 246.
L"NCMN NTTL", // 247.
L"NCMN PDMN PPRS", // 248.
L"NCMN PDMN VATT", // 249.
L"NCMN PNTR", // 250.
L"NCMN PPRS PREL VACT", // 251.
L"NCMN RPRE", // 252.
L"NCMN RPRE VACT VATT", // 253.
L"NCMN RPRE VATT", // 254.
L"NCMN VACT", // 255.
L"NCMN VACT VATT", // 256.
L"NCMN VACT VATT VSTA XVAE", // 257.
L"NCMN VACT VSTA", // 258.
L"NCMN VACT VSTA XVAM", // 259.
L"NCMN VACT VSTA XVBB", // 260.
L"NCMN VATT", // 261.
L"NCMN VATT VSTA", // 262.
L"NCMN VATT XVAM", // 263.
L"NCMN VSTA", // 264.
L"NCMN XVBM", // 265.
L"NPRP RPRE", // 266.
L"NPRP VATT", // 267.
L"NTTL PPRS", // 268.
L"PDMN PPRS", // 269.
L"PDMN VATT", // 270.
L"PDMN VATT VSTA", // 271.
L"PPRS PREL", // 272.
L"PPRS VATT", // 273.
L"RPRE VACT", // 274.
L"RPRE VACT VATT", // 275.
L"RPRE VACT VSTA", // 276.
L"RPRE VACT VSTA XVAE", // 277.
L"RPRE VACT XVAE", // 278.
L"RPRE VATT", // 279.
L"RPRE VATT VSTA", // 280.
L"RPRE VSTA", // 281.
L"VACT VATT", // 282.
L"VACT VATT VSTA", // 283.
L"VACT VATT XVAE XVAM XVBM", // 284.
L"VACT VSTA", // 285.
L"VACT VSTA XVAE", // 286.
L"VACT VSTA XVAE XVAM", // 287.
L"VACT VSTA XVAE XVAM XVMM", // 288.
L"VACT VSTA XVAM", // 289.
L"VACT VSTA XVAM XVMM", // 290.
L"VACT XVAE", // 291.
L"VACT XVAM", // 292.
L"VACT XVAM XVMM", // 293.
L"VACT XVMM", // 294.
L"VATT VSTA", // 295.
L"VSTA XVAE", // 296.
L"VSTA XVAM", // 297.
L"VSTA XVAM XVMM", // 298.
L"VSTA XVBM", // 299.
L"XVAM XVBM", // 300.
L"XVAM XVBM XVMM", // 301.
L"XVAM XVMM", // 302.
L"UNKN", // 303. Unknown
L"ABBR" // 304. Abbrivation
};
//+---------------------------------------------------------------------------
//
// Function: POSCompress
//
// Synopsis: Part Of Speech Compress - translating string to unique id.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD POSCompress(const WCHAR* szTag) { int i;
for (i = 0; i < POSTYPE; i++) { if (wcscmp(szTag, &wzPOSLookup[i][0]) == 0) { return (DWORD)i; } } return POSTYPE; }
//+---------------------------------------------------------------------------
//
// Function: POSDecompress
//
// Synopsis: Part Of Speech Decompress - Decompress tag get
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline WCHAR* POSDecompress(DWORD dwTag) { return (&wzPOSLookup[dwTag][0]); }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: Constructor:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiTrieIter::CThaiTrieIter() : resultWord(NULL), soundexWord(NULL), tempWord(NULL), pTrieScanArray(NULL), m_fThaiNumber(false) { resultWord = new WCHAR[WORDSIZE]; tempWord = new WCHAR[WORDSIZE]; pTrieScanArray = new TRIESCAN[53]; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: Destructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiTrieIter::~CThaiTrieIter() { if (resultWord) delete resultWord; if (tempWord) delete tempWord; if (pTrieScanArray) delete pTrieScanArray; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Initialize variables.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
void CThaiTrieIter::Init(CTrie* ctrie) { // Declare varialbes.
WCHAR wc;
// Initialize parent.
CTrieIter::Init(ctrie);
// Initialize Hash table.
for (wc = THAI_Ko_Kai; wc <= THAI_Ho_Nok_Huk; wc++) GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai]); for (wc = THAI_Vowel_Sara_E; wc <= THAI_Vowel_Sara_AI_MaiMaLai; wc++) GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai - 17]); }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Initialize variables.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::GetScanFirstChar(WCHAR wc, TRIESCAN* pTrieScan) { // Reset the trie scan.
memset(&trieScan1, 0, sizeof(TRIESCAN));
if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return false;
while (wc != trieScan1.wch) { // Keep moving the the right of the trie.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) { memset(pTrieScan, 0, sizeof(TRIESCAN)); return false; } } memcpy(pTrieScan, &trieScan1, sizeof(TRIESCAN));
return true; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: The function move trieScan to the relevant node matching with
// with the cluster of Thai character.
//
// Arguments: szCluster - contain the thai character cluster.
// iNumCluster - contain the size of character.
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CThaiTrieIter::MoveCluster(const WCHAR* szCluster, unsigned int iNumCluster) { // Declare and initailze local variables.
unsigned int i = 0;
// Assert(iNumCluster <= 6, "Invalid cluster");
CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return FALSE;
while (TRUE) { if (szCluster[i] == trieScan1.wch) { i++; if (i == iNumCluster) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return TRUE; } // Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; }
return FALSE; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: The function move trieScan to the relevant node matching with
// with the cluster of Thai character.
//
// Arguments: szCluster - contain the thai character cluster.
// iNumCluster - contain the size of character.
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::MoveCluster(WCHAR* szCluster, unsigned int iNumCluster, bool fBeginNewWord) { // Declare and initailze local variables.
unsigned int i = 0;
Assert(iNumCluster <= 6, "Invalid cluster");
// No need to move.
if (iNumCluster == 0) return false;
// Use a look indexes for where the first character is at.
if (fBeginNewWord) { m_fThaiNumber = false; // Quick look up for proper characters.
if (szCluster[i] >= THAI_Ko_Kai && szCluster[i] <= THAI_Ho_Nok_Huk) memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai)], sizeof(TRIESCAN)); else if (szCluster[i] >= THAI_Vowel_Sara_E && szCluster[i] <= THAI_Vowel_Sara_AI_MaiMaLai) memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai - 17)], sizeof(TRIESCAN)); else { Reset(); m_fThaiNumber = IsThaiNumeric(szCluster[i]); }
if (trieScan.wch == szCluster[i]) i++;
if (i == iNumCluster) { GetNode(); return true; } } CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return false;
if (m_fThaiNumber) { fWordEnd = true; if (IsThaiNumeric(szCluster[i]) || szCluster[i] == L',' || szCluster[i] == L'.') return true; else return false; }
while (true) { if (szCluster[i] == trieScan1.wch) { i++;
if ((i == iNumCluster) || ( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) ) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return true; } // Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Let Nikhahit equal Sara Am.
// TODO: case Nikhahit Mai To and Sara AA should equal to Mai To Sara Am. TO risk for this version.
// This bug was found because Thairath newspaper doesn't write this properly on their web page.
else if (szCluster[i] == THAI_Nikhahit && szCluster[i+1] == THAI_Vowel_Sara_AA && trieScan1.wch == THAI_Vowel_Sign_Sara_Am) { if (szCluster[i+1] == THAI_Vowel_Sara_AA) i++;
i++; if ((i == iNumCluster) || ( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) ) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return true; } // Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; }
if (fBeginNewWord) Reset();
return false; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
SOUNDEXSTATE CThaiTrieIter::MoveSoundexByCluster(WCHAR* szCluster, unsigned int iNumCluster, unsigned int iNumNextCluster) { // Declare and initailze local variables.
unsigned int i = 0 , x = 0; bool fStoreScan = false; TRIESCAN trieScanPush;
Assert(iNumCluster <= 6, "Invalid cluster"); Assert(iNumNextCluster <= 6, "Invalid cluster");
CopyScan();
if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return UNABLE_TO_MOVE;
if (IsThaiEndingSign(*szCluster)) return STOP_MOVE;
// Match as much as possible
while (true) { if (szCluster[i] == trieScan1.wch) { i++; if (i == iNumCluster) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return NOSUBSTITUTE; } // Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break;
// Save our current scan position.
memcpy(&trieScanPush, &trieScan1, sizeof(TRIESCAN)); fStoreScan = true; } // Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; }
// Try doing some tonemark substitution.
if (fStoreScan && IsThaiToneMark(szCluster[i]) ) { // Restore trieScan1 to last matched.
memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN));
while (true) { if (IsThaiToneMark(trieScan1.wch)) { if ( (i + 1) == iNumCluster) { if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster)) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return SUBSTITUTE_DIACRITIC; } } } // Move the Trie right one node.
// Goes through all the none Tonemark.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } }
// Try doing droping the current tonemark.
// Example is case can be best found "Click" is spelt in Thai from the
// different group at Microsoft.
if (fStoreScan && !IsThaiToneMark(szCluster[i]) ) { // Restore trieScan1 to last matched.
memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN));
while (true) { if (IsThaiToneMark(trieScan1.wch)) { if ( (i + 1) == iNumCluster) { if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster)) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return SUBSTITUTE_DIACRITIC; } } } // Move the Trie right one node.
// Drop all the Tonemark.
if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } }
return UNABLE_TO_MOVE; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: set trieScan1 = trieScan.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiTrieIter::CopyScan() { // Let trieScan1 = trieScan
memcpy(&trieScan1,&trieScan, sizeof(TRIESCAN)); }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: the function traverse through the whole dictionary
// to find the best possible match words.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiTrieIter::Soundex(WCHAR* word) { // Reset Trie.
Reset();
// Move Down.
Down();
// Clean soundexWord.
memset(resultWord, 0, sizeof(WCHAR) * WORDSIZE); memset(tempWord, 0, sizeof(WCHAR) * WORDSIZE);
soundexWord = word;
iResultScore = GetScore(L"\x0e04\x0e25\x0e34\x0e01\x0e01\x0e01",soundexWord); iResultScore = 2000;
#if defined (_DEBUG)
iStackSize = 0; #endif
Traverse(0,1000);
return iResultScore; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiTrieIter::GetScore(WCHAR* idealWord, WCHAR* inputWord) { unsigned int iScore = 1000; unsigned int idealWordLen = wcslen(idealWord); unsigned int iInputWordLen = wcslen(inputWord); unsigned int iIndexBegin = 0; unsigned int i; unsigned int x = 0; unsigned int iMaxCompare; bool fShouldExit;
for (i=0; i < iInputWordLen; i++) { iMaxCompare = ( (iIndexBegin + 2) < idealWordLen ) ? (iIndexBegin + 2) : idealWordLen; if (i <= idealWordLen) { x = iIndexBegin; fShouldExit = false; while (true) { if ((x >= iMaxCompare) || (fShouldExit) ) break;
if (idealWord[x] == inputWord[i]) { x++; iIndexBegin = x; break; } if (IsThaiUpperAndLowerClusterCharacter(inputWord[i])) iScore += 5; else iScore += 10; x++; fShouldExit = true; } } else { if (IsThaiUpperAndLowerClusterCharacter(inputWord[i])) iScore += 20; else iScore += 30; } }
while (x <= idealWordLen) { if (IsThaiUpperAndLowerClusterCharacter(idealWord[x])) iScore += 5; else iScore += 10; x++; }
return iScore; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::Traverse(unsigned int iCharPos, unsigned int score) { TRIESCAN trieScanLevel;
#if defined(_DEBUG)
iStackSize++; #endif
// push current trieScan into local stack trieScanLevel.
memcpy(&trieScanLevel,&trieScan, sizeof(TRIESCAN));
// Get Node information
GetNode();
// Store the current character to result word.
tempWord[iCharPos] = wc; tempWord[iCharPos + 1] = 0;
// Determine the distance between two string.
score = GetScore(tempWord, soundexWord); // See if we have reached the end of a word.
if (fWordEnd) { tempWord[iCharPos + 1] = 0; // Is Soundex score lower than we have.
if (score < iResultScore) { // wcscpy(resultWord,tempWord);
Wzncpy(resultWord,tempWord,WORDSIZE); iResultScore = score; } }
// See if we can prune the result of the words.
if (score > (iResultScore + APPROXIMATEWEIGHT)) { #if defined(_DEBUG)
iStackSize--; #endif
return true; }
// Move down Trie branch.
if (Down()) { Traverse(iCharPos + 1, score);
if (Right()) Traverse(iCharPos + 1, score);
// restore trieScan
memcpy(&trieScan,&trieScanLevel, sizeof(TRIESCAN));
if (Right()) Traverse(iCharPos, score); }
#if defined(_DEBUG)
iStackSize--; #endif
return true; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synoposis: This function will trieScan1 to the next cluster if
// the move is possible.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiTrieIter::CheckNextCluster(const WCHAR* szCluster, unsigned int iNumCluster) { // Declare and initailze local variables.
unsigned int i = 0; TRIESCAN trieScan2;
Assert(iNumCluster <= 6, "Invalid cluster");
// If there are no cluster to check consider cluster found.
if (0 == iNumCluster) return true;
memcpy(&trieScan2, &trieScan1, sizeof(TRIESCAN));
// Move down the Trie Branch.
if (!TrieGetNextState(pTrieCtrl, &trieScan2)) return false;
while (true) { if (szCluster[i] == trieScan2.wch) { i++; if (i == iNumCluster) { return true; } // Move down the Trie Branch.
else if (!TrieGetNextState(pTrieCtrl, &trieScan2)) break; } // Move the Trie right one node.
else if (!TrieGetNextNode(pTrieCtrl, &trieScan2)) break; }
return false; }
|