//+--------------------------------------------------------------------------- // // // CThaiTrieIter - class CThaiTrieIter use for traversing trie. // // History: // created 7/99 aarayas // // ©1999 Microsoft Corporation //---------------------------------------------------------------------------- #include "CThaiTrieIter.hpp" #define WORDSIZE 64 static unsigned int iStackSize = 0; //+--------------------------------------------------------------------------- // // Function: IsThaiBeginClusterCharacter // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL IsThaiBeginClusterCharacter(WCHAR wc) { return ( ( wc >= THAI_Vowel_Sara_E ) && (wc <= THAI_Vowel_Sara_AI_MaiMaLai) ); } //+--------------------------------------------------------------------------- // // Function: IsThaiUpperAndLowerClusterCharacter // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL IsThaiUpperAndLowerClusterCharacter(WCHAR wc) { return ( ( (wc == THAI_Vowel_Sign_Mai_HanAkat) ) || ( (wc >= THAI_Vowel_Sign_Sara_Am) && (wc <= THAI_Vowel_Sign_Phinthu) ) || ( (wc >= THAI_Tone_MaiTaiKhu) && (wc <= THAI_Nikhahit) ) ); } //+--------------------------------------------------------------------------- // // Function: IsThaiEndingClusterCharacter // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL IsThaiEndingClusterCharacter(WCHAR wc) { return ( // (wc == THAI_Sign_PaiYanNoi) || // take this line out to fix O11.PaiYanNoi issue. (wc == THAI_Vowel_Sara_A) || (wc == THAI_Vowel_Sara_AA) || (wc == THAI_Vowel_LakKhangYao) || (wc == THAI_Vowel_MaiYaMok) ); } //+--------------------------------------------------------------------------- // // Function: IsThaiMostlyBeginCharacter // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool IsThaiMostlyBeginCharacter(WCHAR wc) { return ( (wc >= THAI_Vowel_Sara_E && wc <= THAI_Vowel_Sara_AI_MaiMaLai) || // Character always in front of a word. (wc == THAI_Cho_Ching) || // Character always in front of a word. (wc == THAI_Pho_Phung) || // Character always in front of a word. (wc == THAI_Fo_Fa) || // Character always in front of a word. (wc == THAI_Ho_Nok_Huk) || // Character always in front of a word. (wc == THAI_Ho_Hip) || // Character most like in front of a word. (wc == THAI_Pho_Samphao) || // Character most like in front of a word. (wc == THAI_Kho_Rakhang) || // Character most like in front of a word. (wc == THAI_Fo_Fan) || // Character most like in front of a word. (wc == THAI_So_So) || // Character most like in front of a word. (wc == THAI_Tho_NangmonTho) ); // Character most like in front of a word. } //+--------------------------------------------------------------------------- // // Function: IsContain // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/00 aarayas // // Notes: // //---------------------------------------------------------------------------- bool IsContain(const WCHAR* pwcWord, unsigned int iWordLen, WCHAR wc) { const WCHAR* pwc = pwcWord; const WCHAR* pwcEnd = pwcWord + iWordLen; while (pwc < pwcEnd) { if (*pwc == wc) return true; pwc++; } return false; } //+--------------------------------------------------------------------------- // // Function: IsThaiMostlyLastCharacter // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool IsThaiMostlyLastCharacter(WCHAR wc) { return ( (wc == THAI_Vowel_Sign_Sara_Am) || // Always the end of word. (wc == THAI_Sign_PaiYanNoi) || // Always the end of word. (wc == THAI_Vowel_MaiYaMok) || // Always the end of word. (wc == THAI_Vowel_LakKhangYao) || // Most likely the end of word. (wc == THAI_Thanthakhat) ); // Most likely the end of word. } //+--------------------------------------------------------------------------- // // Function: IsThaiToneMark // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool IsThaiToneMark(WCHAR wc) { return ( (wc >= 0x0e48) && (wc <= 0x0e4b) || (wc == 0x0e31)); } //+--------------------------------------------------------------------------- // // Function: IsThaiEndingSign // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/02 aarayas // // Notes: // //---------------------------------------------------------------------------- bool IsThaiEndingSign(WCHAR wc) { return ((bool) (wc == THAI_Vowel_MaiYaMok || wc == THAI_Sign_PaiYanNoi)); } //+--------------------------------------------------------------------------- // // Function: GetCluster // // Synopsis: The function return the next number of character which represent // a cluster of Thai text. // // ie. Kor Kai, Kor Kai -> 1 // Kor Kai, Sara Um -> 2 // // * Note this function will not return no more than 3 character, // for cluster as this would represent invalid sequence of character. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- /* unsigned int GetCluster(WCHAR* pszIndex) { int iRetValue = 0; // Take all begin cluster character. while (IsThaiBeginClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; } if (IsThaiConsonant(*pszIndex)) { pszIndex++; iRetValue++; while (IsThaiUpperAndLowerClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; } while (IsThaiEndingClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; } } if (iRetValue == 0) // The character is probably a punctuation. iRetValue++; return iRetValue; } */ //+--------------------------------------------------------------------------- // // Function: IsThaiConsonant // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL IsThaiConsonant(WCHAR wc) { return ( (wc >= THAI_Ko_Kai) && (wc <= THAI_Ho_Nok_Huk) ); } //+--------------------------------------------------------------------------- // // Define the different part of speech for Thai. // //---------------------------------------------------------------------------- WCHAR wzPOSLookup[POSTYPE][46] = { L"NONE", // 0 . No tags. L"NPRP", // 1 . Proper noun L"NCNM", // 2 . Cardinal number L"NONM", // 3 . Ordinal number L"NLBL", // 4 . Label noun L"NCMN", // 5 . Common noun L"NTTL", // 6 . Title noun L"PPRS", // 7 . Personal pronoun L"PDMN", // 8 . Demonstrative pronoun L"PNTR", // 9 . Interrogative pronoun L"PREL", // 10. Relative pronoun L"VACT", // 11. Active verb L"VSTA", // 12. Stative verb L"VATT", // 13. Attributive verb L"XVBM", // 14. Pre-verb auxiliary, before negator L"XVAM", // 15. Pre-verb auxiliary, after negator L"XVMM", // 16. Pre-verb, before or after negator L"XVBB", // 17. Pre-verb auxiliary, in imperative mood L"XVAE", // 18. Post-verb auxiliary L"DDAN", // 19. Definite determiner, after noun without classifier in between L"DDAC", // 20. Definite determiner, allowing classifier in between L"DDBQ", // 21. Definite determiner, between noun and classifier or preceding quantitative expression L"DDAQ", // 22. Definite determiner, following quantitative expression L"DIAC", // 23. Indefinite determiner, following noun; allowing classifier in between L"DIBQ", // 24. Indefinite determiner, between noun and classifier or preceding quantitative expression L"DIAQ", // 25. Indefinite determiner, following quantitative expression L"DCNM", // 26. Determiner, cardinal number expression L"DONM", // 27. Determiner, ordinal number expression L"ADVN", // 28. Adverb with normal form L"ADVI", // 29. Adverb with iterative form L"ADVP", // 30. Adverb with prefixed form L"ADVS", // 31. Sentential adverb L"CNIT", // 32. Unit classifier L"CLTV", // 33. Collective classifier L"CMTR", // 34. Measurement classifier L"CFQC", // 35. Frequency classifier L"CVBL", // 36. Verbal classifier L"JCRG", // 37. Coordinating conjunction L"JCMP", // 38. Comparative conjunction L"JSBR", // 39. Subordinating conjunction L"RPRE", // 40. Preposition L"INT", // 41. Interjection L"FIXN", // 42. Nominal prefix L"FIXV", // 43. Adverbial prefix L"EAFF", // 44. Ending for affirmative sentencev L"EITT", // 45. Ending for interrogative sentence L"NEG", // 46. Negator L"PUNC", // 47. Punctuation L"ADVI ADVN", // 48. L"ADVI ADVN NCMN", // 49. L"ADVI ADVN VSTA", // 50. L"ADVI VATT", // 51. L"ADVN ADVP", // 52. L"ADVN ADVP ADVS", // 53. L"ADVN ADVP DIAQ DIBQ JCMP JSBR RPRE", // 54. L"ADVN ADVP NCMN VATT", // 55. L"ADVN ADVP VSTA", // 56. L"ADVN ADVS DDAC DDAN DIAC VATT XVAE", // 57. L"ADVN ADVS DDAN NCMN VATT VSTA", // 58. L"ADVN ADVS NCMN", // 59. L"ADVN ADVS NCMN VATT", // 60. L"ADVN ADVS VACT", // 61. L"ADVN ADVS VATT", // 62. L"ADVN CFQC NCMN RPRE VSTA", // 63. L"ADVN CLTV CNIT NCMN RPRE", // 64. L"ADVN DCNM", // 65. L"ADVN DDAC DDAN", // 66. L"ADVN DDAC DDAN NCMN PDMN", // 67. L"ADVN DDAC DDAN PDMN", // 68. L"ADVN DDAN DDBQ", // 69. L"ADVN DDAN DIAC PDMN VSTA", // 70. L"ADVN DDAN FIXN PDMN", // 71. L"ADVN DDAN NCMN", // 72. L"ADVN DDAQ", // 73. L"ADVN DDBQ", // 74. L"ADVN DDBQ RPRE VATT", // 75. L"ADVN DDBQ VATT VSTA XVAE", // 76. L"ADVN DIAC", // 77. L"ADVN DIAC PDMN", // 78. L"ADVN DIBQ", // 79. L"ADVN DIBQ NCMN", // 80. L"ADVN DIBQ VACT VSTA", // 81. L"ADVN DIBQ VATT", // 82. L"ADVN DONM JCMP", // 83. L"ADVN DONM JSBR NCMN RPRE VATT XVAE", // 84. L"ADVN EITT PNTR", // 85. L"ADVN FIXN", // 86. L"ADVN JCMP", // 87. L"ADVN JCRG", // 88. L"ADVN JCRG JSBR", // 89. L"ADVN JCRG JSBR XVBM XVMM", // 90. L"ADVN JCRG RPRE VACT VSTA XVAE", // 91. L"ADVN JSBR", // 92. L"ADVN JSBR NCMN", // 93. L"ADVN JSBR RPRE VATT", // 94. L"ADVN JSBR RPRE XVAE", // 95. L"ADVN JSBR VSTA", // 96. L"ADVN JSBR XVAE XVBM", // 97. L"ADVN NCMN", // 98. L"ADVN NCMN RPRE VACT VATT VSTA", // 99. L"ADVN NCMN RPRE VACT XVAE", // 100. L"ADVN NCMN RPRE VATT", // 101. L"ADVN NCMN VACT VATT VSTA", // 102. L"ADVN NCMN VACT VSTA", // 103. L"ADVN NCMN VATT", // 104. L"ADVN NCMN VATT VSTA", // 105. L"ADVN NEG", // 106. L"ADVN NPRP VATT", // 107. L"ADVN PDMN VACT", // 108. L"ADVN PNTR", // 109. L"ADVN RPRE", // 110. L"ADVN RPRE VACT VATT XVAE", // 111. L"ADVN RPRE VACT XVAM XVBM", // 112. L"ADVN RPRE VATT VSTA", // 113. L"ADVN RPRE VSTA", // 114. L"ADVN VACT", // 115. L"ADVN VACT VATT", // 116. L"ADVN VACT VATT VSTA", // 117. L"ADVN VACT VATT VSTA XVAM XVBM", // 118. L"ADVN VACT VSTA", // 119. L"ADVN VACT VSTA XVAE", // 120. L"ADVN VACT XVAE", // 121. L"ADVN VATT", // 122. L"ADVN VATT VSTA", // 123. L"ADVN VATT VSTA XVAM XVBM XVMM", // 124. L"ADVN VATT XVBM", // 125. L"ADVN VSTA", // 126. L"ADVN VSTA XVAE", // 127. L"ADVN VSTA XVBM", // 128. L"ADVN XVAE", // 129. L"ADVN XVAM", // 130. L"ADVN XVBM XVMM", // 131. L"ADVP JSBR RPRE VATT", // 132. L"ADVP VATT", // 133. L"ADVS DDAC JCRG", // 134. L"ADVS DDAC JSBR", // 135. L"ADVS DDAN VSTA", // 136. L"ADVS DIAC", // 137. L"ADVS DONM", // 138. L"ADVS JCRG JSBR", // 139. L"ADVS JCRG JSBR RPRE", // 140. L"ADVS JSBR", // 141. L"ADVS JSBR RPRE", // 142. L"ADVS NCMN", // 143. L"ADVS VATT", // 144. L"CFQC CLTV CNIT DCNM JCRG JSBR NCMN RPRE XVBM", // 145. L"CFQC CNIT PREL", // 146. L"CFQC NCMN", // 147. L"CLTV CNIT NCMN", // 148. L"CLTV CNIT NCMN RPRE", // 149. L"CLTV CNIT NCMN VSTA", // 150. L"CLTV NCMN", // 151. L"CLTV NCMN VACT VATT", // 152. L"CLTV NCMN VATT", // 153. L"CMTR CNIT NCMN", // 154. L"CMTR NCMN", // 155. L"CMTR NCMN VATT VSTA", // 156. L"CNIT DDAC NCMN VATT", // 157. L"CNIT DONM NCMN RPRE VATT", // 158. L"CNIT FIXN FIXV JSBR NCMN", // 159. L"CNIT JCRG JSBR NCMN PREL RPRE VATT", // 160. L"CNIT JSBR RPRE", // 161. L"CNIT NCMN", // 162. L"CNIT NCMN RPRE", // 163. L"CNIT NCMN RPRE VATT", // 164. L"CNIT NCMN VACT", // 165. L"CNIT NCMN VSTA", // 166. L"CNIT NCNM", // 167. L"CNIT PPRS", // 168. L"DCNM DDAC DIAC DONM VATT VSTA", // 169. L"DCNM DDAN DIAC", // 170. L"DCNM DIAC NCMN NCNM", // 171. L"DCNM DIBQ NCMN", // 172. L"DCNM DONM", // 173. L"DCNM NCMN", // 174. L"DCNM NCNM", // 175. L"DCNM NCNM VACT", // 176. L"DCNM VATT", // 177. L"DDAC DDAN", // 178. L"DDAC DDAN DIAC NCMN", // 179. L"DDAC DDAN DIAC VATT", // 180. L"DDAC DDAN EAFF PDMN", // 181. L"DDAC DDAN PDMN", // 182. L"DDAC DIAC VSTA", // 183. L"DDAC NCMN", // 184. L"DDAN DDBQ", // 185. L"DDAN DIAC PNTR", // 186. L"DDAN NCMN", // 187. L"DDAN NCMN RPRE VATT", // 188. L"DDAN PDMN", // 189. L"DDAN RPRE", // 190. L"DDAN VATT", // 191. L"DDAQ VATT", // 192. L"DDBQ DIBQ", // 193. L"DDBQ JCRG JSBR", // 194. L"DDBQ JCRG NCMN", // 195. L"DIAC PDMN", // 196. L"DIBQ JSBR RPRE VSTA", // 197. L"DIBQ NCMN", // 198. L"DIBQ VATT", // 199. L"DIBQ VATT VSTA", // 200. L"DIBQ XVBM", // 201. L"DONM NCMN RPRE", // 202. L"DONM VACT VATT VSTA", // 203. L"DONM VATT", // 204. L"EAFF XVAE XVAM XVBM", // 205. L"EITT JCRG", // 206. L"FIXN FIXV NCMN", // 207. L"FIXN FIXV RPRE VSTA", // 208. L"FIXN JSBR NCMN PREL RPRE VSTA XVBM", // 209. L"FIXN NCMN", // 210. L"FIXN VACT", // 211. L"FIXN VACT VSTA", // 212. L"FIXV JSBR RPRE", // 213. L"JCMP JSBR", // 214. L"JCMP RPRE VSTA", // 215. L"JCMP VATT VSTA", // 216. L"JCMP VSTA", // 217. L"JCRG JSBR", // 218. L"JCRG JSBR NCMN RPRE", // 219. L"JCRG JSBR RPRE", // 220. L"JCRG RPRE", // 221. L"JCRG RPRE VATT VSTA", // 222. L"JCRG VSTA", // 223. L"JSBR NCMN", // 224. L"JSBR NCMN XVAE", // 225. L"JSBR NCMN XVAM XVBM XVMM", // 226. L"JSBR PREL", // 227. L"JSBR PREL RPRE", // 228. L"JSBR PREL XVBM", // 229. L"JSBR RPRE", // 230. L"JSBR RPRE VACT", // 231. L"JSBR RPRE VACT VSTA", // 232. L"JSBR RPRE VACT XVAE XVAM", // 233. L"JSBR RPRE VATT", // 234. L"JSBR RPRE VSTA", // 235. L"JSBR RPRE XVAM", // 236. L"JSBR VACT", // 237. L"JSBR VACT VSTA", // 238. L"JSBR VATT XVBM XVMM", // 239. L"JSBR VSTA", // 240. L"JSBR XVBM", // 241. L"NCMN NCNM", // 242. L"NCMN NCNM NPRP", // 243. L"NCMN NLBL NPRP", // 244. L"NCMN NPRP", // 245. L"NCMN NPRP RPRE", // 246. L"NCMN NTTL", // 247. L"NCMN PDMN PPRS", // 248. L"NCMN PDMN VATT", // 249. L"NCMN PNTR", // 250. L"NCMN PPRS PREL VACT", // 251. L"NCMN RPRE", // 252. L"NCMN RPRE VACT VATT", // 253. L"NCMN RPRE VATT", // 254. L"NCMN VACT", // 255. L"NCMN VACT VATT", // 256. L"NCMN VACT VATT VSTA XVAE", // 257. L"NCMN VACT VSTA", // 258. L"NCMN VACT VSTA XVAM", // 259. L"NCMN VACT VSTA XVBB", // 260. L"NCMN VATT", // 261. L"NCMN VATT VSTA", // 262. L"NCMN VATT XVAM", // 263. L"NCMN VSTA", // 264. L"NCMN XVBM", // 265. L"NPRP RPRE", // 266. L"NPRP VATT", // 267. L"NTTL PPRS", // 268. L"PDMN PPRS", // 269. L"PDMN VATT", // 270. L"PDMN VATT VSTA", // 271. L"PPRS PREL", // 272. L"PPRS VATT", // 273. L"RPRE VACT", // 274. L"RPRE VACT VATT", // 275. L"RPRE VACT VSTA", // 276. L"RPRE VACT VSTA XVAE", // 277. L"RPRE VACT XVAE", // 278. L"RPRE VATT", // 279. L"RPRE VATT VSTA", // 280. L"RPRE VSTA", // 281. L"VACT VATT", // 282. L"VACT VATT VSTA", // 283. L"VACT VATT XVAE XVAM XVBM", // 284. L"VACT VSTA", // 285. L"VACT VSTA XVAE", // 286. L"VACT VSTA XVAE XVAM", // 287. L"VACT VSTA XVAE XVAM XVMM", // 288. L"VACT VSTA XVAM", // 289. L"VACT VSTA XVAM XVMM", // 290. L"VACT XVAE", // 291. L"VACT XVAM", // 292. L"VACT XVAM XVMM", // 293. L"VACT XVMM", // 294. L"VATT VSTA", // 295. L"VSTA XVAE", // 296. L"VSTA XVAM", // 297. L"VSTA XVAM XVMM", // 298. L"VSTA XVBM", // 299. L"XVAM XVBM", // 300. L"XVAM XVBM XVMM", // 301. L"XVAM XVMM", // 302. L"UNKN", // 303. Unknown L"ABBR" // 304. Abbrivation }; //+--------------------------------------------------------------------------- // // Function: POSCompress // // Synopsis: Part Of Speech Compress - translating string to unique id. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- DWORD POSCompress(const WCHAR* szTag) { int i; for (i = 0; i < POSTYPE; i++) { if (wcscmp(szTag, &wzPOSLookup[i][0]) == 0) { return (DWORD)i; } } return POSTYPE; } //+--------------------------------------------------------------------------- // // Function: POSDecompress // // Synopsis: Part Of Speech Decompress - Decompress tag get // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline WCHAR* POSDecompress(DWORD dwTag) { return (&wzPOSLookup[dwTag][0]); } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: Constructor: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiTrieIter::CThaiTrieIter() : resultWord(NULL), soundexWord(NULL), tempWord(NULL), pTrieScanArray(NULL), m_fThaiNumber(false) { resultWord = new WCHAR[WORDSIZE]; tempWord = new WCHAR[WORDSIZE]; pTrieScanArray = new TRIESCAN[53]; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: Destructor // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiTrieIter::~CThaiTrieIter() { if (resultWord) delete resultWord; if (tempWord) delete tempWord; if (pTrieScanArray) delete pTrieScanArray; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: Initialize variables. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- void CThaiTrieIter::Init(CTrie* ctrie) { // Declare varialbes. WCHAR wc; // Initialize parent. CTrieIter::Init(ctrie); // Initialize Hash table. for (wc = THAI_Ko_Kai; wc <= THAI_Ho_Nok_Huk; wc++) GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai]); for (wc = THAI_Vowel_Sara_E; wc <= THAI_Vowel_Sara_AI_MaiMaLai; wc++) GetScanFirstChar(wc,&pTrieScanArray[wc - THAI_Ko_Kai - 17]); } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: Initialize variables. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiTrieIter::GetScanFirstChar(WCHAR wc, TRIESCAN* pTrieScan) { // Reset the trie scan. memset(&trieScan1, 0, sizeof(TRIESCAN)); if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return false; while (wc != trieScan1.wch) { // Keep moving the the right of the trie. if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) { memset(pTrieScan, 0, sizeof(TRIESCAN)); return false; } } memcpy(pTrieScan, &trieScan1, sizeof(TRIESCAN)); return true; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: The function move trieScan to the relevant node matching with // with the cluster of Thai character. // // Arguments: szCluster - contain the thai character cluster. // iNumCluster - contain the size of character. // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL CThaiTrieIter::MoveCluster(const WCHAR* szCluster, unsigned int iNumCluster) { // Declare and initailze local variables. unsigned int i = 0; // Assert(iNumCluster <= 6, "Invalid cluster"); CopyScan(); if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return FALSE; while (TRUE) { if (szCluster[i] == trieScan1.wch) { i++; if (i == iNumCluster) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return TRUE; } // Move down the Trie Branch. else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Move the Trie right one node. else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } return FALSE; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: The function move trieScan to the relevant node matching with // with the cluster of Thai character. // // Arguments: szCluster - contain the thai character cluster. // iNumCluster - contain the size of character. // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiTrieIter::MoveCluster(WCHAR* szCluster, unsigned int iNumCluster, bool fBeginNewWord) { // Declare and initailze local variables. unsigned int i = 0; Assert(iNumCluster <= 6, "Invalid cluster"); // No need to move. if (iNumCluster == 0) return false; // Use a look indexes for where the first character is at. if (fBeginNewWord) { m_fThaiNumber = false; // Quick look up for proper characters. if (szCluster[i] >= THAI_Ko_Kai && szCluster[i] <= THAI_Ho_Nok_Huk) memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai)], sizeof(TRIESCAN)); else if (szCluster[i] >= THAI_Vowel_Sara_E && szCluster[i] <= THAI_Vowel_Sara_AI_MaiMaLai) memcpy(&trieScan,&pTrieScanArray[(szCluster[i] - THAI_Ko_Kai - 17)], sizeof(TRIESCAN)); else { Reset(); m_fThaiNumber = IsThaiNumeric(szCluster[i]); } if (trieScan.wch == szCluster[i]) i++; if (i == iNumCluster) { GetNode(); return true; } } CopyScan(); if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return false; if (m_fThaiNumber) { fWordEnd = true; if (IsThaiNumeric(szCluster[i]) || szCluster[i] == L',' || szCluster[i] == L'.') return true; else return false; } while (true) { if (szCluster[i] == trieScan1.wch) { i++; if ((i == iNumCluster) || ( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) ) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return true; } // Move down the Trie Branch. else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Let Nikhahit equal Sara Am. // TODO: case Nikhahit Mai To and Sara AA should equal to Mai To Sara Am. TO risk for this version. // This bug was found because Thairath newspaper doesn't write this properly on their web page. else if (szCluster[i] == THAI_Nikhahit && szCluster[i+1] == THAI_Vowel_Sara_AA && trieScan1.wch == THAI_Vowel_Sign_Sara_Am) { if (szCluster[i+1] == THAI_Vowel_Sara_AA) i++; i++; if ((i == iNumCluster) || ( (szCluster[i] == THAI_Vowel_MaiYaMok || szCluster[i] == THAI_Sign_PaiYanNoi)/* && (i+1 == iNumCluster )*/) ) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return true; } // Move down the Trie Branch. else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; } // Move the Trie right one node. else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } if (fBeginNewWord) Reset(); return false; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- SOUNDEXSTATE CThaiTrieIter::MoveSoundexByCluster(WCHAR* szCluster, unsigned int iNumCluster, unsigned int iNumNextCluster) { // Declare and initailze local variables. unsigned int i = 0 , x = 0; bool fStoreScan = false; TRIESCAN trieScanPush; Assert(iNumCluster <= 6, "Invalid cluster"); Assert(iNumNextCluster <= 6, "Invalid cluster"); CopyScan(); if (!TrieGetNextState(pTrieCtrl, &trieScan1)) return UNABLE_TO_MOVE; if (IsThaiEndingSign(*szCluster)) return STOP_MOVE; // Match as much as possible while (true) { if (szCluster[i] == trieScan1.wch) { i++; if (i == iNumCluster) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return NOSUBSTITUTE; } // Move down the Trie Branch. else if (!TrieGetNextState(pTrieCtrl, &trieScan1)) break; // Save our current scan position. memcpy(&trieScanPush, &trieScan1, sizeof(TRIESCAN)); fStoreScan = true; } // Move the Trie right one node. else if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } // Try doing some tonemark substitution. if (fStoreScan && IsThaiToneMark(szCluster[i]) ) { // Restore trieScan1 to last matched. memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN)); while (true) { if (IsThaiToneMark(trieScan1.wch)) { if ( (i + 1) == iNumCluster) { if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster)) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return SUBSTITUTE_DIACRITIC; } } } // Move the Trie right one node. // Goes through all the none Tonemark. if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } } // Try doing droping the current tonemark. // Example is case can be best found "Click" is spelt in Thai from the // different group at Microsoft. if (fStoreScan && !IsThaiToneMark(szCluster[i]) ) { // Restore trieScan1 to last matched. memcpy(&trieScan1, &trieScanPush, sizeof(TRIESCAN)); while (true) { if (IsThaiToneMark(trieScan1.wch)) { if ( (i + 1) == iNumCluster) { if (CheckNextCluster(szCluster+iNumCluster,iNumNextCluster)) { memcpy(&trieScan, &trieScan1, sizeof(TRIESCAN)); GetNode(); return SUBSTITUTE_DIACRITIC; } } } // Move the Trie right one node. // Drop all the Tonemark. if (!TrieGetNextNode(pTrieCtrl, &trieScan1)) break; } } return UNABLE_TO_MOVE; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: set trieScan1 = trieScan. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline void CThaiTrieIter::CopyScan() { // Let trieScan1 = trieScan memcpy(&trieScan1,&trieScan, sizeof(TRIESCAN)); } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: the function traverse through the whole dictionary // to find the best possible match words. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiTrieIter::Soundex(WCHAR* word) { // Reset Trie. Reset(); // Move Down. Down(); // Clean soundexWord. memset(resultWord, 0, sizeof(WCHAR) * WORDSIZE); memset(tempWord, 0, sizeof(WCHAR) * WORDSIZE); soundexWord = word; iResultScore = GetScore(L"\x0e04\x0e25\x0e34\x0e01\x0e01\x0e01",soundexWord); iResultScore = 2000; #if defined (_DEBUG) iStackSize = 0; #endif Traverse(0,1000); return iResultScore; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiTrieIter::GetScore(WCHAR* idealWord, WCHAR* inputWord) { unsigned int iScore = 1000; unsigned int idealWordLen = wcslen(idealWord); unsigned int iInputWordLen = wcslen(inputWord); unsigned int iIndexBegin = 0; unsigned int i; unsigned int x = 0; unsigned int iMaxCompare; bool fShouldExit; for (i=0; i < iInputWordLen; i++) { iMaxCompare = ( (iIndexBegin + 2) < idealWordLen ) ? (iIndexBegin + 2) : idealWordLen; if (i <= idealWordLen) { x = iIndexBegin; fShouldExit = false; while (true) { if ((x >= iMaxCompare) || (fShouldExit) ) break; if (idealWord[x] == inputWord[i]) { x++; iIndexBegin = x; break; } if (IsThaiUpperAndLowerClusterCharacter(inputWord[i])) iScore += 5; else iScore += 10; x++; fShouldExit = true; } } else { if (IsThaiUpperAndLowerClusterCharacter(inputWord[i])) iScore += 20; else iScore += 30; } } while (x <= idealWordLen) { if (IsThaiUpperAndLowerClusterCharacter(idealWord[x])) iScore += 5; else iScore += 10; x++; } return iScore; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiTrieIter::Traverse(unsigned int iCharPos, unsigned int score) { TRIESCAN trieScanLevel; #if defined(_DEBUG) iStackSize++; #endif // push current trieScan into local stack trieScanLevel. memcpy(&trieScanLevel,&trieScan, sizeof(TRIESCAN)); // Get Node information GetNode(); // Store the current character to result word. tempWord[iCharPos] = wc; tempWord[iCharPos + 1] = 0; // Determine the distance between two string. score = GetScore(tempWord, soundexWord); // See if we have reached the end of a word. if (fWordEnd) { tempWord[iCharPos + 1] = 0; // Is Soundex score lower than we have. if (score < iResultScore) { // wcscpy(resultWord,tempWord); Wzncpy(resultWord,tempWord,WORDSIZE); iResultScore = score; } } // See if we can prune the result of the words. if (score > (iResultScore + APPROXIMATEWEIGHT)) { #if defined(_DEBUG) iStackSize--; #endif return true; } // Move down Trie branch. if (Down()) { Traverse(iCharPos + 1, score); if (Right()) Traverse(iCharPos + 1, score); // restore trieScan memcpy(&trieScan,&trieScanLevel, sizeof(TRIESCAN)); if (Right()) Traverse(iCharPos, score); } #if defined(_DEBUG) iStackSize--; #endif return true; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synoposis: This function will trieScan1 to the next cluster if // the move is possible. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiTrieIter::CheckNextCluster(const WCHAR* szCluster, unsigned int iNumCluster) { // Declare and initailze local variables. unsigned int i = 0; TRIESCAN trieScan2; Assert(iNumCluster <= 6, "Invalid cluster"); // If there are no cluster to check consider cluster found. if (0 == iNumCluster) return true; memcpy(&trieScan2, &trieScan1, sizeof(TRIESCAN)); // Move down the Trie Branch. if (!TrieGetNextState(pTrieCtrl, &trieScan2)) return false; while (true) { if (szCluster[i] == trieScan2.wch) { i++; if (i == iNumCluster) { return true; } // Move down the Trie Branch. else if (!TrieGetNextState(pTrieCtrl, &trieScan2)) break; } // Move the Trie right one node. else if (!TrieGetNextNode(pTrieCtrl, &trieScan2)) break; } return false; }