Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

404 lines
12 KiB

  1. #include "base.h"
  2. #include "SpanishDict.h"
  3. CSpanishDict::CSpanishDict(WCHAR* pwcsInitFilePath) :
  4. m_vaDictItem4(DICT_4_INIT_SIZE),
  5. m_vaDictItem8(DICT_8_INIT_SIZE),
  6. m_vaDictItemStr(DICT_STR_INIT_SIZE),
  7. m_ulDictItem4Count(0),
  8. m_ulDictItem8Count(0),
  9. m_ulDictItemStrCount(0)
  10. {
  11. m_apSpanishSuffix = new CSpanishSuffixDict();
  12. CStandardCFile Words(pwcsInitFilePath, L"r");
  13. WCHAR pwcsBuf[MAX_LINE_LEN];
  14. char pszBuf[MAX_LINE_LEN];
  15. while(fgets(pszBuf, MAX_LINE_LEN, (FILE*) Words))
  16. {
  17. if (pszBuf[0] == '\n')
  18. {
  19. continue;
  20. }
  21. int imbRet = MultiByteToWideChar(
  22. 1252, // English / Spanish code page
  23. MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
  24. pszBuf, -1,
  25. pwcsBuf,
  26. MAX_LINE_LEN);
  27. Assert(imbRet > 0);
  28. SpanishDictItem pItem(pwcsBuf);
  29. if (pItem.m_ulLen <= COMPRESS_4_SIZE)
  30. {
  31. m_vaDictItem4[m_ulDictItem4Count].ulStr = pItem.m_ulStrCompress;
  32. m_vaDictItem4[m_ulDictItem4Count].ulData = pItem.m_dwCompress;
  33. m_ulDictItem4Count++;
  34. }
  35. else if (pItem.m_ulLen <= COMPRESS_8_SIZE)
  36. {
  37. m_vaDictItem8[m_ulDictItem8Count].ullStr = pItem.m_ullStrCompress;
  38. m_vaDictItem8[m_ulDictItem8Count].ulData = pItem.m_dwCompress;
  39. m_ulDictItem8Count++;
  40. }
  41. else
  42. {
  43. m_vaDictItemStr[m_ulDictItemStrCount].pszStr = new unsigned char[pItem.m_ulLen + 1];
  44. bool bRet;
  45. bRet = g_apSpanishUtil->ConvertToChar(
  46. pItem.m_pwcs,
  47. pItem.m_ulLen,
  48. m_vaDictItemStr[m_ulDictItemStrCount].pszStr,
  49. pItem.m_ulLen + 1);
  50. Assert(bRet);
  51. m_vaDictItemStr[m_ulDictItemStrCount].ulData = pItem.m_dwCompress;
  52. m_ulDictItemStrCount++;
  53. }
  54. }
  55. }
  56. void CSpanishDict::BreakWord(
  57. ULONG ulLen,
  58. WCHAR* pwcsWord,
  59. bool* pfExistAlt,
  60. ULONG* pulAltLen,
  61. WCHAR* pwcsAlt)
  62. {
  63. *pfExistAlt = false;
  64. if (ulLen <= 2)
  65. {
  66. return;
  67. }
  68. if (*pulAltLen < MAX_WORD_LEN)
  69. {
  70. return;
  71. }
  72. //
  73. // very fast heuristic to find non breakable words
  74. //
  75. if (pwcsWord[ulLen - 1] != L'e' &&
  76. pwcsWord[ulLen - 1] != L's' &&
  77. pwcsWord[ulLen - 2] != L'l')
  78. {
  79. return;
  80. }
  81. DictStatus status;
  82. short sResCount;
  83. WCHAR pwcsBuf[MAX_WORD_LEN];
  84. WCHAR* pwcs = pwcsWord;
  85. ULONG ul = ulLen;
  86. pwcsBuf[ul] = L'\0';
  87. while (ul > 0)
  88. {
  89. pwcsBuf[ul - 1] = *pwcs;
  90. ul--;
  91. pwcs++;
  92. }
  93. CSuffixTerm* prTerm[10];
  94. status = m_apSpanishSuffix->m_SuffixTrie.trie_Find(
  95. pwcsBuf,
  96. TRIE_ALL_MATCHES | TRIE_IGNORECASE,
  97. 10,
  98. prTerm,
  99. &sResCount);
  100. WCHAR pwcsTemp[MAX_WORD_LEN];
  101. ULONG ulTempLen;
  102. while (sResCount > 0)
  103. {
  104. CSuffixTerm* pTerm = prTerm[sResCount - 1];
  105. Assert(ulLen < MAX_WORD_LEN);
  106. wcsncpy(pwcsTemp, pwcsWord, ulLen);
  107. pwcsTemp[ulLen] = L'\0';
  108. ulTempLen = ulLen;
  109. bool bRet;
  110. ULONG ulCompressedData;
  111. if (!(pTerm->ulType & (TYPE11 | TYPE12 | TYPE13 |TYPE14)))
  112. {
  113. Assert(ulLen >= pTerm->ulCut);
  114. if (ulLen == pTerm->ulCut)
  115. {
  116. sResCount--;
  117. continue;
  118. }
  119. pwcsTemp[ulLen - pTerm->ulCut] = L'\0';
  120. ulTempLen = ulLen - pTerm->ulCut;
  121. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  122. if (pTerm->ulType == TYPE1 && (!bRet))
  123. {
  124. pwcsTemp[ulTempLen] = L's';
  125. pwcsTemp[ulTempLen + 1] = L'\0';
  126. bRet = Find(pwcsTemp, ulTempLen + 1, ulCompressedData);
  127. }
  128. if ( (!bRet) ||
  129. (!(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)))
  130. {
  131. sResCount--;
  132. continue;
  133. }
  134. *pfExistAlt = true;
  135. wcscpy(pwcsAlt, pwcsTemp);
  136. *pulAltLen = ulTempLen;
  137. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  138. switch (pTerm->ulType)
  139. {
  140. case TYPE1:
  141. return;
  142. case TYPE2:
  143. *pulAltLen += 3;
  144. wcscat(pwcsAlt, L"ndo");
  145. return;
  146. case TYPE3:
  147. *pulAltLen += 1;
  148. wcscat(pwcsAlt, L"n");
  149. return;
  150. case TYPE4:
  151. *pulAltLen += 3;
  152. wcscat(pwcsAlt, L"mos");
  153. return;
  154. case TYPE5:
  155. *pulAltLen += 1;
  156. wcscat(pwcsAlt, L"d");
  157. return;
  158. case TYPE6:
  159. *pulAltLen += 1;
  160. wcscat(pwcsAlt, L"r");
  161. return;
  162. case TYPE7:
  163. case TYPE8:
  164. case TYPE9:
  165. case TYPE10:
  166. case TYPE15:
  167. case TYPE16:
  168. return;
  169. default:
  170. Assert(false);
  171. }
  172. }
  173. else
  174. {
  175. *pfExistAlt = true;
  176. switch (pTerm->ulType)
  177. {
  178. case TYPE11:
  179. {
  180. Assert(ulTempLen >= pTerm->ulLen);
  181. if (ulTempLen == pTerm->ulLen)
  182. {
  183. break;
  184. }
  185. pwcsTemp[ulTempLen - pTerm->ulLen] = L'\0';
  186. ulTempLen -= pTerm->ulLen;
  187. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  188. if (bRet &&
  189. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  190. {
  191. wcscpy(pwcsAlt, pwcsTemp);
  192. *pulAltLen = ulTempLen;
  193. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  194. *pfExistAlt = true;
  195. return;
  196. }
  197. }
  198. break;
  199. case TYPE12:
  200. case TYPE14:
  201. {
  202. pwcsTemp[ulTempLen-3] = L's'; // removing the no form the nos
  203. pwcsTemp[ulTempLen-2] = L'\0';
  204. bRet = Find(pwcsTemp, ulTempLen - 2, ulCompressedData);
  205. if (bRet &&
  206. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  207. {
  208. wcscpy(pwcsAlt, pwcsTemp);
  209. *pulAltLen = ulTempLen - 2;
  210. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  211. *pfExistAlt = true;
  212. return;
  213. }
  214. Assert(pTerm->ulLen >= 3);
  215. Assert(ulTempLen >= pTerm->ulLen);
  216. if (ulTempLen == pTerm->ulLen)
  217. {
  218. break;
  219. }
  220. ulTempLen -= pTerm->ulLen;
  221. pwcsTemp[ulTempLen] = L'\0';
  222. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  223. if (bRet &&
  224. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  225. {
  226. wcscpy(pwcsAlt, pwcsTemp);
  227. *pulAltLen = ulTempLen - 2;
  228. *pfExistAlt = true;
  229. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  230. return;
  231. }
  232. }
  233. break;
  234. case TYPE13:
  235. {
  236. pwcsTemp[ulTempLen-1] = L'\0';
  237. bRet = Find(pwcsTemp, ulTempLen - 1, ulCompressedData);
  238. if (bRet &&
  239. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  240. {
  241. wcscpy(pwcsAlt, pwcsTemp);
  242. *pulAltLen = ulTempLen - 1;
  243. *pfExistAlt = true;
  244. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  245. return;
  246. }
  247. Assert(pTerm->ulLen >= 3);
  248. Assert(ulTempLen >= pTerm->ulLen);
  249. Assert(ulTempLen >= pTerm->ulLen);
  250. if (ulTempLen == pTerm->ulLen)
  251. {
  252. break;
  253. }
  254. ulTempLen -= pTerm->ulLen;
  255. pwcsTemp[ulTempLen] = L'\0';
  256. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  257. if (bRet &&
  258. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  259. {
  260. wcscpy(pwcsAlt, pwcsTemp);
  261. *pulAltLen = ulTempLen - 2;
  262. *pfExistAlt = true;
  263. g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
  264. return;
  265. }
  266. }
  267. break;
  268. }
  269. }
  270. sResCount--;
  271. }
  272. pwcsAlt[0] = L'\0';
  273. *pfExistAlt = false;
  274. }
  275. bool CSpanishDict::Find(WCHAR* pwcs, ULONG ulLen, ULONG& ulData)
  276. {
  277. bool bRet;
  278. if (ulLen <= COMPRESS_4_SIZE)
  279. {
  280. CompressDictItem4 Key;
  281. bRet = g_apSpanishUtil->CompressStr4(pwcs, ulLen, Key.ulStr);
  282. if (!bRet)
  283. {
  284. return false;
  285. }
  286. CompressDictItem4* pItem;
  287. pItem = BinaryFind<CompressDictItem4>(
  288. (CompressDictItem4*)m_vaDictItem4,
  289. m_ulDictItem4Count,
  290. Key);
  291. if (!pItem)
  292. {
  293. return false;
  294. }
  295. ulData = pItem->ulData;
  296. }
  297. else if (ulLen <= COMPRESS_8_SIZE)
  298. {
  299. CompressDictItem8 Key;
  300. bRet = g_apSpanishUtil->CompressStr8(pwcs, ulLen, Key.ullStr);
  301. if (!bRet)
  302. {
  303. return false;
  304. }
  305. CompressDictItem8* pItem;
  306. pItem = BinaryFind<CompressDictItem8>(
  307. (CompressDictItem8*)m_vaDictItem8,
  308. m_ulDictItem8Count,
  309. Key);
  310. if (!pItem)
  311. {
  312. return false;
  313. }
  314. ulData = pItem->ulData;
  315. }
  316. else
  317. {
  318. unsigned char psz[32];
  319. bool bRet;
  320. bRet = g_apSpanishUtil->ConvertToChar(pwcs, ulLen, psz, 32);
  321. if (!bRet)
  322. {
  323. return false;
  324. }
  325. PsudoCompressDictItemStr Key;
  326. Key.pszStr = psz;
  327. CompressDictItemStr* pItem;
  328. pItem = BinaryFind<CompressDictItemStr>(
  329. (CompressDictItemStr*)m_vaDictItemStr,
  330. m_ulDictItemStrCount,
  331. Key);
  332. if (!pItem)
  333. {
  334. return false;
  335. }
  336. ulData = pItem->ulData;
  337. }
  338. return true;
  339. }