Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

393 lines
11 KiB

  1. #include "base.h"
  2. #include "SpanishDict.h"
  3. #define MAX_WORD_LEN 128
  4. CSpanishDict::CSpanishDict(WCHAR* pwcsInitFilePath) :
  5. m_vaDictItem4(DICT_4_INIT_SIZE),
  6. m_vaDictItem8(DICT_8_INIT_SIZE),
  7. m_vaDictItemStr(DICT_STR_INIT_SIZE),
  8. m_ulDictItem4Count(0),
  9. m_ulDictItem8Count(0),
  10. m_ulDictItemStrCount(0)
  11. {
  12. m_apSpanishSuffix = new CSpanishSuffixDict();
  13. CStandardCFile Words(pwcsInitFilePath, L"r");
  14. WCHAR pwcsBuf[MAX_WORD_LEN];
  15. DictStatus status;
  16. while(fgetws(pwcsBuf, MAX_WORD_LEN, (FILE*) Words))
  17. {
  18. if (pwcsBuf[0] == L'\n')
  19. {
  20. continue;
  21. }
  22. SpanishDictItem pItem(pwcsBuf);
  23. if (pItem.m_ulLen <= COMPRESS_4_SIZE)
  24. {
  25. m_vaDictItem4[m_ulDictItem4Count].ulStr = pItem.m_ulStrCompress;
  26. m_vaDictItem4[m_ulDictItem4Count].ulData = pItem.m_dwCompress;
  27. m_ulDictItem4Count++;
  28. }
  29. else if (pItem.m_ulLen <= COMPRESS_8_SIZE)
  30. {
  31. m_vaDictItem8[m_ulDictItem8Count].ullStr = pItem.m_ullStrCompress;
  32. m_vaDictItem8[m_ulDictItem8Count].ulData = pItem.m_dwCompress;
  33. m_ulDictItem8Count++;
  34. }
  35. else
  36. {
  37. m_vaDictItemStr[m_ulDictItemStrCount].pszStr = new unsigned char[pItem.m_ulLen + 1];
  38. bool bRet;
  39. bRet = g_apSpanishUtil->ConvertToChar(
  40. pItem.m_pwcs,
  41. pItem.m_ulLen,
  42. m_vaDictItemStr[m_ulDictItemStrCount].pszStr,
  43. pItem.m_ulLen + 1);
  44. Assert(bRet);
  45. m_vaDictItemStr[m_ulDictItemStrCount].ulData = pItem.m_dwCompress;
  46. m_ulDictItemStrCount++;
  47. }
  48. }
  49. }
  50. void CSpanishDict::BreakWord(
  51. ULONG ulLen,
  52. WCHAR* pwcsWord,
  53. bool* pfExistAlt,
  54. ULONG* pulAltLen,
  55. WCHAR* pwcsAlt)
  56. {
  57. *pfExistAlt = false;
  58. if (ulLen <= 2)
  59. {
  60. return;
  61. }
  62. //
  63. // very fast heuristic to find non breakable words
  64. //
  65. if (pwcsWord[ulLen - 1] != L'e' &&
  66. pwcsWord[ulLen - 1] != L's' &&
  67. pwcsWord[ulLen - 2] != L'l')
  68. {
  69. return;
  70. }
  71. DictStatus status;
  72. short sResCount;
  73. WCHAR pwcsBuf[MAX_WORD_LEN];
  74. WCHAR* pwcs = pwcsWord;
  75. ULONG ul = ulLen;
  76. pwcsBuf[ul] = L'\0';
  77. while (ul > 0)
  78. {
  79. pwcsBuf[ul - 1] = *pwcs;
  80. ul--;
  81. pwcs++;
  82. }
  83. CSuffixTerm* prTerm[10];
  84. status = m_apSpanishSuffix->m_SuffixTrie.trie_Find(
  85. pwcsBuf,
  86. TRIE_ALL_MATCHES | TRIE_IGNORECASE,
  87. 10,
  88. prTerm,
  89. &sResCount);
  90. WCHAR pwcsTemp[MAX_WORD_LEN];
  91. ULONG ulTempLen;
  92. while (sResCount > 0)
  93. {
  94. CSuffixTerm* pTerm = prTerm[sResCount - 1];
  95. Assert(ulLen < MAX_WORD_LEN);
  96. wcsncpy(pwcsTemp, pwcsWord, ulLen);
  97. pwcsTemp[ulLen] = L'\0';
  98. ulTempLen = ulLen;
  99. bool bRet;
  100. ULONG ulCompressedData;
  101. if (!(pTerm->ulType & (TYPE11 | TYPE12 | TYPE13 |TYPE14)))
  102. {
  103. Assert(ulLen >= pTerm->ulCut);
  104. if (ulLen == pTerm->ulCut)
  105. {
  106. sResCount--;
  107. continue;
  108. }
  109. pwcsTemp[ulLen - pTerm->ulCut] = L'\0';
  110. ulTempLen = ulLen - pTerm->ulCut;
  111. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  112. if (pTerm->ulType == TYPE1 && (!bRet))
  113. {
  114. pwcsTemp[ulTempLen] = L's';
  115. pwcsTemp[ulTempLen + 1] = L'\0';
  116. bRet = Find(pwcsTemp, ulTempLen + 1, ulCompressedData);
  117. }
  118. if ( (!bRet) ||
  119. (!(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)))
  120. {
  121. sResCount--;
  122. continue;
  123. }
  124. *pfExistAlt = true;
  125. wcscpy(pwcsAlt, pwcsTemp);
  126. *pulAltLen = ulTempLen;
  127. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  128. switch (pTerm->ulType)
  129. {
  130. case TYPE1:
  131. return;
  132. case TYPE2:
  133. *pulAltLen += 3;
  134. wcscat(pwcsAlt, L"ndo");
  135. return;
  136. case TYPE3:
  137. *pulAltLen += 1;
  138. wcscat(pwcsAlt, L"n");
  139. return;
  140. case TYPE4:
  141. *pulAltLen += 3;
  142. wcscat(pwcsAlt, L"mos");
  143. return;
  144. case TYPE5:
  145. *pulAltLen += 1;
  146. wcscat(pwcsAlt, L"d");
  147. return;
  148. case TYPE6:
  149. *pulAltLen += 1;
  150. wcscat(pwcsAlt, L"r");
  151. return;
  152. case TYPE7:
  153. case TYPE8:
  154. case TYPE9:
  155. case TYPE10:
  156. case TYPE15:
  157. case TYPE16:
  158. return;
  159. default:
  160. Assert(false);
  161. }
  162. }
  163. else
  164. {
  165. *pfExistAlt = true;
  166. switch (pTerm->ulType)
  167. {
  168. case TYPE11:
  169. {
  170. Assert(ulTempLen >= pTerm->ulLen);
  171. if (ulTempLen == pTerm->ulLen)
  172. {
  173. break;
  174. }
  175. pwcsTemp[ulTempLen - pTerm->ulLen] = L'\0';
  176. ulTempLen -= pTerm->ulLen;
  177. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  178. if (bRet &&
  179. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  180. {
  181. wcscpy(pwcsAlt, pwcsTemp);
  182. *pulAltLen = ulTempLen;
  183. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  184. *pfExistAlt = true;
  185. return;
  186. }
  187. }
  188. break;
  189. case TYPE12:
  190. case TYPE14:
  191. {
  192. pwcsTemp[ulTempLen-3] = L's'; // removing the no form the nos
  193. pwcsTemp[ulTempLen-2] = L'\0';
  194. bRet = Find(pwcsTemp, ulTempLen - 2, ulCompressedData);
  195. if (bRet &&
  196. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  197. {
  198. wcscpy(pwcsAlt, pwcsTemp);
  199. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  200. *pulAltLen = ulTempLen - 2;
  201. *pfExistAlt = true;
  202. return;
  203. }
  204. Assert(pTerm->ulLen >= 3);
  205. Assert(ulTempLen >= pTerm->ulLen);
  206. if (ulTempLen == pTerm->ulLen)
  207. {
  208. break;
  209. }
  210. ulTempLen -= pTerm->ulLen;
  211. pwcsTemp[ulTempLen] = L'\0';
  212. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  213. if (bRet &&
  214. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  215. {
  216. wcscpy(pwcsAlt, pwcsTemp);
  217. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  218. *pulAltLen = ulTempLen - 2;
  219. *pfExistAlt = true;
  220. return;
  221. }
  222. }
  223. break;
  224. case TYPE13:
  225. {
  226. pwcsTemp[ulTempLen-1] = L'\0';
  227. bRet = Find(pwcsTemp, ulTempLen - 1, ulCompressedData);
  228. if (bRet &&
  229. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  230. {
  231. wcscpy(pwcsAlt, pwcsTemp);
  232. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  233. *pulAltLen = ulTempLen - 1;
  234. *pfExistAlt = true;
  235. return;
  236. }
  237. Assert(pTerm->ulLen >= 3);
  238. Assert(ulTempLen >= pTerm->ulLen);
  239. Assert(ulTempLen >= pTerm->ulLen);
  240. if (ulTempLen == pTerm->ulLen)
  241. {
  242. break;
  243. }
  244. ulTempLen -= pTerm->ulLen;
  245. pwcsTemp[ulTempLen] = L'\0';
  246. bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
  247. if (bRet &&
  248. (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
  249. {
  250. wcscpy(pwcsAlt, pwcsTemp);
  251. g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData);
  252. *pulAltLen = ulTempLen - 2;
  253. *pfExistAlt = true;
  254. return;
  255. }
  256. }
  257. break;
  258. }
  259. }
  260. sResCount--;
  261. }
  262. pwcsAlt[0] = L'\0';
  263. *pfExistAlt = false;
  264. }
  265. bool CSpanishDict::Find(WCHAR* pwcs, ULONG ulLen, ULONG& ulData)
  266. {
  267. bool bRet;
  268. if (ulLen <= COMPRESS_4_SIZE)
  269. {
  270. CompressDictItem4 Key;
  271. bRet = g_apSpanishUtil->CompressStr4(pwcs, ulLen, Key.ulStr);
  272. if (!bRet)
  273. {
  274. return false;
  275. }
  276. CompressDictItem4* pItem;
  277. pItem = BinaryFind<CompressDictItem4>(
  278. (CompressDictItem4*)m_vaDictItem4,
  279. m_ulDictItem4Count,
  280. Key);
  281. if (!pItem)
  282. {
  283. return false;
  284. }
  285. ulData = pItem->ulData;
  286. }
  287. else if (ulLen <= COMPRESS_8_SIZE)
  288. {
  289. CompressDictItem8 Key;
  290. bRet = g_apSpanishUtil->CompressStr8(pwcs, ulLen, Key.ullStr);
  291. if (!bRet)
  292. {
  293. return false;
  294. }
  295. CompressDictItem8* pItem;
  296. pItem = BinaryFind<CompressDictItem8>(
  297. (CompressDictItem8*)m_vaDictItem8,
  298. m_ulDictItem8Count,
  299. Key);
  300. if (!pItem)
  301. {
  302. return false;
  303. }
  304. ulData = pItem->ulData;
  305. }
  306. else
  307. {
  308. unsigned char psz[32];
  309. bool bRet;
  310. bRet = g_apSpanishUtil->ConvertToChar(pwcs, ulLen, psz, 32);
  311. if (!bRet)
  312. {
  313. return false;
  314. }
  315. PsudoCompressDictItemStr Key;
  316. Key.pszStr = psz;
  317. CompressDictItemStr* pItem;
  318. pItem = BinaryFind<CompressDictItemStr>(
  319. (CompressDictItemStr*)m_vaDictItemStr,
  320. m_ulDictItemStrCount,
  321. Key);
  322. if (!pItem)
  323. {
  324. return false;
  325. }
  326. ulData = pItem->ulData;
  327. }
  328. return true;
  329. }