Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

429 lines
9.4 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : SpanishUtils.h
  4. // Purpose : Genral utilities for spanish
  5. //
  6. // Project : WordBreakers
  7. // Component: Spanish word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jun 20 2000 yairh creation
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #ifndef _SPANISH_UTILS_H_
  17. #define _SPANISH_UTILS_H_
  18. #include "trie.h"
  19. #define TYPE1 1<<0
  20. #define TYPE2 1<<1
  21. #define TYPE3 1<<2
  22. #define TYPE4 1<<3
  23. #define TYPE5 1<<4
  24. #define TYPE6 1<<5
  25. #define TYPE7 1<<6
  26. #define TYPE8 1<<7
  27. #define TYPE9 1<<8
  28. #define TYPE10 1<<9
  29. #define TYPE11 1<<10
  30. #define TYPE12 1<<11
  31. #define TYPE13 1<<12
  32. #define TYPE14 1<<13
  33. #define TYPE15 1<<14
  34. #define TYPE16 1<<15
  35. #define COMPRESS_4_SIZE 6
  36. #define COMPRESS_8_SIZE 12
  37. class CSpanishUtil
  38. {
  39. public:
  40. CSpanishUtil::CSpanishUtil();
  41. int aiWcscmp(const WCHAR* p, const WCHAR* t);
  42. int aiStrcmp(const unsigned char* p, const unsigned char* t);
  43. int aiWcsncmp(const WCHAR* p, const WCHAR* t, const int iLen);
  44. void ReplaceAccent(WCHAR* pwcs, DWORD dwCompressedBuf)
  45. {
  46. WORD w = (WORD) dwCompressedBuf;
  47. BYTE bLoc;
  48. BYTE bc = 0;
  49. bc = (w & 0xF00) >> 8;
  50. if (bc)
  51. {
  52. bLoc = (w & 0xF000) >> 12;
  53. pwcs[bLoc] = (WCHAR)m_rReverseAccentConvert[bc];
  54. }
  55. bc = w & 0xF;
  56. if (bc)
  57. {
  58. bLoc = (w & 0xF0) >> 4;
  59. pwcs[bLoc] = (WCHAR) m_rReverseAccentConvert[bc];
  60. }
  61. }
  62. ULONG GetTypeFromCompressedData(DWORD dw)
  63. {
  64. return dw >> 16;
  65. }
  66. DWORD CompressData(
  67. ULONG ulType,
  68. BYTE bLoc1,
  69. BYTE bChar1,
  70. BYTE bLoc2,
  71. BYTE bChar2)
  72. {
  73. return (ulType << 16) | (bLoc1 << 12) | (bChar1 << 8) | (bLoc2 << 4) | (bChar2);
  74. }
  75. ULONG AddTypeToCompressedData(ULONG ul, ULONG ulType)
  76. {
  77. return (ul | (ulType << 16));
  78. }
  79. bool CompressStr4(WCHAR* pwcsStr, ULONG ulLen, ULONG& ulCompress)
  80. {
  81. //
  82. // each char is 5 bits
  83. //
  84. int iShift = 27;
  85. ulCompress = 0;
  86. ULONG ul = 0;
  87. while(ul < ulLen)
  88. {
  89. Assert(iShift>=0);
  90. if ((*pwcsStr > 0xFF) || (m_rCharCompress[*pwcsStr] == 0) )
  91. {
  92. return false;
  93. }
  94. ulCompress |= m_rCharCompress[*pwcsStr] << iShift;
  95. iShift -= 5;
  96. pwcsStr++;
  97. ul++;
  98. }
  99. return true;
  100. }
  101. bool CompressStr8(WCHAR* pwcsStr, ULONG ulLen, ULONGLONG& ullCompress)
  102. {
  103. //
  104. // each char is 5 bits
  105. //
  106. int iShift = 59;
  107. ullCompress = 0;
  108. ULONG ul = 0;
  109. while(ul < ulLen)
  110. {
  111. Assert(iShift>=0);
  112. if ((*pwcsStr > 0xFF) || m_rCharCompress[*pwcsStr] == 0 )
  113. {
  114. return false;
  115. }
  116. ullCompress |= ((ULONGLONG)m_rCharCompress[*pwcsStr]) << iShift;
  117. iShift -= 5;
  118. pwcsStr++;
  119. ul++;
  120. }
  121. return true;
  122. }
  123. bool ConvertToChar(const WCHAR* pwcs, const ULONG ulLen, unsigned char* pszOut, ULONG ulOutLen)
  124. {
  125. if (ulOutLen < ulLen + 1)
  126. {
  127. return false;
  128. }
  129. ULONG ul = 0;
  130. while (ul < ulLen)
  131. {
  132. if (*pwcs > 0xFF)
  133. {
  134. return false;
  135. }
  136. *pszOut = *((char*)pwcs);
  137. pszOut++;
  138. pwcs++;
  139. ul++;
  140. }
  141. *pszOut = '\0';
  142. return true;
  143. }
  144. public:
  145. //
  146. // members.
  147. //
  148. WCHAR m_rCharConvert[256];
  149. BYTE m_rCharCompress[256];
  150. char m_rAccentConvert[256];
  151. WCHAR m_rReverseAccentConvert[16];
  152. };
  153. extern CAutoClassPointer<CSpanishUtil> g_apSpanishUtil;
  154. class CToAccUpper
  155. {
  156. public:
  157. static
  158. WCHAR
  159. MapToUpper(
  160. IN WCHAR wc
  161. )
  162. {
  163. if ( (wc & 0xff00) == 0 )
  164. {
  165. return ( g_apSpanishUtil->m_rCharConvert[wc] );
  166. }
  167. else
  168. {
  169. return ( towupper(wc) );
  170. } // if
  171. }
  172. };
  173. class SpanishDictItem
  174. {
  175. public:
  176. SpanishDictItem(ULONG ulW, WCHAR* pwcsW, ULONG ulAL, WCHAR* pwcsA, ULONG ulC, ULONG ulT)
  177. {
  178. m_fOwnMemory = true;
  179. Assert(ulW == ulAL);
  180. m_ulLen = ulW;
  181. m_pwcs = new WCHAR[ulW + 1];
  182. wcsncpy(m_pwcs, pwcsW, ulW);
  183. m_pwcs[ulW] = L'\0';
  184. m_pwcsAlt = new WCHAR[ulAL + 1];
  185. wcsncpy(m_pwcsAlt, pwcsA, ulAL);
  186. m_pwcsAlt[ulAL] = L'\0';
  187. m_ulAltLen = ulAL;
  188. m_ulCounter = ulC;
  189. m_ulType = ulT;
  190. WCHAR* p = pwcsW;
  191. BYTE i = 0;
  192. BYTE k = 0;
  193. BYTE r[4] = {0};
  194. while (*p)
  195. {
  196. if (*p != pwcsA[i])
  197. {
  198. Assert(k < 4);
  199. Assert(i < 16);
  200. Assert(
  201. g_apSpanishUtil->m_rCharConvert[*p] ==
  202. g_apSpanishUtil->m_rCharConvert[pwcsA[i]]);
  203. r[k] = i;
  204. r[k+1] = g_apSpanishUtil->m_rAccentConvert[pwcsA[i]];
  205. k+=2;
  206. }
  207. i++;
  208. p++;
  209. }
  210. m_dwCompress = g_apSpanishUtil->CompressData(m_ulType, r[0], r[1], r[2], r[3]);
  211. if (m_ulLen <= COMPRESS_4_SIZE)
  212. {
  213. bool b = g_apSpanishUtil->CompressStr4(m_pwcs, m_ulLen, m_ulStrCompress);
  214. Assert(b);
  215. }
  216. else if (m_ulLen <= COMPRESS_8_SIZE)
  217. {
  218. bool b = g_apSpanishUtil->CompressStr8(m_pwcs, m_ulLen, m_ullStrCompress);
  219. Assert(b);
  220. }
  221. }
  222. SpanishDictItem(WCHAR* pwcsBuf)
  223. {
  224. m_fOwnMemory = false;
  225. ULONG ul = wcslen(pwcsBuf);
  226. pwcsBuf[ul - 1] = L'\0';
  227. WCHAR* p = pwcsBuf;
  228. WCHAR* ppwcsParams[7];
  229. ppwcsParams[0] = p;
  230. int i = 1;
  231. while(*p)
  232. {
  233. if (*p == L';')
  234. {
  235. *p = L'\0';
  236. ppwcsParams[i] = p+1;
  237. i++;
  238. }
  239. p++;
  240. }
  241. m_pwcs = ppwcsParams[0];
  242. m_ulLen = _wtol(ppwcsParams[1]);
  243. m_pwcsAlt = ppwcsParams[2];
  244. m_ulAltLen = _wtol(ppwcsParams[3]);
  245. m_ulType = _wtol(ppwcsParams[4]);
  246. m_dwCompress = _wtol(ppwcsParams[5]);
  247. if (m_ulLen <= COMPRESS_4_SIZE)
  248. {
  249. m_ulStrCompress = _wtol(ppwcsParams[6]);
  250. }
  251. else if (m_ulLen <= COMPRESS_8_SIZE)
  252. {
  253. m_ullStrCompress = _wtoi64(ppwcsParams[6]);
  254. }
  255. }
  256. ~SpanishDictItem()
  257. {
  258. if (m_fOwnMemory)
  259. {
  260. delete[] m_pwcs;
  261. delete[] m_pwcsAlt;
  262. }
  263. }
  264. void AddType(ULONG ulType)
  265. {
  266. m_ulType |= ulType;
  267. m_dwCompress = g_apSpanishUtil->AddTypeToCompressedData(m_dwCompress, ulType);
  268. }
  269. int Serialize(WCHAR* pwcsBuf)
  270. {
  271. if (m_ulLen <= COMPRESS_4_SIZE)
  272. {
  273. return swprintf(
  274. pwcsBuf,
  275. L"%s;%d;%s;%d;%d;%u;%u\n",
  276. m_pwcs,
  277. m_ulLen,
  278. m_pwcsAlt,
  279. m_ulAltLen,
  280. m_ulType,
  281. m_dwCompress,
  282. m_ulStrCompress);
  283. }
  284. else if (m_ulLen <= COMPRESS_8_SIZE)
  285. {
  286. return swprintf(
  287. pwcsBuf,
  288. L"%s;%d;%s;%d;%d;%u;%I64u\n",
  289. m_pwcs,
  290. m_ulLen,
  291. m_pwcsAlt,
  292. m_ulAltLen,
  293. m_ulType,
  294. m_dwCompress,
  295. m_ullStrCompress);
  296. }
  297. return swprintf(
  298. pwcsBuf,
  299. L"%s;%d;%s;%d;%d;%u;0\n",
  300. m_pwcs,
  301. m_ulLen,
  302. m_pwcsAlt,
  303. m_ulAltLen,
  304. m_ulType,
  305. m_dwCompress);
  306. }
  307. ULONG m_ulLen;
  308. WCHAR* m_pwcs;
  309. ULONG m_ulAltLen;
  310. WCHAR* m_pwcsAlt;
  311. ULONG m_ulCounter;
  312. ULONG m_ulType;
  313. DWORD m_dwCompress;
  314. ULONG m_ulStrCompress;
  315. ULONGLONG m_ullStrCompress;
  316. bool m_fOwnMemory;
  317. };
  318. class CStandardCFile
  319. {
  320. public:
  321. CStandardCFile(WCHAR *pwcsFileName, WCHAR *pwcsMode, bool fThrowExcptionOn = true)
  322. {
  323. char pszBuf[MAX_PATH];
  324. wcstombs(pszBuf, pwcsFileName, MAX_PATH);
  325. char pszMode[10];
  326. wcstombs(pszMode, pwcsMode, 10);
  327. m_pFile = fopen(pszBuf, pszMode);
  328. if (! m_pFile && fThrowExcptionOn)
  329. {
  330. throw CGenericException(L"Could not open file");
  331. }
  332. }
  333. ~CStandardCFile()
  334. {
  335. if (m_pFile)
  336. {
  337. fclose(m_pFile);
  338. }
  339. }
  340. operator FILE*()
  341. {
  342. return m_pFile;
  343. }
  344. protected:
  345. FILE *m_pFile;
  346. };
  347. struct CSuffixTerm
  348. {
  349. WCHAR* pwcs;
  350. ULONG ulLen;
  351. ULONG ulCut;
  352. ULONG ulType;
  353. };
  354. extern const CSuffixTerm g_rSpanishSuffix[] ;
  355. class CSpanishSuffixDict
  356. {
  357. public:
  358. CSpanishSuffixDict();
  359. CTrie<CSuffixTerm, CToAccUpper> m_SuffixTrie;
  360. };
  361. #endif // _SPANISH_UTILS_H_