Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

426 lines
9.7 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : SpanishUtils.h
  4. // Purpose : Genral utilities for spanish
  5. //
  6. // Project : WordBreakers
  7. // Component: Spanish word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jun 20 2000 yairh creation
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #ifndef _SPANISH_UTILS_H_
  17. #define _SPANISH_UTILS_H_
  18. #include "trie.h"
  19. #define TYPE1 1<<0
  20. #define TYPE2 1<<1
  21. #define TYPE3 1<<2
  22. #define TYPE4 1<<3
  23. #define TYPE5 1<<4
  24. #define TYPE6 1<<5
  25. #define TYPE7 1<<6
  26. #define TYPE8 1<<7
  27. #define TYPE9 1<<8
  28. #define TYPE10 1<<9
  29. #define TYPE11 1<<10
  30. #define TYPE12 1<<11
  31. #define TYPE13 1<<12
  32. #define TYPE14 1<<13
  33. #define TYPE15 1<<14
  34. #define TYPE16 1<<15
  35. #define COMPRESS_4_SIZE 6
  36. #define COMPRESS_8_SIZE 12
  37. class CSpanishUtil
  38. {
  39. public:
  40. CSpanishUtil::CSpanishUtil();
  41. int aiWcscmp(const WCHAR* p, const WCHAR* t);
  42. int aiStrcmp(const unsigned char* p, const unsigned char* t);
  43. int aiWcsncmp(const WCHAR* p, const WCHAR* t, const int iLen);
  44. void ReplaceAccent(WCHAR* pwcs, ULONG ulLen, DWORD dwCompressedBuf)
  45. {
  46. WORD w = (WORD) dwCompressedBuf;
  47. BYTE bLoc;
  48. BYTE bc = 0;
  49. bc = (w & 0xF00) >> 8;
  50. if (bc)
  51. {
  52. bLoc = (w & 0xF000) >> 12;
  53. Assert(bLoc < ulLen);
  54. pwcs[bLoc] = (WCHAR)m_rReverseAccentConvert[bc];
  55. }
  56. bc = w & 0xF;
  57. if (bc)
  58. {
  59. bLoc = (w & 0xF0) >> 4;
  60. Assert(bLoc < ulLen);
  61. pwcs[bLoc] = (WCHAR) m_rReverseAccentConvert[bc];
  62. }
  63. }
  64. ULONG GetTypeFromCompressedData(DWORD dw)
  65. {
  66. return dw >> 16;
  67. }
  68. DWORD CompressData(
  69. ULONG ulType,
  70. BYTE bLoc1,
  71. BYTE bChar1,
  72. BYTE bLoc2,
  73. BYTE bChar2)
  74. {
  75. return (ulType << 16) | (bLoc1 << 12) | (bChar1 << 8) | (bLoc2 << 4) | (bChar2);
  76. }
  77. ULONG AddTypeToCompressedData(ULONG ul, ULONG ulType)
  78. {
  79. return (ul | (ulType << 16));
  80. }
  81. bool CompressStr4(WCHAR* pwcsStr, ULONG ulLen, ULONG& ulCompress)
  82. {
  83. //
  84. // each char is 5 bits
  85. //
  86. int iShift = 27;
  87. ulCompress = 0;
  88. ULONG ul = 0;
  89. while(ul < ulLen)
  90. {
  91. Assert(iShift>=0);
  92. if ((*pwcsStr > 0xFF) || (m_rCharCompress[*pwcsStr] == 0) )
  93. {
  94. return false;
  95. }
  96. ulCompress |= m_rCharCompress[*pwcsStr] << iShift;
  97. iShift -= 5;
  98. pwcsStr++;
  99. ul++;
  100. }
  101. return true;
  102. }
  103. bool CompressStr8(WCHAR* pwcsStr, ULONG ulLen, ULONGLONG& ullCompress)
  104. {
  105. //
  106. // each char is 5 bits
  107. //
  108. int iShift = 59;
  109. ullCompress = 0;
  110. ULONG ul = 0;
  111. while(ul < ulLen)
  112. {
  113. Assert(iShift>=0);
  114. if ((*pwcsStr > 0xFF) || m_rCharCompress[*pwcsStr] == 0 )
  115. {
  116. return false;
  117. }
  118. ullCompress |= ((ULONGLONG)m_rCharCompress[*pwcsStr]) << iShift;
  119. iShift -= 5;
  120. pwcsStr++;
  121. ul++;
  122. }
  123. return true;
  124. }
  125. bool ConvertToChar(const WCHAR* pwcs, const ULONG ulLen, unsigned char* pszOut, ULONG ulOutLen)
  126. {
  127. if (ulOutLen < ulLen + 1)
  128. {
  129. return false;
  130. }
  131. ULONG ul = 0;
  132. while (ul < ulLen)
  133. {
  134. if (*pwcs > 0xFF)
  135. {
  136. return false;
  137. }
  138. *pszOut = *((char*)pwcs);
  139. pszOut++;
  140. pwcs++;
  141. ul++;
  142. }
  143. *pszOut = '\0';
  144. return true;
  145. }
  146. public:
  147. //
  148. // members.
  149. //
  150. WCHAR m_rCharConvert[256];
  151. BYTE m_rCharCompress[256];
  152. char m_rAccentConvert[256];
  153. WCHAR m_rReverseAccentConvert[16];
  154. };
  155. extern CAutoClassPointer<CSpanishUtil> g_apSpanishUtil;
  156. class CToAccUpper
  157. {
  158. public:
  159. static
  160. WCHAR
  161. MapToUpper(
  162. IN WCHAR wc
  163. )
  164. {
  165. if ( (wc & 0xff00) == 0 )
  166. {
  167. return ( g_apSpanishUtil->m_rCharConvert[wc] );
  168. }
  169. else
  170. {
  171. return ( towupper(wc) );
  172. } // if
  173. }
  174. };
  175. class SpanishDictItem
  176. {
  177. public:
  178. SpanishDictItem(ULONG ulW, WCHAR* pwcsW, ULONG ulAL, WCHAR* pwcsA, ULONG ulC, ULONG ulT)
  179. {
  180. m_fOwnMemory = true;
  181. Assert(ulW == ulAL);
  182. m_ulLen = ulW;
  183. m_pwcs = new WCHAR[ulW + 1];
  184. wcsncpy(m_pwcs, pwcsW, ulW);
  185. m_pwcs[ulW] = L'\0';
  186. m_pwcsAlt = new WCHAR[ulAL + 1];
  187. wcsncpy(m_pwcsAlt, pwcsA, ulAL);
  188. m_pwcsAlt[ulAL] = L'\0';
  189. m_ulAltLen = ulAL;
  190. m_ulCounter = ulC;
  191. m_ulType = ulT;
  192. WCHAR* p = pwcsW;
  193. BYTE i = 0;
  194. BYTE k = 0;
  195. BYTE r[4] = {0};
  196. while (*p)
  197. {
  198. if (*p != pwcsA[i])
  199. {
  200. Assert(k < 4);
  201. Assert(i < 16);
  202. Assert(
  203. g_apSpanishUtil->m_rCharConvert[*p] ==
  204. g_apSpanishUtil->m_rCharConvert[pwcsA[i]]);
  205. r[k] = i;
  206. r[k+1] = g_apSpanishUtil->m_rAccentConvert[pwcsA[i]];
  207. k+=2;
  208. }
  209. i++;
  210. p++;
  211. }
  212. m_dwCompress = g_apSpanishUtil->CompressData(m_ulType, r[0], r[1], r[2], r[3]);
  213. if (m_ulLen <= COMPRESS_4_SIZE)
  214. {
  215. bool b = g_apSpanishUtil->CompressStr4(m_pwcs, m_ulLen, m_ulStrCompress);
  216. Assert(b);
  217. }
  218. else if (m_ulLen <= COMPRESS_8_SIZE)
  219. {
  220. bool b = g_apSpanishUtil->CompressStr8(m_pwcs, m_ulLen, m_ullStrCompress);
  221. Assert(b);
  222. }
  223. }
  224. SpanishDictItem(WCHAR* pwcsBuf)
  225. {
  226. m_fOwnMemory = false;
  227. ULONG ul = wcslen(pwcsBuf);
  228. pwcsBuf[ul - 1] = L'\0';
  229. WCHAR* p = pwcsBuf;
  230. WCHAR* ppwcsParams[7];
  231. ppwcsParams[0] = p;
  232. int i = 1;
  233. while(*p)
  234. {
  235. if (*p == L';')
  236. {
  237. *p = L'\0';
  238. ppwcsParams[i] = p+1;
  239. i++;
  240. }
  241. p++;
  242. }
  243. m_pwcs = ppwcsParams[0];
  244. m_ulLen = _wtol(ppwcsParams[1]);
  245. m_pwcsAlt = ppwcsParams[2];
  246. m_ulAltLen = _wtol(ppwcsParams[3]);
  247. m_ulType = _wtol(ppwcsParams[4]);
  248. m_dwCompress = _wtol(ppwcsParams[5]);
  249. if (m_ulLen <= COMPRESS_4_SIZE)
  250. {
  251. m_ulStrCompress = _wtol(ppwcsParams[6]);
  252. }
  253. else if (m_ulLen <= COMPRESS_8_SIZE)
  254. {
  255. m_ullStrCompress = _wtoi64(ppwcsParams[6]);
  256. }
  257. }
  258. ~SpanishDictItem()
  259. {
  260. if (m_fOwnMemory)
  261. {
  262. delete[] m_pwcs;
  263. delete[] m_pwcsAlt;
  264. }
  265. }
  266. void AddType(ULONG ulType)
  267. {
  268. m_ulType |= ulType;
  269. m_dwCompress = g_apSpanishUtil->AddTypeToCompressedData(m_dwCompress, ulType);
  270. }
  271. int Serialize(WCHAR* pwcsBuf)
  272. {
  273. if (m_ulLen <= COMPRESS_4_SIZE)
  274. {
  275. return swprintf(
  276. pwcsBuf,
  277. L"%s;%d;%s;%d;%d;%u;%u\n",
  278. m_pwcs,
  279. m_ulLen,
  280. m_pwcsAlt,
  281. m_ulAltLen,
  282. m_ulType,
  283. m_dwCompress,
  284. m_ulStrCompress);
  285. }
  286. else if (m_ulLen <= COMPRESS_8_SIZE)
  287. {
  288. return swprintf(
  289. pwcsBuf,
  290. L"%s;%d;%s;%d;%d;%u;%I64u\n",
  291. m_pwcs,
  292. m_ulLen,
  293. m_pwcsAlt,
  294. m_ulAltLen,
  295. m_ulType,
  296. m_dwCompress,
  297. m_ullStrCompress);
  298. }
  299. return swprintf(
  300. pwcsBuf,
  301. L"%s;%d;%s;%d;%d;%u;0\n",
  302. m_pwcs,
  303. m_ulLen,
  304. m_pwcsAlt,
  305. m_ulAltLen,
  306. m_ulType,
  307. m_dwCompress);
  308. }
  309. ULONG m_ulLen;
  310. WCHAR* m_pwcs;
  311. ULONG m_ulAltLen;
  312. WCHAR* m_pwcsAlt;
  313. ULONG m_ulCounter;
  314. ULONG m_ulType;
  315. DWORD m_dwCompress;
  316. ULONG m_ulStrCompress;
  317. ULONGLONG m_ullStrCompress;
  318. bool m_fOwnMemory;
  319. };
  320. class CStandardCFile
  321. {
  322. public:
  323. CStandardCFile(WCHAR *pwcsFileName, WCHAR *pwcsMode, bool fThrowExcptionOn = true)
  324. {
  325. m_pFile = _wfopen(pwcsFileName, pwcsMode);
  326. if (! m_pFile && fThrowExcptionOn)
  327. {
  328. throw CGenericException(L"Could not open file");
  329. }
  330. }
  331. ~CStandardCFile()
  332. {
  333. if (m_pFile)
  334. {
  335. fclose(m_pFile);
  336. }
  337. }
  338. operator FILE*()
  339. {
  340. return m_pFile;
  341. }
  342. protected:
  343. FILE *m_pFile;
  344. };
  345. struct CSuffixTerm
  346. {
  347. WCHAR* pwcs;
  348. ULONG ulLen;
  349. ULONG ulCut;
  350. ULONG ulType;
  351. };
  352. extern const CSuffixTerm g_rSpanishSuffix[] ;
  353. class CSpanishSuffixDict
  354. {
  355. public:
  356. CSpanishSuffixDict();
  357. CTrie<CSuffixTerm, CToAccUpper> m_SuffixTrie;
  358. };
  359. #endif // _SPANISH_UTILS_H_