Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

385 lines
18 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Formats.cpp
  4. // Purpose : Global dictionaries
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // May 30 2000 yairh creation
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #include "base.h"
  17. #include "tokenizer.h"
  18. const CCliticsTerm g_aClitics[] =
  19. {
  20. { L"l\'", 2 , HEAD_MATCH_TRUNCATE},
  21. { L"l\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  22. { L"d\'", 2 , HEAD_MATCH_TRUNCATE},
  23. { L"d\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  24. { L"j\'", 2 , HEAD_MATCH_TRUNCATE},
  25. { L"j\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  26. { L"m\'", 2 , HEAD_MATCH_TRUNCATE},
  27. { L"m\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  28. { L"n\'", 2 , HEAD_MATCH_TRUNCATE},
  29. { L"n\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  30. { L"s\'", 2 , HEAD_MATCH_TRUNCATE},
  31. { L"s\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  32. { L"q\'", 2 , HEAD_MATCH_TRUNCATE},
  33. { L"q\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  34. { L"t\'", 2 , HEAD_MATCH_TRUNCATE},
  35. { L"t\x0a0\'", 3 , HEAD_MATCH_TRUNCATE},
  36. { L"un\'", 3 , HEAD_MATCH_TRUNCATE},
  37. { L"un\x0a0\'", 4 , HEAD_MATCH_TRUNCATE},
  38. { L"nell\'", 5 , HEAD_MATCH_TRUNCATE},
  39. { L"nell\x0a0\'", 6 , HEAD_MATCH_TRUNCATE},
  40. { L"all\'", 4 , HEAD_MATCH_TRUNCATE},
  41. { L"all\x0a0\'", 5 , HEAD_MATCH_TRUNCATE},
  42. { L"dell\'", 5 , HEAD_MATCH_TRUNCATE},
  43. { L"dell\x0a0\'", 6 , HEAD_MATCH_TRUNCATE},
  44. { L"Sull\'", 5 , HEAD_MATCH_TRUNCATE},
  45. { L"Sull\x0a0\'", 6 , HEAD_MATCH_TRUNCATE},
  46. { L"tutt\'", 5 , HEAD_MATCH_TRUNCATE},
  47. { L"tutt\x0a0\'", 6 , HEAD_MATCH_TRUNCATE},
  48. { L"qu\'", 3 , HEAD_MATCH_TRUNCATE},
  49. { L"qu\x0a0\'", 4 , HEAD_MATCH_TRUNCATE},
  50. { L"\'s", 2 , TAIL_MATCH_TRUNCATE},
  51. { L"\'ll", 3 , TAIL_MATCH_TRUNCATE},
  52. { L"\'m", 2 , TAIL_MATCH_TRUNCATE},
  53. { L"\'ve", 3 , TAIL_MATCH_TRUNCATE},
  54. { L"\'re", 3 , TAIL_MATCH_TRUNCATE},
  55. { L"\'d", 2 , TAIL_MATCH_TRUNCATE},
  56. { L"-je", 3 , TAIL_MATCH_TRUNCATE},
  57. { L"-tu", 3 , TAIL_MATCH_TRUNCATE},
  58. { L"-il", 3 , TAIL_MATCH_TRUNCATE},
  59. { L"-elle", 5 , TAIL_MATCH_TRUNCATE},
  60. { L"-on", 3 , TAIL_MATCH_TRUNCATE},
  61. { L"-ils", 4 , TAIL_MATCH_TRUNCATE},
  62. { L"-elles", 6 , TAIL_MATCH_TRUNCATE},
  63. { L"-t-il", 5 , TAIL_MATCH_TRUNCATE},
  64. { L"-t-elle", 7 , TAIL_MATCH_TRUNCATE},
  65. { L"-t-on", 5 , TAIL_MATCH_TRUNCATE},
  66. { L"-t-ils", 6 , TAIL_MATCH_TRUNCATE},
  67. { L"-t-elles", 8 , TAIL_MATCH_TRUNCATE},
  68. { L"-t'", 3 , TAIL_MATCH_TRUNCATE},
  69. { L"-t'y", 4 , TAIL_MATCH_TRUNCATE},
  70. { L"-t'en", 5 , TAIL_MATCH_TRUNCATE},
  71. { L"-m'", 3 , TAIL_MATCH_TRUNCATE},
  72. { L"-m'y", 4 , TAIL_MATCH_TRUNCATE},
  73. { L"-m'en", 5 , TAIL_MATCH_TRUNCATE},
  74. { L"-l'", 3 , TAIL_MATCH_TRUNCATE},
  75. { L"-l'y", 4 , TAIL_MATCH_TRUNCATE},
  76. { L"-l'en", 5 , TAIL_MATCH_TRUNCATE},
  77. { L"-z-y", 4 , TAIL_MATCH_TRUNCATE},
  78. { L"-z-en", 5 , TAIL_MATCH_TRUNCATE},
  79. { L"-y", 2 , TAIL_MATCH_TRUNCATE},
  80. { L"'y", 2 , TAIL_MATCH_TRUNCATE},
  81. { L"-y-en", 5 , TAIL_MATCH_TRUNCATE},
  82. { L"-nous", 5 , TAIL_MATCH_TRUNCATE},
  83. { L"-nous-y", 7 , TAIL_MATCH_TRUNCATE},
  84. { L"-nous-en", 9 , TAIL_MATCH_TRUNCATE},
  85. { L"-vous", 5 , TAIL_MATCH_TRUNCATE},
  86. { L"-vous-y", 7 , TAIL_MATCH_TRUNCATE},
  87. { L"-vous-en", 8 , TAIL_MATCH_TRUNCATE},
  88. { L"-toi", 4 , TAIL_MATCH_TRUNCATE},
  89. { L"-toi-z-y", 8 , TAIL_MATCH_TRUNCATE},
  90. { L"-toi-z-en", 9 , TAIL_MATCH_TRUNCATE},
  91. { L"-moi", 4 , TAIL_MATCH_TRUNCATE},
  92. { L"-moi-z-y", 8 , TAIL_MATCH_TRUNCATE},
  93. { L"-moi-z-en", 9 , TAIL_MATCH_TRUNCATE},
  94. { L"-lui", 4 , TAIL_MATCH_TRUNCATE},
  95. { L"-lui-en", 7 , TAIL_MATCH_TRUNCATE},
  96. { L"-leur", 5 , TAIL_MATCH_TRUNCATE},
  97. { L"-leur-en", 8 , TAIL_MATCH_TRUNCATE},
  98. { L"-eux", 4 , TAIL_MATCH_TRUNCATE},
  99. { L"-en", 3 , TAIL_MATCH_TRUNCATE},
  100. { L"'en", 3 , TAIL_MATCH_TRUNCATE},
  101. { L"-la", 3 , TAIL_MATCH_TRUNCATE},
  102. { L"-la-leur", 8 , TAIL_MATCH_TRUNCATE},
  103. { L"-la-vous", 8 , TAIL_MATCH_TRUNCATE},
  104. { L"-la-nous", 8 , TAIL_MATCH_TRUNCATE},
  105. { L"-la-nous-y", 10 , TAIL_MATCH_TRUNCATE},
  106. { L"-la-lui", 7 , TAIL_MATCH_TRUNCATE},
  107. { L"-la-lui-en", 10 , TAIL_MATCH_TRUNCATE},
  108. { L"-la-toi", 7 , TAIL_MATCH_TRUNCATE},
  109. { L"-la-moi", 7 , TAIL_MATCH_TRUNCATE},
  110. { L"-la-moi-z-y", 11, TAIL_MATCH_TRUNCATE},
  111. { L"-la-moi-z-en", 12 , TAIL_MATCH_TRUNCATE},
  112. { L"-le", 3 , TAIL_MATCH_TRUNCATE},
  113. { L"-le-leur", 8 , TAIL_MATCH_TRUNCATE},
  114. { L"-le-vous", 8 , TAIL_MATCH_TRUNCATE},
  115. { L"-le-nous", 8 , TAIL_MATCH_TRUNCATE},
  116. { L"-le-nous-y", 10 , TAIL_MATCH_TRUNCATE},
  117. { L"-le-lui", 7 , TAIL_MATCH_TRUNCATE},
  118. { L"-le-lui-en", 10 , TAIL_MATCH_TRUNCATE},
  119. { L"-le-toi", 7 , TAIL_MATCH_TRUNCATE},
  120. { L"-le-moi", 7 , TAIL_MATCH_TRUNCATE},
  121. { L"-le-moi-z-y", 11, TAIL_MATCH_TRUNCATE},
  122. { L"-le-moi-z-en", 12 , TAIL_MATCH_TRUNCATE},
  123. { L"-les", 4 , TAIL_MATCH_TRUNCATE},
  124. { L"-les-leur", 9 , TAIL_MATCH_TRUNCATE},
  125. { L"-les-vous", 9 , TAIL_MATCH_TRUNCATE},
  126. { L"-les-nous", 9 , TAIL_MATCH_TRUNCATE},
  127. { L"-les-nous-y", 11 , TAIL_MATCH_TRUNCATE},
  128. { L"-les-lui", 8 , TAIL_MATCH_TRUNCATE},
  129. { L"-les-lui-en", 11 , TAIL_MATCH_TRUNCATE},
  130. { L"-les-toi", 8 , TAIL_MATCH_TRUNCATE},
  131. { L"-les-moi", 8 , TAIL_MATCH_TRUNCATE},
  132. { L"-les-moi-z-y", 12, TAIL_MATCH_TRUNCATE},
  133. { L"-les-moi-z-en", 13 , TAIL_MATCH_TRUNCATE},
  134. { L"-ce", 3 , TAIL_MATCH_TRUNCATE},
  135. { L"-cis", 4 , TAIL_MATCH_TRUNCATE},
  136. { L"-cies-l�", 8 , TAIL_MATCH_TRUNCATE},
  137. { L"-cies", 5 , TAIL_MATCH_TRUNCATE},
  138. { L"-cie", 4 , TAIL_MATCH_TRUNCATE},
  139. { L"-ci", 3 , TAIL_MATCH_TRUNCATE},
  140. { L"-l�", 3 , TAIL_MATCH_TRUNCATE},
  141. { L"-cis-l�", 7 , TAIL_MATCH_TRUNCATE},
  142. { L"-cies-ci", 8 , TAIL_MATCH_TRUNCATE},
  143. { L"-cie-l�", 7 , TAIL_MATCH_TRUNCATE},
  144. { L"\0", 0 , NON_MATCH_TRUNCATE}
  145. };
  146. const CCliticsTerm g_SClitics =
  147. { L"s\'", 1, TAIL_MATCH_TRUNCATE };
  148. const CCliticsTerm g_EmptyClitics =
  149. { L"\0", 0, NON_MATCH_TRUNCATE };
  150. const CDateTerm g_aDateFormatList[] =
  151. {
  152. // format len Type D_M1 D_M1 D_M2 D_M2 Year Year
  153. // offset len len offset len offset
  154. {L"#.#.##", 6, 0, 0, 1, 2, 1, 4, 2},
  155. {L"##.#.##", 7, 0, 0, 2, 3, 1, 5, 2},
  156. {L"#.##.##", 7, 0, 0, 1, 2, 2, 5, 2},
  157. {L"##.##.##", 8, 0, 0, 2, 3, 2, 6, 2},
  158. {L"#.#.###", 7, 0, 0, 1, 2, 1, 4, 3},
  159. {L"##.#.###", 8, 0, 0, 2, 3, 1, 5, 3},
  160. {L"#.##.###", 8, 0, 0, 1, 2, 2, 5, 3},
  161. {L"##.##.###", 9, 0, 0, 2, 3, 2, 6, 3},
  162. {L"#.#.####", 8, 0, 0, 1, 2, 1, 4, 4},
  163. {L"##.#.####", 9, 0, 0, 2, 3, 1, 5, 4},
  164. {L"#.##.####", 9, 0, 0, 1, 2, 2, 5, 4},
  165. {L"##.##.####", 10, 0, 0, 2, 3, 2, 6, 4},
  166. {L"###.#.#", 7, YYMMDD_TYPE, 6, 1, 4, 1, 0, 3},
  167. {L"###.##.#", 8, YYMMDD_TYPE, 7, 1, 4, 2, 0, 3},
  168. {L"###.#.##", 8, YYMMDD_TYPE, 6, 2, 4, 1, 0, 3},
  169. {L"###.##.##", 9, YYMMDD_TYPE, 7, 2, 4, 2, 0, 3},
  170. {L"####.#.#", 8, YYMMDD_TYPE, 7, 1, 5, 1, 0, 4},
  171. {L"####.##.#", 9, YYMMDD_TYPE, 8, 1, 5, 2, 0, 4},
  172. {L"####.#.##", 9, YYMMDD_TYPE, 7, 2, 5, 1, 0, 4},
  173. {L"####.##.##", 10, YYMMDD_TYPE, 8, 2, 5, 2, 0, 4},
  174. {L"\0", 0, 0,}
  175. };
  176. const CTimeTerm g_aTimeFormatList[] =
  177. {
  178. // format len hour hour min min sec sec AM/PM
  179. // offset len offset len offset len
  180. {L"#:#", 3, 0, 1, 2, 1, 0, 0, None },
  181. {L"##:#", 4, 0, 2, 3, 1, 0, 0, None },
  182. {L"#:##", 4, 0, 1, 2, 2, 0, 0, None },
  183. {L"##:##", 5, 0, 2, 3, 2, 0, 0, None },
  184. {L"#:#:#", 5, 0, 1, 2, 1, 4, 1, None },
  185. {L"#:#:##", 6, 0, 1, 2, 1, 4, 2, None },
  186. {L"##:#:#", 6, 0, 2, 3, 1, 5, 1, None },
  187. {L"##:#:##", 7, 0, 2, 3, 1, 5, 2, None },
  188. {L"#:##:#", 6, 0, 1, 2, 2, 5, 1, None },
  189. {L"#:##:##", 7, 0, 1, 2, 2, 5, 2, None },
  190. {L"##:##:#", 7, 0, 2, 3, 2, 6, 1, None },
  191. {L"##:##:##", 8, 0, 2, 3, 2, 6, 2, None },
  192. {L"#AM", 3, 0, 1, 0, 0, 0, 0, Am },
  193. {L"##AM", 4, 0, 2, 0, 0, 0, 0, Am },
  194. {L"#:#AM", 5, 0, 1, 2, 1, 0, 0, Am },
  195. {L"##:#AM", 6, 0, 2, 3, 1, 0, 0, Am },
  196. {L"#:##AM", 6, 0, 1, 2, 2, 0, 0, Am },
  197. {L"##:##AM", 7, 0, 2, 3, 2, 0, 0, Am },
  198. {L"#:#:#AM", 7, 0, 1, 2, 1, 4, 1, Am },
  199. {L"#:#:##AM", 8, 0, 1, 2, 1, 4, 2, Am },
  200. {L"##:#:#AM", 8, 0, 2, 3, 1, 5, 1, Am },
  201. {L"##:#:##AM", 9, 0, 2, 3, 1, 5, 2, Am },
  202. {L"#:##:#AM", 8, 0, 1, 2, 2, 5, 1, Am },
  203. {L"#:##:##AM", 9, 0, 1, 2, 2, 5, 2, Am },
  204. {L"##:##:#AM", 9, 0, 2, 3, 2, 6, 1, Am },
  205. {L"##:##:##AM", 10, 0, 2, 3, 2, 6, 2, Am },
  206. {L"#PM", 3, 0, 1, 0, 0, 0, 0, Pm },
  207. {L"##PM", 4, 0, 2, 0, 0, 0, 0, Pm },
  208. {L"#:#PM", 5, 0, 1, 2, 1, 0, 0, Pm },
  209. {L"##:#PM", 6, 0, 2, 3, 1, 0, 0, Pm },
  210. {L"#:##PM", 6, 0, 1, 2, 2, 0, 0, Pm },
  211. {L"##:##PM", 7, 0, 2, 3, 2, 0, 0, Pm },
  212. {L"#:#:#PM", 7, 0, 1, 2, 1, 4, 1, Pm },
  213. {L"#:#:##PM", 8, 0, 1, 2, 1, 4, 2, Pm },
  214. {L"##:#:#PM", 8, 0, 2, 3, 1, 5, 1, Pm },
  215. {L"##:#:##PM", 9, 0, 2, 3, 1, 5, 2, Pm },
  216. {L"#:##:#PM", 8, 0, 1, 2, 2, 5, 1, Pm },
  217. {L"#:##:##PM", 9, 0, 1, 2, 2, 5, 2, Pm },
  218. {L"##:##:#PM", 9, 0, 2, 3, 2, 6, 1, Pm },
  219. {L"##:##:##PM", 10, 0, 2, 3, 2, 6, 2, Pm },
  220. {L"#a.m", 4, 0, 1, 0, 0, 0, 0, Am },
  221. {L"##a.m", 5, 0, 2, 0, 0, 0, 0, Am },
  222. {L"#:#a.m", 6, 0, 1, 2, 1, 0, 0, Am },
  223. {L"##:#a.m", 7, 0, 2, 3, 1, 0, 0, Am },
  224. {L"#:##a.m", 7, 0, 1, 2, 2, 0, 0, Am },
  225. {L"##:##a.m", 8, 0, 2, 3, 2, 0, 0, Am },
  226. {L"#:#:#a.m", 8, 0, 1, 2, 1, 4, 1, Am },
  227. {L"#:#:##a.m", 9, 0, 1, 2, 1, 4, 2, Am },
  228. {L"##:#:#a.m", 9, 0, 2, 3, 1, 5, 1, Am },
  229. {L"##:#:##a.m", 10, 0, 2, 3, 1, 5, 2, Am },
  230. {L"#:##:#a.m", 9, 0, 1, 2, 2, 5, 1, Am },
  231. {L"#:##:##a.m", 10, 0, 1, 2, 2, 5, 2, Am },
  232. {L"##:##:#a.m", 10, 0, 2, 3, 2, 6, 1, Am },
  233. {L"##:##:##a.m", 11, 0, 2, 3, 2, 6, 2, Am },
  234. {L"#p.m", 4, 0, 1, 0, 0, 0, 0, Pm },
  235. {L"##p.m", 5, 0, 2, 0, 0, 0, 0, Pm },
  236. {L"#:#p.m", 6, 0, 1, 2, 1, 0, 0, Pm },
  237. {L"##:#p.m", 7, 0, 2, 3, 1, 0, 0, Pm },
  238. {L"#:##p.m", 7, 0, 1, 2, 2, 0, 0, Pm },
  239. {L"##:##p.m", 8, 0, 2, 3, 2, 0, 0, Pm },
  240. {L"#:#:#p.m", 8, 0, 1, 2, 1, 4, 1, Pm },
  241. {L"#:#:##p.m", 9, 0, 1, 2, 1, 4, 2, Pm },
  242. {L"##:#:#p.m", 9, 0, 2, 3, 1, 5, 1, Pm },
  243. {L"##:#:##p.m", 10, 0, 2, 3, 1, 5, 2, Pm },
  244. {L"#:##:#p.m", 9, 0, 1, 2, 2, 5, 1, Pm },
  245. {L"#:##:##p.m", 10, 0, 1, 2, 2, 5, 2, Pm },
  246. {L"##:##:#p.m", 10, 0, 2, 3, 2, 6, 1, Pm },
  247. {L"##:##:##p.m", 11, 0, 2, 3, 2, 6, 2, Pm },
  248. {L"#H", 2, 0, 1, 0, 0, 0, 0, None },
  249. {L"##H", 3, 0, 2, 0, 0, 0, 0, None },
  250. {L"#H#", 3, 0, 1, 2, 1, 0, 0, None },
  251. {L"##H#", 4, 0, 2, 3, 1, 0, 0, None },
  252. {L"#H##", 4, 0, 1, 2, 2, 0, 0, None },
  253. {L"##H##", 5, 0, 2, 3, 2, 0, 0, None },
  254. {L"\0", 0, 0, 0, 0, 0, 0, 0, None },
  255. };
  256. CAutoClassPointer<CClitics> g_pClitics;
  257. CAutoClassPointer<CSpecialAbbreviationSet> g_pEngAbbList;
  258. CAutoClassPointer<CSpecialAbbreviationSet> g_pFrnAbbList;
  259. CAutoClassPointer<CSpecialAbbreviationSet> g_pItlAbbList;
  260. CAutoClassPointer<CSpecialAbbreviationSet> g_pSpnAbbList;
  261. CAutoClassPointer<CDateFormat> g_pDateFormat;
  262. CAutoClassPointer<CTimeFormat> g_pTimeFormat;
  263. CClitics::CClitics()
  264. {
  265. DictStatus status;
  266. WCHAR* pTerm;
  267. int i;
  268. for (i = 0, pTerm = g_aClitics[i].pwcs;
  269. *pTerm != L'\0';
  270. i++, pTerm = g_aClitics[i].pwcs)
  271. {
  272. status = m_trieClitics.trie_Insert(
  273. pTerm,
  274. TRIE_IGNORECASE,
  275. const_cast<CCliticsTerm*>(&g_aClitics[i]),
  276. NULL);
  277. Assert (DICT_SUCCESS == status);
  278. }
  279. }
  280. CSpecialAbbreviationSet::CSpecialAbbreviationSet(const CAbbTerm* pAbbTermList)
  281. {
  282. DictStatus status;
  283. WCHAR* pTerm;
  284. int i;
  285. for (i = 0, pTerm = pAbbTermList[i].pwcsAbb;
  286. *pTerm != L'\0';
  287. i++, pTerm = pAbbTermList[i].pwcsAbb)
  288. {
  289. status = m_trieAbb.trie_Insert(
  290. pTerm,
  291. TRIE_IGNORECASE,
  292. const_cast<CAbbTerm*>(&pAbbTermList[i]),
  293. NULL);
  294. Assert (DICT_SUCCESS == status);
  295. }
  296. }
  297. CDateFormat::CDateFormat()
  298. {
  299. DictStatus status;
  300. WCHAR* pTerm;
  301. int i;
  302. for (i = 0, pTerm = g_aDateFormatList[i].pwcsFormat;
  303. *pTerm != L'\0';
  304. i++, pTerm = g_aDateFormatList[i].pwcsFormat)
  305. {
  306. status = m_trieDateFormat.trie_Insert(
  307. pTerm,
  308. TRIE_IGNORECASE,
  309. const_cast<CDateTerm*>(&g_aDateFormatList[i]),
  310. NULL);
  311. Assert (DICT_SUCCESS == status);
  312. }
  313. }
  314. CTimeFormat::CTimeFormat()
  315. {
  316. DictStatus status;
  317. WCHAR* pTerm;
  318. int i;
  319. for (i = 0, pTerm = g_aTimeFormatList[i].pwcsFormat;
  320. *pTerm != L'\0';
  321. i++, pTerm = g_aTimeFormatList[i].pwcsFormat)
  322. {
  323. status = m_trieTimeFormat.trie_Insert(
  324. pTerm,
  325. TRIE_IGNORECASE,
  326. const_cast<CTimeTerm*>(&g_aTimeFormatList[i]),
  327. NULL);
  328. Assert (DICT_SUCCESS == status);
  329. }
  330. }