Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
8.2 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Tokenizer.cpp
  4. // Purpose : Tokenizer declerations
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jan 06 2000 yairh creation
  14. // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
  15. // problems. (Fix Bug 15449).
  16. // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
  17. //
  18. ////////////////////////////////////////////////////////////////////////////////
  19. #include "base.h"
  20. #include "CustomBreaking.h"
  21. #include "proparray.h"
  22. #include "AutoPtr.h"
  23. #include "excption.h"
  24. #include "SpanishUtils.h"
  25. #include "WbUtils.h"
  26. #ifndef WHISTLER_BUILD
  27. #include "LanguageResources_i.c"
  28. #endif // WHISTLER_BUILD
  29. CAutoClassPointer<CCustomBreaker> g_apEngCustomBreaker;
  30. CAutoClassPointer<CCustomBreaker> g_apEngUKCustomBreaker;
  31. CAutoClassPointer<CCustomBreaker> g_apFrnCustomBreaker;
  32. CAutoClassPointer<CCustomBreaker> g_apSpnCustomBreaker;
  33. CAutoClassPointer<CCustomBreaker> g_apItlCustomBreaker;
  34. CCustomWordTerm::CCustomWordTerm(const WCHAR* pwcs) :
  35. m_ulStartTxt(0),
  36. m_ulEndTxt(0),
  37. m_pwcs(NULL)
  38. {
  39. ULONG ulLen = wcslen(pwcs);
  40. CAutoArrayPointer<WCHAR> ap;
  41. ap = new WCHAR[ulLen + 1];
  42. wcscpy(ap.Get(), pwcs);
  43. while ((m_ulStartTxt < ulLen) &&
  44. TEST_PROP(GET_PROP(ap.Get()[m_ulStartTxt]), CUSTOM_PUNCT_HEAD))
  45. {
  46. m_ulStartTxt++;
  47. }
  48. if (m_ulStartTxt == ulLen)
  49. {
  50. THROW_HRESULT_EXCEPTION(E_INVALIDARG);
  51. }
  52. m_ulEndTxt = ulLen;
  53. while(m_ulEndTxt &&
  54. TEST_PROP(GET_PROP(ap.Get()[m_ulEndTxt - 1]), CUSTOM_PUNCT_TAIL))
  55. {
  56. m_ulEndTxt--;
  57. }
  58. if (m_ulEndTxt <= m_ulStartTxt)
  59. {
  60. THROW_HRESULT_EXCEPTION(E_INVALIDARG);
  61. }
  62. m_pwcs = ap.Detach();
  63. m_ulLen = ulLen;
  64. }
  65. bool CCustomWordTerm::CheckWord(
  66. const ULONG ulBufLen,
  67. ULONG ulOffsetToBaseWord,
  68. ULONG ulBaseWordLen,
  69. const WCHAR* pwcsBuf,
  70. ULONG* pulMatchOffset,
  71. ULONG* pulMatchLen)
  72. {
  73. ULONG ulStartTxt = m_ulStartTxt;
  74. while (ulOffsetToBaseWord &&
  75. ulStartTxt &&
  76. m_pwcs[ulStartTxt] == pwcsBuf[ulOffsetToBaseWord])
  77. {
  78. ulOffsetToBaseWord--;
  79. ulStartTxt--;
  80. ulBaseWordLen++;
  81. }
  82. if (ulStartTxt)
  83. {
  84. return false;
  85. }
  86. ULONG ulEndTxt = m_ulEndTxt;
  87. while ((ulEndTxt < m_ulLen) &&
  88. (ulOffsetToBaseWord + ulBaseWordLen < ulBufLen) &&
  89. (m_pwcs[ulEndTxt] == pwcsBuf[ulOffsetToBaseWord + ulBaseWordLen ]))
  90. {
  91. ulEndTxt++;
  92. ulBaseWordLen++;
  93. }
  94. if (ulEndTxt != m_ulLen)
  95. {
  96. return false;
  97. }
  98. *pulMatchOffset = ulOffsetToBaseWord;
  99. *pulMatchLen = ulBaseWordLen;
  100. return true;
  101. }
  102. void CCustomWordCollection::AddWord(const WCHAR* pwcs)
  103. {
  104. CAutoClassPointer<CCustomWordTerm> ap;
  105. ap = new CCustomWordTerm(pwcs);
  106. m_vaWordCollection[m_ulCount] = ap.Get();
  107. m_ulCount++;
  108. ap.Detach();
  109. }
  110. bool CCustomWordCollection::CheckWord(
  111. const ULONG ulLen,
  112. const ULONG ulOffsetToBaseWord,
  113. const ULONG ulBaseWordLen,
  114. const WCHAR* pwcsBuf,
  115. ULONG* pulMatchOffset,
  116. ULONG* pulMatchLen)
  117. {
  118. for (ULONG ul = 0; ul < m_ulCount; ul++)
  119. {
  120. bool fRet = m_vaWordCollection[ul]->CheckWord(
  121. ulLen,
  122. ulOffsetToBaseWord,
  123. ulBaseWordLen,
  124. pwcsBuf,
  125. pulMatchOffset,
  126. pulMatchLen);
  127. if (fRet)
  128. {
  129. return true;
  130. }
  131. }
  132. return false;
  133. }
  134. CCustomBreaker::CCustomBreaker(LCID lcid) :
  135. m_Trie(true),
  136. m_ulWordCount(0)
  137. {
  138. CVarString vsPath;
  139. if (false == GetCustomWBFilePath(lcid, vsPath))
  140. {
  141. return;
  142. }
  143. CStandardCFile Words((LPWSTR)vsPath, L"r", false);
  144. if (!((FILE*)Words))
  145. {
  146. return;
  147. }
  148. WCHAR pwcsBuf[64];
  149. DictStatus status;
  150. while(fgetws(pwcsBuf, 64, (FILE*) Words))
  151. {
  152. m_ulWordCount++;
  153. ULONG ulLen = wcslen(pwcsBuf);
  154. if (ulLen && pwcsBuf[ulLen - 1] == L'\n')
  155. {
  156. pwcsBuf[ulLen - 1] = L'\0';
  157. ulLen--;
  158. }
  159. if (0 == ulLen)
  160. {
  161. continue;
  162. }
  163. try
  164. {
  165. CAutoClassPointer<CCustomWordCollection> apCollection = new CCustomWordCollection;
  166. apCollection->AddWord(pwcsBuf);
  167. WCHAR* pwcsKey = pwcsBuf + apCollection->GetFirstWord()->GetTxtStart();
  168. pwcsBuf[apCollection->GetFirstWord()->GetTxtEnd()] = L'\0';
  169. DictStatus status;
  170. CCustomWordCollection* pExistingCollection;
  171. status = m_Trie.trie_Insert(
  172. pwcsKey,
  173. TRIE_DEFAULT,
  174. apCollection.Get(),
  175. &pExistingCollection);
  176. if (DICT_ITEM_ALREADY_PRESENT == status)
  177. {
  178. pExistingCollection->AddWord(apCollection->GetFirstWord()->GetTxt());
  179. }
  180. else if (DICT_SUCCESS == status)
  181. {
  182. apCollection.Detach();
  183. continue;
  184. }
  185. }
  186. catch (CHresultException& h)
  187. {
  188. if (E_INVALIDARG == (HRESULT)h)
  189. {
  190. continue;
  191. }
  192. else
  193. {
  194. throw h;
  195. }
  196. }
  197. }
  198. }
  199. //
  200. // The idea behind the algorithm is to store a list of special patterns that should not
  201. // be broken. We also want to be able to recognize those patterns when few punctuations
  202. // are attached to them. For example if .NET is a special pattern then in the following
  203. // patterns (.NET) .NET! .NET? we also want to recognize the .NET pattern and emit .NET
  204. // It is more complicated in the next case - NET!. The expected behavior is not to break it.
  205. // So algorithm need to identify when a punctuation is part of the token and not be broken
  206. // and when it is just a breaker.
  207. // The algorithm is
  208. // 1. Initialization.
  209. // for each token is the file
  210. // a. Remove punctuations from the beginning and ending of the token - we will
  211. // reference it as the base form of the token.
  212. // b. Insert the base form to a dictionary. Each base form will be pointing to the
  213. // generating token. Few tokens can be mapped to the same base form
  214. // (NET? and NET!) so each base form will point to a collection of generating tokens
  215. // 2. Breaking.
  216. // For each pattern you get from the document
  217. // a. perform 1a.
  218. // b. look for the resulting base form in the dictionary.
  219. // c. per each item in the collection check whether the generating token exist in the
  220. // pattern we got from the document.
  221. //
  222. bool CCustomBreaker::BreakText(
  223. ULONG ulLen,
  224. WCHAR* pwcsBuf,
  225. ULONG* pulOutLen,
  226. ULONG* pulOffset)
  227. {
  228. DictStatus status;
  229. CCustomWordCollection* pCollection;
  230. short sCount = 0;
  231. ULONG ul = 0;
  232. while ((ul < ulLen) &&
  233. TEST_PROP(GET_PROP(pwcsBuf[ul]), CUSTOM_PUNCT_HEAD))
  234. {
  235. ul++;
  236. }
  237. ULONG ulOffsetToBase = ul;
  238. if (ulOffsetToBase == ulLen)
  239. {
  240. return false;
  241. }
  242. ULONG ulBaseLen = ulLen;
  243. while(ulBaseLen &&
  244. TEST_PROP(GET_PROP(pwcsBuf[ulBaseLen - 1]), CUSTOM_PUNCT_TAIL))
  245. {
  246. ulBaseLen--;
  247. }
  248. if (ulBaseLen <= ulOffsetToBase)
  249. {
  250. return false;
  251. }
  252. ulBaseLen -= ulOffsetToBase;
  253. status = m_Trie.trie_Find(
  254. pwcsBuf + ulOffsetToBase,
  255. TRIE_LONGEST_MATCH,
  256. 1,
  257. &pCollection,
  258. &sCount);
  259. if (sCount)
  260. {
  261. bool bRet;
  262. bRet = pCollection->CheckWord(
  263. ulLen,
  264. ulOffsetToBase,
  265. ulBaseLen,
  266. pwcsBuf,
  267. pulOffset,
  268. pulOutLen);
  269. return bRet;
  270. }
  271. return false;
  272. }