Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

503 lines
13 KiB

  1. // IWBreak.cpp
  2. //
  3. // CWordBreak implementation
  4. //
  5. // Copyright 2000 Microsoft Corp.
  6. //
  7. // Modification History:
  8. // 18 APR 2000 bhshin added WordBreak destructor
  9. // 30 MAR 2000 bhshin created
  10. #include "StdAfx.h"
  11. #include "KorWbrk.h"
  12. #include "IWBreak.h"
  13. #include "Lex.h"
  14. #include "Token.h"
  15. #include "Record.h"
  16. #include "Analyze.h"
  17. #include "IndexRec.h"
  18. #include "unikor.h"
  19. #include "Morpho.h"
  20. extern CRITICAL_SECTION g_CritSect;
  21. extern MAPFILE g_LexMap;
  22. extern BOOL g_fLoaded;
  23. /////////////////////////////////////////////////////////////////////////////
  24. // CWordBreaker member functions
  25. // CWordBreaker::Init
  26. //
  27. // intialize WordBreaker object & lexicon
  28. //
  29. // Parameters:
  30. // fQuery -> (BOOL) query time flag
  31. // ulMaxTokenSize -> (ULONG) maximum input token length
  32. // *pfLicense <- (BOOL*) always return TRUE
  33. //
  34. // Result:
  35. // (HRESULT)
  36. //
  37. // 30MAR00 bhshin began
  38. STDMETHODIMP CWordBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense)
  39. {
  40. if (pfLicense == NULL)
  41. return E_INVALIDARG;
  42. if (IsBadWritePtr(pfLicense, sizeof(DWORD)))
  43. return E_INVALIDARG;
  44. // store intitializing information
  45. m_fQuery = fQuery;
  46. m_ulMaxTokenSize = ulMaxTokenSize;
  47. *pfLicense = TRUE;
  48. if (!g_fLoaded)
  49. {
  50. // load lexicon file
  51. ATLTRACE(L"Load lexicon...\r\n");
  52. if (!InitLexicon(&g_LexMap))
  53. return LANGUAGE_E_DATABASE_NOT_FOUND;
  54. g_fLoaded = TRUE;
  55. }
  56. m_PI.lexicon = g_LexMap;
  57. WB_LOG_PRINT_HEADER(fQuery);
  58. return S_OK;
  59. }
  60. // CWordBreaker::BreakText
  61. //
  62. // main word breaking method
  63. //
  64. // Parameters:
  65. // pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
  66. // pWordSink -> (IWordSink*) pointer to the word sink
  67. // pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
  68. //
  69. // Result:
  70. // (HRESULT)
  71. //
  72. // 30MAR00 bhshin began
  73. STDMETHODIMP CWordBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink)
  74. {
  75. WT Type;
  76. int cchTextProcessed, cchProcessed, cchHanguel;
  77. WCHAR wchLast = L'\0';
  78. if (pTextSource == NULL)
  79. return E_INVALIDARG;
  80. if (pWordSink == NULL)
  81. return S_OK;
  82. if (pTextSource->iCur == pTextSource->iEnd)
  83. return S_OK;
  84. ATLASSERT(pTextSource->iCur < pTextSource->iEnd);
  85. do
  86. {
  87. while (pTextSource->iCur < pTextSource->iEnd)
  88. {
  89. Tokenize(TRUE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
  90. if (Type == WT_REACHEND)
  91. break;
  92. cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
  93. if (cchProcessed < 0)
  94. return E_UNEXPECTED;
  95. pTextSource->iCur += cchProcessed;
  96. }
  97. } while (SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
  98. while ( pTextSource->iCur < pTextSource->iEnd )
  99. {
  100. Tokenize(FALSE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
  101. cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
  102. if (cchProcessed < 0)
  103. return E_UNEXPECTED;
  104. pTextSource->iCur += cchProcessed;
  105. }
  106. return S_OK;
  107. }
  108. // CWordBreaker::ComposePhrase
  109. //
  110. // convert a noun and modifier back into a source phrase (NOT USED)
  111. //
  112. // Parameters:
  113. // pwcNoun -> (const WCHAR*) input noun
  114. // cwcNoun -> (ULONG) length of input noun
  115. // pwcModifier -> (const WCHAR *) input modifier
  116. // cwcModifier -> (ULONG) length of input modifier
  117. // ulAttachmentType -> (ULONG) value about the method of composition
  118. // pwcPhrase -> (WCHAR *) pointer to the returned buffer
  119. // pcwcPhrase -> (ULONG *) length of returned string
  120. //
  121. // Result:
  122. // (HRESULT)
  123. //
  124. // 30MAR00 bhshin began
  125. STDMETHODIMP CWordBreaker::ComposePhrase(const WCHAR *pwcNoun, ULONG cwcNoun, const WCHAR *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase)
  126. {
  127. if (m_fQuery)
  128. return E_NOTIMPL;
  129. return WBREAK_E_QUERY_ONLY;
  130. }
  131. // CWordBreaker::GetLicenseToUse
  132. //
  133. // return license information
  134. //
  135. // Parameters:
  136. // ppwcsLicense -> (const WCHAR **) output pointer to the license information
  137. //
  138. // Result:
  139. // (HRESULT)
  140. //
  141. // 30MAR00 bhshin began
  142. STDMETHODIMP CWordBreaker::GetLicenseToUse(const WCHAR ** ppwcsLicense)
  143. {
  144. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-2000";
  145. if (ppwcsLicense == NULL)
  146. return E_INVALIDARG;
  147. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD)))
  148. return E_INVALIDARG;
  149. *ppwcsLicense = wcsCopyright;
  150. return S_OK;
  151. }
  152. // CWordBreaker::WordBreak
  153. //
  154. // main hangul word breaking operator
  155. //
  156. // Parameters:
  157. // pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
  158. // Type -> (WT) word token type
  159. // cchTextProcessed -> (int) input length to process
  160. // cchHanguel -> (int) hangul token length (hanguel+romaji case only)
  161. // pWordSink -> (IWordSink*) pointer to the word sink
  162. // pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
  163. // pwchLast -> (WCHAR*) input & output last character of previous token
  164. //
  165. // Result:
  166. // (int) -1 if error occurs, text length to process
  167. //
  168. // 30MAR00 bhshin began
  169. int CWordBreaker::WordBreak(TEXT_SOURCE *pTextSource, WT Type,
  170. int cchTextProcessed, int cchHanguel,
  171. IWordSink *pWordSink, IPhraseSink *pPhraseSink,
  172. WCHAR *pwchLast)
  173. {
  174. const WCHAR *pwcStem;
  175. int iCur;
  176. int cchToken, cchProcessed, cchHg;
  177. int cchPrefix;
  178. ATLASSERT(cchTextProcessed > 0);
  179. if (cchTextProcessed <= 0)
  180. return -1;
  181. iCur = pTextSource->iCur;
  182. pwcStem = pTextSource->awcBuffer + iCur;
  183. cchProcessed = cchTextProcessed;
  184. cchToken = cchTextProcessed;
  185. // check too long token
  186. if (cchToken > (int)m_ulMaxTokenSize || cchToken > MAX_INDEX_STRING)
  187. {
  188. cchProcessed = (m_ulMaxTokenSize < MAX_INDEX_STRING) ? m_ulMaxTokenSize : MAX_INDEX_STRING;
  189. pWordSink->PutWord(cchProcessed,
  190. pwcStem,
  191. cchProcessed,
  192. pTextSource->iCur);
  193. return cchProcessed;
  194. }
  195. //=================================================
  196. // query & index time
  197. //=================================================
  198. if (Type == WT_PHRASE_SEP)
  199. {
  200. // phrase separator
  201. *pwchLast = L'\0';
  202. pWordSink->PutBreak(WORDREP_BREAK_EOS);
  203. }
  204. else if (Type == WT_WORD_SEP)
  205. {
  206. if (!fIsWhiteSpace(*pwcStem))
  207. *pwchLast = L'\0';
  208. // Korean WB do not add EOW.
  209. }
  210. else if (Type == WT_ROMAJI)
  211. {
  212. // symbol, alphabet, hanja, romaji + hanguel
  213. // get next token
  214. iCur += cchToken;
  215. Tokenize(FALSE, pTextSource, iCur, &Type, &cchToken, &cchHg);
  216. if (Type == WT_ROMAJI)
  217. {
  218. if (cchHg > 0)
  219. {
  220. // romaji+(hanguel+romaji) case -> put word itself
  221. cchProcessed += cchToken;
  222. iCur += cchToken;
  223. cchProcessed += GetWordPhrase(FALSE, pTextSource, iCur);
  224. WB_LOG_START(pwcStem, cchProcessed);
  225. pWordSink->PutWord(cchProcessed,
  226. pwcStem,
  227. cchProcessed,
  228. pTextSource->iCur);
  229. WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_SYMBOL);
  230. }
  231. else
  232. {
  233. WB_LOG_START(pwcStem, cchProcessed);
  234. // {romaj}{romaj} case : -> breaking first {romaji}
  235. CIndexInfo IndexInfo;
  236. if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
  237. goto ErrorReturn;
  238. AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
  239. cchHanguel, &IndexInfo, &cchPrefix);
  240. if (m_fQuery)
  241. {
  242. IndexInfo.AddIndex(pwcStem, cchProcessed+cchToken, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
  243. WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
  244. if (!IndexInfo.PutQueryIndexList())
  245. goto ErrorReturn;
  246. }
  247. else
  248. {
  249. if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
  250. goto ErrorReturn;
  251. }
  252. }
  253. }
  254. else if (Type == WT_HANGUEL)
  255. {
  256. // romaji(hanguel+romaji) + hanguel case
  257. WCHAR wzRomaji[MAX_INDEX_STRING+1];
  258. int cchRomaji;
  259. cchRomaji = (cchProcessed > MAX_INDEX_STRING) ? MAX_INDEX_STRING : cchProcessed;
  260. wcsncpy(wzRomaji, pwcStem, cchRomaji);
  261. wzRomaji[cchRomaji] = L'\0';
  262. WB_LOG_START(pwcStem, cchProcessed+cchToken);
  263. cchProcessed += cchToken;
  264. // start position include romanji
  265. CIndexInfo IndexInfo;
  266. if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
  267. goto ErrorReturn;
  268. if (cchHanguel > 0)
  269. {
  270. AnalyzeRomaji(pwcStem, cchRomaji, pTextSource->iCur, cchRomaji,
  271. cchHanguel, &IndexInfo, &cchPrefix);
  272. }
  273. else
  274. {
  275. cchPrefix = CheckURLPrefix(pwcStem, cchProcessed-cchToken);
  276. }
  277. // analyze string starts from last hangul
  278. pwcStem = pTextSource->awcBuffer + iCur;
  279. if (cchRomaji > 0)
  280. IndexInfo.SetRomajiInfo(wzRomaji, cchRomaji, cchPrefix);
  281. // analyze string always with indexing mode on symbol processing
  282. if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchToken, iCur, &IndexInfo, *pwchLast))
  283. goto ErrorReturn;
  284. if (m_fQuery)
  285. {
  286. if (cchRomaji > 0)
  287. IndexInfo.SetRomajiInfo(NULL, 0, 0);
  288. IndexInfo.AddIndex(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
  289. WB_LOG_ADD_INDEX(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, INDEX_QUERY);
  290. if (!IndexInfo.PutQueryIndexList())
  291. goto ErrorReturn;
  292. }
  293. else
  294. {
  295. if (!IndexInfo.MakeSingleLengthMergedIndex())
  296. goto ErrorReturn;
  297. if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
  298. goto ErrorReturn;
  299. }
  300. *pwchLast = *(pwcStem + cchToken - 1);
  301. }
  302. else // next: WT_START, WT_PHRASE_SEP, WT_WORD_SEP, WT_REACHEND
  303. {
  304. WB_LOG_START(pwcStem, cchProcessed);
  305. CIndexInfo IndexInfo;
  306. if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
  307. goto ErrorReturn;
  308. AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
  309. cchHanguel, &IndexInfo, &cchPrefix);
  310. if (m_fQuery)
  311. {
  312. IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
  313. WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
  314. if (!IndexInfo.PutQueryIndexList())
  315. goto ErrorReturn;
  316. }
  317. else
  318. {
  319. if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
  320. goto ErrorReturn;
  321. }
  322. }
  323. }
  324. else if (Type == WT_HANGUEL)
  325. {
  326. // hangul input
  327. WB_LOG_START(pwcStem, cchProcessed);
  328. CIndexInfo IndexInfo;
  329. if (!IndexInfo.Initialize(cchProcessed, iCur, pWordSink, pPhraseSink))
  330. goto ErrorReturn;
  331. if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchProcessed, iCur, &IndexInfo, *pwchLast))
  332. goto ErrorReturn;
  333. if (m_fQuery)
  334. {
  335. IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
  336. WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
  337. if (!IndexInfo.PutQueryIndexList())
  338. goto ErrorReturn;
  339. }
  340. else
  341. {
  342. if (!IndexInfo.MakeSingleLengthMergedIndex())
  343. goto ErrorReturn;
  344. if (!IndexInfo.PutFinalIndexList(pwcStem))
  345. goto ErrorReturn;
  346. }
  347. *pwchLast = *(pwcStem + cchProcessed - 1);
  348. }
  349. WB_LOG_PRINT_ALL();
  350. WB_LOG_END();
  351. return cchProcessed;
  352. ErrorReturn:
  353. WB_LOG_END();
  354. return -1;
  355. }
  356. // CWordBreaker::AnalyzeRomaji
  357. //
  358. // helper function for romaji token wordbreaking
  359. //
  360. // Parameters:
  361. // pwcStem -> (const WCHAR*) input token string
  362. // cchStem -> (int) length of input romaji token
  363. // iCur -> (int) source string position
  364. // cchProcessed -> (int) input length to process
  365. // cchHanguel -> (int) hangul token length (hanguel+romaji case only)
  366. // pIndexInfo -> (CIndexInfo *) output index list
  367. // pcchPrefix -> (int*) output prefix length
  368. //
  369. // Result:
  370. // (void)
  371. //
  372. // 23NOV00 bhshin began
  373. void CWordBreaker::AnalyzeRomaji(const WCHAR *pwcStem, int cchStem,
  374. int iCur, int cchProcessed, int cchHanguel,
  375. CIndexInfo *pIndexInfo, int *pcchPrefix)
  376. {
  377. int cchPrefix = 0;
  378. // hanguel+romaji case
  379. if (cchHanguel < cchProcessed)
  380. {
  381. // hanguel
  382. if (cchHanguel > 0)
  383. {
  384. pIndexInfo->AddIndex(pwcStem, cchHanguel, WEIGHT_HARD_MATCH, 0, cchHanguel-1);
  385. WB_LOG_ADD_INDEX(pwcStem, cchHanguel, INDEX_SYMBOL);
  386. }
  387. // romaji
  388. if ((cchStem-cchHanguel) > 0)
  389. {
  390. pIndexInfo->AddIndex(pwcStem + cchHanguel, cchStem - cchHanguel, WEIGHT_HARD_MATCH, cchHanguel, cchStem-1);
  391. WB_LOG_ADD_INDEX(pwcStem + cchHanguel, cchStem - cchHanguel, INDEX_SYMBOL);
  392. }
  393. }
  394. if (cchHanguel == 1 || (cchStem-cchHanguel) == 1)
  395. {
  396. // romaji(hangul+romaji)
  397. pIndexInfo->AddIndex(pwcStem, cchStem, WEIGHT_HARD_MATCH, 0, cchStem-1);
  398. WB_LOG_ADD_INDEX(pwcStem, cchStem, INDEX_SYMBOL);
  399. }
  400. // check URL prefix
  401. cchPrefix = CheckURLPrefix(pwcStem, cchProcessed);
  402. if (cchPrefix > 0 && cchPrefix < cchProcessed)
  403. {
  404. pIndexInfo->AddIndex(pwcStem + cchPrefix, cchStem - cchPrefix, WEIGHT_HARD_MATCH, cchPrefix, cchStem-1);
  405. WB_LOG_ADD_INDEX(pwcStem + cchPrefix, cchStem - cchPrefix, INDEX_SYMBOL);
  406. }
  407. *pcchPrefix = cchPrefix; // return it
  408. }