Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

487 lines
11 KiB

  1. //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
  2. //
  3. // Copyright (c) 2001 Microsoft Corporation. All rights reserved.
  4. //
  5. // Module:
  6. // volcano/dll/brknet.c
  7. //
  8. // Description:
  9. // Functions to implement the functionality of managing breakpoint structures.
  10. //
  11. // Author:
  12. // ahmadab 11/05/01
  13. //
  14. //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
  15. #include "common.h"
  16. #include "volcanop.h"
  17. #include "lattice.h"
  18. #include "runnet.h"
  19. #include "brknet.h"
  20. // free an element list
  21. void FreeElemList (ELEMLIST *pList)
  22. {
  23. if (pList->pElem)
  24. {
  25. ExternFree (pList->pElem);
  26. }
  27. }
  28. // reverses the order of entries in an element list
  29. void ReverseElementList (ELEMLIST *pElemList)
  30. {
  31. LATTICE_PATH_ELEMENT Elem;
  32. int iElemFor, iElemBack;
  33. for ( iElemFor = 0, iElemBack = pElemList->cElem -1;
  34. iElemFor < pElemList->cElem && iElemBack >= 0 && iElemFor < iElemBack;
  35. iElemFor++, iElemBack--
  36. )
  37. {
  38. Elem = pElemList->pElem[iElemFor];
  39. pElemList->pElem[iElemFor] = pElemList->pElem[iElemBack];
  40. pElemList->pElem[iElemBack] = Elem;
  41. }
  42. }
  43. // frees a break point array
  44. void FreeBreaks (int cBrk, BRKPT *pBrk)
  45. {
  46. int i;
  47. for (i = 0; i < cBrk; i++)
  48. {
  49. FreeElemList (&pBrk[i].Starting);
  50. FreeElemList (&pBrk[i].Ending);
  51. FreeElemList (&pBrk[i].Crossing);
  52. }
  53. ExternFree (pBrk);
  54. }
  55. // insert an element into an element list
  56. BOOL InsertListElement (ELEMLIST *pList, int iStrk, int iAlt, LATTICE_ELEMENT *pLatElem)
  57. {
  58. LATTICE_PATH_ELEMENT *pElem;
  59. // expand the element list
  60. pList->pElem = (LATTICE_PATH_ELEMENT *) ExternRealloc (pList->pElem,
  61. (pList->cElem + 1) * sizeof (*pList->pElem));
  62. if (!pList->pElem)
  63. {
  64. return FALSE;
  65. }
  66. pElem = pList->pElem + pList->cElem;
  67. pElem->iAlt = iAlt;
  68. pElem->iStroke = iStrk;
  69. pElem->nStrokes = pLatElem->nStrokes;
  70. pElem->wChar = pLatElem->wChar;
  71. pElem->iBoxNum = -1;
  72. pList->cElem++;
  73. return TRUE;
  74. }
  75. // makes the break point list
  76. BRKPT *CreateBrkPtList (LATTICE *pLat)
  77. {
  78. BRKPT *pBrk = NULL;
  79. int cStrk = pLat->nStrokes;
  80. BOOL bRet = FALSE;
  81. int i,
  82. iStrk,
  83. iAlt,
  84. cAlt,
  85. iStrtStrk,
  86. iPrevAlt;
  87. LATTICE_ALT_LIST *pAlt;
  88. // allocate and init buffer
  89. pBrk = (BRKPT *) ExternAlloc (cStrk * sizeof (BRKPT));
  90. if (!pBrk)
  91. {
  92. goto exit;
  93. }
  94. memset (pBrk, 0, cStrk * sizeof (BRKPT));
  95. // all strokes are candidate break points
  96. for (iStrk = 0; iStrk < cStrk; iStrk++)
  97. {
  98. // look at the alt list of words ending at this stroke
  99. pAlt = pLat->pAltList + iStrk;
  100. cAlt = pAlt->nUsed;
  101. // for all alts
  102. for (iAlt = 0; iAlt < cAlt; iAlt++)
  103. {
  104. // did it occur before
  105. for (iPrevAlt = 0; iPrevAlt < iAlt; iPrevAlt++)
  106. {
  107. if (pAlt->alts[iAlt].nStrokes == pAlt->alts[iPrevAlt].nStrokes)
  108. {
  109. break;
  110. }
  111. }
  112. // there was no previous alt with the same # of strokes
  113. if (iPrevAlt == iAlt)
  114. {
  115. // update the starting list
  116. iStrtStrk = iStrk - pAlt->alts[iAlt].nStrokes + 1;
  117. // This shouldn't happen, but there is enough code involved in
  118. // building up the lattice that it is probably a good idea to check.
  119. if (iStrtStrk < 0)
  120. {
  121. ASSERT(("Malformed lattice in CreateBrkPtList()", 0));
  122. goto exit;
  123. }
  124. if (iStrtStrk > 0)
  125. {
  126. if ( !InsertListElement (&pBrk[iStrtStrk - 1].Starting, iStrk,
  127. iAlt, pAlt->alts + iAlt)
  128. )
  129. {
  130. goto exit;
  131. }
  132. }
  133. // update the ending list
  134. if ( !InsertListElement (&pBrk[iStrk].Ending, iStrk,
  135. iAlt, pAlt->alts + iAlt)
  136. )
  137. {
  138. goto exit;
  139. }
  140. // update the crossing list
  141. for (i = iStrtStrk; i < iStrk; i++)
  142. {
  143. if ( !InsertListElement (&pBrk[i].Crossing, iStrk,
  144. iAlt, pAlt->alts + iAlt)
  145. )
  146. {
  147. goto exit;
  148. }
  149. }
  150. }
  151. }
  152. }
  153. bRet = TRUE;
  154. exit:
  155. if (!bRet)
  156. {
  157. if (pBrk)
  158. {
  159. FreeBreaks (cStrk, pBrk);
  160. }
  161. return NULL;
  162. }
  163. else
  164. {
  165. return pBrk;
  166. }
  167. }
  168. // find the element with the best probablity in the element list
  169. LATTICE_ELEMENT *FindBestInList ( LATTICE *pLat,
  170. ELEMLIST *pList,
  171. LATTICE_PATH_ELEMENT **ppBestElem
  172. )
  173. {
  174. int i, iAlt;
  175. LATTICE_ELEMENT *pBest = NULL;
  176. LATTICE_ALT_LIST *pAlt;
  177. // go thru all elements
  178. for (i = 0; i < pList->cElem; i++)
  179. {
  180. pAlt = pLat->pAltList + pList->pElem[i].iStroke;
  181. iAlt = pList->pElem[i].iAlt;
  182. // exclude fake elements
  183. if (pAlt->alts[iAlt].wChar == SYM_UNKNOWN)
  184. continue;
  185. // update the best
  186. if (!pBest || pBest->logProb < pAlt->alts[iAlt].logProb)
  187. {
  188. (*ppBestElem) = pList->pElem + i;
  189. pBest = pAlt->alts + iAlt;
  190. }
  191. }
  192. return pBest;
  193. }
  194. float FuguSegScore(int cStrokes, STROKE *pStrokes, LOCRUN_INFO *pLocRunInfo);
  195. // compute the set of features for a given break point
  196. // returns the number of features computed
  197. int FeaturizeBrkPt (LATTICE *pLat, BRKPT *pBrk, int *pFeat)
  198. {
  199. int xDist,
  200. iHgt;
  201. int cFeat = 0;
  202. LATTICE_ELEMENT *pBestStart = NULL,
  203. *pBestEnd = NULL,
  204. *pBestCross = NULL;
  205. LATTICE_PATH_ELEMENT *pBestStartPathElem = NULL,
  206. *pBestEndPathElem = NULL,
  207. *pBestCrossPathElem = NULL;
  208. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  209. pFeat[cFeat++] = pBrk->Starting.cElem * 1000;
  210. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  211. pFeat[cFeat++] = pBrk->Ending.cElem * 1000;
  212. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  213. pFeat[cFeat++] = pBrk->Crossing.cElem * 500;
  214. // find the best chars
  215. pBestStart = FindBestInList (pLat, &pBrk->Starting, &pBestStartPathElem);
  216. pBestEnd = FindBestInList (pLat, &pBrk->Ending, &pBestEndPathElem);
  217. pBestCross = FindBestInList (pLat, &pBrk->Crossing, &pBestCrossPathElem);
  218. // do we have a starting best
  219. if (pBestStart)
  220. {
  221. // a real char
  222. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  223. pFeat[cFeat++] = 65535;
  224. // log prob
  225. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  226. pFeat[cFeat++] = min (65535, (int) (-1.0 * pBestStart->logProb * 1000));
  227. // space / area
  228. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  229. pFeat[cFeat++] = min (65535, (int) (65535 * (double) pBestStart->space / (pBestStart->area + 1)));
  230. ASSERT (pFeat[cFeat - 1] >= 0);
  231. // # of strokes
  232. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  233. pFeat[cFeat++] = pBestStart->nStrokes * 1000;
  234. // char unigram
  235. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  236. pFeat[cFeat++] =
  237. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, pBestStart->wChar)));
  238. // Fugu Char Detector score, compute if did not doo it before
  239. if (pBestStart->iCharDetectorScore == -1)
  240. {
  241. pBestStart->iCharDetectorScore =
  242. min (65535, (int) (65535.0 * FuguSegScore (pBestStart->nStrokes,
  243. pLat->pStroke + pBestStartPathElem->iStroke - pBestStart->nStrokes + 1,
  244. &g_locRunInfo)));
  245. }
  246. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  247. pFeat[cFeat++] = max (0, min (65535, pBestStart->iCharDetectorScore));
  248. }
  249. else
  250. {
  251. // not a real char
  252. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  253. pFeat[cFeat++] = 0;
  254. // fake log prob
  255. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  256. pFeat[cFeat++] = 65535;
  257. // fake space / area
  258. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  259. pFeat[cFeat++] = 0;
  260. // fake # of strokes
  261. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  262. pFeat[cFeat++] = 0;
  263. // fake unigram
  264. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  265. pFeat[cFeat++] =
  266. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, 0xFFFE)));
  267. // fake fugu score
  268. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  269. pFeat[cFeat++] = 0;
  270. }
  271. if (pBestEnd)
  272. {
  273. // a real char
  274. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  275. pFeat[cFeat++] = 65535;
  276. // log prob
  277. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  278. pFeat[cFeat++] = min (65535, (int) (-1.0 * pBestEnd->logProb * 1000));
  279. // space / area
  280. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  281. pFeat[cFeat++] = min (65535, (int) (65535 * (double) pBestEnd->space / (pBestEnd->area + 1)));
  282. ASSERT (pFeat[cFeat - 1] >= 0);
  283. // # of strokes
  284. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  285. pFeat[cFeat++] = pBestEnd->nStrokes * 1000;
  286. // char unigram
  287. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  288. pFeat[cFeat++] =
  289. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, pBestEnd->wChar)));
  290. // Fugu Char Detector score
  291. if (pBestEnd->iCharDetectorScore == -1)
  292. {
  293. pBestEnd->iCharDetectorScore =
  294. min (65535, (int) (65535.0 * FuguSegScore (pBestEnd->nStrokes,
  295. pLat->pStroke + pBestEndPathElem->iStroke - pBestEnd->nStrokes + 1,
  296. &g_locRunInfo)));
  297. }
  298. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  299. pFeat[cFeat++] = max (0, min (65535, pBestEnd->iCharDetectorScore));
  300. }
  301. else
  302. {
  303. // not a real char
  304. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  305. pFeat[cFeat++] = 0;
  306. // fake log prob
  307. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  308. pFeat[cFeat++] = 65535;
  309. // fake space / area
  310. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  311. pFeat[cFeat++] = 0;
  312. // fake # of strokes
  313. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  314. pFeat[cFeat++] = 0;
  315. // fake unigram
  316. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  317. pFeat[cFeat++] =
  318. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, 0xFFFE)));
  319. // fake fugu score
  320. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  321. pFeat[cFeat++] = 0;
  322. }
  323. if (pBestCross)
  324. {
  325. // a real char
  326. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  327. pFeat[cFeat++] = 65535;
  328. // log prob
  329. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  330. pFeat[cFeat++] = min (65535, (int) (-1.0 * pBestCross->logProb * 1000));
  331. // space / area
  332. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  333. pFeat[cFeat++] = min (65535, (int) (65535 * (double) pBestCross->space / (pBestCross->area + 1)));
  334. ASSERT (pFeat[cFeat - 1] >= 0);
  335. // # of strokes
  336. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  337. pFeat[cFeat++] = pBestCross->nStrokes * 1000;
  338. // char unigram
  339. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  340. pFeat[cFeat++] =
  341. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, pBestCross->wChar)));
  342. // Fugu Char Detector score
  343. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  344. if (pBestCross->iCharDetectorScore == -1)
  345. {
  346. pBestCross->iCharDetectorScore =
  347. min (65535, (int) (65535.0 * FuguSegScore (pBestCross->nStrokes,
  348. pLat->pStroke + pBestCrossPathElem->iStroke - pBestCross->nStrokes + 1,
  349. &g_locRunInfo)));
  350. }
  351. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  352. pFeat[cFeat++] = max (0, min (65535, pBestCross->iCharDetectorScore));
  353. }
  354. else
  355. {
  356. // not a real char
  357. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  358. pFeat[cFeat++] = 0;
  359. // fake log prob
  360. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  361. pFeat[cFeat++] = 65535;
  362. // fake space / area
  363. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  364. pFeat[cFeat++] = 0;
  365. // fake # of strokes
  366. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  367. pFeat[cFeat++] = 0;
  368. // fake unigram
  369. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  370. pFeat[cFeat++] =
  371. min (65535, (int) (-255 * UnigramCost (&g_unigramInfo, 0xFFFE)));
  372. // fake fugu score
  373. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  374. pFeat[cFeat++] = 0;
  375. }
  376. if (pBestStart && pBestEnd)
  377. {
  378. xDist = pBestStart->bbox.left - pBestEnd->bbox.right;
  379. iHgt = 1 + ( (pBestStart->bbox.bottom - pBestStart->bbox.top) +
  380. (pBestEnd->bbox.bottom - pBestEnd->bbox.top)
  381. ) / 2;
  382. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  383. pFeat[cFeat++] =
  384. min (65535, 32768 + (int)(32768.0 * xDist / (abs(xDist) + iHgt)));
  385. // bigram
  386. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  387. pFeat[cFeat++] = min (65535,
  388. (int) (-1.0 * BigramTransitionCost (&g_locRunInfo, &g_bigramInfo,
  389. pBestEnd->wChar, pBestStart->wChar) * 65535));
  390. }
  391. else
  392. {
  393. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  394. pFeat[cFeat++] = 0;
  395. ASSERT (cFeat < MAX_BRK_NET_FEAT);
  396. pFeat[cFeat++] = 65535;
  397. }
  398. return cFeat;
  399. }