Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1235 lines
31 KiB

  1. /*******************************************************************************
  2. * LtsCart.cpp *
  3. *----------*
  4. *
  5. * ** WARNING **
  6. * CART code for LTS. This code was created in MS Research and LiJ owns
  7. * the algorithm. YunusM eliminated the private heap used by this code
  8. * and used the new and delete operators instead.
  9. *
  10. * Created By: LIJ (MS Research) Date: 06/18/99
  11. * Current Owner: Fil
  12. *
  13. * Copyright (C) 1999 Microsoft Corporation. All Rights Reserved
  14. *******************************************************************************/
  15. //--- Includes --------------------------------------------------------------
  16. #include "StdAfx.h"
  17. #include "LtsCart.h"
  18. #pragma warning(disable : 4100)
  19. /* the following are for exceptions: single letter and NULL output */
  20. static const char *bogus_pron_1033 = "B OW G AH S P R AH N AH N S IY EY SH AH N";
  21. static const char *bogus_pron_1041 = "N A N I"; // what?
  22. static const char *single_letter_pron_1033[52] =
  23. {
  24. "EY",
  25. "B IY",
  26. "S IY",
  27. "D IY",
  28. "IY",
  29. "EH F",
  30. "JH IY",
  31. "EY CH",
  32. "AY",
  33. "JH EY",
  34. "K EY",
  35. "EH L",
  36. "EH M",
  37. "EH N",
  38. "OW",
  39. "P IY",
  40. "K Y UW",
  41. "AA R",
  42. "EH S",
  43. "T IY",
  44. "Y UW",
  45. "V IY",
  46. "D AH B AX L Y UW",
  47. "EH K S",
  48. "W AY",
  49. "Z IY",
  50. //
  51. // PLURAL SPELLINGS
  52. //
  53. "EY Z",
  54. "B IY Z",
  55. "S IY Z",
  56. "D IY Z",
  57. "IY Z",
  58. "EH F S",
  59. "JH IY Z",
  60. "EY CH AX Z",
  61. "AY Z",
  62. "JH EY Z",
  63. "K EY Z",
  64. "EH L Z",
  65. "EH M Z",
  66. "EH N Z",
  67. "OW Z",
  68. "P IY Z",
  69. "K Y UW Z",
  70. "AA R Z",
  71. "EH S AX Z",
  72. "T IY Z",
  73. "Y UW Z",
  74. "V IY Z",
  75. "D AH B AX L Y UW Z",
  76. "EH K S AX Z",
  77. "W AY Z",
  78. "Z IY Z"
  79. };
  80. static const char *single_letter_pron_1041[52] =
  81. {
  82. "EE",
  83. "B II",
  84. "SH II",
  85. "D II",
  86. "II",
  87. "E H U",
  88. "J II",
  89. "EE CH I",
  90. "A I",
  91. "J EE",
  92. "K EE",
  93. "E R U",
  94. "E M U",
  95. "E N U",
  96. "OO",
  97. "P II",
  98. "K Y UU",
  99. "AA R U",
  100. "E S U",
  101. "T II",
  102. "Y UU",
  103. "B U I",
  104. "D A B U R Y UU",
  105. "E STOP K U S U",
  106. "W A I",
  107. "Z E STOP T O",
  108. //
  109. // PLURAL SPELLINGS
  110. //
  111. "EE Z U",
  112. "B II Z U",
  113. "SH II Z U",
  114. "D II Z U",
  115. "II Z U",
  116. "E H U Z U",
  117. "J II Z U",
  118. "EE CH I Z U",
  119. "A I Z U",
  120. "J EE Z U",
  121. "K EE Z U",
  122. "E R U Z U",
  123. "E M U Z U",
  124. "E N U Z U",
  125. "OO Z U",
  126. "P II Z U",
  127. "K Y UU Z U",
  128. "AA R U Z U",
  129. "E S U Z U",
  130. "T II Z U",
  131. "Y UU Z U",
  132. "B U I Z U",
  133. "D A B U R Y UU Z U",
  134. "E STOP K U S U Z U",
  135. "W A I Z U",
  136. "Z E STOP T O Z U"
  137. };
  138. /*
  139. * not worthwhile to use binary search with only about 30 entries
  140. */
  141. static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
  142. {
  143. USES_CONVERSION;
  144. SPDBG_FUNC("symbol_to_id");
  145. int i;
  146. for (i = 0; i < tab->n_symbols; i++)
  147. {
  148. if (CSTR_EQUAL == CompareString(MAKELCID(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), SORT_DEFAULT), NORM_IGNORECASE,
  149. A2T(tab->storage + tab->sym_idx[i]), -1, A2T(sym), -1))
  150. {
  151. return i;
  152. }
  153. }
  154. return NO_SYMBOL;
  155. } // static int symbol_to_id(LTS_SYMTAB *tab, char *sym)
  156. static char *id_to_symbol(LTS_SYMTAB *tab, int id)
  157. {
  158. SPDBG_FUNC("id_to_symbol");
  159. if (id < 0 || id > tab->n_symbols)
  160. {
  161. return NULL;
  162. }
  163. else
  164. {
  165. return tab->storage + tab->sym_idx[id];
  166. }
  167. } // static char *id_to_symbol(LTS_SYMTAB *tab, int id)
  168. __inline void ODS (const char *format, ...)
  169. {
  170. #ifdef _DEBUG
  171. SPDBG_FUNC("ODS");
  172. va_list arglist;
  173. va_start (arglist, format);
  174. char buf[2048];
  175. _vsnprintf(buf, 2048, format, arglist);
  176. OutputDebugStringA(buf);
  177. va_end (arglist);
  178. #endif
  179. }
  180. __inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
  181. LTS_SAMPLE *sample)
  182. {
  183. SPDBG_FUNC("ans_simp_question");
  184. SYMBOL id;
  185. int *phones = feat[question.questype].feature[question.feature];
  186. SAMPLE_GET_CONTEXT(sample, question.questype, question.context,
  187. question.offset, id);
  188. return (TST_BIT(phones, id) ? TRUE : FALSE);
  189. } // __inline int ans_simp_question (LTS_FEATURE *feat, SIMPLE_QUESTION question,
  190. static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
  191. {
  192. SPDBG_FUNC("product_eval");
  193. int negate, result;
  194. SIMPLE_QUESTION ques;
  195. char *cptr;
  196. cptr = term;
  197. while (TRUE)
  198. {
  199. /* negation sign */
  200. if (*cptr == '~')
  201. {
  202. negate = TRUE;
  203. cptr++;
  204. }
  205. else
  206. {
  207. negate = FALSE;
  208. }
  209. if (!isdigit(*cptr))
  210. {
  211. //quit (-1, "Invalid product in product_eval\n");
  212. // OutputDebugString("Invalid product in product_eval\n");
  213. return FALSE;
  214. }
  215. for (result = *cptr++ - '0'; isdigit (*cptr); cptr++)
  216. {
  217. result = result * 10 + (*cptr - '0');
  218. }
  219. QUES_DECODE(result, ques.questype, ques.context, ques.offset,
  220. ques.feature);
  221. if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
  222. {
  223. return FALSE;
  224. }
  225. if (*cptr == '\0')
  226. {
  227. break;
  228. }
  229. if (*cptr++ != '&')
  230. {
  231. //quit (-1, "product_eval: syntax error in product term %s\n", term);
  232. /*
  233. char szTemp[512];
  234. sprintf(szTemp, "product_eval: syntax error in product term %s\n", term);
  235. OutputDebugString(szTemp);
  236. */
  237. return FALSE;
  238. }
  239. }
  240. return TRUE;
  241. } // static int product_eval (LTS_FEATURE *feat, char *term, LTS_SAMPLE *sample)
  242. static int ans_comp_question(LTS_FEATURE *feat, char *prod,
  243. LTS_SAMPLE *sample)
  244. {
  245. SPDBG_FUNC("ans_comp_question");
  246. int i, num_products, limit;
  247. char *cptr, string[LONGEST_STR], *products[MAX_PRODUCTS];
  248. strcpy(string, prod);
  249. for (cptr = string, num_products = 1; *cptr != '\0'; cptr++)
  250. {
  251. if (*cptr == '|') num_products++;
  252. }
  253. if (num_products > MAX_PRODUCTS)
  254. {
  255. //quit(1, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
  256. /*
  257. char szTemp[256];
  258. sprintf(szTemp, "please increase MAX_PRODUCTS up to %d at least\n", num_products);
  259. OutputDebugString(szTemp);
  260. */
  261. return FALSE;
  262. }
  263. for (i = 0, limit = num_products -1, cptr = string; ; i++)
  264. {
  265. products[i] = cptr++;
  266. if (i == limit)
  267. {
  268. break;
  269. }
  270. for (; *cptr != '|'; cptr++) {};
  271. *cptr++ = '\0';
  272. }
  273. for (i = 0; i < num_products; i++)
  274. {
  275. if (product_eval (feat, products[i], sample) == TRUE)
  276. {
  277. return TRUE;
  278. }
  279. }
  280. return FALSE;
  281. } // static int ans_comp_question(LTS_FEATURE *feat, char *prod,
  282. static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
  283. {
  284. SPDBG_FUNC("find_leaf");
  285. if (!root->yes_child)
  286. {
  287. return root;
  288. }
  289. else if (ans_comp_question(feat, root->prod, sample))
  290. {
  291. return find_leaf(feat, root->yes_child, sample);
  292. }
  293. else
  294. {
  295. return find_leaf(feat, root->no_child, sample);
  296. }
  297. } // static T_NODE *find_leaf(LTS_FEATURE *feat, T_NODE *root, LTS_SAMPLE *sample)
  298. static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
  299. LTS_SAMPLE *sample, LTS_PROD **next)
  300. {
  301. SPDBG_FUNC("lts_product_eval");
  302. int negate, result;
  303. SIMPLE_QUESTION ques;
  304. LTS_PROD *cptr = term;
  305. while (TRUE)
  306. {
  307. if ((*cptr) & PROD_NEG)
  308. {
  309. negate = TRUE;
  310. result = (*cptr) ^ PROD_NEG;
  311. }
  312. else
  313. {
  314. negate = FALSE;
  315. result = (*cptr);
  316. }
  317. QUES_DECODE(result, ques.questype, ques.context, ques.offset,
  318. ques.feature);
  319. if ((negate ^ ans_simp_question (feat, ques, sample)) == FALSE)
  320. {
  321. while (*cptr != PROD_TERM && *cptr != QUES_TERM)
  322. {
  323. cptr++;
  324. }
  325. if (*cptr == QUES_TERM)
  326. {
  327. *next = NULL;
  328. }
  329. else
  330. {
  331. *next = cptr + 1;
  332. }
  333. return FALSE;
  334. }
  335. cptr++;
  336. if (*cptr == QUES_TERM)
  337. {
  338. *next = NULL;
  339. break;
  340. }
  341. else if (*cptr == PROD_TERM)
  342. {
  343. *next = cptr + 1;
  344. break;
  345. }
  346. }
  347. return TRUE;
  348. } // static int lts_product_eval (LTS_FEATURE *feat, LTS_PROD *term,
  349. static int lts_ans_comp_question(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
  350. int idx, LTS_SAMPLE *sample)
  351. {
  352. SPDBG_FUNC("lts_ans_comp_question");
  353. LTS_PROD *next, *term = (LTS_PROD *) ((char *) tree->p_prod + idx);
  354. while (TRUE)
  355. {
  356. if (lts_product_eval (feat, term, sample, &next) == TRUE)
  357. {
  358. return TRUE;
  359. }
  360. if (next == NULL)
  361. {
  362. break;
  363. }
  364. term = next;
  365. }
  366. return FALSE;
  367. } // static int lts_ans_comp_question(LTS_TREE *tree, LTS_FEATURE *feat,
  368. static LTS_NODE *lts_find_leaf(LTS_TREE UNALIGNED *tree, LTS_FEATURE *feat,
  369. LTS_NODE *root, LTS_SAMPLE *sample)
  370. {
  371. SPDBG_FUNC("lts_find_leaf");
  372. if (IS_LEAF_NODE(root))
  373. {
  374. return root;
  375. }
  376. else if (lts_ans_comp_question(tree, feat, ((LTS_NODE UNALIGNED *)root)->idx, sample))
  377. {
  378. return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes, sample);
  379. }
  380. else
  381. {
  382. return lts_find_leaf(tree, feat, root + ((LTS_NODE UNALIGNED *)root)->yes + 1, sample);
  383. }
  384. } // static LTS_NODE *lts_find_leaf(LTS_TREE *tree, LTS_FEATURE *feat,
  385. static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
  386. SYMBOL *pOut)
  387. {
  388. SPDBG_FUNC("lts_find_leaf_count");
  389. LTS_TREE UNALIGNED *tree = l_forest->tree[*pIn];
  390. LTS_NODE UNALIGNED *leaf;
  391. LTS_SAMPLE sample;
  392. /*
  393. * construct a sample in order to share all the code with training
  394. */
  395. sample.pIn = pIn;
  396. sample.pOut = pOut;
  397. /* *pOut cannot be NULL_SYMBOL_ID */
  398. *pOut = NULL_SYMBOL_ID + 1;
  399. leaf = lts_find_leaf(tree, l_forest->features, &(tree->nodes[0]), &sample);
  400. return (LTS_DIST *) ((char *)tree->p_dist + leaf->idx);
  401. } // static LTS_DIST *lts_find_leaf_count(LTS_FOREST *l_forest, SYMBOL *pIn,
  402. static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
  403. {
  404. SPDBG_FUNC("allocate_out_result");
  405. LTS_OUT_RESULT *res = new LTS_OUT_RESULT;
  406. if (res)
  407. {
  408. res->out_strings = new LTS_OUT_STRING *[MAX_ALT_STRINGS];
  409. if (res->out_strings)
  410. {
  411. res->num_allocated_strings = MAX_ALT_STRINGS;
  412. res->num_strings = 0;
  413. }
  414. else
  415. {
  416. delete res;
  417. res = NULL;
  418. }
  419. }
  420. return res;
  421. } // static LTS_OUT_RESULT *allocate_out_result(LTS_FOREST *l_forest)
  422. static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
  423. {
  424. SPDBG_FUNC("free_out_result");
  425. int i;
  426. for (i = 0; i < res->num_strings; i++)
  427. {
  428. delete res->out_strings[i];
  429. }
  430. if (res->num_allocated_strings == MAX_ALT_STRINGS)
  431. {
  432. delete res->out_strings;
  433. }
  434. else
  435. {
  436. free(res->out_strings); /* dirty */
  437. }
  438. delete res;
  439. } // static void free_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res)
  440. static bool reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
  441. int min)
  442. {
  443. SPDBG_FUNC("reallocate_out_result");
  444. int s = res->num_allocated_strings, old_size = s;
  445. LTS_OUT_STRING **p;
  446. while (s < min)
  447. s += INC_ALT_STRINGS;
  448. p = res->out_strings;
  449. res->out_strings = (LTS_OUT_STRING **)
  450. calloc(s, sizeof(LTS_OUT_STRING *));
  451. if (!res->out_strings)
  452. {
  453. return false;
  454. }
  455. memcpy(res->out_strings, p, old_size * sizeof(LTS_OUT_STRING *));
  456. if (old_size == MAX_ALT_STRINGS)
  457. {
  458. delete p;
  459. }
  460. else
  461. {
  462. free(p);
  463. }
  464. res->num_allocated_strings = s;
  465. ODS("increased out_strings to %d in order to meet %d\n", s, min);
  466. return true;
  467. } // static void reallocate_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
  468. static bool grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
  469. SYMBOL i, int count, float inv_sum,
  470. LTS_OUT_RESULT *tmpRes)
  471. {
  472. SPDBG_FUNC("grow_out_result");
  473. int j;
  474. if (res->num_strings + tmpRes->num_strings >= res->num_allocated_strings)
  475. {
  476. if (!reallocate_out_result(l_forest, res,
  477. res->num_strings + tmpRes->num_strings))
  478. {
  479. return false;
  480. }
  481. }
  482. for (j = 0; j < tmpRes->num_strings; j++)
  483. {
  484. SYMBOL *psrc = tmpRes->out_strings[j]->psym;
  485. SYMBOL *ptgt;
  486. res->out_strings[res->num_strings + j] = new LTS_OUT_STRING;
  487. if (!res->out_strings)
  488. {
  489. return false;
  490. }
  491. ptgt = res->out_strings[res->num_strings + j]->psym;
  492. *ptgt++ = i;
  493. while (*psrc != NULL_SYMBOL_ID)
  494. {
  495. *ptgt++ = *psrc++;
  496. }
  497. *ptgt++ = NULL_SYMBOL_ID;
  498. res->out_strings[res->num_strings + j]->prob = count * inv_sum *
  499. tmpRes->out_strings[j]->prob;
  500. }
  501. res->num_strings += tmpRes->num_strings;
  502. free_out_result(l_forest, tmpRes);
  503. return true;
  504. } // static void grow_out_result(LTS_FOREST *l_forest, LTS_OUT_RESULT *res,
  505. static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
  506. SYMBOL *input_id, int in_index,
  507. SYMBOL *output_id, float cutoff)
  508. {
  509. SPDBG_FUNC("gen_one_output");
  510. SYMBOL out[SP_MAX_WORD_LENGTH], *pOut;
  511. LTS_OUT_RESULT *res = allocate_out_result(l_forest);
  512. if (!res)
  513. {
  514. return NULL;
  515. }
  516. int sum, i, dim;
  517. LTS_DIST UNALIGNED *pdf;
  518. LTS_PAIR UNALIGNED *l_pair, *lp;
  519. float cut, inv_sum;
  520. /*
  521. * copy output_id to local
  522. */
  523. SYMBOL *psrc = output_id - 1, *ptgt = out;
  524. while (*psrc != NULL_SYMBOL_ID) psrc--;
  525. while (psrc != output_id)
  526. *ptgt++ = *psrc++;
  527. pOut = ptgt;
  528. /* sanity check */
  529. if (pOut - out != in_index + 1)
  530. {
  531. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  532. int *z=0;
  533. z[0]=z[1];
  534. }
  535. if (in_index == len - 1)
  536. {
  537. pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
  538. l_pair = &(pdf->p_pair);
  539. dim = pdf->c_dists;
  540. for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
  541. {
  542. sum += lp->cnt;
  543. }
  544. SPDBG_ASSERT(sum > 0);
  545. inv_sum = 1.0f / sum;
  546. cut = cutoff * sum;
  547. for (lp = l_pair, i = 0; i < dim; i++, lp++)
  548. {
  549. if ((float)(lp->cnt) > cut)
  550. {
  551. res->out_strings[res->num_strings] = new LTS_OUT_STRING;
  552. if (NULL == res->out_strings[res->num_strings])
  553. {
  554. return NULL;
  555. }
  556. res->out_strings[res->num_strings]->psym[0] = (SYMBOL) lp->id;
  557. res->out_strings[res->num_strings]->psym[1] = NULL_SYMBOL_ID;
  558. res->out_strings[res->num_strings]->prob = lp->cnt * inv_sum;
  559. res->num_strings++;
  560. } /* cut */
  561. }
  562. }
  563. else
  564. {
  565. LTS_OUT_RESULT *tmpRes;
  566. pdf = lts_find_leaf_count(l_forest, input_id + in_index, pOut);
  567. dim = pdf->c_dists;
  568. l_pair = &(pdf->p_pair);
  569. for (lp = l_pair, sum = 0, i = 0; i < dim; i++, lp++)
  570. {
  571. sum += lp->cnt;
  572. }
  573. SPDBG_ASSERT(sum > 0);
  574. inv_sum = 1.0f / sum;
  575. cut = cutoff * sum;
  576. for (lp = l_pair, i = 0; i < dim; i++, lp++)
  577. {
  578. if ((float)(lp->cnt) > cut)
  579. {
  580. SYMBOL *pTmpOut = pOut + 1;
  581. *pOut = (SYMBOL) lp->id;
  582. tmpRes = gen_one_output(l_forest, len, input_id, in_index + 1, pTmpOut, cutoff);
  583. if (!tmpRes)
  584. {
  585. return NULL;
  586. }
  587. if (!grow_out_result(l_forest, res, (SYMBOL)(lp->id), lp->cnt,
  588. inv_sum, tmpRes))
  589. {
  590. return NULL;
  591. }
  592. }
  593. } /* i */
  594. } /* else */
  595. return res;
  596. } // static LTS_OUT_RESULT *gen_one_output(LTS_FOREST *l_forest, int len,
  597. static int comp_out_result_prob(const void *vp1, const void *vp2)
  598. {
  599. SPDBG_FUNC("comp_out_result_prob");
  600. LTS_OUT_STRING **p1 = (LTS_OUT_STRING **) vp1,
  601. **p2 = (LTS_OUT_STRING **) vp2;
  602. if ((*p1)->prob > (*p2)->prob)
  603. {
  604. return -1;
  605. }
  606. else if ((*p1)->prob < (*p2)->prob)
  607. {
  608. return 1;
  609. }
  610. else
  611. {
  612. return 0;
  613. }
  614. } // static int comp_out_result_prob(const void *vp1, const void *vp2)
  615. static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
  616. char *word)
  617. {
  618. SPDBG_FUNC("lts_fill_out_buffer");
  619. int i, j, n;
  620. float inv_sum, sum = 0.0f;
  621. char phnstr[LONGEST_STR];
  622. char *tmp;
  623. LTS_SYMTAB *tab = l_forest->symbols;
  624. if (out == NULL)
  625. {
  626. return;
  627. }
  628. if (word)
  629. {
  630. strcpy(l_forest->out.word, word);
  631. }
  632. else
  633. {
  634. l_forest->out.word[0] = 0;
  635. }
  636. /* normalize probabilities */
  637. for (i = 0; i < out->num_strings; i++)
  638. {
  639. sum += out->out_strings[i]->prob;
  640. }
  641. inv_sum = 1.0f / sum;
  642. for (i = 0; i < out->num_strings; i++)
  643. {
  644. out->out_strings[i]->prob *= inv_sum;
  645. }
  646. /*
  647. * sort them according to the prob field
  648. */
  649. qsort(out->out_strings, out->num_strings, sizeof(LTS_OUT_STRING *),
  650. &comp_out_result_prob);
  651. if (out->num_strings > MAX_OUTPUT_STRINGS - l_forest->out.num_prons)
  652. {
  653. n = MAX_OUTPUT_STRINGS - l_forest->out.num_prons;
  654. for (sum = 0.0f, i = 0; i < n; i++)
  655. {
  656. sum += out->out_strings[i]->prob;
  657. }
  658. inv_sum = 1.0f / sum;
  659. for (i = 0; i < n; i++)
  660. {
  661. out->out_strings[i]->prob *= inv_sum;
  662. }
  663. }
  664. else
  665. {
  666. n = out->num_strings;
  667. }
  668. for (j = l_forest->out.num_prons, i = 0; i < n; i++)
  669. {
  670. SYMBOL *p = out->out_strings[i]->psym;
  671. char *psrc, *ptgt;
  672. if (out->out_strings[i]->prob < MIN_OUT_PROB)
  673. {
  674. continue;
  675. }
  676. phnstr[0] = 0;
  677. l_forest->out.pron[j].prob = out->out_strings[i]->prob;
  678. while (*p != NULL_SYMBOL_ID)
  679. {
  680. tmp = id_to_symbol(&(tab[OUTPUT]), *p++);
  681. SPDBG_ASSERT(tmp);
  682. if (tmp)
  683. {
  684. strcat(phnstr, tmp);
  685. strcat(phnstr, " ");
  686. }
  687. }
  688. psrc = phnstr;
  689. ptgt = l_forest->out.pron[j].pstr;
  690. while (*psrc)
  691. {
  692. if (*psrc != '#' && *psrc != '_')
  693. {
  694. *ptgt++ = *psrc++;
  695. }
  696. else if (*psrc == '_')
  697. {
  698. *ptgt++ = ' ';
  699. psrc++;
  700. }
  701. else
  702. {
  703. psrc += 2; /* skip an extra space */
  704. }
  705. /* extreme case, truncate it */
  706. if (ptgt - l_forest->out.pron[j].pstr >= SP_MAX_PRON_LENGTH)
  707. {
  708. for (ptgt--; !isspace(*ptgt); ptgt--) {}; /* never output partial phone */
  709. ptgt++;
  710. break;
  711. }
  712. }
  713. // output could contain only '# '
  714. if (ptgt > l_forest->out.pron[j].pstr && *(ptgt - 1) == ' ')
  715. {
  716. *(ptgt - 1) = 0; /* remove the last space */
  717. }
  718. else
  719. {
  720. *ptgt = 0; /* shouldn't happen unless ptgt didn't move */
  721. }
  722. if (ptgt > l_forest->out.pron[j].pstr)
  723. {
  724. j++;
  725. }
  726. } /* i */
  727. if (j <= MAX_OUTPUT_STRINGS)
  728. {
  729. l_forest->out.num_prons = j;
  730. }
  731. else
  732. {
  733. l_forest->out.num_prons = MAX_OUTPUT_STRINGS; // should never happen
  734. }
  735. free_out_result(l_forest, out);
  736. } // static void lts_fill_out_buffer(LTS_FOREST *l_forest, LTS_OUT_RESULT *out,
  737. void assign_a_fixed_pron(LTS_OUTPUT *out, const char *pron, char *word)
  738. {
  739. SPDBG_FUNC("assign_a_fixed_pron");
  740. out->num_prons = 1;
  741. strcpy(out->word, word);
  742. out->pron[0].prob = 1.0f;
  743. if (strlen(pron) < SP_MAX_PRON_LENGTH)
  744. {
  745. strcpy(out->pron[0].pstr, pron);
  746. }
  747. else
  748. {
  749. char *p;
  750. strncpy(out->pron[0].pstr, pron, SP_MAX_PRON_LENGTH);
  751. p = &(out->pron[0].pstr[SP_MAX_PRON_LENGTH - 1]);
  752. while (!isspace(*p))
  753. {
  754. p--; /* truncate the last partial phoneme */
  755. }
  756. *p = 0;
  757. }
  758. } // void assign_a_fixed_pron(LTS_OUTPUT *out, char *pron, char *word)
  759. inline BOOL IsCharInRangeA(int ch, int chMin, int chMax)
  760. {
  761. return (unsigned)(ch - chMin) <= (unsigned)(chMax - chMin);
  762. }
  763. void assign_a_spelling_pron(LTS_OUTPUT *out, const char * single_letter_pron[52], char *word)
  764. {
  765. SPDBG_FUNC("assign_a_spelling_pron");
  766. char *p;
  767. int cchPron = 0;
  768. strcpy(out->word, word);
  769. if (ispunct(*word))
  770. {
  771. p = word + 1;
  772. }
  773. else
  774. {
  775. p = word;
  776. }
  777. out->num_prons = 1;
  778. out->pron[0].prob = 1.0f;
  779. out->pron[0].pstr[0] = 0;
  780. char * pchPron = out->pron[0].pstr;
  781. while (*p)
  782. {
  783. int cPOffset = 0; // 0 for single letter, 26 for plurals
  784. int c = *p++;
  785. // Lowercaseify, and skip over non-letters
  786. if (IsCharInRangeA(c, 'A', 'Z'))
  787. {
  788. c += 'a' - 'A';
  789. }
  790. else if (!IsCharInRangeA(c, 'a', 'z'))
  791. {
  792. continue;
  793. }
  794. // Check if the next two characters are 'S (apostrophe S). Include the following cases: words ending in 's 'S s' S'
  795. // If they are we use a the plural pronunciation for the letter and skip over the letter and 'S
  796. if ((p[0] == '\'') && ((0 == p[1] && 's' == c) || 's' == p[1] || 'S' == p[1]))
  797. {
  798. cPOffset = 26;
  799. p += p[1] ? 1 : 0; // skip 'S
  800. }
  801. // Make sure the string isn't too long accounting for the new phone and seperator
  802. const char * const pchPronT = single_letter_pron[cPOffset + c - 'a'];
  803. const int cchPronT = strlen(pchPronT);
  804. if ((cchPron + 1 + cchPronT) < (SP_MAX_PRON_LENGTH - 1)) // +1 for separating space, -1 for terminating NUL
  805. {
  806. strcpy(pchPron + cchPron, pchPronT);
  807. cchPron += cchPronT;
  808. pchPron[cchPron++] = ' ';
  809. }
  810. else
  811. {
  812. break;
  813. }
  814. }
  815. if (cchPron)
  816. {
  817. pchPron[cchPron - 1] = 0; // trim trailing space char
  818. }
  819. }
  820. HRESULT LtscartGetPron(LTS_FOREST *l_forest, char *word, LTS_OUTPUT **ppLtsOutput)
  821. {
  822. SPDBG_FUNC("LtscartGetPron");
  823. HRESULT hr = S_OK;
  824. LTS_OUT_RESULT *pres = NULL;
  825. char *p, *base;
  826. SYMBOL buffer[LONGEST_STR], *pbuf = buffer + 1;
  827. int len, id, hasvowel = 0, allcapital = 1;
  828. l_forest->out.num_prons = 0;
  829. buffer[0] = NULL_SYMBOL_ID;
  830. len = 0;
  831. if (word == NULL || (base = strtok(word, " \t\n")) == NULL)
  832. {
  833. assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, "NUL");
  834. *ppLtsOutput = &(l_forest->out);
  835. return S_FALSE;
  836. }
  837. else
  838. {
  839. base = strtok(word, " \t\n");
  840. if (ispunct(*base))
  841. {
  842. for (p = base; *p && ispunct(*p); p++) {};
  843. }
  844. else
  845. {
  846. p = base;
  847. }
  848. }
  849. char ach[2];
  850. ach[1] = 0;
  851. while (*p)
  852. {
  853. const int d = *p++;
  854. const int c = tolower(d);
  855. if (!hasvowel && (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'))
  856. {
  857. hasvowel = 1;
  858. }
  859. if (allcapital && d == c)
  860. {
  861. allcapital = 0;
  862. }
  863. ach[0] = (char)c;
  864. if ((id = symbol_to_id (&(l_forest->symbols[INPUT]), ach)) == NO_SYMBOL || id == NULL_SYMBOL_ID)
  865. {
  866. ODS("cannot find the symbol %c, skip!\n", c);
  867. continue;
  868. }
  869. pbuf[len++] = (SYMBOL) id;
  870. }
  871. pbuf[len] = NULL_SYMBOL_ID;
  872. if (len >= SP_MAX_WORD_LENGTH || len <= 0)
  873. {
  874. // fill in bogus pron below
  875. }
  876. else if (len == 1)
  877. {
  878. LTS_SYMTAB *tab = l_forest->symbols;
  879. char *p = id_to_symbol(&(tab[INPUT]), pbuf[0]);
  880. int c = tolower(p[0]);
  881. if (c >= 'a' && c <= 'z')
  882. {
  883. assign_a_fixed_pron(&(l_forest->out), l_forest->single_letter_pron[c - 'a'], word);
  884. }
  885. }
  886. else if (!hasvowel)
  887. {
  888. assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
  889. }
  890. else
  891. {
  892. if (allcapital)
  893. {
  894. assign_a_spelling_pron(&(l_forest->out), l_forest->single_letter_pron, word);
  895. }
  896. pres = gen_one_output(l_forest, len, pbuf, 0, pbuf, DEFAULT_PRUNE);
  897. if (!pres)
  898. {
  899. return E_OUTOFMEMORY;
  900. }
  901. lts_fill_out_buffer(l_forest, pres, word);
  902. }
  903. if (l_forest->out.num_prons == 0)
  904. {
  905. hr = S_FALSE;
  906. assign_a_fixed_pron(&(l_forest->out), l_forest->bogus_pron, word);
  907. }
  908. *ppLtsOutput = &(l_forest->out);
  909. SPDBG_RETURN(hr);
  910. } /* LtscartGetPron */
  911. LTS_FOREST *LtscartReadData (LCID lcid, PBYTE map_addr)
  912. {
  913. SPDBG_FUNC("LtscartReadData");
  914. int i;
  915. LTS_FOREST *l_forest;
  916. LTS_SYMTAB *tab;
  917. LTS_FEATURE *feat;
  918. int output = 0;
  919. l_forest = (LTS_FOREST *) calloc(1, sizeof(LTS_FOREST));
  920. if (!l_forest)
  921. {
  922. return NULL;
  923. }
  924. if (lcid == 1033)
  925. {
  926. l_forest->bogus_pron = bogus_pron_1033;
  927. l_forest->single_letter_pron = single_letter_pron_1033;
  928. }
  929. else if (lcid == 1041)
  930. {
  931. l_forest->bogus_pron = bogus_pron_1041;
  932. l_forest->single_letter_pron = single_letter_pron_1041;
  933. }
  934. else
  935. {
  936. return NULL;
  937. }
  938. //read in the symbol table
  939. l_forest->symbols = (LTS_SYMTAB *) calloc(2, sizeof(LTS_SYMTAB));
  940. if (!l_forest->symbols)
  941. {
  942. return NULL;
  943. }
  944. tab = &(l_forest->symbols[INPUT]);
  945. CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
  946. output += sizeof(int);
  947. tab->sym_idx = (int *)(map_addr + output);
  948. output += tab->n_symbols * sizeof(int);
  949. CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
  950. output += sizeof(int);
  951. tab->storage = (char*)(map_addr + output);
  952. output += tab->n_bytes * sizeof(char);
  953. tab = &(l_forest->symbols[OUTPUT]);
  954. CopyMemory(&(tab->n_symbols), map_addr + output, sizeof(int));
  955. output += sizeof(int);
  956. tab->sym_idx = (int*)(map_addr + output);
  957. output += tab->n_symbols * sizeof(int);
  958. CopyMemory(&(tab->n_bytes), map_addr + output, sizeof(int));
  959. output += sizeof(int);
  960. tab->storage = (char*)(map_addr + output);
  961. output += tab->n_bytes * sizeof(char);
  962. // read in the feature vector
  963. l_forest->features = (LTS_FEATURE *) calloc(2, sizeof(LTS_FEATURE));
  964. if (!l_forest->features)
  965. {
  966. return NULL;
  967. }
  968. feat = &(l_forest->features[INPUT]);
  969. CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
  970. output += sizeof(int);
  971. CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
  972. output += sizeof(int);
  973. feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
  974. if (!feat->feature)
  975. {
  976. return NULL;
  977. }
  978. for (i = 0; i < feat->n_feat; i++)
  979. {
  980. feat->feature[i] = (int*)(map_addr + output);
  981. output += feat->dim * sizeof(int);
  982. }
  983. feat = &(l_forest->features[OUTPUT]);
  984. CopyMemory(&(feat->n_feat), map_addr + output, sizeof(int));
  985. output += sizeof(int);
  986. CopyMemory(&(feat->dim), map_addr + output, sizeof(int));
  987. output += sizeof(int);
  988. feat->feature = (int **) calloc(feat->n_feat, sizeof(int *));
  989. if (!feat->feature)
  990. {
  991. return NULL;
  992. }
  993. for (i = 0; i < feat->n_feat; i++)
  994. {
  995. feat->feature[i] = (int*)(map_addr + output);
  996. output += feat->dim * sizeof(int);
  997. }
  998. /*
  999. * read in the tree
  1000. */
  1001. l_forest->tree = (LTS_TREE **) calloc(l_forest->symbols[INPUT].n_symbols,
  1002. sizeof(LTS_TREE *));
  1003. if (!l_forest->tree)
  1004. {
  1005. return NULL;
  1006. }
  1007. for (i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
  1008. {
  1009. LTS_TREE *l_root;
  1010. l_forest->tree[i] = l_root = (LTS_TREE *) calloc(1, sizeof(LTS_TREE));
  1011. if (!l_root)
  1012. {
  1013. return NULL;
  1014. }
  1015. CopyMemory(&(l_root->n_nodes), map_addr + output, sizeof(int));
  1016. output += sizeof(int);
  1017. l_root->nodes = (LTS_NODE*)(map_addr + output);
  1018. output += l_root->n_nodes * sizeof(LTS_NODE);
  1019. CopyMemory(&(l_root->size_dist), map_addr + output, sizeof(int));
  1020. output += sizeof(int);
  1021. l_root->p_dist = (LTS_DIST*)(map_addr + output);
  1022. output += l_root->size_dist * sizeof(char);
  1023. CopyMemory(&(l_root->size_prod), map_addr + output, sizeof(int));
  1024. output += sizeof(int);
  1025. if (l_root->size_prod > 0)
  1026. {
  1027. l_root->p_prod = (LTS_PROD*)(map_addr + output);
  1028. output += l_root->size_prod * sizeof(char);
  1029. }
  1030. }
  1031. return l_forest;
  1032. } // LTS_FOREST *LtscartReadData(char *forest_image, HANDLE *hFile1,
  1033. void LtscartFreeData(LTS_FOREST *l_forest)
  1034. {
  1035. SPDBG_FUNC("LtscartFreeData");
  1036. for (int i = 1; i < l_forest->symbols[INPUT].n_symbols; i++)
  1037. {
  1038. free(l_forest->tree[i]);
  1039. }
  1040. free(l_forest->tree);
  1041. free(l_forest->features[INPUT].feature);
  1042. free(l_forest->features[OUTPUT].feature);
  1043. free(l_forest->features);
  1044. free(l_forest->symbols);
  1045. free(l_forest);
  1046. } // void LtscartFreeData(LTS_FOREST *l_forest, HANDLE m_hFile,