Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1000 lines
31 KiB

  1. /******************************************************************************
  2. * StdSentEnum.h *
  3. *---------------*
  4. * This is the header file for the CStdSentEnum implementation.
  5. *------------------------------------------------------------------------------
  6. * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
  7. * All Rights Reserved
  8. *
  9. *********************************************************************** EDC ***/
  10. #ifndef StdSentEnum_h
  11. #define StdSentEnum_h
  12. //--- Additional includes
  13. #include "stdafx.h"
  14. #include "ms_entropicengine.h"
  15. #include "resource.h"
  16. #include "SentItemMemory.h"
  17. #include "morph.h"
  18. #include "TTSPropertiesDialog.h"
  19. //=== Constants ====================================================
  20. //--- Vowel WCHAR values - used to disambiguate pronunciations of certain words
  21. const WCHAR g_Vowels[] =
  22. {
  23. 0x0a, // AA
  24. 0x0b, // AE
  25. 0x0c, // AH
  26. 0x0d, // AO
  27. 0x0e, // AW
  28. 0x0f, // AX
  29. 0x10, // AY
  30. 0x15, // EH
  31. 0x16, // ER
  32. 0x17, // EY
  33. 0x1b, // IH
  34. 0x1c, // IY
  35. 0x23, // OW
  36. 0x24, // OY
  37. 0x2a, // UH
  38. 0x2b, // UW
  39. };
  40. //--- Normalization constants - see NormData.cpp
  41. extern const char g_pFlagCharacter;
  42. extern const unsigned char g_AnsiToAscii[256];
  43. extern const SPLSTR g_O;
  44. extern const SPLSTR g_negative;
  45. extern const SPLSTR g_decimalpoint;
  46. extern const SPLSTR g_to;
  47. extern const SPLSTR g_a;
  48. extern const SPLSTR g_of;
  49. extern const SPLSTR g_percent;
  50. extern const SPLSTR g_degree;
  51. extern const SPLSTR g_degrees;
  52. extern const SPLSTR g_squared;
  53. extern const SPLSTR g_cubed;
  54. extern const SPLSTR g_ones[10];
  55. extern const SPLSTR g_tens[10];
  56. extern const SPLSTR g_teens[10];
  57. extern const SPLSTR g_onesOrdinal[10];
  58. extern const SPLSTR g_tensOrdinal[10];
  59. extern const SPLSTR g_teensOrdinal[10];
  60. extern const SPLSTR g_quantifiers[6];
  61. extern const SPLSTR g_quantifiersOrdinal[6];
  62. extern const SPLSTR g_dash;
  63. extern WCHAR g_Euro[2];
  64. struct CurrencySign
  65. {
  66. SPLSTR Sign;
  67. SPLSTR MainUnit;
  68. SPLSTR SecondaryUnit;
  69. };
  70. struct StateStruct
  71. {
  72. SPLSTR Abbreviation;
  73. SPLSTR FullName;
  74. };
  75. extern const StateStruct g_StateAbbreviations[63];
  76. extern const CurrencySign g_CurrencySigns[14];
  77. extern const SPLSTR g_SingularPrimaryCurrencySigns[14];
  78. extern const SPLSTR g_SingularSecondaryCurrencySigns[14];
  79. extern const WCHAR g_DateDelimiters[3];
  80. extern const SPLSTR g_months[12];
  81. extern const SPLSTR g_monthAbbreviations[13];
  82. extern const SPLSTR g_days[7];
  83. extern const SPLSTR g_dayAbbreviations[10];
  84. extern const SPLSTR g_Area;
  85. extern const SPLSTR g_Country;
  86. extern const SPLSTR g_Code;
  87. extern const SPLSTR g_Half;
  88. extern const SPLSTR g_Tenths;
  89. extern const SPLSTR g_Sixteenths;
  90. extern const SPLSTR g_Hundredths;
  91. extern const SPLSTR g_Over;
  92. extern const SPLSTR g_PluralDenominators[10];
  93. extern const SPLSTR g_A;
  94. extern const SPLSTR g_M;
  95. extern const SPLSTR g_P;
  96. extern const SPLSTR g_OClock;
  97. extern const SPLSTR g_hundred;
  98. extern const SPLSTR g_hour;
  99. extern const SPLSTR g_hours;
  100. extern const SPLSTR g_minute;
  101. extern const SPLSTR g_minutes;
  102. extern const SPLSTR g_second;
  103. extern const SPLSTR g_seconds;
  104. extern const SPLSTR g_ANSICharacterProns[256];
  105. extern const SPVSTATE g_DefaultXMLState;
  106. extern const SPLSTR g_And;
  107. extern const SPLSTR g_comma;
  108. extern const SPLSTR g_period;
  109. extern const SPLSTR g_periodString;
  110. extern const SPLSTR g_slash;
  111. extern const SPLSTR g_Decades[];
  112. extern const SPLSTR g_Zeroes;
  113. extern const SPLSTR g_Hundreds;
  114. #define DAYMAX 31
  115. #define DAYMIN 1
  116. #define MONTHMAX 12
  117. #define MONTHMIN 1
  118. #define YEARMAX 9999
  119. #define YEARMIN 0
  120. #define HOURMIN 1
  121. #define HOURMAX 23
  122. #define MINUTEMIN 0
  123. #define MINUTEMAX 59
  124. #define SECONDMIN 0
  125. #define SECONDMAX 59
  126. //--- POS Tagger Constants - see MiscData.cpp
  127. typedef enum TEMPLATETYPE
  128. {
  129. PREV1T,
  130. NEXT1T,
  131. PREV2T,
  132. NEXT2T,
  133. PREV1OR2T,
  134. NEXT1OR2T,
  135. PREV1OR2OR3T,
  136. NEXT1OR2OR3T,
  137. PREV1TNEXT1T,
  138. PREV1TNEXT2T,
  139. PREV2TNEXT1T,
  140. NOTCAP,
  141. CAP,
  142. PREVNOTCAP,
  143. PREVCAP,
  144. PREV1W,
  145. NEXT1W,
  146. PREV2W,
  147. NEXT2W,
  148. PREV1OR2W,
  149. NEXT1OR2W,
  150. CURRWPREV1W,
  151. CURRWNEXT1W,
  152. CURRWPREV1T,
  153. CURRWNEXT1T,
  154. CURRW,
  155. PREV1WT,
  156. NEXT1WT,
  157. CURRWPREV1WT,
  158. CURRWNEXT1WT
  159. } TEMPLATETYPE;
  160. struct BrillPatch
  161. {
  162. ENGPARTOFSPEECH eCurrentPOS;
  163. ENGPARTOFSPEECH eConvertToPOS;
  164. TEMPLATETYPE eTemplateType;
  165. ENGPARTOFSPEECH eTemplatePOS1;
  166. ENGPARTOFSPEECH eTemplatePOS2;
  167. const WCHAR* pTemplateWord1;
  168. const WCHAR* pTemplateWord2;
  169. };
  170. extern const BrillPatch g_POSTaggerPatches [62];
  171. //=== Class, Enum, Struct and Union Declarations ===================
  172. typedef CSPList<TTSWord,TTSWord&> CWordList;
  173. typedef CSPList<TTSSentItem,TTSSentItem&> CItemList;
  174. //--- Structs used for normalization
  175. typedef enum
  176. {
  177. PRECEDING,
  178. FOLLOWING,
  179. UNATTACHED
  180. } NORM_POSITION;
  181. struct NumberGroup
  182. {
  183. BOOL fOnes; // "one" through "nineteen"
  184. BOOL fTens; // "twenty" through "ninety"
  185. BOOL fHundreds; // "one hundred" through "nine hundred"
  186. BOOL fQuantifier; // "thousand" through "quadrillion"
  187. };
  188. struct TTSIntegerItemInfo
  189. {
  190. long lNumGroups;
  191. NumberGroup Groups[6];
  192. BOOL fOrdinal;
  193. BOOL fDigitByDigit;
  194. ULONG ulNumDigits;
  195. //--- Normalization internal only
  196. long lLeftOver;
  197. BOOL fSeparators;
  198. const WCHAR* pStartChar;
  199. const WCHAR* pEndChar;
  200. };
  201. struct TTSDigitsItemInfo : TTSItemInfo
  202. {
  203. const WCHAR* pFirstDigit;
  204. ULONG ulNumDigits;
  205. };
  206. struct TTSNumberItemInfo;
  207. struct TTSFractionItemInfo
  208. {
  209. BOOL fIsStandard;
  210. TTSNumberItemInfo* pNumerator;
  211. TTSNumberItemInfo* pDenominator;
  212. //--- Normalization internal only
  213. const WCHAR* pVulgar;
  214. };
  215. struct TTSNumberItemInfo : TTSItemInfo
  216. {
  217. BOOL fNegative;
  218. TTSIntegerItemInfo* pIntegerPart;
  219. TTSDigitsItemInfo* pDecimalPart;
  220. TTSFractionItemInfo* pFractionalPart;
  221. //--- Normalization internal only
  222. const WCHAR* pStartChar;
  223. const WCHAR* pEndChar;
  224. CWordList* pWordList;
  225. };
  226. struct TTSPhoneNumberItemInfo : TTSItemInfo
  227. {
  228. //--- Country code members
  229. TTSNumberItemInfo* pCountryCode;
  230. //--- Area code members
  231. TTSDigitsItemInfo* pAreaCode;
  232. BOOL fIs800;
  233. BOOL fOne;
  234. //--- Main number members
  235. TTSDigitsItemInfo** ppGroups;
  236. ULONG ulNumGroups;
  237. };
  238. struct TTSZipCodeItemInfo : TTSItemInfo
  239. {
  240. TTSDigitsItemInfo* pFirstFive;
  241. TTSDigitsItemInfo* pLastFour;
  242. };
  243. struct TTSStateAndZipCodeItemInfo : TTSItemInfo
  244. {
  245. TTSZipCodeItemInfo* pZipCode;
  246. };
  247. struct TTSCurrencyItemInfo : TTSItemInfo
  248. {
  249. TTSNumberItemInfo* pPrimaryNumberPart;
  250. TTSNumberItemInfo* pSecondaryNumberPart;
  251. BOOL fQuantifier;
  252. long lNumPostNumberStates;
  253. long lNumPostSymbolStates;
  254. };
  255. struct TTSYearItemInfo : TTSItemInfo
  256. {
  257. const WCHAR* pYear;
  258. ULONG ulNumDigits;
  259. };
  260. struct TTSRomanNumeralItemInfo : TTSItemInfo
  261. {
  262. TTSItemInfo* pNumberInfo;
  263. };
  264. struct TTSDecadeItemInfo : TTSItemInfo
  265. {
  266. const WCHAR* pCentury;
  267. ULONG ulDecade;
  268. };
  269. struct TTSDateItemInfo : TTSItemInfo
  270. {
  271. ULONG ulDayIndex;
  272. ULONG ulMonthIndex;
  273. TTSIntegerItemInfo* pDay;
  274. TTSYearItemInfo* pYear;
  275. };
  276. typedef enum
  277. {
  278. AM,
  279. PM,
  280. UNDEFINED
  281. } TIMEABBREVIATION;
  282. struct TTSTimeOfDayItemInfo : TTSItemInfo
  283. {
  284. BOOL fTimeAbbreviation;
  285. BOOL fTwentyFourHour;
  286. BOOL fMinutes;
  287. };
  288. struct TTSTimeItemInfo : TTSItemInfo
  289. {
  290. TTSNumberItemInfo* pHours;
  291. TTSNumberItemInfo* pMinutes;
  292. const WCHAR* pSeconds;
  293. };
  294. struct TTSHyphenatedStringInfo : TTSItemInfo
  295. {
  296. TTSItemInfo* pFirstChunkInfo;
  297. TTSItemInfo* pSecondChunkInfo;
  298. const WCHAR* pFirstChunk;
  299. const WCHAR* pSecondChunk;
  300. };
  301. struct TTSSuffixItemInfo : TTSItemInfo
  302. {
  303. const WCHAR* pFirstChar;
  304. ULONG ulNumChars;
  305. };
  306. struct TTSNumberRangeItemInfo : TTSItemInfo
  307. {
  308. TTSItemInfo *pFirstNumberInfo;
  309. TTSItemInfo *pSecondNumberInfo;
  310. };
  311. struct TTSTimeRangeItemInfo : TTSItemInfo
  312. {
  313. TTSTimeOfDayItemInfo *pFirstTimeInfo;
  314. TTSTimeOfDayItemInfo *pSecondTimeInfo;
  315. };
  316. struct AbbrevRecord
  317. {
  318. const WCHAR* pOrth;
  319. WCHAR* pPron1;
  320. ENGPARTOFSPEECH POS1;
  321. WCHAR* pPron2;
  322. ENGPARTOFSPEECH POS2;
  323. WCHAR* pPron3;
  324. ENGPARTOFSPEECH POS3;
  325. int iSentBreakDisambig;
  326. int iPronDisambig;
  327. };
  328. struct TTSAbbreviationInfo : TTSItemInfo
  329. {
  330. const AbbrevRecord* pAbbreviation;
  331. };
  332. //--- Structs used for Lex Lookup
  333. typedef enum { PRON_A = 0, PRON_B = 1 };
  334. struct PRONUNIT
  335. {
  336. ULONG phon_Len;
  337. WCHAR phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
  338. ULONG POScount;
  339. ENGPARTOFSPEECH POScode[POS_MAX];
  340. };
  341. struct PRONRECORD
  342. {
  343. WCHAR orthStr[SP_MAX_WORD_LENGTH]; // Orth text
  344. WCHAR lemmaStr[SP_MAX_WORD_LENGTH]; // Root word
  345. ULONG pronType; // Pronunciation is lex or LTS
  346. PRONUNIT pronArray[2];
  347. ENGPARTOFSPEECH POSchoice;
  348. ENGPARTOFSPEECH XMLPartOfSpeech;
  349. bool hasAlt;
  350. ULONG altChoice;
  351. BOOL fUsePron;
  352. WCHAR CustomLtsToken[SP_MAX_WORD_LENGTH];
  353. };
  354. //--- Miscellaneous structs and typedefs
  355. struct SentencePointer
  356. {
  357. const WCHAR *pSentenceStart;
  358. const SPVTEXTFRAG *pSentenceFrag;
  359. };
  360. //=== Function Definitions ===========================================
  361. // Misc Number Normalization functions and helpers
  362. int MatchCurrencySign( const WCHAR*& pStartChar, const WCHAR*& pEndChar, NORM_POSITION& ePosition );
  363. //=== Classes
  364. /*** CSentenceStack *************************************************
  365. * This class is used to maintain a stack of sentences for the Skip
  366. * call to utilize.
  367. */
  368. class CSentenceStack
  369. {
  370. public:
  371. /*--- Methods ---*/
  372. CSentenceStack() { m_StackPtr = -1; }
  373. int GetCount( void ) { return m_StackPtr + 1; }
  374. virtual SentencePointer& Pop( void ) { SPDBG_ASSERT( m_StackPtr > -1 ); return m_Stack[m_StackPtr--]; }
  375. virtual HRESULT Push( const SentencePointer& val ) { ++m_StackPtr; return m_Stack.SetAtGrow( m_StackPtr, val ); }
  376. virtual void Reset( void ) { m_StackPtr = -1; }
  377. protected:
  378. /*--- Member data ---*/
  379. CSPArray<SentencePointer,SentencePointer> m_Stack;
  380. int m_StackPtr;
  381. };
  382. /*** CSentItem
  383. * This object is a helper class
  384. */
  385. class CSentItem : public TTSSentItem
  386. {
  387. public:
  388. CSentItem() { memset( this, 0, sizeof(*this) ); }
  389. CSentItem( TTSSentItem& Other ) { memcpy( this, &Other, sizeof( Other ) ); }
  390. };
  391. /*** CSentItemEnum
  392. * This object is designed to be used by a single thread.
  393. */
  394. class ATL_NO_VTABLE CSentItemEnum :
  395. public CComObjectRootEx<CComMultiThreadModel>,
  396. public IEnumSENTITEM
  397. {
  398. /*=== ATL Setup ===*/
  399. public:
  400. DECLARE_PROTECT_FINAL_CONSTRUCT()
  401. BEGIN_COM_MAP(CSentItemEnum)
  402. COM_INTERFACE_ENTRY(IEnumSENTITEM)
  403. END_COM_MAP()
  404. /*=== Methods =======*/
  405. public:
  406. /*--- Constructors/Destructors ---*/
  407. /*--- Non interface methods ---*/
  408. void _SetOwner( IUnknown* pOwner ) { m_cpOwner = pOwner; }
  409. CItemList& _GetList( void ) { return m_ItemList; }
  410. CSentItemMemory& _GetMemoryManager( void ) { return m_MemoryManager; }
  411. /*=== Interfaces ====*/
  412. public:
  413. //--- IEnumSpSentence ----------------------------------------
  414. STDMETHOD(Next)( TTSSentItem *pItemEnum );
  415. STDMETHOD(Reset)( void );
  416. /*=== Member data ===*/
  417. private:
  418. CComPtr<IUnknown> m_cpOwner;
  419. CItemList m_ItemList;
  420. SPLISTPOS m_ListPos;
  421. CSentItemMemory m_MemoryManager;
  422. };
  423. /*** CStdSentEnum COM object
  424. */
  425. class ATL_NO_VTABLE CStdSentEnum :
  426. public CComObjectRootEx<CComMultiThreadModel>,
  427. public IEnumSpSentence
  428. {
  429. /*=== ATL Setup ===*/
  430. public:
  431. DECLARE_GET_CONTROLLING_UNKNOWN()
  432. DECLARE_PROTECT_FINAL_CONSTRUCT()
  433. BEGIN_COM_MAP(CStdSentEnum)
  434. COM_INTERFACE_ENTRY(IEnumSpSentence)
  435. END_COM_MAP()
  436. /*=== Methods =======*/
  437. public:
  438. /*--- Constructors/Destructors ---*/
  439. HRESULT FinalConstruct();
  440. void FinalRelease();
  441. /*--- Non interface methods ---*/
  442. HRESULT InitAggregateLexicon( void );
  443. HRESULT AddLexiconToAggregate( ISpLexicon *pAddLexicon, DWORD dwFlags );
  444. HRESULT InitMorphLexicon( void );
  445. void fNamesLTS( bool );
  446. //--- Abbreviation Sentence Breaking Disambiguation Functions
  447. HRESULT IsAbbreviationEOS( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  448. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  449. HRESULT IfEOSNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  450. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  451. HRESULT IfEOSAndLowercaseNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  452. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  453. //--- Abbreviation Pronunciation Disambiguation Functions
  454. HRESULT SingleOrPluralAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  455. CItemList& ItemList, SPLISTPOS ListPos );
  456. HRESULT DoctorDriveAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  457. CItemList& ItemList, SPLISTPOS ListPos );
  458. HRESULT AbbreviationFollowedByDigit( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  459. CItemList& ItemList, SPLISTPOS ListPos );
  460. HRESULT AllCapsAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  461. CItemList& ItemList, SPLISTPOS ListPos );
  462. HRESULT CapitalizedAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  463. CItemList& ItemList, SPLISTPOS ListPos );
  464. HRESULT SECAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  465. CItemList& ItemList, SPLISTPOS ListPos );
  466. HRESULT DegreeAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  467. CItemList& ItemList, SPLISTPOS ListPos );
  468. HRESULT AbbreviationModifier( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  469. CItemList& ItemList, SPLISTPOS ListPos );
  470. HRESULT ADisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  471. CItemList& ItemList, SPLISTPOS ListPos );
  472. HRESULT PolishDisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  473. CItemList& ItemList, SPLISTPOS ListPos );
  474. //--- Word Pronunciation Disambiguation Functions
  475. HRESULT MeasurementDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  476. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  477. HRESULT TheDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  478. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  479. HRESULT ReadDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  480. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  481. private:
  482. //--- Pronunciation Table init helper
  483. HRESULT InitPron( WCHAR** OriginalPron );
  484. //--- Sentence breaking helpers ---//
  485. HRESULT GetNextSentence( IEnumSENTITEM** pItemEnum );
  486. HRESULT AddNextSentItem( CItemList& ItemList, CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  487. HRESULT SkipWhiteSpaceAndTags( const WCHAR*& pStartChar, const WCHAR*& pEndChar,
  488. const SPVTEXTFRAG*& pCurrFrag, CSentItemMemory& pMemoryManager,
  489. BOOL fAddToItemList = false, CItemList* pItemList = NULL );
  490. const WCHAR* FindTokenEnd( const WCHAR* pStartChar, const WCHAR* pEndChar );
  491. //--- Lexicon and POS helpers ---//
  492. HRESULT DetermineProns( CItemList& ItemList, CSentItemMemory& MemoryManager );
  493. HRESULT Pronounce( PRONRECORD *pPron );
  494. //--- Normalization helpers ---//
  495. HRESULT Normalize( CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  496. HRESULT MatchCategory( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  497. HRESULT ExpandCategory( TTSItemInfo*& pItemNormInfo, CItemList& ItemList, SPLISTPOS ListPos,
  498. CSentItemMemory& MemoryManager );
  499. HRESULT DoUnicodeToAsciiMap( const WCHAR *pUnicodeString, ULONG ulUnicodeStringLength,
  500. WCHAR *ppConvertedString );
  501. HRESULT IsAlphaWord( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  502. CSentItemMemory& MemoryManager );
  503. HRESULT IsInitialism( CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager,
  504. BOOL* pfIsEOS );
  505. //--- Various Number Related Normalization helpers ---//
  506. HRESULT IsNumberCategory( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  507. HRESULT IsNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager,
  508. BOOL fMultiItem = true );
  509. HRESULT IsInteger( const WCHAR* pStartChar, TTSIntegerItemInfo*& pIntegerInfo,
  510. CSentItemMemory& MemoryManager );
  511. HRESULT IsDigitString( const WCHAR* pStartChar, TTSDigitsItemInfo*& pDigitsInfo,
  512. CSentItemMemory& MemoryManager );
  513. HRESULT ExpandNumber( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  514. HRESULT ExpandPercent( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  515. HRESULT ExpandDegrees( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  516. HRESULT ExpandSquare( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  517. HRESULT ExpandCube( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  518. void ExpandInteger( TTSIntegerItemInfo* pItemInfo, const WCHAR* Context, CWordList &WordList );
  519. void ExpandDigit( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList );
  520. void ExpandTwoDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  521. void ExpandThreeDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  522. void ExpandDigitOrdinal( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList );
  523. void ExpandTwoOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  524. void ExpandThreeOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  525. void ExpandDigits( TTSDigitsItemInfo* pItemInfo, CWordList& WordList );
  526. HRESULT IsFraction( const WCHAR* pStartChar, TTSFractionItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  527. HRESULT ExpandFraction( TTSFractionItemInfo* pItemInfo, CWordList& WordList );
  528. HRESULT IsRomanNumeral( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  529. HRESULT IsPhoneNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, CWordList& WordList );
  530. HRESULT IsZipCode( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  531. HRESULT ExpandZipCode( TTSZipCodeItemInfo* pItemInfo, CWordList& WordList );
  532. HRESULT IsCurrency( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager,
  533. CWordList& WordList );
  534. HRESULT IsNumberRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  535. HRESULT ExpandNumberRange( TTSNumberRangeItemInfo* pItemInfo, CWordList& WordList );
  536. HRESULT IsCurrencyRange( TTSItemInfo*& pItemInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  537. //--- Date Related Normalization helpers ---//
  538. HRESULT IsNumericCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context,
  539. CSentItemMemory& MemoryManager );
  540. HRESULT IsMonthStringCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context,
  541. CSentItemMemory& MemoryManager );
  542. HRESULT IsLongFormDate_DMDY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  543. HRESULT IsLongFormDate_DDMY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  544. HRESULT ExpandDate( TTSDateItemInfo* pItemInfo, CWordList& WordList );
  545. HRESULT ExpandYear( TTSYearItemInfo* pItemInfo, CWordList& WordList );
  546. HRESULT IsDecade( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  547. HRESULT ExpandDecade( TTSDecadeItemInfo* pItemInfo, CWordList& WordList );
  548. ULONG MatchMonthString( WCHAR*& pMonth, ULONG ulLength );
  549. ULONG MatchDayString( WCHAR*& pDayString, WCHAR* pEndChar );
  550. bool MatchDateDelimiter( WCHAR **DateString );
  551. //--- Time Related Normalization helpers ---//
  552. HRESULT IsTimeOfDay( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList, BOOL fMultiItem = true );
  553. HRESULT IsTime( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  554. HRESULT ExpandTime( TTSTimeItemInfo* pItemInfo, CWordList& WordList );
  555. HRESULT IsTimeRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  556. //--- SPELL tag normalization helper
  557. HRESULT SpellOutString( CWordList& WordList );
  558. void ExpandPunctuation( CWordList& WordList, WCHAR wc );
  559. //--- Default normalization helper
  560. HRESULT ExpandUnrecognizedString( CWordList& WordList, CSentItemMemory& MemoryManager );
  561. //--- Misc. normalization helpers
  562. HRESULT IsStateAndZipcode( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  563. HRESULT IsHyphenatedString( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  564. CSentItemMemory& MemoryManager );
  565. HRESULT ExpandHyphenatedString( TTSHyphenatedStringInfo* pItemInfo, CWordList& WordList );
  566. HRESULT IsSuffix( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  567. CSentItemMemory& MemoryManager );
  568. HRESULT ExpandSuffix( TTSSuffixItemInfo* pItemInfo, CWordList& WordList );
  569. bool Zeroes( const WCHAR* );
  570. bool ThreeZeroes( const WCHAR* );
  571. bool IsPunctuation(const TTSSentItem *Item);
  572. /*=== Interfaces ====*/
  573. public:
  574. //--- IEnumSpSentence ----------------------------------------
  575. STDMETHOD(SetFragList)( const SPVTEXTFRAG* pTextFragList, DWORD dwFlags );
  576. STDMETHOD(Next)( IEnumSENTITEM **ppSentItemEnum );
  577. STDMETHOD(Previous)( IEnumSENTITEM **ppSentItemEnum );
  578. STDMETHOD(Reset)( void );
  579. //=== Data members ===
  580. private:
  581. CComPtr<ISpContainerLexicon> m_cpAggregateLexicon;
  582. CComPtr<ISpPhoneConverter> m_cpPhonemeConverter;
  583. CSMorph* m_pMorphLexicon;
  584. DWORD m_dwSpeakFlags;
  585. const SPVTEXTFRAG* m_pTextFragList;
  586. const SPVTEXTFRAG* m_pCurrFrag;
  587. const WCHAR* m_pNextChar;
  588. const WCHAR* m_pEndChar;
  589. const WCHAR* m_pEndOfCurrToken;
  590. const WCHAR* m_pEndOfCurrItem;
  591. CSentenceStack m_SentenceStack;
  592. SEPARATOR_AND_DECIMAL m_eSeparatorAndDecimal;
  593. SHORT_DATE_ORDER m_eShortDateOrder;
  594. static CComAutoCriticalSection m_AbbrevTableCritSec;
  595. bool m_fNameItem;
  596. bool m_fHaveNamesLTS;
  597. };
  598. //--- Structs and typedefs used for abbreviation stuff
  599. typedef HRESULT (CStdSentEnum::* SentBreakDisambigFunc)(const AbbrevRecord*, CItemList& , SPLISTPOS,
  600. CSentItemMemory&, BOOL*);
  601. typedef HRESULT (CStdSentEnum::* PronDisambigFunc) ( const AbbrevRecord*, PRONRECORD*, CItemList&, SPLISTPOS );
  602. typedef HRESULT (CStdSentEnum::* PostLexLookupDisambigFunc) ( const AbbrevRecord*, CItemList&, SPLISTPOS, CSentItemMemory& );
  603. extern AbbrevRecord g_AbbreviationTable[177];
  604. extern const PronDisambigFunc g_PronDisambigTable[];
  605. extern const SentBreakDisambigFunc g_SentBreakDisambigTable[];
  606. extern AbbrevRecord g_AmbiguousWordTable[72];
  607. extern const PronDisambigFunc g_AmbiguousWordDisambigTable[];
  608. extern AbbrevRecord g_PostLexLookupWordTable[41];
  609. extern const PostLexLookupDisambigFunc g_PostLexLookupDisambigTable[];
  610. extern WCHAR *g_pOfA;
  611. extern WCHAR *g_pOfAn;
  612. extern BOOL g_fAbbrevTablesInitialized;
  613. extern void CleanupAbbrevTables( void );
  614. //--- First words table - used in sentence breaking
  615. extern const SPLSTR g_FirstWords[163];
  616. //
  617. //=== Inlines
  618. //
  619. inline ULONG my_wcstoul( const WCHAR *pStartChar, WCHAR **ppEndChar )
  620. {
  621. if ( iswdigit( *pStartChar ) )
  622. {
  623. return wcstoul( pStartChar, ppEndChar, 10 );
  624. }
  625. else
  626. {
  627. if ( ppEndChar )
  628. {
  629. *ppEndChar = (WCHAR*) pStartChar;
  630. }
  631. return 0;
  632. }
  633. }
  634. inline ENGPARTOFSPEECH ConvertItemTypeToPartOfSp( TTSItemType ItemType )
  635. {
  636. switch ( ItemType )
  637. {
  638. case eOPEN_PARENTHESIS:
  639. case eOPEN_BRACKET:
  640. case eOPEN_BRACE:
  641. return MS_GroupBegin;
  642. case eCLOSE_PARENTHESIS:
  643. case eCLOSE_BRACKET:
  644. case eCLOSE_BRACE:
  645. return MS_GroupEnd;
  646. case eSINGLE_QUOTE:
  647. case eDOUBLE_QUOTE:
  648. return MS_Quotation;
  649. case ePERIOD:
  650. case eQUESTION:
  651. case eEXCLAMATION:
  652. return MS_EOSItem;
  653. case eCOMMA:
  654. case eCOLON:
  655. case eSEMICOLON:
  656. case eHYPHEN:
  657. case eELLIPSIS:
  658. return MS_MiscPunc;
  659. default:
  660. return MS_Unknown;
  661. }
  662. }
  663. inline bool MatchPhoneNumberDelimiter( const WCHAR wc )
  664. {
  665. return ( wc == L' ' || wc == L'-' || wc == L'.' );
  666. }
  667. inline bool NeedsToBeNormalized( const AbbrevRecord* pAbbreviation )
  668. {
  669. if( !wcscmp( pAbbreviation->pOrth, L"jan" ) ||
  670. !wcscmp( pAbbreviation->pOrth, L"feb" ) ||
  671. !wcscmp( pAbbreviation->pOrth, L"mar" ) ||
  672. !wcscmp( pAbbreviation->pOrth, L"apr" ) ||
  673. !wcscmp( pAbbreviation->pOrth, L"jun" ) ||
  674. !wcscmp( pAbbreviation->pOrth, L"jul" ) ||
  675. !wcscmp( pAbbreviation->pOrth, L"aug" ) ||
  676. !wcscmp( pAbbreviation->pOrth, L"sep" ) ||
  677. !wcscmp( pAbbreviation->pOrth, L"sept" ) ||
  678. !wcscmp( pAbbreviation->pOrth, L"oct" ) ||
  679. !wcscmp( pAbbreviation->pOrth, L"nov" ) ||
  680. !wcscmp( pAbbreviation->pOrth, L"dec" ) ||
  681. !wcscmp( pAbbreviation->pOrth, L"mon" ) ||
  682. !wcscmp( pAbbreviation->pOrth, L"tue" ) ||
  683. !wcscmp( pAbbreviation->pOrth, L"tues" ) ||
  684. !wcscmp( pAbbreviation->pOrth, L"wed" ) ||
  685. !wcscmp( pAbbreviation->pOrth, L"thu" ) ||
  686. !wcscmp( pAbbreviation->pOrth, L"thur" ) ||
  687. !wcscmp( pAbbreviation->pOrth, L"thurs" ) ||
  688. !wcscmp( pAbbreviation->pOrth, L"fri" ) ||
  689. !wcscmp( pAbbreviation->pOrth, L"sat" ) ||
  690. !wcscmp( pAbbreviation->pOrth, L"sun" ) )
  691. {
  692. return true;
  693. }
  694. else
  695. {
  696. return false;
  697. }
  698. }
  699. inline HRESULT SetWordList( CSentItem& Item, CWordList& WordList, CSentItemMemory& MemoryManager )
  700. {
  701. HRESULT hr = S_OK;
  702. SPLISTPOS WordListPos = WordList.GetHeadPosition();
  703. Item.ulNumWords = WordList.GetCount();
  704. Item.Words = (TTSWord*) MemoryManager.GetMemory( Item.ulNumWords * sizeof(TTSWord), &hr );
  705. if ( SUCCEEDED( hr ) )
  706. {
  707. ULONG ulIndex = 0;
  708. while ( WordListPos )
  709. {
  710. SPDBG_ASSERT( ulIndex < Item.ulNumWords );
  711. Item.Words[ulIndex++] = WordList.GetNext( WordListPos );
  712. }
  713. }
  714. return hr;
  715. }
  716. inline int CompareStringAndSPLSTR( const void* _String, const void* _SPLSTR )
  717. {
  718. int _StringLen = wcslen( (const WCHAR*) _String );
  719. int _SPLSTRLen = ( (const SPLSTR*) _SPLSTR )->Len;
  720. if ( _StringLen < _SPLSTRLen )
  721. {
  722. int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen );
  723. if ( Result != 0 )
  724. {
  725. return Result;
  726. }
  727. else
  728. {
  729. return -1;
  730. }
  731. }
  732. else if ( _StringLen > _SPLSTRLen )
  733. {
  734. int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _SPLSTRLen );
  735. if ( Result != 0 )
  736. {
  737. return Result;
  738. }
  739. else
  740. {
  741. return 1;
  742. }
  743. }
  744. else
  745. {
  746. return ( wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ) );
  747. }
  748. }
  749. inline int CompareStringAndStateStruct( const void* _String, const void* _StateStruct )
  750. {
  751. int _StringLen = wcslen( (const WCHAR*) _String );
  752. int _StateStructLen = ( (const StateStruct*) _StateStruct )->Abbreviation.Len;
  753. if ( _StringLen < _StateStructLen )
  754. {
  755. int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  756. _StringLen );
  757. if ( Result != 0 )
  758. {
  759. return Result;
  760. }
  761. else
  762. {
  763. return -1;
  764. }
  765. }
  766. else if ( _StringLen > _StateStructLen )
  767. {
  768. int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  769. _StateStructLen );
  770. if ( Result != 0 )
  771. {
  772. return Result;
  773. }
  774. else
  775. {
  776. return 1;
  777. }
  778. }
  779. else
  780. {
  781. return ( wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  782. _StringLen ) );
  783. }
  784. }
  785. inline int CompareStringAndAbbrevRecord( const void* _String, const void* _AbbrevRecord )
  786. {
  787. return ( _wcsicmp( (const WCHAR*) _String, ( (const AbbrevRecord*) _AbbrevRecord )->pOrth ) );
  788. }
  789. inline int CompareWCHARAndWCHAR( const void *pWCHAR_1, const void *pWCHAR_2 )
  790. {
  791. return ( *( (WCHAR*) pWCHAR_1) - *( (WCHAR*) pWCHAR_2) );
  792. }
  793. inline BOOL IsSpace( WCHAR wc )
  794. {
  795. return ( ( wc == 0x20 ) || ( wc == 0x9 ) || ( wc == 0xD ) ||
  796. ( wc == 0xA ) || ( wc == 0x200B ) );
  797. }
  798. inline BOOL IsCapital( WCHAR wc )
  799. {
  800. return ( ( wc >= L'A' ) && ( wc <= L'Z' ) );
  801. }
  802. inline TTSItemType IsGroupBeginning( WCHAR wc )
  803. {
  804. if ( wc == L'(' )
  805. {
  806. return eOPEN_PARENTHESIS;
  807. }
  808. else if ( wc == L'[' )
  809. {
  810. return eOPEN_BRACKET;
  811. }
  812. else if ( wc == L'{' )
  813. {
  814. return eOPEN_BRACE;
  815. }
  816. else
  817. {
  818. return eUNMATCHED;
  819. }
  820. }
  821. inline TTSItemType IsGroupEnding( WCHAR wc )
  822. {
  823. if ( wc == L')' )
  824. {
  825. return eCLOSE_PARENTHESIS;
  826. }
  827. else if ( wc == L']' )
  828. {
  829. return eCLOSE_BRACKET;
  830. }
  831. else if ( wc == L'}' )
  832. {
  833. return eCLOSE_BRACE;
  834. }
  835. else
  836. {
  837. return eUNMATCHED;
  838. }
  839. }
  840. inline TTSItemType IsQuotationMark( WCHAR wc )
  841. {
  842. if ( wc == L'\'' )
  843. {
  844. return eSINGLE_QUOTE;
  845. }
  846. else if ( wc == L'\"' )
  847. {
  848. return eDOUBLE_QUOTE;
  849. }
  850. else
  851. {
  852. return eUNMATCHED;
  853. }
  854. }
  855. inline TTSItemType IsEOSItem( WCHAR wc )
  856. {
  857. if ( wc == L'.' )
  858. {
  859. return ePERIOD;
  860. }
  861. else if ( wc == L'!' )
  862. {
  863. return eEXCLAMATION;
  864. }
  865. else if ( wc == L'?' )
  866. {
  867. return eQUESTION;
  868. }
  869. else
  870. {
  871. return eUNMATCHED;
  872. }
  873. }
  874. inline TTSItemType IsMiscPunctuation( WCHAR wc )
  875. {
  876. if ( wc == L',' )
  877. {
  878. return eCOMMA;
  879. }
  880. else if ( wc == L';' )
  881. {
  882. return eSEMICOLON;
  883. }
  884. else if ( wc == L':' )
  885. {
  886. return eCOLON;
  887. }
  888. else if ( wc == L'-' )
  889. {
  890. return eHYPHEN;
  891. }
  892. else
  893. {
  894. return eUNMATCHED;
  895. }
  896. }
  897. #endif //--- This must be the last line in the file