Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1008 lines
32 KiB

  1. /******************************************************************************
  2. * StdSentEnum.h *
  3. *---------------*
  4. * This is the header file for the CStdSentEnum implementation.
  5. *------------------------------------------------------------------------------
  6. * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
  7. * All Rights Reserved
  8. *
  9. *********************************************************************** EDC ***/
  10. #ifndef StdSentEnum_h
  11. #define StdSentEnum_h
  12. //--- Additional includes
  13. #include "stdafx.h"
  14. #include "spttseng.h"
  15. #include "resource.h"
  16. #include "SentItemMemory.h"
  17. #include "morph.h"
  18. //=== Constants ====================================================
  19. typedef enum SEPARATOR_AND_DECIMAL
  20. {
  21. PERIOD_COMMA = (1L << 0),
  22. COMMA_PERIOD = (1L << 1)
  23. } SEPARATOR_AND_DECIMAL;
  24. typedef enum SHORT_DATE_ORDER
  25. {
  26. MONTH_DAY_YEAR = (1L << 0),
  27. DAY_MONTH_YEAR = (1L << 1),
  28. YEAR_MONTH_DAY = (1L << 2)
  29. } SHORT_DATE_ORDER;
  30. //--- Vowel WCHAR values - used to disambiguate pronunciations of certain words
  31. const WCHAR g_Vowels[] =
  32. {
  33. 0x0a, // AA
  34. 0x0b, // AE
  35. 0x0c, // AH
  36. 0x0d, // AO
  37. 0x0e, // AW
  38. 0x0f, // AX
  39. 0x10, // AY
  40. 0x15, // EH
  41. 0x16, // ER
  42. 0x17, // EY
  43. 0x1b, // IH
  44. 0x1c, // IY
  45. 0x23, // OW
  46. 0x24, // OY
  47. 0x2a, // UH
  48. 0x2b, // UW
  49. };
  50. //--- Normalization constants - see NormData.cpp
  51. extern const char g_pFlagCharacter;
  52. extern const unsigned char g_AnsiToAscii[256];
  53. extern const SPLSTR g_O;
  54. extern const SPLSTR g_negative;
  55. extern const SPLSTR g_decimalpoint;
  56. extern const SPLSTR g_to;
  57. extern const SPLSTR g_a;
  58. extern const SPLSTR g_of;
  59. extern const SPLSTR g_percent;
  60. extern const SPLSTR g_degree;
  61. extern const SPLSTR g_degrees;
  62. extern const SPLSTR g_squared;
  63. extern const SPLSTR g_cubed;
  64. extern const SPLSTR g_ones[10];
  65. extern const SPLSTR g_tens[10];
  66. extern const SPLSTR g_teens[10];
  67. extern const SPLSTR g_onesOrdinal[10];
  68. extern const SPLSTR g_tensOrdinal[10];
  69. extern const SPLSTR g_teensOrdinal[10];
  70. extern const SPLSTR g_quantifiers[6];
  71. extern const SPLSTR g_quantifiersOrdinal[6];
  72. extern const SPLSTR g_dash;
  73. extern WCHAR g_Euro[2];
  74. struct CurrencySign
  75. {
  76. SPLSTR Sign;
  77. SPLSTR MainUnit;
  78. SPLSTR SecondaryUnit;
  79. };
  80. struct StateStruct
  81. {
  82. SPLSTR Abbreviation;
  83. SPLSTR FullName;
  84. };
  85. extern const StateStruct g_StateAbbreviations[63];
  86. extern const CurrencySign g_CurrencySigns[14];
  87. extern const SPLSTR g_SingularPrimaryCurrencySigns[14];
  88. extern const SPLSTR g_SingularSecondaryCurrencySigns[14];
  89. extern const WCHAR g_DateDelimiters[3];
  90. extern const SPLSTR g_months[12];
  91. extern const SPLSTR g_monthAbbreviations[13];
  92. extern const SPLSTR g_days[7];
  93. extern const SPLSTR g_dayAbbreviations[10];
  94. extern const SPLSTR g_Area;
  95. extern const SPLSTR g_Country;
  96. extern const SPLSTR g_Code;
  97. extern const SPLSTR g_Half;
  98. extern const SPLSTR g_Tenths;
  99. extern const SPLSTR g_Sixteenths;
  100. extern const SPLSTR g_Hundredths;
  101. extern const SPLSTR g_Over;
  102. extern const SPLSTR g_PluralDenominators[10];
  103. extern const SPLSTR g_A;
  104. extern const SPLSTR g_M;
  105. extern const SPLSTR g_P;
  106. extern const SPLSTR g_OClock;
  107. extern const SPLSTR g_hundred;
  108. extern const SPLSTR g_hour;
  109. extern const SPLSTR g_hours;
  110. extern const SPLSTR g_minute;
  111. extern const SPLSTR g_minutes;
  112. extern const SPLSTR g_second;
  113. extern const SPLSTR g_seconds;
  114. extern const SPLSTR g_ANSICharacterProns[256];
  115. extern const SPVSTATE g_DefaultXMLState;
  116. extern const SPLSTR g_And;
  117. extern const SPLSTR g_comma;
  118. extern const SPLSTR g_period;
  119. extern const SPLSTR g_periodString;
  120. extern const SPLSTR g_slash;
  121. extern const SPLSTR g_Decades[];
  122. extern const SPLSTR g_Zeroes;
  123. extern const SPLSTR g_Hundreds;
  124. #define DAYMAX 31
  125. #define DAYMIN 1
  126. #define MONTHMAX 12
  127. #define MONTHMIN 1
  128. #define YEARMAX 9999
  129. #define YEARMIN 0
  130. #define HOURMIN 1
  131. #define HOURMAX 23
  132. #define MINUTEMIN 0
  133. #define MINUTEMAX 59
  134. #define SECONDMIN 0
  135. #define SECONDMAX 59
  136. //--- POS Tagger Constants - see MiscData.cpp
  137. typedef enum TEMPLATETYPE
  138. {
  139. PREV1T,
  140. NEXT1T,
  141. PREV2T,
  142. NEXT2T,
  143. PREV1OR2T,
  144. NEXT1OR2T,
  145. PREV1OR2OR3T,
  146. NEXT1OR2OR3T,
  147. PREV1TNEXT1T,
  148. PREV1TNEXT2T,
  149. PREV2TNEXT1T,
  150. NOTCAP,
  151. CAP,
  152. PREVNOTCAP,
  153. PREVCAP,
  154. PREV1W,
  155. NEXT1W,
  156. PREV2W,
  157. NEXT2W,
  158. PREV1OR2W,
  159. NEXT1OR2W,
  160. CURRWPREV1W,
  161. CURRWNEXT1W,
  162. CURRWPREV1T,
  163. CURRWNEXT1T,
  164. CURRW,
  165. PREV1WT,
  166. NEXT1WT,
  167. CURRWPREV1WT,
  168. CURRWNEXT1WT
  169. } TEMPLATETYPE;
  170. struct BrillPatch
  171. {
  172. ENGPARTOFSPEECH eCurrentPOS;
  173. ENGPARTOFSPEECH eConvertToPOS;
  174. TEMPLATETYPE eTemplateType;
  175. ENGPARTOFSPEECH eTemplatePOS1;
  176. ENGPARTOFSPEECH eTemplatePOS2;
  177. const WCHAR* pTemplateWord1;
  178. const WCHAR* pTemplateWord2;
  179. };
  180. extern const BrillPatch g_POSTaggerPatches [63];
  181. //=== Class, Enum, Struct and Union Declarations ===================
  182. typedef CSPList<TTSWord,TTSWord&> CWordList;
  183. typedef CSPList<TTSSentItem,TTSSentItem&> CItemList;
  184. //--- Structs used for normalization
  185. typedef enum
  186. {
  187. PRECEDING,
  188. FOLLOWING,
  189. UNATTACHED
  190. } NORM_POSITION;
  191. struct NumberGroup
  192. {
  193. BOOL fOnes; // "one" through "nineteen"
  194. BOOL fTens; // "twenty" through "ninety"
  195. BOOL fHundreds; // "one hundred" through "nine hundred"
  196. BOOL fQuantifier; // "thousand" through "quadrillion"
  197. };
  198. struct TTSIntegerItemInfo
  199. {
  200. long lNumGroups;
  201. NumberGroup Groups[6];
  202. BOOL fOrdinal;
  203. BOOL fDigitByDigit;
  204. ULONG ulNumDigits;
  205. //--- Normalization internal only
  206. long lLeftOver;
  207. BOOL fSeparators;
  208. const WCHAR* pStartChar;
  209. const WCHAR* pEndChar;
  210. };
  211. struct TTSDigitsItemInfo : TTSItemInfo
  212. {
  213. const WCHAR* pFirstDigit;
  214. ULONG ulNumDigits;
  215. };
  216. struct TTSNumberItemInfo;
  217. struct TTSFractionItemInfo
  218. {
  219. BOOL fIsStandard;
  220. TTSNumberItemInfo* pNumerator;
  221. TTSNumberItemInfo* pDenominator;
  222. //--- Normalization internal only
  223. const WCHAR* pVulgar;
  224. };
  225. struct TTSNumberItemInfo : TTSItemInfo
  226. {
  227. BOOL fNegative;
  228. TTSIntegerItemInfo* pIntegerPart;
  229. TTSDigitsItemInfo* pDecimalPart;
  230. TTSFractionItemInfo* pFractionalPart;
  231. //--- Normalization internal only
  232. const WCHAR* pStartChar;
  233. const WCHAR* pEndChar;
  234. CWordList* pWordList;
  235. };
  236. struct TTSPhoneNumberItemInfo : TTSItemInfo
  237. {
  238. //--- Country code members
  239. TTSNumberItemInfo* pCountryCode;
  240. //--- Area code members
  241. TTSDigitsItemInfo* pAreaCode;
  242. BOOL fIs800;
  243. BOOL fOne;
  244. //--- Main number members
  245. TTSDigitsItemInfo** ppGroups;
  246. ULONG ulNumGroups;
  247. };
  248. struct TTSZipCodeItemInfo : TTSItemInfo
  249. {
  250. TTSDigitsItemInfo* pFirstFive;
  251. TTSDigitsItemInfo* pLastFour;
  252. };
  253. struct TTSStateAndZipCodeItemInfo : TTSItemInfo
  254. {
  255. TTSZipCodeItemInfo* pZipCode;
  256. };
  257. struct TTSCurrencyItemInfo : TTSItemInfo
  258. {
  259. TTSNumberItemInfo* pPrimaryNumberPart;
  260. TTSNumberItemInfo* pSecondaryNumberPart;
  261. BOOL fQuantifier;
  262. long lNumPostNumberStates;
  263. long lNumPostSymbolStates;
  264. };
  265. struct TTSYearItemInfo : TTSItemInfo
  266. {
  267. const WCHAR* pYear;
  268. ULONG ulNumDigits;
  269. };
  270. struct TTSRomanNumeralItemInfo : TTSItemInfo
  271. {
  272. TTSItemInfo* pNumberInfo;
  273. };
  274. struct TTSDecadeItemInfo : TTSItemInfo
  275. {
  276. const WCHAR* pCentury;
  277. ULONG ulDecade;
  278. };
  279. struct TTSDateItemInfo : TTSItemInfo
  280. {
  281. ULONG ulDayIndex;
  282. ULONG ulMonthIndex;
  283. TTSIntegerItemInfo* pDay;
  284. TTSYearItemInfo* pYear;
  285. };
  286. typedef enum
  287. {
  288. AM,
  289. PM,
  290. UNDEFINED
  291. } TIMEABBREVIATION;
  292. struct TTSTimeOfDayItemInfo : TTSItemInfo
  293. {
  294. BOOL fTimeAbbreviation;
  295. BOOL fTwentyFourHour;
  296. BOOL fMinutes;
  297. };
  298. struct TTSTimeItemInfo : TTSItemInfo
  299. {
  300. TTSNumberItemInfo* pHours;
  301. TTSNumberItemInfo* pMinutes;
  302. const WCHAR* pSeconds;
  303. };
  304. struct TTSHyphenatedStringInfo : TTSItemInfo
  305. {
  306. TTSItemInfo* pFirstChunkInfo;
  307. TTSItemInfo* pSecondChunkInfo;
  308. const WCHAR* pFirstChunk;
  309. const WCHAR* pSecondChunk;
  310. };
  311. struct TTSSuffixItemInfo : TTSItemInfo
  312. {
  313. const WCHAR* pFirstChar;
  314. ULONG ulNumChars;
  315. };
  316. struct TTSNumberRangeItemInfo : TTSItemInfo
  317. {
  318. TTSItemInfo *pFirstNumberInfo;
  319. TTSItemInfo *pSecondNumberInfo;
  320. };
  321. struct TTSTimeRangeItemInfo : TTSItemInfo
  322. {
  323. TTSTimeOfDayItemInfo *pFirstTimeInfo;
  324. TTSTimeOfDayItemInfo *pSecondTimeInfo;
  325. };
  326. struct AbbrevRecord
  327. {
  328. const WCHAR* pOrth;
  329. WCHAR* pPron1;
  330. ENGPARTOFSPEECH POS1;
  331. WCHAR* pPron2;
  332. ENGPARTOFSPEECH POS2;
  333. WCHAR* pPron3;
  334. ENGPARTOFSPEECH POS3;
  335. int iSentBreakDisambig;
  336. int iPronDisambig;
  337. };
  338. struct TTSAbbreviationInfo : TTSItemInfo
  339. {
  340. const AbbrevRecord* pAbbreviation;
  341. };
  342. //--- Structs used for Lex Lookup
  343. typedef enum { PRON_A = 0, PRON_B = 1 };
  344. struct PRONUNIT
  345. {
  346. ULONG phon_Len;
  347. WCHAR phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
  348. ULONG POScount;
  349. ENGPARTOFSPEECH POScode[POS_MAX];
  350. };
  351. struct PRONRECORD
  352. {
  353. WCHAR orthStr[SP_MAX_WORD_LENGTH]; // Orth text
  354. WCHAR lemmaStr[SP_MAX_WORD_LENGTH]; // Root word
  355. ULONG pronType; // Pronunciation is lex or LTS
  356. PRONUNIT pronArray[2];
  357. ENGPARTOFSPEECH POSchoice;
  358. ENGPARTOFSPEECH XMLPartOfSpeech;
  359. bool hasAlt;
  360. ULONG altChoice;
  361. BOOL fUsePron;
  362. };
  363. //--- Miscellaneous structs and typedefs
  364. struct SentencePointer
  365. {
  366. const WCHAR *pSentenceStart;
  367. const SPVTEXTFRAG *pSentenceFrag;
  368. };
  369. //=== Function Definitions ===========================================
  370. // Misc Number Normalization functions and helpers
  371. int MatchCurrencySign( const WCHAR*& pStartChar, const WCHAR*& pEndChar, NORM_POSITION& ePosition );
  372. //=== Classes
  373. /*** CSentenceStack *************************************************
  374. * This class is used to maintain a stack of sentences for the Skip
  375. * call to utilize.
  376. */
  377. class CSentenceStack
  378. {
  379. public:
  380. /*--- Methods ---*/
  381. CSentenceStack() { m_StackPtr = -1; }
  382. int GetCount( void ) { return m_StackPtr + 1; }
  383. virtual SentencePointer& Pop( void ) { SPDBG_ASSERT( m_StackPtr > -1 ); return m_Stack[m_StackPtr--]; }
  384. virtual HRESULT Push( const SentencePointer& val ) { ++m_StackPtr; return m_Stack.SetAtGrow( m_StackPtr, val ); }
  385. virtual void Reset( void ) { m_StackPtr = -1; }
  386. protected:
  387. /*--- Member data ---*/
  388. CSPArray<SentencePointer,SentencePointer> m_Stack;
  389. int m_StackPtr;
  390. };
  391. /*** CSentItem
  392. * This object is a helper class
  393. */
  394. class CSentItem : public TTSSentItem
  395. {
  396. public:
  397. CSentItem() { memset( this, 0, sizeof(*this) ); }
  398. CSentItem( TTSSentItem& Other ) { memcpy( this, &Other, sizeof( Other ) ); }
  399. };
  400. /*** CSentItemEnum
  401. * This object is designed to be used by a single thread.
  402. */
  403. class ATL_NO_VTABLE CSentItemEnum :
  404. public CComObjectRootEx<CComMultiThreadModel>,
  405. public IEnumSENTITEM
  406. {
  407. /*=== ATL Setup ===*/
  408. public:
  409. DECLARE_PROTECT_FINAL_CONSTRUCT()
  410. BEGIN_COM_MAP(CSentItemEnum)
  411. COM_INTERFACE_ENTRY(IEnumSENTITEM)
  412. END_COM_MAP()
  413. /*=== Methods =======*/
  414. public:
  415. /*--- Constructors/Destructors ---*/
  416. /*--- Non interface methods ---*/
  417. void _SetOwner( IUnknown* pOwner ) { m_cpOwner = pOwner; }
  418. CItemList& _GetList( void ) { return m_ItemList; }
  419. CSentItemMemory& _GetMemoryManager( void ) { return m_MemoryManager; }
  420. /*=== Interfaces ====*/
  421. public:
  422. //--- IEnumSpSentence ----------------------------------------
  423. STDMETHOD(Next)( TTSSentItem *pItemEnum );
  424. STDMETHOD(Reset)( void );
  425. /*=== Member data ===*/
  426. private:
  427. CComPtr<IUnknown> m_cpOwner;
  428. CItemList m_ItemList;
  429. SPLISTPOS m_ListPos;
  430. CSentItemMemory m_MemoryManager;
  431. };
  432. /*** CStdSentEnum COM object
  433. */
  434. class ATL_NO_VTABLE CStdSentEnum :
  435. public CComObjectRootEx<CComMultiThreadModel>,
  436. public IEnumSpSentence
  437. {
  438. /*=== ATL Setup ===*/
  439. public:
  440. DECLARE_GET_CONTROLLING_UNKNOWN()
  441. DECLARE_PROTECT_FINAL_CONSTRUCT()
  442. BEGIN_COM_MAP(CStdSentEnum)
  443. COM_INTERFACE_ENTRY(IEnumSpSentence)
  444. END_COM_MAP()
  445. /*=== Methods =======*/
  446. public:
  447. /*--- Constructors/Destructors ---*/
  448. HRESULT FinalConstruct();
  449. void FinalRelease();
  450. /*--- Non interface methods ---*/
  451. HRESULT InitAggregateLexicon( void );
  452. HRESULT AddLexiconToAggregate( ISpLexicon *pAddLexicon, DWORD dwFlags );
  453. HRESULT InitMorphLexicon( void );
  454. //--- Abbreviation Sentence Breaking Disambiguation Functions
  455. HRESULT IsAbbreviationEOS( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  456. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  457. HRESULT IfEOSNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  458. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  459. HRESULT IfEOSAndLowercaseNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos,
  460. CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  461. //--- Abbreviation Pronunciation Disambiguation Functions
  462. HRESULT SingleOrPluralAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  463. CItemList& ItemList, SPLISTPOS ListPos );
  464. HRESULT DoctorDriveAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  465. CItemList& ItemList, SPLISTPOS ListPos );
  466. HRESULT AbbreviationFollowedByDigit( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  467. CItemList& ItemList, SPLISTPOS ListPos );
  468. HRESULT AllCapsAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  469. CItemList& ItemList, SPLISTPOS ListPos );
  470. HRESULT CapitalizedAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  471. CItemList& ItemList, SPLISTPOS ListPos );
  472. HRESULT SECAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  473. CItemList& ItemList, SPLISTPOS ListPos );
  474. HRESULT DegreeAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  475. CItemList& ItemList, SPLISTPOS ListPos );
  476. HRESULT AbbreviationModifier( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  477. CItemList& ItemList, SPLISTPOS ListPos );
  478. HRESULT ADisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  479. CItemList& ItemList, SPLISTPOS ListPos );
  480. HRESULT PolishDisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron,
  481. CItemList& ItemList, SPLISTPOS ListPos );
  482. //--- Word Pronunciation Disambiguation Functions
  483. HRESULT MeasurementDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  484. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  485. HRESULT TheDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  486. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  487. HRESULT ReadDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
  488. SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  489. private:
  490. //--- Pronunciation Table init helper
  491. HRESULT InitPron( WCHAR** OriginalPron );
  492. //--- Sentence breaking helpers ---//
  493. HRESULT GetNextSentence( IEnumSENTITEM** pItemEnum );
  494. HRESULT AddNextSentItem( CItemList& ItemList, CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
  495. HRESULT SkipWhiteSpaceAndTags( const WCHAR*& pStartChar, const WCHAR*& pEndChar,
  496. const SPVTEXTFRAG*& pCurrFrag, CSentItemMemory& pMemoryManager,
  497. BOOL fAddToItemList = false, CItemList* pItemList = NULL );
  498. const WCHAR* FindTokenEnd( const WCHAR* pStartChar, const WCHAR* pEndChar );
  499. //--- Lexicon and POS helpers ---//
  500. HRESULT DetermineProns( CItemList& ItemList, CSentItemMemory& MemoryManager );
  501. HRESULT Pronounce( PRONRECORD *pPron );
  502. //--- Normalization helpers ---//
  503. HRESULT Normalize( CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
  504. HRESULT MatchCategory( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  505. HRESULT ExpandCategory( TTSItemInfo*& pItemNormInfo, CItemList& ItemList, SPLISTPOS ListPos,
  506. CSentItemMemory& MemoryManager );
  507. HRESULT DoUnicodeToAsciiMap( const WCHAR *pUnicodeString, ULONG ulUnicodeStringLength,
  508. WCHAR *ppConvertedString );
  509. HRESULT IsAlphaWord( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  510. CSentItemMemory& MemoryManager );
  511. HRESULT IsInitialism( CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager,
  512. BOOL* pfIsEOS );
  513. //--- Various Number Related Normalization helpers ---//
  514. HRESULT IsNumberCategory( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  515. HRESULT IsNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager,
  516. BOOL fMultiItem = true );
  517. HRESULT IsInteger( const WCHAR* pStartChar, TTSIntegerItemInfo*& pIntegerInfo,
  518. CSentItemMemory& MemoryManager );
  519. HRESULT IsDigitString( const WCHAR* pStartChar, TTSDigitsItemInfo*& pDigitsInfo,
  520. CSentItemMemory& MemoryManager );
  521. HRESULT ExpandNumber( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  522. HRESULT ExpandPercent( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  523. HRESULT ExpandDegrees( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  524. HRESULT ExpandSquare( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  525. HRESULT ExpandCube( TTSNumberItemInfo* pItemInfo, CWordList& WordList );
  526. void ExpandInteger( TTSIntegerItemInfo* pItemInfo, const WCHAR* Context, CWordList &WordList );
  527. void ExpandDigit( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList );
  528. void ExpandTwoDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  529. void ExpandThreeDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  530. void ExpandDigitOrdinal( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList );
  531. void ExpandTwoOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  532. void ExpandThreeOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList );
  533. void ExpandDigits( TTSDigitsItemInfo* pItemInfo, CWordList& WordList );
  534. HRESULT IsFraction( const WCHAR* pStartChar, TTSFractionItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  535. HRESULT ExpandFraction( TTSFractionItemInfo* pItemInfo, CWordList& WordList );
  536. HRESULT IsRomanNumeral( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  537. HRESULT IsPhoneNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, CWordList& WordList );
  538. HRESULT IsZipCode( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  539. HRESULT ExpandZipCode( TTSZipCodeItemInfo* pItemInfo, CWordList& WordList );
  540. HRESULT IsCurrency( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager,
  541. CWordList& WordList );
  542. HRESULT IsNumberRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  543. HRESULT ExpandNumberRange( TTSNumberRangeItemInfo* pItemInfo, CWordList& WordList );
  544. HRESULT IsCurrencyRange( TTSItemInfo*& pItemInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  545. //--- Date Related Normalization helpers ---//
  546. HRESULT IsNumericCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context,
  547. CSentItemMemory& MemoryManager );
  548. HRESULT IsMonthStringCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context,
  549. CSentItemMemory& MemoryManager );
  550. HRESULT IsLongFormDate_DMDY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  551. HRESULT IsLongFormDate_DDMY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  552. HRESULT ExpandDate( TTSDateItemInfo* pItemInfo, CWordList& WordList );
  553. HRESULT ExpandYear( TTSYearItemInfo* pItemInfo, CWordList& WordList );
  554. HRESULT IsDecade( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager );
  555. HRESULT ExpandDecade( TTSDecadeItemInfo* pItemInfo, CWordList& WordList );
  556. ULONG MatchMonthString( WCHAR*& pMonth, ULONG ulLength );
  557. ULONG MatchDayString( WCHAR*& pDayString, WCHAR* pEndChar );
  558. bool MatchDateDelimiter( WCHAR **DateString );
  559. //--- Time Related Normalization helpers ---//
  560. HRESULT IsTimeOfDay( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList, BOOL fMultiItem = true );
  561. HRESULT IsTime( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager );
  562. HRESULT ExpandTime( TTSTimeItemInfo* pItemInfo, CWordList& WordList );
  563. HRESULT IsTimeRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  564. //--- SPELL tag normalization helper
  565. HRESULT SpellOutString( CWordList& WordList );
  566. void ExpandPunctuation( CWordList& WordList, WCHAR wc );
  567. //--- Default normalization helper
  568. HRESULT ExpandUnrecognizedString( CWordList& WordList, CSentItemMemory& MemoryManager );
  569. //--- Misc. normalization helpers
  570. HRESULT IsStateAndZipcode( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
  571. HRESULT IsHyphenatedString( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  572. CSentItemMemory& MemoryManager );
  573. HRESULT ExpandHyphenatedString( TTSHyphenatedStringInfo* pItemInfo, CWordList& WordList );
  574. HRESULT IsSuffix( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo,
  575. CSentItemMemory& MemoryManager );
  576. HRESULT ExpandSuffix( TTSSuffixItemInfo* pItemInfo, CWordList& WordList );
  577. bool Zeroes( const WCHAR* );
  578. bool ThreeZeroes( const WCHAR* );
  579. bool IsPunctuation(const TTSSentItem *Item);
  580. /*=== Interfaces ====*/
  581. public:
  582. //--- IEnumSpSentence ----------------------------------------
  583. STDMETHOD(SetFragList)( const SPVTEXTFRAG* pTextFragList, DWORD dwFlags );
  584. STDMETHOD(Next)( IEnumSENTITEM **ppSentItemEnum );
  585. STDMETHOD(Previous)( IEnumSENTITEM **ppSentItemEnum );
  586. STDMETHOD(Reset)( void );
  587. //=== Data members ===
  588. private:
  589. CComPtr<ISpContainerLexicon> m_cpAggregateLexicon;
  590. CComPtr<ISpPhoneConverter> m_cpPhonemeConverter;
  591. CSMorph* m_pMorphLexicon;
  592. DWORD m_dwSpeakFlags;
  593. const SPVTEXTFRAG* m_pTextFragList;
  594. const SPVTEXTFRAG* m_pCurrFrag;
  595. const WCHAR* m_pNextChar;
  596. const WCHAR* m_pEndChar;
  597. const WCHAR* m_pEndOfCurrToken;
  598. const WCHAR* m_pEndOfCurrItem;
  599. CSentenceStack m_SentenceStack;
  600. SEPARATOR_AND_DECIMAL m_eSeparatorAndDecimal;
  601. SHORT_DATE_ORDER m_eShortDateOrder;
  602. static CComAutoCriticalSection m_AbbrevTableCritSec;
  603. };
  604. //--- Structs and typedefs used for abbreviation stuff
  605. typedef HRESULT (CStdSentEnum::* SentBreakDisambigFunc)(const AbbrevRecord*, CItemList& , SPLISTPOS,
  606. CSentItemMemory&, BOOL*);
  607. typedef HRESULT (CStdSentEnum::* PronDisambigFunc) ( const AbbrevRecord*, PRONRECORD*, CItemList&, SPLISTPOS );
  608. typedef HRESULT (CStdSentEnum::* PostLexLookupDisambigFunc) ( const AbbrevRecord*, CItemList&, SPLISTPOS, CSentItemMemory& );
  609. extern AbbrevRecord g_AbbreviationTable[177];
  610. extern const PronDisambigFunc g_PronDisambigTable[];
  611. extern const SentBreakDisambigFunc g_SentBreakDisambigTable[];
  612. extern AbbrevRecord g_AmbiguousWordTable[72];
  613. extern const PronDisambigFunc g_AmbiguousWordDisambigTable[];
  614. extern AbbrevRecord g_PostLexLookupWordTable[41];
  615. extern const PostLexLookupDisambigFunc g_PostLexLookupDisambigTable[];
  616. extern WCHAR *g_pOfA;
  617. extern WCHAR *g_pOfAn;
  618. extern BOOL g_fAbbrevTablesInitialized;
  619. extern void CleanupAbbrevTables( void );
  620. //--- First words table - used in sentence breaking
  621. extern const SPLSTR g_FirstWords[163];
  622. //
  623. //=== Inlines
  624. //
  625. inline ULONG my_wcstoul( const WCHAR *pStartChar, WCHAR **ppEndChar )
  626. {
  627. if ( iswdigit( *pStartChar ) )
  628. {
  629. return wcstoul( pStartChar, ppEndChar, 10 );
  630. }
  631. else
  632. {
  633. if ( ppEndChar )
  634. {
  635. *ppEndChar = (WCHAR*) pStartChar;
  636. }
  637. return 0;
  638. }
  639. }
  640. inline ENGPARTOFSPEECH ConvertItemTypeToPartOfSp( TTSItemType ItemType )
  641. {
  642. switch ( ItemType )
  643. {
  644. case eOPEN_PARENTHESIS:
  645. case eOPEN_BRACKET:
  646. case eOPEN_BRACE:
  647. return MS_GroupBegin;
  648. case eCLOSE_PARENTHESIS:
  649. case eCLOSE_BRACKET:
  650. case eCLOSE_BRACE:
  651. return MS_GroupEnd;
  652. case eSINGLE_QUOTE:
  653. case eDOUBLE_QUOTE:
  654. return MS_Quotation;
  655. case ePERIOD:
  656. case eQUESTION:
  657. case eEXCLAMATION:
  658. return MS_EOSItem;
  659. case eCOMMA:
  660. case eCOLON:
  661. case eSEMICOLON:
  662. case eHYPHEN:
  663. case eELLIPSIS:
  664. return MS_MiscPunc;
  665. default:
  666. return MS_Unknown;
  667. }
  668. }
  669. inline bool MatchPhoneNumberDelimiter( const WCHAR wc )
  670. {
  671. return ( wc == L' ' || wc == L'-' || wc == L'.' );
  672. }
  673. inline bool NeedsToBeNormalized( const AbbrevRecord* pAbbreviation )
  674. {
  675. if( !wcscmp( pAbbreviation->pOrth, L"jan" ) ||
  676. !wcscmp( pAbbreviation->pOrth, L"feb" ) ||
  677. !wcscmp( pAbbreviation->pOrth, L"mar" ) ||
  678. !wcscmp( pAbbreviation->pOrth, L"apr" ) ||
  679. !wcscmp( pAbbreviation->pOrth, L"jun" ) ||
  680. !wcscmp( pAbbreviation->pOrth, L"jul" ) ||
  681. !wcscmp( pAbbreviation->pOrth, L"aug" ) ||
  682. !wcscmp( pAbbreviation->pOrth, L"sep" ) ||
  683. !wcscmp( pAbbreviation->pOrth, L"sept" ) ||
  684. !wcscmp( pAbbreviation->pOrth, L"oct" ) ||
  685. !wcscmp( pAbbreviation->pOrth, L"nov" ) ||
  686. !wcscmp( pAbbreviation->pOrth, L"dec" ) ||
  687. !wcscmp( pAbbreviation->pOrth, L"mon" ) ||
  688. !wcscmp( pAbbreviation->pOrth, L"tue" ) ||
  689. !wcscmp( pAbbreviation->pOrth, L"tues" ) ||
  690. !wcscmp( pAbbreviation->pOrth, L"wed" ) ||
  691. !wcscmp( pAbbreviation->pOrth, L"thu" ) ||
  692. !wcscmp( pAbbreviation->pOrth, L"thur" ) ||
  693. !wcscmp( pAbbreviation->pOrth, L"thurs" ) ||
  694. !wcscmp( pAbbreviation->pOrth, L"fri" ) ||
  695. !wcscmp( pAbbreviation->pOrth, L"sat" ) ||
  696. !wcscmp( pAbbreviation->pOrth, L"sun" ) )
  697. {
  698. return true;
  699. }
  700. else
  701. {
  702. return false;
  703. }
  704. }
  705. inline HRESULT SetWordList( CSentItem& Item, CWordList& WordList, CSentItemMemory& MemoryManager )
  706. {
  707. HRESULT hr = S_OK;
  708. SPLISTPOS WordListPos = WordList.GetHeadPosition();
  709. Item.ulNumWords = WordList.GetCount();
  710. Item.Words = (TTSWord*) MemoryManager.GetMemory( Item.ulNumWords * sizeof(TTSWord), &hr );
  711. if ( SUCCEEDED( hr ) )
  712. {
  713. ULONG ulIndex = 0;
  714. while ( WordListPos )
  715. {
  716. SPDBG_ASSERT( ulIndex < Item.ulNumWords );
  717. Item.Words[ulIndex++] = WordList.GetNext( WordListPos );
  718. }
  719. }
  720. return hr;
  721. }
  722. inline int CompareStringAndSPLSTR( const void* _String, const void* _SPLSTR )
  723. {
  724. int _StringLen = wcslen( (const WCHAR*) _String );
  725. int _SPLSTRLen = ( (const SPLSTR*) _SPLSTR )->Len;
  726. if ( _StringLen < _SPLSTRLen )
  727. {
  728. int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen );
  729. if ( Result != 0 )
  730. {
  731. return Result;
  732. }
  733. else
  734. {
  735. return -1;
  736. }
  737. }
  738. else if ( _StringLen > _SPLSTRLen )
  739. {
  740. int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _SPLSTRLen );
  741. if ( Result != 0 )
  742. {
  743. return Result;
  744. }
  745. else
  746. {
  747. return 1;
  748. }
  749. }
  750. else
  751. {
  752. return ( wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ) );
  753. }
  754. }
  755. inline int CompareStringAndStateStruct( const void* _String, const void* _StateStruct )
  756. {
  757. int _StringLen = wcslen( (const WCHAR*) _String );
  758. int _StateStructLen = ( (const StateStruct*) _StateStruct )->Abbreviation.Len;
  759. if ( _StringLen < _StateStructLen )
  760. {
  761. int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  762. _StringLen );
  763. if ( Result != 0 )
  764. {
  765. return Result;
  766. }
  767. else
  768. {
  769. return -1;
  770. }
  771. }
  772. else if ( _StringLen > _StateStructLen )
  773. {
  774. int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  775. _StateStructLen );
  776. if ( Result != 0 )
  777. {
  778. return Result;
  779. }
  780. else
  781. {
  782. return 1;
  783. }
  784. }
  785. else
  786. {
  787. return ( wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr,
  788. _StringLen ) );
  789. }
  790. }
  791. inline int CompareStringAndAbbrevRecord( const void* _String, const void* _AbbrevRecord )
  792. {
  793. return ( _wcsicmp( (const WCHAR*) _String, ( (const AbbrevRecord*) _AbbrevRecord )->pOrth ) );
  794. }
  795. inline int CompareWCHARAndWCHAR( const void *pWCHAR_1, const void *pWCHAR_2 )
  796. {
  797. return ( *( (WCHAR*) pWCHAR_1) - *( (WCHAR*) pWCHAR_2) );
  798. }
  799. inline BOOL IsSpace( WCHAR wc )
  800. {
  801. return ( ( wc == 0x20 ) || ( wc == 0x9 ) || ( wc == 0xD ) ||
  802. ( wc == 0xA ) || ( wc == 0x200B ) );
  803. }
  804. inline BOOL IsCapital( WCHAR wc )
  805. {
  806. return ( ( wc >= L'A' ) && ( wc <= L'Z' ) );
  807. }
  808. inline TTSItemType IsGroupBeginning( WCHAR wc )
  809. {
  810. if ( wc == L'(' )
  811. {
  812. return eOPEN_PARENTHESIS;
  813. }
  814. else if ( wc == L'[' )
  815. {
  816. return eOPEN_BRACKET;
  817. }
  818. else if ( wc == L'{' )
  819. {
  820. return eOPEN_BRACE;
  821. }
  822. else
  823. {
  824. return eUNMATCHED;
  825. }
  826. }
  827. inline TTSItemType IsGroupEnding( WCHAR wc )
  828. {
  829. if ( wc == L')' )
  830. {
  831. return eCLOSE_PARENTHESIS;
  832. }
  833. else if ( wc == L']' )
  834. {
  835. return eCLOSE_BRACKET;
  836. }
  837. else if ( wc == L'}' )
  838. {
  839. return eCLOSE_BRACE;
  840. }
  841. else
  842. {
  843. return eUNMATCHED;
  844. }
  845. }
  846. inline TTSItemType IsQuotationMark( WCHAR wc )
  847. {
  848. if ( wc == L'\'' )
  849. {
  850. return eSINGLE_QUOTE;
  851. }
  852. else if ( wc == L'\"' )
  853. {
  854. return eDOUBLE_QUOTE;
  855. }
  856. else
  857. {
  858. return eUNMATCHED;
  859. }
  860. }
  861. inline TTSItemType IsEOSItem( WCHAR wc )
  862. {
  863. if ( wc == L'.' )
  864. {
  865. return ePERIOD;
  866. }
  867. else if ( wc == L'!' )
  868. {
  869. return eEXCLAMATION;
  870. }
  871. else if ( wc == L'?' )
  872. {
  873. return eQUESTION;
  874. }
  875. else
  876. {
  877. return eUNMATCHED;
  878. }
  879. }
  880. inline TTSItemType IsMiscPunctuation( WCHAR wc )
  881. {
  882. if ( wc == L',' )
  883. {
  884. return eCOMMA;
  885. }
  886. else if ( wc == L';' )
  887. {
  888. return eSEMICOLON;
  889. }
  890. else if ( wc == L':' )
  891. {
  892. return eCOLON;
  893. }
  894. else if ( wc == L'-' )
  895. {
  896. return eHYPHEN;
  897. }
  898. else
  899. {
  900. return eUNMATCHED;
  901. }
  902. }
  903. #endif //--- This must be the last line in the file