|
|
/******************************************************************************
* StdSentEnum.h * *---------------* * This is the header file for the CStdSentEnum implementation. *------------------------------------------------------------------------------ * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99 * All Rights Reserved * *********************************************************************** EDC ***/ #ifndef StdSentEnum_h
#define StdSentEnum_h
//--- Additional includes
#include "stdafx.h"
#include "ms_entropicengine.h"
#include "resource.h"
#include "SentItemMemory.h"
#include "morph.h"
#include "TTSPropertiesDialog.h"
//=== Constants ====================================================
//--- Vowel WCHAR values - used to disambiguate pronunciations of certain words
const WCHAR g_Vowels[] = { 0x0a, // AA
0x0b, // AE
0x0c, // AH
0x0d, // AO
0x0e, // AW
0x0f, // AX
0x10, // AY
0x15, // EH
0x16, // ER
0x17, // EY
0x1b, // IH
0x1c, // IY
0x23, // OW
0x24, // OY
0x2a, // UH
0x2b, // UW
};
//--- Normalization constants - see NormData.cpp
extern const char g_pFlagCharacter; extern const unsigned char g_AnsiToAscii[256]; extern const SPLSTR g_O; extern const SPLSTR g_negative; extern const SPLSTR g_decimalpoint; extern const SPLSTR g_to; extern const SPLSTR g_a; extern const SPLSTR g_of; extern const SPLSTR g_percent; extern const SPLSTR g_degree; extern const SPLSTR g_degrees; extern const SPLSTR g_squared; extern const SPLSTR g_cubed; extern const SPLSTR g_ones[10]; extern const SPLSTR g_tens[10]; extern const SPLSTR g_teens[10]; extern const SPLSTR g_onesOrdinal[10]; extern const SPLSTR g_tensOrdinal[10]; extern const SPLSTR g_teensOrdinal[10]; extern const SPLSTR g_quantifiers[6]; extern const SPLSTR g_quantifiersOrdinal[6]; extern const SPLSTR g_dash; extern WCHAR g_Euro[2];
struct CurrencySign { SPLSTR Sign; SPLSTR MainUnit; SPLSTR SecondaryUnit; };
struct StateStruct { SPLSTR Abbreviation; SPLSTR FullName; };
extern const StateStruct g_StateAbbreviations[63]; extern const CurrencySign g_CurrencySigns[14]; extern const SPLSTR g_SingularPrimaryCurrencySigns[14]; extern const SPLSTR g_SingularSecondaryCurrencySigns[14]; extern const WCHAR g_DateDelimiters[3]; extern const SPLSTR g_months[12]; extern const SPLSTR g_monthAbbreviations[13]; extern const SPLSTR g_days[7]; extern const SPLSTR g_dayAbbreviations[10]; extern const SPLSTR g_Area; extern const SPLSTR g_Country; extern const SPLSTR g_Code; extern const SPLSTR g_Half; extern const SPLSTR g_Tenths; extern const SPLSTR g_Sixteenths; extern const SPLSTR g_Hundredths; extern const SPLSTR g_Over; extern const SPLSTR g_PluralDenominators[10]; extern const SPLSTR g_A; extern const SPLSTR g_M; extern const SPLSTR g_P; extern const SPLSTR g_OClock; extern const SPLSTR g_hundred; extern const SPLSTR g_hour; extern const SPLSTR g_hours; extern const SPLSTR g_minute; extern const SPLSTR g_minutes; extern const SPLSTR g_second; extern const SPLSTR g_seconds; extern const SPLSTR g_ANSICharacterProns[256]; extern const SPVSTATE g_DefaultXMLState; extern const SPLSTR g_And; extern const SPLSTR g_comma; extern const SPLSTR g_period; extern const SPLSTR g_periodString; extern const SPLSTR g_slash; extern const SPLSTR g_Decades[]; extern const SPLSTR g_Zeroes; extern const SPLSTR g_Hundreds;
#define DAYMAX 31
#define DAYMIN 1
#define MONTHMAX 12
#define MONTHMIN 1
#define YEARMAX 9999
#define YEARMIN 0
#define HOURMIN 1
#define HOURMAX 23
#define MINUTEMIN 0
#define MINUTEMAX 59
#define SECONDMIN 0
#define SECONDMAX 59
//--- POS Tagger Constants - see MiscData.cpp
typedef enum TEMPLATETYPE { PREV1T, NEXT1T, PREV2T, NEXT2T, PREV1OR2T, NEXT1OR2T, PREV1OR2OR3T, NEXT1OR2OR3T, PREV1TNEXT1T, PREV1TNEXT2T, PREV2TNEXT1T, NOTCAP, CAP, PREVNOTCAP, PREVCAP, PREV1W, NEXT1W, PREV2W, NEXT2W, PREV1OR2W, NEXT1OR2W, CURRWPREV1W, CURRWNEXT1W, CURRWPREV1T, CURRWNEXT1T, CURRW, PREV1WT, NEXT1WT, CURRWPREV1WT, CURRWNEXT1WT } TEMPLATETYPE;
struct BrillPatch { ENGPARTOFSPEECH eCurrentPOS; ENGPARTOFSPEECH eConvertToPOS; TEMPLATETYPE eTemplateType; ENGPARTOFSPEECH eTemplatePOS1; ENGPARTOFSPEECH eTemplatePOS2; const WCHAR* pTemplateWord1; const WCHAR* pTemplateWord2; };
extern const BrillPatch g_POSTaggerPatches [62];
//=== Class, Enum, Struct and Union Declarations ===================
typedef CSPList<TTSWord,TTSWord&> CWordList; typedef CSPList<TTSSentItem,TTSSentItem&> CItemList;
//--- Structs used for normalization
typedef enum { PRECEDING, FOLLOWING, UNATTACHED } NORM_POSITION;
struct NumberGroup { BOOL fOnes; // "one" through "nineteen"
BOOL fTens; // "twenty" through "ninety"
BOOL fHundreds; // "one hundred" through "nine hundred"
BOOL fQuantifier; // "thousand" through "quadrillion"
};
struct TTSIntegerItemInfo { long lNumGroups; NumberGroup Groups[6]; BOOL fOrdinal; BOOL fDigitByDigit; ULONG ulNumDigits; //--- Normalization internal only
long lLeftOver; BOOL fSeparators; const WCHAR* pStartChar; const WCHAR* pEndChar; };
struct TTSDigitsItemInfo : TTSItemInfo { const WCHAR* pFirstDigit; ULONG ulNumDigits; };
struct TTSNumberItemInfo;
struct TTSFractionItemInfo { BOOL fIsStandard; TTSNumberItemInfo* pNumerator; TTSNumberItemInfo* pDenominator; //--- Normalization internal only
const WCHAR* pVulgar; };
struct TTSNumberItemInfo : TTSItemInfo { BOOL fNegative; TTSIntegerItemInfo* pIntegerPart; TTSDigitsItemInfo* pDecimalPart; TTSFractionItemInfo* pFractionalPart; //--- Normalization internal only
const WCHAR* pStartChar; const WCHAR* pEndChar; CWordList* pWordList; };
struct TTSPhoneNumberItemInfo : TTSItemInfo { //--- Country code members
TTSNumberItemInfo* pCountryCode; //--- Area code members
TTSDigitsItemInfo* pAreaCode; BOOL fIs800; BOOL fOne; //--- Main number members
TTSDigitsItemInfo** ppGroups; ULONG ulNumGroups; };
struct TTSZipCodeItemInfo : TTSItemInfo { TTSDigitsItemInfo* pFirstFive; TTSDigitsItemInfo* pLastFour; };
struct TTSStateAndZipCodeItemInfo : TTSItemInfo { TTSZipCodeItemInfo* pZipCode; };
struct TTSCurrencyItemInfo : TTSItemInfo { TTSNumberItemInfo* pPrimaryNumberPart; TTSNumberItemInfo* pSecondaryNumberPart; BOOL fQuantifier; long lNumPostNumberStates; long lNumPostSymbolStates; };
struct TTSYearItemInfo : TTSItemInfo { const WCHAR* pYear; ULONG ulNumDigits; };
struct TTSRomanNumeralItemInfo : TTSItemInfo { TTSItemInfo* pNumberInfo; };
struct TTSDecadeItemInfo : TTSItemInfo { const WCHAR* pCentury; ULONG ulDecade; };
struct TTSDateItemInfo : TTSItemInfo { ULONG ulDayIndex; ULONG ulMonthIndex; TTSIntegerItemInfo* pDay; TTSYearItemInfo* pYear; };
typedef enum { AM, PM, UNDEFINED } TIMEABBREVIATION;
struct TTSTimeOfDayItemInfo : TTSItemInfo { BOOL fTimeAbbreviation; BOOL fTwentyFourHour; BOOL fMinutes; };
struct TTSTimeItemInfo : TTSItemInfo { TTSNumberItemInfo* pHours; TTSNumberItemInfo* pMinutes; const WCHAR* pSeconds; };
struct TTSHyphenatedStringInfo : TTSItemInfo { TTSItemInfo* pFirstChunkInfo; TTSItemInfo* pSecondChunkInfo; const WCHAR* pFirstChunk; const WCHAR* pSecondChunk; };
struct TTSSuffixItemInfo : TTSItemInfo { const WCHAR* pFirstChar; ULONG ulNumChars; };
struct TTSNumberRangeItemInfo : TTSItemInfo { TTSItemInfo *pFirstNumberInfo; TTSItemInfo *pSecondNumberInfo; };
struct TTSTimeRangeItemInfo : TTSItemInfo { TTSTimeOfDayItemInfo *pFirstTimeInfo; TTSTimeOfDayItemInfo *pSecondTimeInfo; };
struct AbbrevRecord { const WCHAR* pOrth; WCHAR* pPron1; ENGPARTOFSPEECH POS1; WCHAR* pPron2; ENGPARTOFSPEECH POS2; WCHAR* pPron3; ENGPARTOFSPEECH POS3; int iSentBreakDisambig; int iPronDisambig; };
struct TTSAbbreviationInfo : TTSItemInfo { const AbbrevRecord* pAbbreviation; };
//--- Structs used for Lex Lookup
typedef enum { PRON_A = 0, PRON_B = 1 };
struct PRONUNIT { ULONG phon_Len; WCHAR phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
ULONG POScount; ENGPARTOFSPEECH POScode[POS_MAX]; };
struct PRONRECORD { WCHAR orthStr[SP_MAX_WORD_LENGTH]; // Orth text
WCHAR lemmaStr[SP_MAX_WORD_LENGTH]; // Root word
ULONG pronType; // Pronunciation is lex or LTS
PRONUNIT pronArray[2]; ENGPARTOFSPEECH POSchoice; ENGPARTOFSPEECH XMLPartOfSpeech; bool hasAlt; ULONG altChoice; BOOL fUsePron; WCHAR CustomLtsToken[SP_MAX_WORD_LENGTH]; };
//--- Miscellaneous structs and typedefs
struct SentencePointer { const WCHAR *pSentenceStart; const SPVTEXTFRAG *pSentenceFrag; };
//=== Function Definitions ===========================================
// Misc Number Normalization functions and helpers
int MatchCurrencySign( const WCHAR*& pStartChar, const WCHAR*& pEndChar, NORM_POSITION& ePosition );
//=== Classes
/*** CSentenceStack *************************************************
* This class is used to maintain a stack of sentences for the Skip * call to utilize. */ class CSentenceStack { public: /*--- Methods ---*/ CSentenceStack() { m_StackPtr = -1; } int GetCount( void ) { return m_StackPtr + 1; } virtual SentencePointer& Pop( void ) { SPDBG_ASSERT( m_StackPtr > -1 ); return m_Stack[m_StackPtr--]; } virtual HRESULT Push( const SentencePointer& val ) { ++m_StackPtr; return m_Stack.SetAtGrow( m_StackPtr, val ); } virtual void Reset( void ) { m_StackPtr = -1; }
protected: /*--- Member data ---*/ CSPArray<SentencePointer,SentencePointer> m_Stack; int m_StackPtr; };
/*** CSentItem
* This object is a helper class */ class CSentItem : public TTSSentItem { public: CSentItem() { memset( this, 0, sizeof(*this) ); } CSentItem( TTSSentItem& Other ) { memcpy( this, &Other, sizeof( Other ) ); } };
/*** CSentItemEnum
* This object is designed to be used by a single thread. */ class ATL_NO_VTABLE CSentItemEnum : public CComObjectRootEx<CComMultiThreadModel>, public IEnumSENTITEM { /*=== ATL Setup ===*/ public: DECLARE_PROTECT_FINAL_CONSTRUCT()
BEGIN_COM_MAP(CSentItemEnum) COM_INTERFACE_ENTRY(IEnumSENTITEM) END_COM_MAP()
/*=== Methods =======*/ public: /*--- Constructors/Destructors ---*/
/*--- Non interface methods ---*/ void _SetOwner( IUnknown* pOwner ) { m_cpOwner = pOwner; } CItemList& _GetList( void ) { return m_ItemList; } CSentItemMemory& _GetMemoryManager( void ) { return m_MemoryManager; }
/*=== Interfaces ====*/ public: //--- IEnumSpSentence ----------------------------------------
STDMETHOD(Next)( TTSSentItem *pItemEnum ); STDMETHOD(Reset)( void );
/*=== Member data ===*/ private: CComPtr<IUnknown> m_cpOwner; CItemList m_ItemList; SPLISTPOS m_ListPos; CSentItemMemory m_MemoryManager; };
/*** CStdSentEnum COM object
*/ class ATL_NO_VTABLE CStdSentEnum : public CComObjectRootEx<CComMultiThreadModel>, public IEnumSpSentence { /*=== ATL Setup ===*/ public: DECLARE_GET_CONTROLLING_UNKNOWN() DECLARE_PROTECT_FINAL_CONSTRUCT()
BEGIN_COM_MAP(CStdSentEnum) COM_INTERFACE_ENTRY(IEnumSpSentence) END_COM_MAP()
/*=== Methods =======*/ public: /*--- Constructors/Destructors ---*/ HRESULT FinalConstruct(); void FinalRelease();
/*--- Non interface methods ---*/ HRESULT InitAggregateLexicon( void ); HRESULT AddLexiconToAggregate( ISpLexicon *pAddLexicon, DWORD dwFlags ); HRESULT InitMorphLexicon( void ); void fNamesLTS( bool );
//--- Abbreviation Sentence Breaking Disambiguation Functions
HRESULT IsAbbreviationEOS( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT IfEOSNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT IfEOSAndLowercaseNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS );
//--- Abbreviation Pronunciation Disambiguation Functions
HRESULT SingleOrPluralAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT DoctorDriveAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AbbreviationFollowedByDigit( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AllCapsAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT CapitalizedAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT SECAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT DegreeAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AbbreviationModifier( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT ADisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT PolishDisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos );
//--- Word Pronunciation Disambiguation Functions
HRESULT MeasurementDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT TheDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT ReadDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager );
private: //--- Pronunciation Table init helper
HRESULT InitPron( WCHAR** OriginalPron );
//--- Sentence breaking helpers ---//
HRESULT GetNextSentence( IEnumSENTITEM** pItemEnum ); HRESULT AddNextSentItem( CItemList& ItemList, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT SkipWhiteSpaceAndTags( const WCHAR*& pStartChar, const WCHAR*& pEndChar, const SPVTEXTFRAG*& pCurrFrag, CSentItemMemory& pMemoryManager, BOOL fAddToItemList = false, CItemList* pItemList = NULL ); const WCHAR* FindTokenEnd( const WCHAR* pStartChar, const WCHAR* pEndChar );
//--- Lexicon and POS helpers ---//
HRESULT DetermineProns( CItemList& ItemList, CSentItemMemory& MemoryManager ); HRESULT Pronounce( PRONRECORD *pPron );
//--- Normalization helpers ---//
HRESULT Normalize( CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT MatchCategory( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT ExpandCategory( TTSItemInfo*& pItemNormInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT DoUnicodeToAsciiMap( const WCHAR *pUnicodeString, ULONG ulUnicodeStringLength, WCHAR *ppConvertedString ); HRESULT IsAlphaWord( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT IsInitialism( CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); //--- Various Number Related Normalization helpers ---//
HRESULT IsNumberCategory( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, BOOL fMultiItem = true ); HRESULT IsInteger( const WCHAR* pStartChar, TTSIntegerItemInfo*& pIntegerInfo, CSentItemMemory& MemoryManager ); HRESULT IsDigitString( const WCHAR* pStartChar, TTSDigitsItemInfo*& pDigitsInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandNumber( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandPercent( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandDegrees( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandSquare( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandCube( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); void ExpandInteger( TTSIntegerItemInfo* pItemInfo, const WCHAR* Context, CWordList &WordList ); void ExpandDigit( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandTwoDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandThreeDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandDigitOrdinal( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandTwoOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandThreeOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandDigits( TTSDigitsItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsFraction( const WCHAR* pStartChar, TTSFractionItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandFraction( TTSFractionItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsRomanNumeral( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsPhoneNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsZipCode( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT ExpandZipCode( TTSZipCodeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsCurrency( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsNumberRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandNumberRange( TTSNumberRangeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsCurrencyRange( TTSItemInfo*& pItemInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
//--- Date Related Normalization helpers ---//
HRESULT IsNumericCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsMonthStringCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsLongFormDate_DMDY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsLongFormDate_DDMY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT ExpandDate( TTSDateItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandYear( TTSYearItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsDecade( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandDecade( TTSDecadeItemInfo* pItemInfo, CWordList& WordList ); ULONG MatchMonthString( WCHAR*& pMonth, ULONG ulLength ); ULONG MatchDayString( WCHAR*& pDayString, WCHAR* pEndChar ); bool MatchDateDelimiter( WCHAR **DateString );
//--- Time Related Normalization helpers ---//
HRESULT IsTimeOfDay( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList, BOOL fMultiItem = true ); HRESULT IsTime( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT ExpandTime( TTSTimeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsTimeRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList );
//--- SPELL tag normalization helper
HRESULT SpellOutString( CWordList& WordList ); void ExpandPunctuation( CWordList& WordList, WCHAR wc );
//--- Default normalization helper
HRESULT ExpandUnrecognizedString( CWordList& WordList, CSentItemMemory& MemoryManager );
//--- Misc. normalization helpers
HRESULT IsStateAndZipcode( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsHyphenatedString( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandHyphenatedString( TTSHyphenatedStringInfo* pItemInfo, CWordList& WordList ); HRESULT IsSuffix( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandSuffix( TTSSuffixItemInfo* pItemInfo, CWordList& WordList ); bool Zeroes( const WCHAR* ); bool ThreeZeroes( const WCHAR* ); bool IsPunctuation(const TTSSentItem *Item);
/*=== Interfaces ====*/ public: //--- IEnumSpSentence ----------------------------------------
STDMETHOD(SetFragList)( const SPVTEXTFRAG* pTextFragList, DWORD dwFlags ); STDMETHOD(Next)( IEnumSENTITEM **ppSentItemEnum ); STDMETHOD(Previous)( IEnumSENTITEM **ppSentItemEnum ); STDMETHOD(Reset)( void );
//=== Data members ===
private: CComPtr<ISpContainerLexicon> m_cpAggregateLexicon; CComPtr<ISpPhoneConverter> m_cpPhonemeConverter; CSMorph* m_pMorphLexicon; DWORD m_dwSpeakFlags; const SPVTEXTFRAG* m_pTextFragList; const SPVTEXTFRAG* m_pCurrFrag; const WCHAR* m_pNextChar; const WCHAR* m_pEndChar; const WCHAR* m_pEndOfCurrToken; const WCHAR* m_pEndOfCurrItem; CSentenceStack m_SentenceStack; SEPARATOR_AND_DECIMAL m_eSeparatorAndDecimal; SHORT_DATE_ORDER m_eShortDateOrder; static CComAutoCriticalSection m_AbbrevTableCritSec; bool m_fNameItem; bool m_fHaveNamesLTS; };
//--- Structs and typedefs used for abbreviation stuff
typedef HRESULT (CStdSentEnum::* SentBreakDisambigFunc)(const AbbrevRecord*, CItemList& , SPLISTPOS, CSentItemMemory&, BOOL*); typedef HRESULT (CStdSentEnum::* PronDisambigFunc) ( const AbbrevRecord*, PRONRECORD*, CItemList&, SPLISTPOS ); typedef HRESULT (CStdSentEnum::* PostLexLookupDisambigFunc) ( const AbbrevRecord*, CItemList&, SPLISTPOS, CSentItemMemory& ); extern AbbrevRecord g_AbbreviationTable[177]; extern const PronDisambigFunc g_PronDisambigTable[]; extern const SentBreakDisambigFunc g_SentBreakDisambigTable[]; extern AbbrevRecord g_AmbiguousWordTable[72]; extern const PronDisambigFunc g_AmbiguousWordDisambigTable[]; extern AbbrevRecord g_PostLexLookupWordTable[41]; extern const PostLexLookupDisambigFunc g_PostLexLookupDisambigTable[]; extern WCHAR *g_pOfA; extern WCHAR *g_pOfAn; extern BOOL g_fAbbrevTablesInitialized; extern void CleanupAbbrevTables( void );
//--- First words table - used in sentence breaking
extern const SPLSTR g_FirstWords[163];
//
//=== Inlines
//
inline ULONG my_wcstoul( const WCHAR *pStartChar, WCHAR **ppEndChar ) { if ( iswdigit( *pStartChar ) ) { return wcstoul( pStartChar, ppEndChar, 10 ); } else { if ( ppEndChar ) { *ppEndChar = (WCHAR*) pStartChar; } return 0; } }
inline ENGPARTOFSPEECH ConvertItemTypeToPartOfSp( TTSItemType ItemType ) { switch ( ItemType ) { case eOPEN_PARENTHESIS: case eOPEN_BRACKET: case eOPEN_BRACE: return MS_GroupBegin;
case eCLOSE_PARENTHESIS: case eCLOSE_BRACKET: case eCLOSE_BRACE: return MS_GroupEnd;
case eSINGLE_QUOTE: case eDOUBLE_QUOTE: return MS_Quotation;
case ePERIOD: case eQUESTION: case eEXCLAMATION: return MS_EOSItem;
case eCOMMA: case eCOLON: case eSEMICOLON: case eHYPHEN: case eELLIPSIS: return MS_MiscPunc;
default: return MS_Unknown; } }
inline bool MatchPhoneNumberDelimiter( const WCHAR wc ) { return ( wc == L' ' || wc == L'-' || wc == L'.' ); }
inline bool NeedsToBeNormalized( const AbbrevRecord* pAbbreviation ) { if( !wcscmp( pAbbreviation->pOrth, L"jan" ) || !wcscmp( pAbbreviation->pOrth, L"feb" ) || !wcscmp( pAbbreviation->pOrth, L"mar" ) || !wcscmp( pAbbreviation->pOrth, L"apr" ) || !wcscmp( pAbbreviation->pOrth, L"jun" ) || !wcscmp( pAbbreviation->pOrth, L"jul" ) || !wcscmp( pAbbreviation->pOrth, L"aug" ) || !wcscmp( pAbbreviation->pOrth, L"sep" ) || !wcscmp( pAbbreviation->pOrth, L"sept" ) || !wcscmp( pAbbreviation->pOrth, L"oct" ) || !wcscmp( pAbbreviation->pOrth, L"nov" ) || !wcscmp( pAbbreviation->pOrth, L"dec" ) || !wcscmp( pAbbreviation->pOrth, L"mon" ) || !wcscmp( pAbbreviation->pOrth, L"tue" ) || !wcscmp( pAbbreviation->pOrth, L"tues" ) || !wcscmp( pAbbreviation->pOrth, L"wed" ) || !wcscmp( pAbbreviation->pOrth, L"thu" ) || !wcscmp( pAbbreviation->pOrth, L"thur" ) || !wcscmp( pAbbreviation->pOrth, L"thurs" ) || !wcscmp( pAbbreviation->pOrth, L"fri" ) || !wcscmp( pAbbreviation->pOrth, L"sat" ) || !wcscmp( pAbbreviation->pOrth, L"sun" ) ) { return true; } else { return false; } }
inline HRESULT SetWordList( CSentItem& Item, CWordList& WordList, CSentItemMemory& MemoryManager ) { HRESULT hr = S_OK; SPLISTPOS WordListPos = WordList.GetHeadPosition(); Item.ulNumWords = WordList.GetCount(); Item.Words = (TTSWord*) MemoryManager.GetMemory( Item.ulNumWords * sizeof(TTSWord), &hr ); if ( SUCCEEDED( hr ) ) { ULONG ulIndex = 0; while ( WordListPos ) { SPDBG_ASSERT( ulIndex < Item.ulNumWords ); Item.Words[ulIndex++] = WordList.GetNext( WordListPos ); } }
return hr; }
inline int CompareStringAndSPLSTR( const void* _String, const void* _SPLSTR ) { int _StringLen = wcslen( (const WCHAR*) _String ); int _SPLSTRLen = ( (const SPLSTR*) _SPLSTR )->Len; if ( _StringLen < _SPLSTRLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ); if ( Result != 0 ) { return Result; } else { return -1; } } else if ( _StringLen > _SPLSTRLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _SPLSTRLen ); if ( Result != 0 ) { return Result; } else { return 1; } } else { return ( wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ) ); } }
inline int CompareStringAndStateStruct( const void* _String, const void* _StateStruct ) { int _StringLen = wcslen( (const WCHAR*) _String ); int _StateStructLen = ( (const StateStruct*) _StateStruct )->Abbreviation.Len; if ( _StringLen < _StateStructLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StringLen ); if ( Result != 0 ) { return Result; } else { return -1; } } else if ( _StringLen > _StateStructLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StateStructLen ); if ( Result != 0 ) { return Result; } else { return 1; } } else { return ( wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StringLen ) ); } }
inline int CompareStringAndAbbrevRecord( const void* _String, const void* _AbbrevRecord ) { return ( _wcsicmp( (const WCHAR*) _String, ( (const AbbrevRecord*) _AbbrevRecord )->pOrth ) ); }
inline int CompareWCHARAndWCHAR( const void *pWCHAR_1, const void *pWCHAR_2 ) { return ( *( (WCHAR*) pWCHAR_1) - *( (WCHAR*) pWCHAR_2) ); }
inline BOOL IsSpace( WCHAR wc ) { return ( ( wc == 0x20 ) || ( wc == 0x9 ) || ( wc == 0xD ) || ( wc == 0xA ) || ( wc == 0x200B ) ); }
inline BOOL IsCapital( WCHAR wc ) { return ( ( wc >= L'A' ) && ( wc <= L'Z' ) ); }
inline TTSItemType IsGroupBeginning( WCHAR wc ) { if ( wc == L'(' ) { return eOPEN_PARENTHESIS; } else if ( wc == L'[' ) { return eOPEN_BRACKET; } else if ( wc == L'{' ) { return eOPEN_BRACE; } else { return eUNMATCHED; } }
inline TTSItemType IsGroupEnding( WCHAR wc ) { if ( wc == L')' ) { return eCLOSE_PARENTHESIS; } else if ( wc == L']' ) { return eCLOSE_BRACKET; } else if ( wc == L'}' ) { return eCLOSE_BRACE; } else { return eUNMATCHED; } }
inline TTSItemType IsQuotationMark( WCHAR wc ) { if ( wc == L'\'' ) { return eSINGLE_QUOTE; } else if ( wc == L'\"' ) { return eDOUBLE_QUOTE; } else { return eUNMATCHED; } }
inline TTSItemType IsEOSItem( WCHAR wc ) { if ( wc == L'.' ) { return ePERIOD; } else if ( wc == L'!' ) { return eEXCLAMATION; } else if ( wc == L'?' ) { return eQUESTION; } else { return eUNMATCHED; } }
inline TTSItemType IsMiscPunctuation( WCHAR wc ) { if ( wc == L',' ) { return eCOMMA; } else if ( wc == L';' ) { return eSEMICOLON; } else if ( wc == L':' ) { return eCOLON; } else if ( wc == L'-' ) { return eHYPHEN; } else { return eUNMATCHED; } }
#endif //--- This must be the last line in the file
|