mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1996 lines
92 KiB
1996 lines
92 KiB
/*******************************************************************************
|
|
* Disambig.cpp *
|
|
*--------------*
|
|
* Description:
|
|
* This module contains the methods to disambiguate part of speech and
|
|
* select the correct pronounciation from the lexicon.
|
|
*-------------------------------------------------------------------------------
|
|
* Created By: EDC Date: 07/15/99
|
|
* Copyright (C) 1999 Microsoft Corporation
|
|
* All Rights Reserved
|
|
*
|
|
*******************************************************************************/
|
|
|
|
//--- Additional includes
|
|
#include "stdafx.h"
|
|
#include "commonlx.h"
|
|
#ifndef StdSentEnum_h
|
|
#include "stdsentenum.h"
|
|
#endif
|
|
#include "spttsengdebug.h"
|
|
|
|
/*****************************************************************************
|
|
* TryPOSConversion *
|
|
*------------------*
|
|
*
|
|
* Description:
|
|
* Checks to see whether the argument PRONRECORD contains the argument
|
|
* ENGPARTOFSPEECH as an option. If so, sets the PRONRECORD alternate
|
|
* choice and part of speech choice, and returns true. If not, just returns
|
|
* false without modifying the PRONRECORD at all.
|
|
*
|
|
***************************************************************** AH *********/
|
|
bool TryPOSConversion( PRONRECORD& pPron, ENGPARTOFSPEECH PartOfSpeech )
|
|
{
|
|
|
|
//--- Check first pronunciation
|
|
for ( ULONG i = 0; i < pPron.pronArray[0].POScount; i++ )
|
|
{
|
|
if ( pPron.pronArray[0].POScode[i] == PartOfSpeech )
|
|
{
|
|
pPron.altChoice = 0;
|
|
pPron.POSchoice = PartOfSpeech;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
//--- Check second pronunciation
|
|
if ( pPron.hasAlt )
|
|
{
|
|
for ( ULONG i = 0; i < pPron.pronArray[1].POScount; i++ )
|
|
{
|
|
if ( pPron.pronArray[1].POScode[i] == PartOfSpeech )
|
|
{
|
|
pPron.altChoice = 1;
|
|
pPron.POSchoice = PartOfSpeech;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
} /* TryPOS Conversion */
|
|
|
|
/*****************************************************************************
|
|
* DisambiguatePOS *
|
|
*-----------------*
|
|
*
|
|
* Description:
|
|
* Disambiguate parts of speech by applying patches in order... This
|
|
* work is an implementation of Eric Brill's rule-based part of speech
|
|
* tagger - see, for example:
|
|
*
|
|
* Brill, Eric. 1992. A simple rule-based part of speech tagger.
|
|
* In Proceedings of the Third Conference on Applied Natural
|
|
* Language Processing, ACL. Trento, Italy.
|
|
*
|
|
***************************************************************** AH *********/
|
|
void DisambiguatePOS( PRONRECORD *pProns, ULONG cNumOfWords )
|
|
{
|
|
SPDBG_FUNC( "DisambiguatePOS" );
|
|
|
|
//--- Iterate over the patches, applying each (where applicable) to the
|
|
//--- entire sentence. For each patch, iterate over each word in the
|
|
//--- sentence to which the patch could apply (from left to right).
|
|
for ( int i = 0; i < sp_countof( g_POSTaggerPatches ); i++ )
|
|
{
|
|
switch ( g_POSTaggerPatches[i].eTemplateType )
|
|
{
|
|
case PREV1T:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous POS matches, and
|
|
//--- the conversion POS is a possibility for this word, convert the
|
|
//--- POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1T:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next POS matches, and
|
|
//--- the conversion POS is a possibility for this word, convert the
|
|
//--- POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV2T:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 2; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the POS two previous matches, and
|
|
//--- the conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT2T:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 2; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the POS two after matches, and
|
|
//--- the conversion POS is a possibility for this word, convert the
|
|
//--- POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1OR2T:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous POS matches OR the
|
|
//--- POS two previous matches, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j > 1 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1OR2T:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next POS matches OR the POS
|
|
//--- two after matches, and the conversion POS is a possibility for this
|
|
//--- word, convert the POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j < cNumOfWords - 2 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1OR2OR3T:
|
|
{
|
|
if ( cNumOfWords > 3 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous POS matches OR the
|
|
//--- POS two previous matches OR the POS three previous matches, and
|
|
//--- the conversion POS is a possibility for this word, convert the POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j > 1 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j > 2 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 3].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1OR2OR3T:
|
|
{
|
|
if ( cNumOfWords > 3 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next POS matches OR the POS
|
|
//--- two after matches OR the POS three after matches, and the conversion
|
|
//--- POS is a possibility for this word, convert the POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j < cNumOfWords - 2 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) ||
|
|
( j < cNumOfWords - 3 &&
|
|
pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 3].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1TNEXT1T:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next POS matches, and the
|
|
//--- previous POS matches, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS2 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1TNEXT2T:
|
|
{
|
|
if ( cNumOfWords > 3 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords - 2; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the POS two after matches, and the
|
|
//--- previous POS matches, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
pProns[j + 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS2 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV2TNEXT1T:
|
|
{
|
|
if ( cNumOfWords > 3 )
|
|
{
|
|
for ( ULONG j = 2; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next POS matches, and the
|
|
//--- POS two previous matches, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 2].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS2 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CAP:
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the word is capitalized, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
iswupper( pProns[j].orthStr[0] ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NOTCAP:
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the word is not capitalized, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
!iswupper( pProns[j].orthStr[0] ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREVCAP:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous word is capitalized,
|
|
//--- and the conversion POS is a possibility for this word, convert the
|
|
//--- POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
iswupper( pProns[j - 1].orthStr[0] ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREVNOTCAP:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the word is capitalized, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
!iswupper( pProns[j - 1].orthStr[0] ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1W:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous word matches, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j - 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1W:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next word matches, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j + 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV2W:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 2; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the word two previous matches, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j - 2].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT2W:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 2; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the word two after matches, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j + 2].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1OR2W:
|
|
{
|
|
if ( cNumOfWords > 2 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous word OR the word two
|
|
//--- previous matches, and the conversion POS is a possibility for this word,
|
|
//--- convert the POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j - 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 ) ||
|
|
( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j - 2].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1OR2W:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next word matches OR the word two after
|
|
//--- matches, and the conversion POS is a possibility for this word, convert the
|
|
//--- POS.
|
|
if ( ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j + 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 ) ||
|
|
( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j + 2].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 ) )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWPREV1W:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current word matches, and the previous
|
|
//--- word matches, and the conversion POS is a possibility for this word, convert
|
|
//--- the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
_wcsicmp( pProns[j - 1].orthStr, g_POSTaggerPatches[i].pTemplateWord2 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWNEXT1W:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current word matches, and the next
|
|
//--- word matches, and the conversion POS is a possibility for this word, convert
|
|
//--- the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
_wcsicmp( pProns[j + 1].orthStr, g_POSTaggerPatches[i].pTemplateWord2 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWPREV1T:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current word matches, and the previous
|
|
//--- POS matches, and the conversion POS is a possibility for this word, convert
|
|
//--- the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWNEXT1T:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current word matches, and the next
|
|
//--- POS matches, and the conversion POS is a possibility for this word, convert
|
|
//--- the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRW:
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current word matches, and the
|
|
//--- conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS ) ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case PREV1WT:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the previous word and POS match, and
|
|
//--- the conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
_wcsicmp( pProns[j - 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case NEXT1WT:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the next word and POS match, and
|
|
//--- the conversion POS is a possibility for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
_wcsicmp( pProns[j + 1].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWPREV1WT:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 1; j < cNumOfWords; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current words matches, and the
|
|
//--- previous word and POS match, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
pProns[j - 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
_wcsicmp( pProns[j - 1].orthStr, g_POSTaggerPatches[i].pTemplateWord2 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case CURRWNEXT1WT:
|
|
{
|
|
if ( cNumOfWords > 1 )
|
|
{
|
|
for ( ULONG j = 0; j < cNumOfWords - 1; j++ )
|
|
{
|
|
if ( pProns[j].XMLPartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- If the current POS matches, and the current words matches, and the
|
|
//--- next word and POS match, and the conversion POS is a possibility
|
|
//--- for this word, convert the POS.
|
|
if ( pProns[j].POSchoice == g_POSTaggerPatches[i].eCurrentPOS &&
|
|
_wcsicmp( pProns[j].orthStr, g_POSTaggerPatches[i].pTemplateWord1 ) == 0 &&
|
|
pProns[j + 1].POSchoice == g_POSTaggerPatches[i].eTemplatePOS1 &&
|
|
_wcsicmp( pProns[j + 1].orthStr, g_POSTaggerPatches[i].pTemplateWord2 ) == 0 )
|
|
{
|
|
TryPOSConversion( pProns[j], g_POSTaggerPatches[i].eConvertToPOS );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
} /* DisambiguatePOS */
|
|
|
|
/*****************************************************************************
|
|
* Pronounce *
|
|
*-----------*
|
|
* Description:
|
|
* Get lexicon or letter-to-sound (LTS) pronunciations
|
|
*
|
|
********************************************************************** MC ***/
|
|
HRESULT CStdSentEnum::Pronounce( PRONRECORD *pPron )
|
|
{
|
|
SPDBG_FUNC( "Pronounce" );
|
|
SPWORDPRONUNCIATIONLIST SPList;
|
|
HRESULT hr = SPERR_NOT_IN_LEX;
|
|
ULONG cPhonLen;
|
|
DWORD dwFlags = eLEXTYPE_USER | eLEXTYPE_APP | eLEXTYPE_PRIVATE1 | eLEXTYPE_PRIVATE2;
|
|
BOOL fPOSExists = false;
|
|
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
|
|
//--- Special Case - XML Provided Part Of Speech. Search for exact match first...
|
|
if ( pPron->XMLPartOfSpeech != MS_Unknown )
|
|
{
|
|
//--- Try User Lexicon
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_USER, &SPList );
|
|
if ( SUCCEEDED( hr ) &&
|
|
SPList.pFirstWordPronunciation )
|
|
{
|
|
for ( SPWORDPRONUNCIATION *pPronunciation = SPList.pFirstWordPronunciation; pPronunciation;
|
|
pPronunciation = pPronunciation->pNextWordPronunciation )
|
|
{
|
|
if ( pPronunciation->ePartOfSpeech == pPron->XMLPartOfSpeech )
|
|
{
|
|
fPOSExists = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( !fPOSExists )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
}
|
|
}
|
|
//--- Handle empty pronunciation
|
|
else if ( !SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
//--- Try App Lexicon
|
|
if ( !fPOSExists )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_APP, &SPList );
|
|
if ( SUCCEEDED( hr ) &&
|
|
SPList.pFirstWordPronunciation )
|
|
{
|
|
for ( SPWORDPRONUNCIATION *pPronunciation = SPList.pFirstWordPronunciation; pPronunciation;
|
|
pPronunciation = pPronunciation->pNextWordPronunciation )
|
|
{
|
|
if ( pPronunciation->ePartOfSpeech == pPron->XMLPartOfSpeech )
|
|
{
|
|
fPOSExists = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( !fPOSExists )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
}
|
|
}
|
|
//--- Handle empty pronunciation
|
|
else if ( !SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
//--- Try Vendor Lexicon
|
|
if ( !fPOSExists )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_PRIVATE1, &SPList );
|
|
if ( SUCCEEDED( hr ) &&
|
|
SPList.pFirstWordPronunciation )
|
|
{
|
|
for ( SPWORDPRONUNCIATION *pPronunciation = SPList.pFirstWordPronunciation; pPronunciation;
|
|
pPronunciation = pPronunciation->pNextWordPronunciation )
|
|
{
|
|
if ( pPronunciation->ePartOfSpeech == pPron->XMLPartOfSpeech )
|
|
{
|
|
fPOSExists = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( !fPOSExists )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
}
|
|
}
|
|
//--- Handle empty pronunciation
|
|
else if ( !SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
//--- Try Morph Lexicon
|
|
if ( !fPOSExists )
|
|
{
|
|
hr = m_pMorphLexicon->DoSuffixMorph( pPron->orthStr, pPron->lemmaStr, 1033, dwFlags, &SPList );
|
|
if ( SUCCEEDED( hr ) &&
|
|
SPList.pFirstWordPronunciation )
|
|
{
|
|
for ( SPWORDPRONUNCIATION *pPronunciation = SPList.pFirstWordPronunciation; pPronunciation;
|
|
pPronunciation = pPronunciation->pNextWordPronunciation )
|
|
{
|
|
if ( pPronunciation->ePartOfSpeech == pPron->XMLPartOfSpeech )
|
|
{
|
|
fPOSExists = true;
|
|
break;
|
|
}
|
|
}
|
|
if ( !fPOSExists )
|
|
{
|
|
//--- Need to do this the last time, to make sure we hit the default code below...
|
|
//--- RAID 5078
|
|
hr = SPERR_NOT_IN_LEX;
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
}
|
|
}
|
|
//--- Handle empty pronunciation
|
|
else if ( !SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Default case - just look up orthography and go with first match.
|
|
if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_USER, &SPList );
|
|
|
|
//--- Handle empty pronunciation
|
|
if ( SUCCEEDED( hr ) &&
|
|
!SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_APP, &SPList );
|
|
|
|
//--- Handle empty pronunciation
|
|
if ( SUCCEEDED( hr ) &&
|
|
!SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_PRIVATE1, &SPList );
|
|
|
|
//--- Handle empty pronunciation
|
|
if ( SUCCEEDED( hr ) &&
|
|
!SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = m_pMorphLexicon->DoSuffixMorph( pPron->orthStr, pPron->lemmaStr, 1033,
|
|
dwFlags, &SPList );
|
|
|
|
//--- Handle empty pronunciation
|
|
if ( SUCCEEDED( hr ) &&
|
|
!SPList.pFirstWordPronunciation )
|
|
{
|
|
if ( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
ZeroMemory( &SPList, sizeof(SPWORDPRONUNCIATIONLIST) );
|
|
}
|
|
hr = SPERR_NOT_IN_LEX;
|
|
}
|
|
}
|
|
if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = m_cpAggregateLexicon->GetPronunciations( pPron->orthStr, 1033, eLEXTYPE_PRIVATE2, &SPList );
|
|
|
|
//--- Make all LTS words Nouns...
|
|
for ( SPWORDPRONUNCIATION *pPronunciation = SPList.pFirstWordPronunciation; pPronunciation;
|
|
pPronunciation = pPronunciation->pNextWordPronunciation )
|
|
{
|
|
pPronunciation->ePartOfSpeech = SPPS_Noun;
|
|
}
|
|
}
|
|
|
|
if (SUCCEEDED(hr))
|
|
{
|
|
//--- WARNING - this assumes pronunciations will only come from one type of lexicon, an assumption
|
|
//--- which was true as of July, 2000
|
|
pPron->pronType = SPList.pFirstWordPronunciation->eLexiconType;
|
|
|
|
//------------------------------------------------------------
|
|
// SAPI unrolls pronunciations from their POS.
|
|
// So roll them back into the original collapsed array
|
|
// of one or two candidates with sorted POS (argh...)
|
|
//------------------------------------------------------------
|
|
SPWORDPRONUNCIATION *firstPron, *pCurPron, *pNextPron;
|
|
|
|
//------------------------------------------
|
|
// Init pronunciation A
|
|
//------------------------------------------
|
|
pCurPron = firstPron = SPList.pFirstWordPronunciation;
|
|
pPron->pronArray[PRON_A].POScount = 1;
|
|
//----------------------------
|
|
// Get phoneme length
|
|
//----------------------------
|
|
cPhonLen = wcslen( firstPron->szPronunciation ) + 1; // include delimiter
|
|
//----------------------------
|
|
// Clip phoneme string to max
|
|
//----------------------------
|
|
if( cPhonLen > SP_MAX_PRON_LENGTH )
|
|
{
|
|
cPhonLen = SP_MAX_PRON_LENGTH;
|
|
}
|
|
//----------------------------
|
|
// Copy unicode phoneme string
|
|
//----------------------------
|
|
memcpy( pPron->pronArray[PRON_A].phon_Str, firstPron->szPronunciation, cPhonLen * sizeof(WCHAR) );
|
|
pPron->pronArray[PRON_A].phon_Len = cPhonLen -1; // minus delimiter
|
|
pPron->pronArray[PRON_A].POScode[0] = (ENGPARTOFSPEECH)firstPron->ePartOfSpeech;
|
|
|
|
//------------------------------------------
|
|
// Init pronunciation B
|
|
//------------------------------------------
|
|
pPron->pronArray[PRON_B].POScount = 0;
|
|
pPron->pronArray[PRON_B].phon_Len = 0;
|
|
|
|
pNextPron = pCurPron->pNextWordPronunciation;
|
|
|
|
while( pNextPron )
|
|
{
|
|
int isDiff;
|
|
|
|
isDiff = wcscmp( firstPron->szPronunciation, pNextPron->szPronunciation );
|
|
if( isDiff )
|
|
{
|
|
//------------------------------------------------
|
|
// Next pronunciation is different from 1st
|
|
//------------------------------------------------
|
|
if( pPron->pronArray[PRON_B].POScount < POS_MAX )
|
|
{
|
|
//---------------------------------------
|
|
// Gather POS B into array
|
|
//---------------------------------------
|
|
pPron->pronArray[PRON_B].POScode[pPron->pronArray[PRON_B].POScount] =
|
|
(ENGPARTOFSPEECH)pNextPron->ePartOfSpeech;
|
|
pPron->pronArray[PRON_B].POScount++;
|
|
if( pPron->pronArray[PRON_B].phon_Len == 0 )
|
|
{
|
|
//-----------------------------------------
|
|
// If there's no B pron yet, make one
|
|
//-----------------------------------------
|
|
cPhonLen = wcslen( pNextPron->szPronunciation ) + 1; // include delimiter
|
|
//----------------------------
|
|
// Clip phoneme string to max
|
|
//----------------------------
|
|
if( cPhonLen > SP_MAX_PRON_LENGTH )
|
|
{
|
|
cPhonLen = SP_MAX_PRON_LENGTH;
|
|
}
|
|
//----------------------------
|
|
// Copy unicode phoneme string
|
|
//----------------------------
|
|
memcpy( pPron->pronArray[PRON_B].phon_Str,
|
|
pNextPron->szPronunciation,
|
|
cPhonLen * sizeof(WCHAR) );
|
|
pPron->pronArray[PRON_B].phon_Len = cPhonLen -1; // minus delimiter
|
|
pPron->hasAlt = true;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//------------------------------------------------
|
|
// Next pronunciation is same as 1st
|
|
//------------------------------------------------
|
|
if( pPron->pronArray[PRON_A].POScount < POS_MAX )
|
|
{
|
|
//---------------------------------------
|
|
// Gather POS A into array
|
|
//---------------------------------------
|
|
pPron->pronArray[PRON_A].POScode[pPron->pronArray[PRON_A].POScount] =
|
|
(ENGPARTOFSPEECH)pNextPron->ePartOfSpeech;
|
|
pPron->pronArray[PRON_A].POScount++;
|
|
}
|
|
}
|
|
pCurPron = pNextPron;
|
|
pNextPron = pCurPron->pNextWordPronunciation;
|
|
}
|
|
}
|
|
|
|
//--- If XML POS provided, set selection now as it won't be touched by the POS Tagger
|
|
if ( pPron->XMLPartOfSpeech != MS_Unknown )
|
|
{
|
|
BOOL fMadeMatch = false;
|
|
|
|
//--- Check first pronunciation
|
|
for ( ULONG i = 0; i < pPron->pronArray[0].POScount; i++ )
|
|
{
|
|
if ( pPron->pronArray[0].POScode[i] == pPron->XMLPartOfSpeech )
|
|
{
|
|
pPron->altChoice = 0;
|
|
pPron->POSchoice = pPron->XMLPartOfSpeech;
|
|
fMadeMatch = true;
|
|
}
|
|
}
|
|
|
|
//--- Check second pronunciation
|
|
if ( pPron->hasAlt )
|
|
{
|
|
for ( ULONG i = 0; i < pPron->pronArray[1].POScount; i++ )
|
|
{
|
|
if ( pPron->pronArray[1].POScode[i] == pPron->XMLPartOfSpeech )
|
|
{
|
|
pPron->altChoice = 1;
|
|
pPron->POSchoice = pPron->XMLPartOfSpeech;
|
|
fMadeMatch = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- If this POS didn't exist for the word, let POS Tagger do its thing
|
|
//--- to determine a pronunciation, and then reassign the POS later...
|
|
if ( !fMadeMatch )
|
|
{
|
|
pPron->XMLPartOfSpeech = MS_Unknown;
|
|
pPron->POSchoice = pPron->pronArray[PRON_A].POScode[0];
|
|
}
|
|
}
|
|
//--- Set default POS, for later refinement by POS Tagger
|
|
else
|
|
{
|
|
pPron->POSchoice = pPron->pronArray[PRON_A].POScode[0];
|
|
pPron->altChoice = PRON_A;
|
|
}
|
|
|
|
if( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
}
|
|
|
|
return hr;
|
|
} /* Pronounce */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::DetermineProns *
|
|
*------------------------------*
|
|
* Description:
|
|
* This method determines POS and looks up the pronounciation
|
|
********************************************************************* MC ****/
|
|
HRESULT CStdSentEnum::DetermineProns( CItemList& ItemList, CSentItemMemory& MemoryManager )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::DetermineProns" );
|
|
HRESULT hr = S_OK;
|
|
ULONG cNumOfProns, cPronIndex;
|
|
PRONRECORD* pProns = NULL;
|
|
|
|
//--- Count the total number of pronunciations needed
|
|
cNumOfProns = 0;
|
|
SPLISTPOS ListPos = ItemList.GetHeadPosition();
|
|
while( ListPos )
|
|
{
|
|
TTSSentItem& Item = ItemList.GetNext( ListPos );
|
|
for ( ULONG i = 0; i < Item.ulNumWords; i++ )
|
|
{
|
|
if( Item.Words[i].pWordText &&
|
|
( Item.Words[i].pXmlState->eAction == SPVA_Speak ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_SpellOut ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_Pronounce ) )
|
|
{
|
|
++cNumOfProns;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( cNumOfProns )
|
|
{
|
|
pProns = new PRONRECORD[cNumOfProns];
|
|
|
|
if( !pProns )
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
}
|
|
else
|
|
{
|
|
//--- First, get item pronunciation(s)
|
|
ZeroMemory( pProns, cNumOfProns * sizeof(PRONRECORD) );
|
|
cPronIndex = 0;
|
|
ListPos = ItemList.GetHeadPosition();
|
|
|
|
//--- Iterate through ItemList
|
|
while( ListPos && SUCCEEDED( hr ) )
|
|
{
|
|
TTSSentItem& Item = ItemList.GetNext( ListPos );
|
|
//--- Iterate over Words
|
|
for ( ULONG i = 0; i < Item.ulNumWords; i++ )
|
|
{
|
|
//--- Get pronunciations and parts of speech for spoken items only
|
|
if ( Item.Words[i].pWordText &&
|
|
( Item.Words[i].pXmlState->eAction == SPVA_Speak ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_SpellOut ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_Pronounce ) )
|
|
{
|
|
SPDBG_ASSERT( cPronIndex < cNumOfProns );
|
|
ULONG cItemLen = Item.Words[i].ulWordLen;
|
|
//--- Clip at max text length
|
|
if( cItemLen > ( SP_MAX_WORD_LENGTH-1 ) )
|
|
{
|
|
cItemLen = SP_MAX_WORD_LENGTH - 1;
|
|
}
|
|
//--- Copy item text
|
|
memcpy( pProns[cPronIndex].orthStr,
|
|
Item.Words[i].pWordText,
|
|
cItemLen * sizeof(WCHAR) );
|
|
pProns[cPronIndex].orthStr[cItemLen] = 0;
|
|
//--- Set Part of Speech, if given in XML
|
|
if ( Item.Words[i].pXmlState->ePartOfSpeech != MS_Unknown )
|
|
{
|
|
pProns[cPronIndex].XMLPartOfSpeech = (ENGPARTOFSPEECH)Item.Words[i].pXmlState->ePartOfSpeech;
|
|
}
|
|
//--- Do Lex Lookup, if necessary
|
|
if ( Item.Words[i].pXmlState->pPhoneIds == NULL ||
|
|
Item.Words[i].pXmlState->ePartOfSpeech == MS_Unknown )
|
|
{
|
|
//--- Special Case - Disambiguate Abbreviations
|
|
if ( Item.pItemInfo->Type == eABBREVIATION ||
|
|
Item.pItemInfo->Type == eABBREVIATION_NORMALIZE )
|
|
{
|
|
const AbbrevRecord *pAbbrevInfo =
|
|
( (TTSAbbreviationInfo*) Item.pItemInfo )->pAbbreviation;
|
|
if ( pAbbrevInfo->iPronDisambig < 0 )
|
|
{
|
|
//--- Default case - just take the first (and only) pronunciation
|
|
pProns[cPronIndex].pronArray[PRON_A].POScount = 1;
|
|
wcscpy( pProns[cPronIndex].pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron1 );
|
|
pProns[cPronIndex].pronArray[PRON_A].phon_Len =
|
|
wcslen( pProns[cPronIndex].pronArray[PRON_A].phon_Str );
|
|
pProns[cPronIndex].pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS1;
|
|
pProns[cPronIndex].pronArray[PRON_B].POScount = 0;
|
|
pProns[cPronIndex].pronArray[PRON_B].phon_Len = 0;
|
|
pProns[cPronIndex].hasAlt = false;
|
|
pProns[cPronIndex].altChoice = PRON_A;
|
|
pProns[cPronIndex].POSchoice = pAbbrevInfo->POS1;
|
|
//--- Abbreviation table prons are basically just vendor lex prons...
|
|
pProns[cPronIndex].pronType = eLEXTYPE_PRIVATE1;
|
|
}
|
|
else
|
|
{
|
|
hr = ( this->*g_PronDisambigTable[pAbbrevInfo->iPronDisambig] )
|
|
( pAbbrevInfo, &pProns[cPronIndex], ItemList, ListPos );
|
|
}
|
|
pProns[cPronIndex].fUsePron = true;
|
|
}
|
|
//--- Default case
|
|
else
|
|
{
|
|
//--- Check disambiguation list
|
|
const AbbrevRecord* pAbbrevRecord =
|
|
(AbbrevRecord*) bsearch( (void*) pProns[cPronIndex].orthStr, (void*) g_AmbiguousWordTable,
|
|
sp_countof( g_AmbiguousWordTable ), sizeof( AbbrevRecord ),
|
|
CompareStringAndAbbrevRecord );
|
|
if ( pAbbrevRecord )
|
|
{
|
|
hr = ( this->*g_AmbiguousWordDisambigTable[pAbbrevRecord->iPronDisambig] )
|
|
( pAbbrevRecord, &pProns[cPronIndex], ItemList, ListPos );
|
|
pProns[cPronIndex].fUsePron = true;
|
|
}
|
|
//--- Do Lex Lookup, if necessary
|
|
else
|
|
{
|
|
hr = Pronounce( &pProns[cPronIndex] );
|
|
}
|
|
}
|
|
}
|
|
cPronIndex++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (SUCCEEDED(hr))
|
|
{
|
|
//--- Next, disambiguate part-of-speech
|
|
DisambiguatePOS( pProns, cNumOfProns );
|
|
|
|
//--- Output debugging information
|
|
TTSDBG_LOGPOSPOSSIBILITIES( pProns, cNumOfProns, STREAM_POSPOSSIBILITIES );
|
|
|
|
//--- Finally, copy selected pronunciation to 'ItemList'
|
|
PRONUNIT *selectedUnit;
|
|
cPronIndex = 0;
|
|
ListPos = ItemList.GetHeadPosition();
|
|
|
|
while( ListPos && SUCCEEDED(hr) )
|
|
{
|
|
TTSSentItem& Item = ItemList.GetNext( ListPos );
|
|
for ( ULONG i = 0; i < Item.ulNumWords; i++ )
|
|
{
|
|
//--- Set pronunciation and part-of-speech for spoken items only
|
|
if( Item.Words[i].pWordText &&
|
|
( Item.Words[i].pXmlState->eAction == SPVA_Speak ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_SpellOut ||
|
|
Item.Words[i].pXmlState->eAction == SPVA_Pronounce ) )
|
|
{
|
|
SPDBG_ASSERT( cPronIndex < cNumOfProns );
|
|
//--- Use XML specified pronunciation, if given.
|
|
if ( Item.Words[i].pXmlState->pPhoneIds )
|
|
{
|
|
Item.Words[i].pWordPron = Item.Words[i].pXmlState->pPhoneIds;
|
|
}
|
|
else
|
|
{
|
|
selectedUnit = &pProns[cPronIndex].pronArray[pProns[cPronIndex].altChoice];
|
|
Item.Words[i].pWordPron =
|
|
(SPPHONEID*) MemoryManager.GetMemory( (selectedUnit->phon_Len + 1) *
|
|
sizeof(SPPHONEID), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
wcscpy( Item.Words[i].pWordPron, selectedUnit->phon_Str );
|
|
}
|
|
}
|
|
|
|
//--- Use XML specified part-of-speech, if given. This will override the case
|
|
//--- where the POS didn't exist as an option and the POS Tagger did its thing
|
|
//--- to find a pronunciation.
|
|
if ( Item.Words[i].pXmlState->ePartOfSpeech != MS_Unknown )
|
|
{
|
|
Item.Words[i].eWordPartOfSpeech = (ENGPARTOFSPEECH)Item.Words[i].pXmlState->ePartOfSpeech;
|
|
}
|
|
else
|
|
{
|
|
Item.Words[i].eWordPartOfSpeech = pProns[cPronIndex].POSchoice;
|
|
}
|
|
|
|
//--- Root word
|
|
if ( pProns[cPronIndex].lemmaStr[0] )
|
|
{
|
|
Item.Words[i].ulLemmaLen = wcslen( pProns[cPronIndex].lemmaStr );
|
|
Item.Words[i].pLemma =
|
|
(WCHAR*) MemoryManager.GetMemory( Item.Words[i].ulLemmaLen * sizeof(WCHAR), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
wcsncpy( (WCHAR*) Item.Words[i].pLemma, pProns[cPronIndex].lemmaStr,
|
|
Item.Words[i].ulLemmaLen );
|
|
}
|
|
}
|
|
|
|
//--- Insert pron in text, if appropriate - RAID #4746
|
|
if ( pProns[cPronIndex].fUsePron )
|
|
{
|
|
ULONG ulNumChars = wcslen( Item.Words[i].pWordPron );
|
|
Item.Words[i].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) Item.Words[i].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) Item.Words[i].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) Item.Words[i].pWordText + 1 ), Item.Words[i].pWordPron );
|
|
(WCHAR) Item.Words[i].pWordText[ ulNumChars + 1 ] = L'*';
|
|
Item.Words[i].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
|
|
cPronIndex++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
//--- Check Post POS disambiguation list
|
|
SPLISTPOS ListPos = ItemList.GetHeadPosition();
|
|
while ( ListPos && SUCCEEDED( hr ) )
|
|
{
|
|
TTSSentItem& Item = ItemList.GetNext( ListPos );
|
|
if ( Item.pItemInfo->Type == eALPHA_WORD ||
|
|
Item.pItemInfo->Type == eABBREVIATION ||
|
|
Item.pItemInfo->Type == eABBREVIATION_NORMALIZE )
|
|
{
|
|
WCHAR temp;
|
|
BOOL fPeriod = false;
|
|
if ( Item.pItemSrcText[Item.ulItemSrcLen - 1] == L'.' &&
|
|
Item.ulItemSrcLen > 1 )
|
|
{
|
|
temp = Item.pItemSrcText[Item.ulItemSrcLen - 1];
|
|
*( (WCHAR*) Item.pItemSrcText + Item.ulItemSrcLen - 1 ) = 0;
|
|
fPeriod = true;
|
|
}
|
|
else
|
|
{
|
|
temp = Item.pItemSrcText[Item.ulItemSrcLen];
|
|
*( (WCHAR*) Item.pItemSrcText + Item.ulItemSrcLen ) = 0;
|
|
}
|
|
|
|
const AbbrevRecord* pAbbrevRecord =
|
|
(AbbrevRecord*) bsearch( (void*) Item.pItemSrcText, (void*) g_PostLexLookupWordTable,
|
|
sp_countof( g_PostLexLookupWordTable ), sizeof( AbbrevRecord ),
|
|
CompareStringAndAbbrevRecord );
|
|
if ( pAbbrevRecord )
|
|
{
|
|
hr = ( this->*g_PostLexLookupDisambigTable[pAbbrevRecord->iPronDisambig] )
|
|
( pAbbrevRecord, ItemList, ListPos, MemoryManager );
|
|
}
|
|
|
|
if ( fPeriod )
|
|
{
|
|
*( (WCHAR*) Item.pItemSrcText + Item.ulItemSrcLen - 1 ) = temp;
|
|
}
|
|
else
|
|
{
|
|
*( (WCHAR*) Item.pItemSrcText + Item.ulItemSrcLen ) = temp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (pProns)
|
|
{
|
|
delete [] pProns;
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::DetermineProns */
|
|
|
|
/***********************************************************************************************
|
|
* MeasurementDisambig *
|
|
*---------------------*
|
|
* Description:
|
|
* This overrides initial pronunciations of measurement abbreviations when they are used
|
|
* as modifiers - e.g. "a 7 ft. pole" vs. "the pole was 7 ft. long"
|
|
*
|
|
********************************************************************* AH **********************/
|
|
HRESULT CStdSentEnum::MeasurementDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
|
|
SPLISTPOS ListPos, CSentItemMemory& MemoryManager )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::MeasurementDisambig" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Get previous two items
|
|
SPLISTPOS TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem TempItem = ItemList.GetPrev( TempPos );
|
|
//--- Previous must be a number
|
|
if ( TempItem.pItemInfo->Type == eNUM_CARDINAL )
|
|
{
|
|
//--- Get next item
|
|
TempPos = ListPos;
|
|
TempItem = ItemList.GetNext( TempPos );
|
|
//--- Next must be a noun or adj
|
|
if ( TempItem.eItemPartOfSpeech == MS_Noun )
|
|
{
|
|
//--- Matched a 7 ft. pole type example - go with singular
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& MeasurementItem = ItemList.GetPrev( TempPos );
|
|
//--- Singular will always be shorter than plural, so this should never overwrite
|
|
//--- anything...
|
|
wcscpy( MeasurementItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( MeasurementItem.Words[0].pWordPron );
|
|
MeasurementItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) MeasurementItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) MeasurementItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) MeasurementItem.Words[0].pWordText + 1 ), MeasurementItem.Words[0].pWordPron );
|
|
(WCHAR) MeasurementItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
MeasurementItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
else if ( TempItem.eItemPartOfSpeech == MS_Adj &&
|
|
TempPos )
|
|
{
|
|
//--- Next must be a noun
|
|
TempItem = ItemList.GetNext( TempPos );
|
|
{
|
|
if ( TempItem.eItemPartOfSpeech == MS_Noun )
|
|
{
|
|
//--- Matched a 7 ft. pole type example - go with singular
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& MeasurementItem = ItemList.GetPrev( TempPos );
|
|
//--- Singular will always be shorter than plural, so this should never overwrite
|
|
//--- anything...
|
|
wcscpy( MeasurementItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( MeasurementItem.Words[0].pWordPron );
|
|
MeasurementItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) MeasurementItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) MeasurementItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) MeasurementItem.Words[0].pWordText + 1 ), MeasurementItem.Words[0].pWordPron );
|
|
(WCHAR) MeasurementItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
MeasurementItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* MeasurementDisambig */
|
|
|
|
/***********************************************************************************************
|
|
* TheDisambig *
|
|
*-------------*
|
|
* Description:
|
|
* This function disambiguates the word the - before a vowel it becomes "thee", before a
|
|
* consonant it is "thuh"...
|
|
*
|
|
********************************************************************* AH **********************/
|
|
HRESULT CStdSentEnum::TheDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
|
|
SPLISTPOS ListPos, CSentItemMemory& MemoryManager )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::TheDisambig" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Get next item
|
|
SPLISTPOS TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem NextItem = ItemList.GetNext( TempPos );
|
|
|
|
if ( NextItem.Words[0].pWordPron &&
|
|
bsearch( (void*) NextItem.Words[0].pWordPron, (void*) g_Vowels, sp_countof( g_Vowels ),
|
|
sizeof( WCHAR ), CompareWCHARAndWCHAR ) )
|
|
{
|
|
//--- Matched a vowel - go with / DH IY 1 /
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& TheItem = ItemList.GetPrev( TempPos );
|
|
//--- The two pronunciations are exactly the same length, so this should never overwrite
|
|
//--- anything
|
|
wcscpy( TheItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
TheItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS1;
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( TheItem.Words[0].pWordPron );
|
|
TheItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) TheItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) TheItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) TheItem.Words[0].pWordText + 1 ), TheItem.Words[0].pWordPron );
|
|
(WCHAR) TheItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
TheItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//--- Didn't match a vowel - go with / DH AX 1 /
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& TheItem = ItemList.GetPrev( TempPos );
|
|
//--- The two pronunciations are exactly the same length, so this should never overwrite
|
|
//--- anything
|
|
wcscpy( TheItem.Words[0].pWordPron, pAbbrevInfo->pPron2 );
|
|
TheItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS2;
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( TheItem.Words[0].pWordPron );
|
|
TheItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) TheItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) TheItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) TheItem.Words[0].pWordText + 1 ), TheItem.Words[0].pWordPron );
|
|
(WCHAR) TheItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
TheItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* TheDisambig */
|
|
|
|
/***********************************************************************************************
|
|
* ADisambig *
|
|
*-----------*
|
|
* Description:
|
|
* This function disambiguates the word "a" - / EY 1 - Noun / vs. / AX - Det /
|
|
*
|
|
********************************************************************* AH **********************/
|
|
HRESULT CStdSentEnum::ADisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList,
|
|
SPLISTPOS ListPos )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::ADisambig" );
|
|
HRESULT hr = S_OK;
|
|
BOOL fNoun = false;
|
|
|
|
//--- Get Current Item...
|
|
SPLISTPOS TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem CurrentItem = ItemList.GetPrev( TempPos );
|
|
|
|
//--- If "a" is part of a multi-word item, use the Noun pronunciation...
|
|
//--- If "a" is not an AlphaWord, use the Noun pronunciation...
|
|
if ( CurrentItem.ulNumWords > 1 ||
|
|
CurrentItem.pItemInfo->Type != eALPHA_WORD )
|
|
{
|
|
fNoun = true;
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron1 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS1;
|
|
pPron->POSchoice = pAbbrevInfo->POS1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( !fNoun )
|
|
{
|
|
//--- Get Next Item...
|
|
TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem NextItem = ItemList.GetNext( TempPos );
|
|
|
|
//--- If "a" is followed by punctuation, use the Noun pronunciation...
|
|
if ( !( NextItem.pItemInfo->Type & eWORDLIST_IS_VALID ) )
|
|
{
|
|
fNoun = true;
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron1 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS1;
|
|
pPron->POSchoice = pAbbrevInfo->POS1;
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Default - use the Determiner pronunciation (but include Noun pronunciation as well,
|
|
//--- so that POS tagger rules will work properly)...
|
|
if ( !fNoun )
|
|
{
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron2 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS2;
|
|
pPron->pronArray[PRON_A].POScount = 1;
|
|
pPron->POSchoice = pAbbrevInfo->POS2;
|
|
wcscpy( pPron->pronArray[PRON_B].phon_Str, pAbbrevInfo->pPron1 );
|
|
pPron->pronArray[PRON_B].phon_Len = wcslen( pPron->pronArray[PRON_B].phon_Str );
|
|
pPron->pronArray[PRON_B].POScode[0] = pAbbrevInfo->POS1;
|
|
pPron->pronArray[PRON_B].POScount = 1;
|
|
pPron->hasAlt = true;
|
|
}
|
|
|
|
return hr;
|
|
} /* ADisambig */
|
|
|
|
/***********************************************************************************************
|
|
* PolishDisambig *
|
|
*----------------*
|
|
* Description:
|
|
* This function disambiguates the word "polish" - [p ow 1 l - ax sh - Noun] vs.
|
|
* [p ow 1 l - ax sh - Adj] vs. [p aa 1 l - ih sh - Verb] vs. [p aa 1 l - ih sh - Noun]
|
|
*
|
|
********************************************************************* AH **********************/
|
|
HRESULT CStdSentEnum::PolishDisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList,
|
|
SPLISTPOS ListPos )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::PolishDisambig" );
|
|
HRESULT hr = S_OK;
|
|
BOOL fMatch = false;
|
|
|
|
//--- Get Current Item...
|
|
SPLISTPOS TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem CurrentItem = ItemList.GetPrev( TempPos );
|
|
|
|
//--- If "Polish" is capitalized and not sentence-initial, and not preceded immediately
|
|
//--- by an open double-quote or parenthesis, use Noun...
|
|
if ( iswupper( CurrentItem.pItemSrcText[0] ) )
|
|
{
|
|
BOOL fSentenceInitial = false;
|
|
if ( !TempPos )
|
|
{
|
|
fSentenceInitial = true;
|
|
}
|
|
else
|
|
{
|
|
TTSSentItem PrevItem = ItemList.GetPrev( TempPos );
|
|
if ( PrevItem.pItemInfo->Type == eOPEN_PARENTHESIS ||
|
|
PrevItem.pItemInfo->Type == eOPEN_BRACKET ||
|
|
PrevItem.pItemInfo->Type == eOPEN_BRACE ||
|
|
PrevItem.pItemInfo->Type == eSINGLE_QUOTE ||
|
|
PrevItem.pItemInfo->Type == eDOUBLE_QUOTE )
|
|
{
|
|
fSentenceInitial = true;
|
|
}
|
|
}
|
|
if ( fSentenceInitial )
|
|
{
|
|
fMatch = true;
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron2 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS2;
|
|
pPron->POSchoice = pAbbrevInfo->POS2;
|
|
}
|
|
else
|
|
{
|
|
fMatch = true;
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron1 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = MS_Noun;
|
|
pPron->POSchoice = MS_Noun;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Default - use the Verb pronunciation (but include the others as well,
|
|
//--- so that POS tagger rules will work properly)...
|
|
if ( !fMatch )
|
|
{
|
|
//--- Verb, Noun
|
|
wcscpy( pPron->pronArray[PRON_A].phon_Str, pAbbrevInfo->pPron2 );
|
|
pPron->pronArray[PRON_A].phon_Len = wcslen( pPron->pronArray[PRON_A].phon_Str );
|
|
pPron->pronArray[PRON_A].POScode[0] = pAbbrevInfo->POS2;
|
|
pPron->pronArray[PRON_A].POScode[1] = pAbbrevInfo->POS3;
|
|
pPron->pronArray[PRON_A].POScount = 2;
|
|
//--- Adj
|
|
wcscpy( pPron->pronArray[PRON_B].phon_Str, pAbbrevInfo->pPron1 );
|
|
pPron->pronArray[PRON_B].phon_Len = wcslen( pPron->pronArray[PRON_B].phon_Str );
|
|
pPron->pronArray[PRON_B].POScode[0] = pAbbrevInfo->POS1;
|
|
pPron->pronArray[PRON_B].POScount = 1;
|
|
//--- Set initial choice to Verb...
|
|
pPron->POSchoice = pAbbrevInfo->POS2;
|
|
pPron->hasAlt = true;
|
|
}
|
|
|
|
return hr;
|
|
} /* PolishDisambig */
|
|
|
|
/***********************************************************************************************
|
|
* ReadDisambig *
|
|
*--------------*
|
|
* Description:
|
|
* This function disambiguates the word Read - past tense vs. present...
|
|
*
|
|
********************************************************************* AH **********************/
|
|
HRESULT CStdSentEnum::ReadDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList,
|
|
SPLISTPOS ListPos, CSentItemMemory& MemoryManager )
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::ReadDisambig" );
|
|
HRESULT hr = S_OK;
|
|
BOOL fMatch = false;
|
|
|
|
//--- Get prev item
|
|
SPLISTPOS TempPos = ListPos;
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
ItemList.GetPrev( TempPos );
|
|
if ( TempPos )
|
|
{
|
|
TTSSentItem PrevItem = ItemList.GetPrev( TempPos );
|
|
|
|
//--- Check for closest auxiliary
|
|
while ( PrevItem.Words[0].eWordPartOfSpeech != MS_VAux &&
|
|
PrevItem.Words[0].eWordPartOfSpeech != MS_Contr &&
|
|
TempPos )
|
|
{
|
|
PrevItem = ItemList.GetPrev( TempPos );
|
|
}
|
|
|
|
if ( PrevItem.Words[0].eWordPartOfSpeech == MS_VAux )
|
|
{
|
|
fMatch = true;
|
|
if ( wcsnicmp( PrevItem.Words[0].pWordText, L"have", 4 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"has", 3 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"had", 3 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"am", 2 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"ain't", 5 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"are", 3 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"aren't", 6 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"be", 2 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"is", 2 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"was", 3 ) == 0 ||
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"were", 4 ) == 0 )
|
|
{
|
|
//--- Matched have or haven't (has or hasn't, had or hadn't) - go with "red"
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
//--- The two pronunciations are exactly the same length, so this should never overwrite
|
|
//--- anything
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron2 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS2;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS2;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//--- Some other auxiliary - go with "reed"
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
//--- The two pronunciations are exactly the same length, so this should never overwrite
|
|
//--- anything
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS1;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS1;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
//--- Check for pronoun aux contractions
|
|
else if ( PrevItem.Words[0].eWordPartOfSpeech == MS_Contr )
|
|
{
|
|
fMatch = true;
|
|
const WCHAR *pApostrophe = wcsstr( PrevItem.Words[0].pWordText, L"'" );
|
|
if ( pApostrophe &&
|
|
wcsnicmp( pApostrophe, L"'ll", 3 ) == 0 )
|
|
{
|
|
//--- Matched an 'll form - go with "reed"
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS1;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS1;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//--- Some other form - go with "red"
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron2 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS2;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS2;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
//--- Check for infinitival form
|
|
else
|
|
{
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
ItemList.GetPrev( TempPos );
|
|
PrevItem = ItemList.GetPrev( TempPos );
|
|
|
|
if ( PrevItem.Words[0].ulWordLen == 2 &&
|
|
wcsnicmp( PrevItem.Words[0].pWordText, L"to", 2 ) == 0 )
|
|
{
|
|
fMatch = true;
|
|
|
|
//--- Matched infinitival form - go with "reed"
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS1;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS1;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//--- Sentence initial - go with "reed"
|
|
else
|
|
{
|
|
fMatch = true;
|
|
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron1 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS1;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS1;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( !fMatch )
|
|
{
|
|
TempPos = ListPos;
|
|
ItemList.GetPrev( TempPos );
|
|
TTSSentItem& ReadItem = ItemList.GetPrev( TempPos );
|
|
//--- Default - go with past tense...
|
|
wcscpy( ReadItem.Words[0].pWordPron, pAbbrevInfo->pPron2 );
|
|
ReadItem.Words[0].eWordPartOfSpeech = pAbbrevInfo->POS2;
|
|
ReadItem.eItemPartOfSpeech = pAbbrevInfo->POS2;
|
|
|
|
//--- Insert pron into word text - RAID #4746
|
|
ULONG ulNumChars = wcslen( ReadItem.Words[0].pWordPron );
|
|
ReadItem.Words[0].pWordText =
|
|
(WCHAR*) MemoryManager.GetMemory( ( ulNumChars + 3 ) * sizeof( WCHAR ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( (WCHAR*) ReadItem.Words[0].pWordText, ( ulNumChars + 3 ) * sizeof( WCHAR ) );
|
|
(WCHAR) ReadItem.Words[0].pWordText[0] = L'*';
|
|
wcscpy( ( (WCHAR*) ReadItem.Words[0].pWordText + 1 ), ReadItem.Words[0].pWordPron );
|
|
(WCHAR) ReadItem.Words[0].pWordText[ ulNumChars + 1 ] = L'*';
|
|
ReadItem.Words[0].ulWordLen = ulNumChars + 2;
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* ReadDisambig */
|
|
|