You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1075 lines
23 KiB
1075 lines
23 KiB
//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
|
|
//
|
|
// Purpose:
|
|
//
|
|
// $NoKeywords: $
|
|
//=============================================================================//
|
|
#include <stdio.h>
|
|
#include <stdarg.h>
|
|
#include <memory.h>
|
|
#include <windows.h>
|
|
#include <mmsystem.h>
|
|
#include <mmreg.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
|
|
#include "phonemeextractor/PhonemeExtractor.h"
|
|
#include "ims_helper/ims_helper.h"
|
|
|
|
#include "tier0/dbg.h"
|
|
#include "sentence.h"
|
|
#include "PhonemeConverter.h"
|
|
#include "tier1/strtools.h"
|
|
|
|
#define TEXTLESS_WORDNAME "[Textless]"
|
|
|
|
static IImsHelper *talkback = NULL;
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose: Expose the interface
|
|
//-----------------------------------------------------------------------------
|
|
class CPhonemeExtractorLipSinc : public IPhonemeExtractor
|
|
{
|
|
public:
|
|
virtual PE_APITYPE GetAPIType() const
|
|
{
|
|
return SPEECH_API_LIPSINC;
|
|
}
|
|
|
|
// Used for menus, etc
|
|
virtual char const *GetName() const
|
|
{
|
|
return "IMS (LipSinc)";
|
|
}
|
|
|
|
SR_RESULT Extract(
|
|
const char *wavfile,
|
|
int numsamples,
|
|
void (*pfnPrint)( const char *fmt, ... ),
|
|
CSentence& inwords,
|
|
CSentence& outwords );
|
|
|
|
|
|
CPhonemeExtractorLipSinc( void );
|
|
~CPhonemeExtractorLipSinc( void );
|
|
|
|
enum
|
|
{
|
|
MAX_WORD_LENGTH = 128,
|
|
};
|
|
private:
|
|
|
|
|
|
class CAnalyzedWord
|
|
{
|
|
public:
|
|
char buffer[ MAX_WORD_LENGTH ];
|
|
double starttime;
|
|
double endtime;
|
|
};
|
|
|
|
class CAnalyzedPhoneme
|
|
{
|
|
public:
|
|
char phoneme[ 32 ];
|
|
double starttime;
|
|
double endtime;
|
|
};
|
|
|
|
bool InitLipSinc( void );
|
|
void ShutdownLipSinc( void );
|
|
|
|
void DescribeError( TALKBACK_ERR err );
|
|
void Printf( char const *fmt, ... );
|
|
|
|
bool CheckSoundFile( char const *filename );
|
|
bool GetInitialized( void );
|
|
void SetInitialized( bool init );
|
|
|
|
void (*m_pfnPrint)( const char *fmt, ... );
|
|
|
|
char const *ConstructInputSentence( CSentence& inwords );
|
|
bool AttemptAnalysis( TALKBACK_ANALYSIS **ppAnalysis, char const *wavfile, CSentence& inwords );
|
|
|
|
char const *ApplyTBWordRules( char const *word );
|
|
|
|
void ProcessWords( TALKBACK_ANALYSIS *analysis, CSentence& inwords, CSentence& outwords );
|
|
void ProcessWordsTextless( TALKBACK_ANALYSIS *analysis, CSentence& outwords );
|
|
|
|
int GetPhonemeIndexAtWord( TALKBACK_ANALYSIS *analysis, double time, bool checkstart );
|
|
|
|
int GetPhonemeIndexAtWordStart( TALKBACK_ANALYSIS *analysis, double starttime );
|
|
int GetPhonemeIndexAtWordEnd( TALKBACK_ANALYSIS *analysis, double endtime );
|
|
|
|
CAnalyzedWord *GetAnalyzedWord( TALKBACK_ANALYSIS *analysis, int index );
|
|
CAnalyzedPhoneme *GetAnalyzedPhoneme( TALKBACK_ANALYSIS *analysis, int index );
|
|
|
|
int ComputeByteFromTime( float time );
|
|
|
|
bool m_bInitialized;
|
|
|
|
float m_flSampleCount;
|
|
float m_flDuration;
|
|
|
|
float m_flSamplesPerSecond;
|
|
|
|
int m_nBytesPerSample;
|
|
|
|
HMODULE m_hHelper;
|
|
};
|
|
|
|
CPhonemeExtractorLipSinc::CPhonemeExtractorLipSinc( void )
|
|
{
|
|
m_hHelper = (HMODULE)0;
|
|
m_pfnPrint = NULL;
|
|
|
|
m_bInitialized = false;
|
|
|
|
m_flSampleCount = 0.0f;
|
|
m_flDuration = 0.0f;
|
|
|
|
m_flSamplesPerSecond = 0.0f;
|
|
|
|
m_nBytesPerSample = 0;
|
|
}
|
|
|
|
CPhonemeExtractorLipSinc::~CPhonemeExtractorLipSinc( void )
|
|
{
|
|
if ( GetInitialized() )
|
|
{
|
|
ShutdownLipSinc();
|
|
}
|
|
}
|
|
|
|
bool CPhonemeExtractorLipSinc::GetInitialized( void )
|
|
{
|
|
return m_bInitialized;
|
|
}
|
|
|
|
void CPhonemeExtractorLipSinc::SetInitialized( bool init )
|
|
{
|
|
m_bInitialized = init;
|
|
}
|
|
|
|
int CPhonemeExtractorLipSinc::ComputeByteFromTime( float time )
|
|
{
|
|
if ( !m_flDuration )
|
|
return 0;
|
|
|
|
float frac = time / m_flDuration;
|
|
|
|
float sampleNumber = frac * m_flSampleCount;
|
|
|
|
int bytenumber = sampleNumber * m_nBytesPerSample;
|
|
|
|
return bytenumber;
|
|
}
|
|
|
|
void CPhonemeExtractorLipSinc::DescribeError( TALKBACK_ERR err )
|
|
{
|
|
Assert( m_pfnPrint );
|
|
|
|
// Get the error description.
|
|
char errorDesc[256] = "";
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
talkback->TalkBackGetErrorString( err, sizeof(errorDesc), errorDesc );
|
|
}
|
|
|
|
// Report or log the error...
|
|
(*m_pfnPrint)( "LIPSINC ERROR: %s\n", errorDesc );
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *fmt -
|
|
// .. -
|
|
//-----------------------------------------------------------------------------
|
|
void CPhonemeExtractorLipSinc::Printf( char const *fmt, ... )
|
|
{
|
|
Assert( m_pfnPrint );
|
|
|
|
char string[ 4096 ];
|
|
|
|
va_list argptr;
|
|
va_start( argptr, fmt );
|
|
vsprintf( string, fmt, argptr );
|
|
va_end( argptr );
|
|
|
|
(*m_pfnPrint)( "%s", string );
|
|
}
|
|
|
|
bool CPhonemeExtractorLipSinc::CheckSoundFile( char const *filename )
|
|
{
|
|
TALKBACK_SOUND_FILE_METRICS fm;
|
|
memset( &fm, 0, sizeof( fm ) );
|
|
fm.m_size = sizeof( fm );
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetSoundFileMetrics( filename, &fm );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return false;
|
|
}
|
|
|
|
if ( fm.m_canBeAnalyzed )
|
|
{
|
|
Printf( "%s: %.2f s, rate %i, bits %i, channels %i\n",
|
|
filename,
|
|
fm.m_duration,
|
|
fm.m_sampleRate,
|
|
fm.m_bitsPerSample,
|
|
fm.m_channelCount );
|
|
}
|
|
|
|
m_flDuration = fm.m_duration;
|
|
if ( m_flDuration > 0 )
|
|
{
|
|
m_flSamplesPerSecond = m_flSampleCount / m_flDuration;
|
|
}
|
|
else
|
|
{
|
|
m_flSamplesPerSecond = 0.0f;
|
|
}
|
|
|
|
m_nBytesPerSample = ( fm.m_bitsPerSample >> 3 );
|
|
|
|
m_flSampleCount /= m_nBytesPerSample;
|
|
|
|
m_nBytesPerSample /= fm.m_channelCount;
|
|
|
|
return fm.m_canBeAnalyzed ? true : false;
|
|
}
|
|
|
|
typedef IImsHelper *(*pfnImsHelper)(void);
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Output : Returns true on success, false on failure.
|
|
//-----------------------------------------------------------------------------
|
|
bool CPhonemeExtractorLipSinc::InitLipSinc( void )
|
|
{
|
|
if ( GetInitialized() )
|
|
{
|
|
return true;
|
|
}
|
|
|
|
m_hHelper = LoadLibrary( "ims_helper.dll" );
|
|
if ( !m_hHelper )
|
|
{
|
|
return false;
|
|
}
|
|
|
|
pfnImsHelper factory = (pfnImsHelper)::GetProcAddress( m_hHelper, "GetImsHelper" );
|
|
if ( !factory )
|
|
{
|
|
FreeLibrary( m_hHelper );
|
|
return false;
|
|
}
|
|
|
|
talkback = reinterpret_cast< IImsHelper * >( (*factory)() );
|
|
if ( !talkback )
|
|
{
|
|
FreeLibrary( m_hHelper );
|
|
return false;
|
|
}
|
|
|
|
char szExeName[ MAX_PATH ];
|
|
szExeName[0] = 0;
|
|
GetModuleFileName( (HMODULE)0, szExeName, sizeof( szExeName ) );
|
|
|
|
char szBaseDir[ MAX_PATH ];
|
|
Q_strncpy( szBaseDir, szExeName, sizeof( szBaseDir ) );
|
|
|
|
Q_StripLastDir( szBaseDir, sizeof( szBaseDir ) );
|
|
Q_StripTrailingSlash( szBaseDir );
|
|
Q_strlower( szBaseDir );
|
|
|
|
char coreDataDir[ 512 ];
|
|
Q_snprintf( coreDataDir, sizeof( coreDataDir ), "%s\\lipsinc_data\\",
|
|
szBaseDir );
|
|
Q_FixSlashes( coreDataDir );
|
|
|
|
char szCheck[ 512 ];
|
|
Q_snprintf( szCheck, sizeof( szCheck ), "%sDtC6dal.dat", coreDataDir );
|
|
struct __stat64 buf;
|
|
|
|
if ( _stat64( szCheck, &buf ) != 0 )
|
|
{
|
|
Q_snprintf( coreDataDir, sizeof( coreDataDir ), "%s\\bin\\lipsinc_data\\",
|
|
szBaseDir );
|
|
Q_FixSlashes( coreDataDir );
|
|
Q_snprintf( szCheck, sizeof( szCheck ), "%sDtC6dal.dat", coreDataDir );
|
|
|
|
if ( _stat64( szCheck, &buf ) != 0 )
|
|
{
|
|
Error( "Unable to find talkback data files in %s.", coreDataDir );
|
|
}
|
|
}
|
|
|
|
TALKBACK_ERR err;
|
|
|
|
err = talkback->TalkBackStartupLibrary( coreDataDir );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
FreeLibrary( m_hHelper );
|
|
return false;
|
|
}
|
|
|
|
long verMajor = 0;
|
|
long verMinor = 0;
|
|
long verRevision = 0;
|
|
|
|
err = talkback->TalkBackGetVersion(
|
|
&verMajor,
|
|
&verMinor,
|
|
&verRevision);
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
FreeLibrary( m_hHelper );
|
|
return false;
|
|
}
|
|
|
|
Printf( "Lipsinc TalkBack Version %i.%i.%i\n", verMajor, verMinor, verRevision );
|
|
|
|
m_bInitialized = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
//-----------------------------------------------------------------------------
|
|
void CPhonemeExtractorLipSinc::ShutdownLipSinc( void )
|
|
{
|
|
// HACK HACK: This seems to crash on exit sometimes
|
|
__try
|
|
{
|
|
talkback->TalkBackShutdownLibrary();
|
|
|
|
FreeLibrary( m_hHelper );
|
|
}
|
|
__except(EXCEPTION_EXECUTE_HANDLER )
|
|
{
|
|
OutputDebugString( "----> Crash shutting down TALKBACK sdk, exception caught and ignored\n" );
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : inwords -
|
|
// Output : char const
|
|
//-----------------------------------------------------------------------------
|
|
char const *CPhonemeExtractorLipSinc::ConstructInputSentence( CSentence& inwords )
|
|
{
|
|
static char sentence[ 16384 ];
|
|
|
|
sentence[ 0 ] = 0;
|
|
|
|
int last = inwords.m_Words.Count() - 1;
|
|
|
|
for ( int i = 0 ; i <= last; i++ )
|
|
{
|
|
CWordTag *w = inwords.m_Words[ i ];
|
|
|
|
strcat( sentence, w->GetWord() );
|
|
if ( i != last )
|
|
{
|
|
strcat( sentence, " " );
|
|
}
|
|
}
|
|
|
|
if ( inwords.m_Words.Count() == 1 &&
|
|
!Q_strnicmp( inwords.GetText(), TEXTLESS_WORDNAME, Q_strlen( TEXTLESS_WORDNAME ) ) )
|
|
{
|
|
sentence[ 0 ] = 0;
|
|
}
|
|
|
|
return sentence;
|
|
}
|
|
|
|
bool CPhonemeExtractorLipSinc::AttemptAnalysis( TALKBACK_ANALYSIS **ppAnalysis, char const *wavfile, CSentence& inwords )
|
|
{
|
|
*ppAnalysis = NULL;
|
|
|
|
TALKBACK_ANALYSIS_SETTINGS settings;
|
|
memset( &settings, 0, sizeof( settings ) );
|
|
|
|
// Set this field to sizeof(TALKBACK_ANALYSIS_SETTINGS) before using the
|
|
// structure.
|
|
settings.fSize = sizeof( TALKBACK_ANALYSIS_SETTINGS );
|
|
|
|
|
|
// Default value: 30 (frames per second).
|
|
settings.fFrameRate = 100;
|
|
// Set this to 1 to optimize for flipbook output, 0 to do analysis normally.
|
|
//
|
|
// Default value: 0 (normal analysis).
|
|
settings.fOptimizeForFlipbook = 0;
|
|
// Set this to -1 to seed the random number generator with the current time.
|
|
// Any other number will be used directly for the random number seed, which
|
|
// is useful if you want repeatable speech gestures. This value does not
|
|
// influence lip-synching at all.
|
|
//
|
|
// Default value: -1 (use current time).
|
|
settings.fRandomSeed = -1;
|
|
// Path to the configuration (.INI) file with phoneme-to-speech-target
|
|
// mapping. Set this to NULL to use the default mapping.
|
|
//
|
|
// Default value: NULL (use default mapping).
|
|
settings.fConfigFile = NULL;
|
|
|
|
char const *text = ConstructInputSentence( inwords );
|
|
|
|
Printf( "Analyzing: \"%s\"\n", text[ 0 ] ? text : TEXTLESS_WORDNAME );
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetAnalysis(
|
|
ppAnalysis,
|
|
wavfile,
|
|
text,
|
|
&settings );
|
|
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return false;
|
|
}
|
|
|
|
Printf( "Analysis successful...\n" );
|
|
|
|
return true;
|
|
}
|
|
|
|
typedef struct
|
|
{
|
|
TALKBACK_PHONEME phoneme;
|
|
char const *string;
|
|
} TBPHONEMES_t;
|
|
|
|
static TBPHONEMES_t g_TBPhonemeList[]=
|
|
{
|
|
{ TALKBACK_PHONEME_IY, "iy" },
|
|
{ TALKBACK_PHONEME_IH, "ih" },
|
|
{ TALKBACK_PHONEME_EH, "eh" },
|
|
{ TALKBACK_PHONEME_EY, "ey" },
|
|
{ TALKBACK_PHONEME_AE, "ae" },
|
|
{ TALKBACK_PHONEME_AA, "aa" },
|
|
{ TALKBACK_PHONEME_AW, "aw" },
|
|
{ TALKBACK_PHONEME_AY, "ay" },
|
|
{ TALKBACK_PHONEME_AH, "ah" },
|
|
{ TALKBACK_PHONEME_AO, "ao" },
|
|
{ TALKBACK_PHONEME_OY, "oy" },
|
|
{ TALKBACK_PHONEME_OW, "ow" },
|
|
{ TALKBACK_PHONEME_UH, "uh" },
|
|
{ TALKBACK_PHONEME_UW, "uw" },
|
|
{ TALKBACK_PHONEME_ER, "er" },
|
|
{ TALKBACK_PHONEME_AX, "ax" },
|
|
{ TALKBACK_PHONEME_S, "s" },
|
|
{ TALKBACK_PHONEME_SH, "sh" },
|
|
{ TALKBACK_PHONEME_Z, "z" },
|
|
{ TALKBACK_PHONEME_ZH, "zh" },
|
|
{ TALKBACK_PHONEME_F, "f" },
|
|
{ TALKBACK_PHONEME_TH, "th" },
|
|
{ TALKBACK_PHONEME_V, "v" },
|
|
{ TALKBACK_PHONEME_DH, "dh" },
|
|
{ TALKBACK_PHONEME_M, "m" },
|
|
{ TALKBACK_PHONEME_N, "n" },
|
|
{ TALKBACK_PHONEME_NG, "ng" },
|
|
{ TALKBACK_PHONEME_L, "l" },
|
|
{ TALKBACK_PHONEME_R, "r" },
|
|
{ TALKBACK_PHONEME_W, "w" },
|
|
{ TALKBACK_PHONEME_Y, "y" },
|
|
{ TALKBACK_PHONEME_HH, "hh" },
|
|
{ TALKBACK_PHONEME_B, "b" },
|
|
{ TALKBACK_PHONEME_D, "d" },
|
|
{ TALKBACK_PHONEME_JH, "jh" },
|
|
{ TALKBACK_PHONEME_G, "g" },
|
|
{ TALKBACK_PHONEME_P, "p" },
|
|
{ TALKBACK_PHONEME_T, "t" },
|
|
{ TALKBACK_PHONEME_K, "k" },
|
|
{ TALKBACK_PHONEME_CH, "ch" },
|
|
{ TALKBACK_PHONEME_SIL, "<sil>" },
|
|
{ -1, NULL }
|
|
};
|
|
|
|
char const *TBPhonemeToString( TALKBACK_PHONEME phoneme )
|
|
{
|
|
if ( phoneme < TALKBACK_PHONEME_FIRST || phoneme > TALKBACK_PHONEME_LAST )
|
|
{
|
|
return "Bogus";
|
|
}
|
|
|
|
TBPHONEMES_t *item = &g_TBPhonemeList[ phoneme ];
|
|
return item->string;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *analysis -
|
|
// time -
|
|
// start -
|
|
// Output : int
|
|
//-----------------------------------------------------------------------------
|
|
int CPhonemeExtractorLipSinc::GetPhonemeIndexAtWord( TALKBACK_ANALYSIS *analysis, double time, bool start )
|
|
{
|
|
long count;
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetNumPhonemes( analysis, &count );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return -1;
|
|
}
|
|
|
|
if ( count <= 0L )
|
|
return -1;
|
|
|
|
// Bogus
|
|
if ( count >= 100000L )
|
|
return -1;
|
|
|
|
for ( int i = 0; i < (int)count; i++ )
|
|
{
|
|
TALKBACK_PHONEME tbPhoneme = TALKBACK_PHONEME_INVALID;
|
|
err = talkback->TalkBackGetPhonemeEnum( analysis, i, &tbPhoneme );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
continue;
|
|
}
|
|
|
|
double t;
|
|
|
|
if ( start )
|
|
{
|
|
err = talkback->TalkBackGetPhonemeStartTime( analysis, i, &t );
|
|
}
|
|
else
|
|
{
|
|
err = talkback->TalkBackGetPhonemeEndTime( analysis, i, &t );
|
|
}
|
|
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
continue;
|
|
}
|
|
|
|
if ( t == time )
|
|
{
|
|
return i;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *analysis -
|
|
// starttime -
|
|
// Output : int
|
|
//-----------------------------------------------------------------------------
|
|
int CPhonemeExtractorLipSinc::GetPhonemeIndexAtWordStart( TALKBACK_ANALYSIS *analysis, double starttime )
|
|
{
|
|
return GetPhonemeIndexAtWord( analysis, starttime, true );
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *analysis -
|
|
// endtime -
|
|
// Output : int
|
|
//-----------------------------------------------------------------------------
|
|
int CPhonemeExtractorLipSinc::GetPhonemeIndexAtWordEnd( TALKBACK_ANALYSIS *analysis, double endtime )
|
|
{
|
|
return GetPhonemeIndexAtWord( analysis, endtime, false );
|
|
}
|
|
|
|
CPhonemeExtractorLipSinc::CAnalyzedPhoneme *CPhonemeExtractorLipSinc::GetAnalyzedPhoneme( TALKBACK_ANALYSIS *analysis, int index )
|
|
{
|
|
static CAnalyzedPhoneme p;
|
|
|
|
memset( &p, 0, sizeof( p ) );
|
|
|
|
TALKBACK_PHONEME tb;
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetPhonemeEnum( analysis, index, &tb );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
|
|
strcpy( p.phoneme, TBPhonemeToString( tb ) );
|
|
|
|
err = talkback->TalkBackGetPhonemeStartTime( analysis, index, &p.starttime );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
err = talkback->TalkBackGetPhonemeEndTime( analysis, index, &p.endtime );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
|
|
return &p;
|
|
}
|
|
|
|
CPhonemeExtractorLipSinc::CAnalyzedWord *CPhonemeExtractorLipSinc::GetAnalyzedWord( TALKBACK_ANALYSIS *analysis, int index )
|
|
{
|
|
static CAnalyzedWord w;
|
|
|
|
memset( &w, 0, sizeof( w ) );
|
|
|
|
long chars = sizeof( w.buffer );
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetWord( analysis, index, chars, w.buffer );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
|
|
err = talkback->TalkBackGetWordStartTime( analysis, index, &w.starttime );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
err = talkback->TalkBackGetWordEndTime( analysis, index, &w.endtime );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return NULL;
|
|
}
|
|
|
|
return &w;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *w1 -
|
|
// *w2 -
|
|
// Output : Returns true on success, false on failure.
|
|
//-----------------------------------------------------------------------------
|
|
bool FuzzyWordMatch( char const *w1, char const *w2 )
|
|
{
|
|
int len1 = strlen( w1 );
|
|
int len2 = strlen( w2 );
|
|
|
|
int minlen = min( len1, len2 );
|
|
|
|
// Found a match
|
|
if ( !strnicmp( w1, w2, minlen ) )
|
|
return true;
|
|
|
|
int letterdiff = abs( len1 - len2 );
|
|
// More than three letters different, don't bother
|
|
if ( letterdiff > 5 )
|
|
return false;
|
|
|
|
// Compute a "delta"
|
|
char *p1 = (char *)w1;
|
|
char *p2 = (char *)w2;
|
|
|
|
CUtlVector <char> word1;
|
|
CUtlVector <char> word2;
|
|
|
|
while ( *p1 )
|
|
{
|
|
if ( V_isalpha( *p1 ) )
|
|
{
|
|
word1.AddToTail( *p1 );
|
|
}
|
|
p1++;
|
|
}
|
|
|
|
while ( *p2 )
|
|
{
|
|
if ( V_isalpha( *p2 ) )
|
|
{
|
|
word2.AddToTail( *p2 );
|
|
}
|
|
p2++;
|
|
}
|
|
|
|
int i;
|
|
for ( i = 0; i < word1.Count(); i++ )
|
|
{
|
|
char c = word1[ i ];
|
|
|
|
// See if c is in word 2, if so subtract it out
|
|
int idx = word2.Find( c );
|
|
|
|
if ( idx != word2.InvalidIndex() )
|
|
{
|
|
word2.Remove( idx );
|
|
}
|
|
}
|
|
|
|
if ( word2.Count() <= letterdiff )
|
|
return true;
|
|
|
|
word2.RemoveAll();
|
|
|
|
while ( *p2 )
|
|
{
|
|
if ( V_isalpha( *p2 ) )
|
|
{
|
|
word2.AddToTail( *p2 );
|
|
}
|
|
p2++;
|
|
}
|
|
|
|
for ( i = 0; i < word2.Count(); i++ )
|
|
{
|
|
char c = word2[ i ];
|
|
|
|
// See if c is in word 2, if so subtract it out
|
|
int idx = word1.Find( c );
|
|
|
|
if ( idx != word1.InvalidIndex() )
|
|
{
|
|
word1.Remove( idx );
|
|
}
|
|
}
|
|
|
|
if ( word1.Count() <= letterdiff )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose: For foreign language stuff, if inwords is empty, process anyway...
|
|
// Input : *analysis -
|
|
// outwords -
|
|
//-----------------------------------------------------------------------------
|
|
void CPhonemeExtractorLipSinc::ProcessWordsTextless( TALKBACK_ANALYSIS *analysis, CSentence& outwords )
|
|
{
|
|
long count;
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetNumPhonemes( analysis, &count );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return;
|
|
}
|
|
|
|
CWordTag *newWord = new CWordTag;
|
|
|
|
newWord->SetWord( TEXTLESS_WORDNAME );
|
|
|
|
float starttime = 0.0f;
|
|
float endtime = 1.0f;
|
|
|
|
|
|
for ( int i = 0; i < count; ++i )
|
|
{
|
|
// Get phoneme and timing info
|
|
CAnalyzedPhoneme *ph = GetAnalyzedPhoneme( analysis, i );
|
|
if ( !ph )
|
|
continue;
|
|
|
|
CPhonemeTag *ptag = new CPhonemeTag;
|
|
|
|
if ( i == 0 || ( ph->starttime < starttime ) )
|
|
{
|
|
starttime = ph->starttime;
|
|
}
|
|
|
|
if ( i == 0 || ( ph->endtime > endtime ) )
|
|
{
|
|
endtime = ph->endtime;
|
|
}
|
|
|
|
ptag->SetStartTime( ph->starttime );
|
|
ptag->SetEndTime( ph->endtime );
|
|
|
|
ptag->m_uiStartByte = ComputeByteFromTime( ph->starttime );
|
|
ptag->m_uiEndByte = ComputeByteFromTime( ph->endtime );
|
|
|
|
ptag->SetTag( ph->phoneme );
|
|
ptag->SetPhonemeCode( TextToPhoneme( ptag->GetTag() ) );
|
|
|
|
newWord->m_Phonemes.AddToTail( ptag );
|
|
}
|
|
|
|
newWord->m_flStartTime = starttime;
|
|
newWord->m_flEndTime = endtime;
|
|
|
|
newWord->m_uiStartByte = ComputeByteFromTime( starttime );
|
|
newWord->m_uiEndByte = ComputeByteFromTime( endtime );
|
|
|
|
outwords.Reset();
|
|
outwords.AddWordTag( newWord );
|
|
outwords.SetTextFromWords();
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose:
|
|
// Input : *analysis -
|
|
// inwords -
|
|
// outwords -
|
|
//-----------------------------------------------------------------------------
|
|
void CPhonemeExtractorLipSinc::ProcessWords( TALKBACK_ANALYSIS *analysis, CSentence& inwords, CSentence& outwords )
|
|
{
|
|
long count;
|
|
|
|
TALKBACK_ERR err = talkback->TalkBackGetNumWords( analysis, &count );
|
|
if ( err != TALKBACK_NOERR )
|
|
{
|
|
DescribeError( err );
|
|
return;
|
|
}
|
|
|
|
if ( count <= 0L )
|
|
{
|
|
if ( inwords.m_Words.Count() == 0 ||
|
|
!Q_strnicmp( inwords.GetText(), TEXTLESS_WORDNAME, Q_strlen( TEXTLESS_WORDNAME ) ) )
|
|
{
|
|
ProcessWordsTextless( analysis, outwords );
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Bogus
|
|
if ( count >= 100000L )
|
|
return;
|
|
|
|
int inwordpos = 0;
|
|
int awordpos = 0;
|
|
|
|
outwords.Reset();
|
|
|
|
char previous[ 256 ];
|
|
previous[ 0 ] = 0;
|
|
|
|
while ( inwordpos < inwords.m_Words.Count() )
|
|
{
|
|
CWordTag *in = inwords.m_Words[ inwordpos ];
|
|
|
|
if ( awordpos >= count )
|
|
{
|
|
// Just copy the rest over without phonemes
|
|
CWordTag *copy = new CWordTag( *in );
|
|
|
|
outwords.AddWordTag( copy );
|
|
|
|
inwordpos++;
|
|
continue;
|
|
}
|
|
|
|
// Should never fail
|
|
CAnalyzedWord *w = GetAnalyzedWord( analysis, awordpos );
|
|
if ( !w )
|
|
{
|
|
return;
|
|
}
|
|
|
|
if ( !stricmp( w->buffer, "<SIL>" ) )
|
|
{
|
|
awordpos++;
|
|
continue;
|
|
}
|
|
|
|
char const *check = ApplyTBWordRules( in->GetWord() );
|
|
if ( !FuzzyWordMatch( check, w->buffer ) )
|
|
{
|
|
bool advance_input = true;
|
|
if ( previous[ 0 ] )
|
|
{
|
|
if ( FuzzyWordMatch( previous, w->buffer ) )
|
|
{
|
|
advance_input = false;
|
|
}
|
|
}
|
|
|
|
if ( advance_input )
|
|
{
|
|
inwordpos++;
|
|
}
|
|
awordpos++;
|
|
continue;
|
|
}
|
|
strcpy( previous, check );
|
|
|
|
CWordTag *newWord = new CWordTag;
|
|
|
|
newWord->SetWord( in->GetWord() );
|
|
|
|
newWord->m_flStartTime = w->starttime;
|
|
newWord->m_flEndTime = w->endtime;
|
|
|
|
newWord->m_uiStartByte = ComputeByteFromTime( w->starttime );
|
|
newWord->m_uiEndByte = ComputeByteFromTime( w->endtime );
|
|
|
|
int phonemestart, phonemeend;
|
|
|
|
phonemestart = GetPhonemeIndexAtWordStart( analysis, w->starttime );
|
|
phonemeend = GetPhonemeIndexAtWordEnd( analysis, w->endtime );
|
|
|
|
if ( phonemestart >= 0 && phonemeend >= 0 )
|
|
{
|
|
for ( ; phonemestart <= phonemeend; phonemestart++ )
|
|
{
|
|
// Get phoneme and timing info
|
|
CAnalyzedPhoneme *ph = GetAnalyzedPhoneme( analysis, phonemestart );
|
|
if ( !ph )
|
|
continue;
|
|
|
|
CPhonemeTag *ptag = new CPhonemeTag;
|
|
ptag->SetStartTime( ph->starttime );
|
|
ptag->SetEndTime( ph->endtime );
|
|
|
|
ptag->m_uiStartByte = ComputeByteFromTime( ph->starttime );
|
|
ptag->m_uiEndByte = ComputeByteFromTime( ph->endtime );
|
|
|
|
ptag->SetTag( ph->phoneme );
|
|
ptag->SetPhonemeCode( TextToPhoneme( ptag->GetTag() ) );
|
|
|
|
newWord->m_Phonemes.AddToTail( ptag );
|
|
}
|
|
}
|
|
|
|
outwords.AddWordTag( newWord );
|
|
inwordpos++;
|
|
awordpos++;
|
|
}
|
|
}
|
|
|
|
char const *CPhonemeExtractorLipSinc::ApplyTBWordRules( char const *word )
|
|
{
|
|
static char outword[ 256 ];
|
|
|
|
char const *in = word;
|
|
char *out = outword;
|
|
|
|
while ( *in && ( ( out - outword ) <= 255 ) )
|
|
{
|
|
if ( *in == '\t' ||
|
|
*in == ' ' ||
|
|
*in == '\n' ||
|
|
*in == '-' ||
|
|
*in == '.' ||
|
|
*in == ',' ||
|
|
*in == ';' ||
|
|
*in == '?' ||
|
|
*in == '"' ||
|
|
*in == ':' ||
|
|
*in == '(' ||
|
|
*in == ')' )
|
|
{
|
|
in++;
|
|
*out++ = ' ';
|
|
continue;
|
|
}
|
|
|
|
if ( !V_isprint( *in ) )
|
|
{
|
|
in++;
|
|
continue;
|
|
}
|
|
|
|
if ( *in >= 128 )
|
|
{
|
|
in++;
|
|
continue;
|
|
}
|
|
|
|
// Skip numbers
|
|
if ( *in >= '0' && *in <= '9' )
|
|
{
|
|
in++;
|
|
continue;
|
|
}
|
|
|
|
// Convert all letters to upper case
|
|
if ( *in >= 'a' && *in <= 'z' )
|
|
{
|
|
*out++ = ( *in++ ) - 'a' + 'A';
|
|
continue;
|
|
}
|
|
|
|
if ( *in >= 'A' && *in <= 'Z' )
|
|
{
|
|
*out++ = *in++;
|
|
continue;
|
|
}
|
|
|
|
if ( *in == '\'' )
|
|
{
|
|
*out++ = *in++;
|
|
continue;
|
|
}
|
|
|
|
in++;
|
|
}
|
|
|
|
*out = 0;
|
|
|
|
return outword;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
|
|
// sample counts for the sentce
|
|
// Output : SR_RESULT
|
|
//-----------------------------------------------------------------------------
|
|
SR_RESULT CPhonemeExtractorLipSinc::Extract(
|
|
const char *wavfile,
|
|
int numsamples,
|
|
void (*pfnPrint)( const char *fmt, ... ),
|
|
CSentence& inwords,
|
|
CSentence& outwords )
|
|
{
|
|
// g_enableTalkBackDebuggingOutput = 1;
|
|
|
|
m_pfnPrint = pfnPrint;
|
|
|
|
if ( !InitLipSinc() )
|
|
{
|
|
return SR_RESULT_ERROR;
|
|
}
|
|
|
|
m_flSampleCount = numsamples;
|
|
|
|
if ( !CheckSoundFile( wavfile ) )
|
|
{
|
|
FreeLibrary( m_hHelper );
|
|
return SR_RESULT_ERROR;
|
|
}
|
|
|
|
TALKBACK_ANALYSIS *analysis = NULL;
|
|
|
|
if ( !AttemptAnalysis( &analysis, wavfile, inwords ) )
|
|
{
|
|
FreeLibrary( m_hHelper );
|
|
return SR_RESULT_FAILED;
|
|
}
|
|
|
|
if ( strlen( inwords.GetText() ) <= 0 )
|
|
{
|
|
inwords.SetTextFromWords();
|
|
}
|
|
|
|
outwords = inwords;
|
|
|
|
// Examine data
|
|
ProcessWords( analysis, inwords, outwords );
|
|
|
|
if ( analysis )
|
|
{
|
|
talkback->TalkBackFreeAnalysis( &analysis );
|
|
}
|
|
|
|
return SR_RESULT_SUCCESS;
|
|
}
|
|
|
|
EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorLipSinc, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );
|