Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1426 lines
36 KiB

  1. //========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose:
  4. //
  5. // $NoKeywords: $
  6. //
  7. //=============================================================================//
  8. // extracephonemes.cpp : Defines the entry point for the console application.
  9. //
  10. #define PROTECTED_THINGS_DISABLE
  11. #include "tier0/wchartypes.h"
  12. #include <stdio.h>
  13. #include <windows.h>
  14. #include <tchar.h>
  15. #include "sphelper.h"
  16. #include "spddkhlp.h"
  17. // ATL Header Files
  18. #include <atlbase.h>
  19. // Face poser and util includes
  20. #include "utlvector.h"
  21. #include "phonemeextractor/PhonemeExtractor.h"
  22. #include "PhonemeConverter.h"
  23. #include "sentence.h"
  24. #include "tier0/dbg.h"
  25. #include "tier0/icommandline.h"
  26. #include "FileSystem.h"
  27. // Extract phoneme grammar id
  28. #define EP_GRAM_ID 101
  29. // First rule of dynamic sentence rule set
  30. #define DYN_SENTENCERULE 102
  31. // # of milliseconds to allow for processing before timeout
  32. #define SR_WAVTIMEOUT 4000
  33. // Weight tag for rule to rule word/rule transitions
  34. #define CONFIDENCE_WEIGHT 0.0f
  35. //#define LOGGING 1
  36. #define LOGFILE "c:\\fp.log"
  37. void LogReset( void )
  38. {
  39. #if LOGGING
  40. FILE *fp = fopen( LOGFILE, "w" );
  41. if ( fp )
  42. fclose( fp );
  43. #endif
  44. }
  45. char *va( const char *fmt, ... );
  46. DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_PhonemeExtractor, "PhonemeExtractor" );
  47. //-----------------------------------------------------------------------------
  48. // Purpose:
  49. // Input : *words -
  50. //-----------------------------------------------------------------------------
  51. void LogWords( CSentence& sentence )
  52. {
  53. Log_Msg( LOG_PhonemeExtractor, "Wordcount == %i\n", sentence.m_Words.Count() );
  54. for ( int i = 0; i < sentence.m_Words.Count(); i++ )
  55. {
  56. const CWordTag *w = sentence.m_Words[ i ];
  57. Log_Msg( LOG_PhonemeExtractor, "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
  58. }
  59. }
  60. //-----------------------------------------------------------------------------
  61. // Purpose:
  62. // Input : *phonemes -
  63. //-----------------------------------------------------------------------------
  64. void LogPhonemes( CSentence& sentence )
  65. {
  66. return;
  67. Log_Msg( LOG_PhonemeExtractor, "Phonemecount == %i\n", sentence.CountPhonemes() );
  68. for ( int i = 0; i < sentence.m_Words.Count(); i++ )
  69. {
  70. const CWordTag *w = sentence.m_Words[ i ];
  71. for ( int j = 0; j < w->m_Phonemes.Count(); j++ )
  72. {
  73. const CPhonemeTag *p = w->m_Phonemes[ j ];
  74. Log_Msg( LOG_PhonemeExtractor, "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
  75. }
  76. }
  77. }
  78. #define NANO_CONVERT 10000000.0f;
  79. //-----------------------------------------------------------------------------
  80. // Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
  81. // FIXME: Right now, phonemes are assumed to evenly space out across a word.
  82. // Input : *converter -
  83. // result -
  84. // sentence -
  85. //-----------------------------------------------------------------------------
  86. void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
  87. {
  88. USES_CONVERSION;
  89. // Grab access to element container
  90. ISpPhrase *phrase = ( ISpPhrase * )result;
  91. if ( !phrase )
  92. return;
  93. SPPHRASE *pElements;
  94. if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
  95. return;
  96. // Only use it if it's better/same size as what we already had on-hand
  97. if ( pElements->Rule.ulCountOfElements > 0 )
  98. //(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
  99. {
  100. sentence.ResetToBase();
  101. // Walk list of words
  102. for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
  103. {
  104. unsigned int wordstart, wordend;
  105. // Get start/end sample index
  106. wordstart = pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
  107. wordend = wordstart + pElements->pElements[i].ulAudioSizeBytes;
  108. // Create word tag
  109. CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
  110. Assert( w );
  111. w->m_uiStartByte = wordstart;
  112. w->m_uiEndByte = wordend;
  113. sentence.AddWordTag( w );
  114. // Count # of phonemes in this word
  115. SPPHONEID pstr[ 2 ];
  116. pstr[ 1 ] = 0;
  117. WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
  118. const SPPHONEID *current;
  119. SPPHONEID phoneme;
  120. current = pElements->pElements[i].pszPronunciation;
  121. float total_weight = 0.0f;
  122. while ( 1 )
  123. {
  124. phoneme = *current++;
  125. if ( !phoneme )
  126. break;
  127. pstr[ 0 ] = phoneme;
  128. wszPhoneme[ 0 ] = L'\0';
  129. converter->IdToPhone( pstr, wszPhoneme );
  130. total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
  131. }
  132. current = pElements->pElements[i].pszPronunciation;
  133. // Decide # of bytes/phoneme weight
  134. float psize = 0;
  135. if ( total_weight )
  136. {
  137. psize = ( wordend - wordstart ) / total_weight;
  138. }
  139. int number = 0;
  140. // Re-walk the phoneme list and create true phoneme tags
  141. float startWeight = 0.0f;
  142. while ( 1 )
  143. {
  144. phoneme = *current++;
  145. if ( !phoneme )
  146. break;
  147. pstr[ 0 ] = phoneme;
  148. wszPhoneme[ 0 ] = L'\0';
  149. converter->IdToPhone( pstr, wszPhoneme );
  150. CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
  151. Assert( p );
  152. float weight = WeightForPhoneme( W2A( wszPhoneme ) );
  153. p->m_uiStartByte = wordstart + (int)( startWeight * psize );
  154. p->m_uiEndByte = p->m_uiStartByte + (int)( psize * weight );
  155. startWeight += weight;
  156. // Convert to IPA phoneme code
  157. p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );
  158. sentence.AddPhonemeTag( w, p );
  159. number++;
  160. }
  161. }
  162. }
  163. // Free memory
  164. ::CoTaskMemFree(pElements);
  165. }
  166. //-----------------------------------------------------------------------------
  167. // Purpose: Create rules for each word in the reference sentence
  168. //-----------------------------------------------------------------------------
  169. typedef struct
  170. {
  171. int ruleId;
  172. SPSTATEHANDLE hRule;
  173. CSpDynamicString word;
  174. char plaintext[ 256 ];
  175. } WORDRULETYPE;
  176. //-----------------------------------------------------------------------------
  177. // Purpose: Creates start for word of sentence
  178. // Input : cpRecoGrammar -
  179. // *root -
  180. // *rules -
  181. // word -
  182. //-----------------------------------------------------------------------------
  183. void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
  184. {
  185. USES_CONVERSION;
  186. HRESULT hr;
  187. WORDRULETYPE *newrule;
  188. int idx = (*rules).AddToTail();
  189. newrule = &(*rules)[ idx ];
  190. newrule->ruleId = DYN_SENTENCERULE + idx + 1;
  191. newrule->word = word;
  192. strcpy( newrule->plaintext, W2T( word ) );
  193. // Create empty rule
  194. hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
  195. Assert( !FAILED( hr ) );
  196. }
  197. //-----------------------------------------------------------------------------
  198. // Purpose:
  199. // Input : cpRecoGrammar -
  200. // *from -
  201. // *to -
  202. //-----------------------------------------------------------------------------
  203. void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
  204. {
  205. USES_CONVERSION;
  206. HRESULT hr;
  207. Assert( from );
  208. if ( from && !to )
  209. {
  210. OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
  211. }
  212. else
  213. {
  214. OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
  215. }
  216. hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
  217. Assert( !FAILED( hr ) );
  218. }
  219. //-----------------------------------------------------------------------------
  220. // Purpose:
  221. // Input : cpRecoGrammar -
  222. // *from -
  223. // *to -
  224. //-----------------------------------------------------------------------------
  225. void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
  226. {
  227. USES_CONVERSION;
  228. HRESULT hr;
  229. Assert( from );
  230. if ( from && !to )
  231. {
  232. OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
  233. }
  234. else
  235. {
  236. OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
  237. }
  238. hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
  239. Assert( !FAILED( hr ) );
  240. }
  241. #define MAX_WORD_SKIP 1
  242. //-----------------------------------------------------------------------------
  243. // Purpose: Links together all word rule states into a sentence rule CFG
  244. // Input : singleword -
  245. // cpRecoGrammar -
  246. // *root -
  247. // *rules -
  248. //-----------------------------------------------------------------------------
  249. bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
  250. {
  251. HRESULT hr;
  252. WORDRULETYPE *rule, *next;
  253. int numrules = (*rules).Count();
  254. rule = &(*rules)[ 0 ];
  255. // Add transition
  256. hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
  257. Assert( !FAILED( hr ) );
  258. for ( int i = 0; i < numrules; i++ )
  259. {
  260. rule = &(*rules)[ i ];
  261. if ( i < numrules - 1 )
  262. {
  263. next = &(*rules)[ i + 1 ];
  264. }
  265. else
  266. {
  267. next = NULL;
  268. }
  269. AddWordTransitionRule( cpRecoGrammar, rule, next );
  270. }
  271. if ( numrules > 1 )
  272. {
  273. for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
  274. {
  275. OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
  276. hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
  277. // Now build rules where you can skip 1 to N intervening words
  278. for ( int i = 1; i < numrules; i++ )
  279. {
  280. // Start at the beginning?
  281. rule = &(*rules)[ i ];
  282. if ( i < numrules - skip )
  283. {
  284. next = &(*rules)[ i + skip ];
  285. }
  286. else
  287. {
  288. continue;
  289. }
  290. // Add transition
  291. AddOptionalTransitionRule( cpRecoGrammar, rule, next );
  292. }
  293. // Go from final rule to end point
  294. AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
  295. }
  296. }
  297. // Store it
  298. hr = cpRecoGrammar->Commit(NULL);
  299. if ( FAILED( hr ) )
  300. return false;
  301. return true;
  302. }
  303. //-----------------------------------------------------------------------------
  304. // Purpose: Debugging, prints alternate list if one is created
  305. // Input : cpResult -
  306. // (*pfnPrint -
  307. //-----------------------------------------------------------------------------
  308. void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
  309. {
  310. ISpPhraseAlt *rgPhraseAlt[ 32 ];
  311. memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
  312. ULONG ulCount;
  313. ISpPhrase *phrase = ( ISpPhrase * )cpResult;
  314. if ( phrase )
  315. {
  316. SPPHRASE *pElements;
  317. if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
  318. {
  319. if ( pElements->Rule.ulCountOfElements > 0 )
  320. {
  321. HRESULT hr = cpResult->GetAlternates(
  322. pElements->Rule.ulFirstElement,
  323. pElements->Rule.ulCountOfElements,
  324. 32,
  325. rgPhraseAlt,
  326. &ulCount);
  327. Assert( !FAILED( hr ) );
  328. for ( ULONG r = 0 ; r < ulCount; r++ )
  329. {
  330. CSpDynamicString dstrText;
  331. hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
  332. Assert( !FAILED( hr ) );
  333. pfnPrint( "[ ALT ]" );
  334. pfnPrint( dstrText.CopyToChar() );
  335. pfnPrint( "\r\n" );
  336. }
  337. }
  338. }
  339. }
  340. for ( int i = 0; i < 32; i++ )
  341. {
  342. if ( rgPhraseAlt[ i ] )
  343. {
  344. rgPhraseAlt[ i ]->Release();
  345. rgPhraseAlt[ i ] = NULL;
  346. }
  347. }
  348. }
  349. void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
  350. {
  351. char sz[ 256 ];
  352. int i;
  353. pfnPrint( "WORDS\r\n\r\n" );
  354. for ( i = 0 ; i < sentence.m_Words.Count(); i++ )
  355. {
  356. CWordTag *word = sentence.m_Words[ i ];
  357. if ( !word )
  358. continue;
  359. sprintf( sz, "<%u - %u> %s\r\n",
  360. word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );
  361. pfnPrint( sz );
  362. for ( int j = 0 ; j < word->m_Phonemes.Count(); j++ )
  363. {
  364. CPhonemeTag *phoneme = word->m_Phonemes[ j ];
  365. if ( !phoneme )
  366. continue;
  367. sprintf( sz, " <%u - %u> %s\r\n",
  368. phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );
  369. pfnPrint( sz );
  370. }
  371. }
  372. pfnPrint( "\r\n" );
  373. }
  374. //-----------------------------------------------------------------------------
  375. // Purpose: Given a wave file and a string of words "text", creates a CFG from the
  376. // sentence and stores the resulting words/phonemes in CSentence
  377. // Input : *wavname -
  378. // text -
  379. // sentence -
  380. // (*pfnPrint -
  381. // Output : SR_RESULT
  382. //-----------------------------------------------------------------------------
  383. SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
  384. {
  385. // Assume failure
  386. SR_RESULT result = SR_RESULT_ERROR;
  387. if ( text.Length() <= 0 )
  388. {
  389. pfnPrint( "Error: no rule / text specified\n" );
  390. return result;
  391. }
  392. USES_CONVERSION;
  393. HRESULT hr;
  394. CUtlVector < WORDRULETYPE > wordRules;
  395. CComPtr<ISpStream> cpInputStream;
  396. CComPtr<ISpRecognizer> cpRecognizer;
  397. CComPtr<ISpRecoContext> cpRecoContext;
  398. CComPtr<ISpRecoGrammar> cpRecoGrammar;
  399. CComPtr<ISpPhoneConverter> cpPhoneConv;
  400. // Create basic SAPI stream object
  401. // NOTE: The helper SpBindToFile can be used to perform the following operations
  402. hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
  403. if ( FAILED( hr ) )
  404. {
  405. pfnPrint( "Error: SAPI 5.1 Stream object not installed?\n" );
  406. return result;
  407. }
  408. CSpStreamFormat sInputFormat;
  409. // setup stream object with wav file MY_WAVE_AUDIO_FILENAME
  410. // for read-only access, since it will only be access by the SR engine
  411. hr = cpInputStream->BindToFile(
  412. T2W(wavname),
  413. SPFM_OPEN_READONLY,
  414. NULL,
  415. sInputFormat.WaveFormatExPtr(),
  416. SPFEI_ALL_EVENTS );
  417. if ( FAILED( hr ) )
  418. {
  419. pfnPrint( "Error: couldn't open wav file %s\n", wavname );
  420. return result;
  421. }
  422. // Create in-process speech recognition engine
  423. hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
  424. if ( FAILED( hr ) )
  425. {
  426. pfnPrint( "Error: SAPI 5.1 In process recognizer object not installed?\n" );
  427. return result;
  428. }
  429. // Create recognition context to receive events
  430. hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
  431. if ( FAILED( hr ) )
  432. {
  433. pfnPrint( "Error: SAPI 5.1 Unable to create recognizer context\n" );
  434. return result;
  435. }
  436. // Create a grammar
  437. hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
  438. if ( FAILED( hr ) )
  439. {
  440. pfnPrint( "Error: SAPI 5.1 Unable to create recognizer grammar\n" );
  441. return result;
  442. }
  443. LANGID englishID = 0x409; // 1033 decimal
  444. bool userSpecified = false;
  445. LANGID langID = SpGetUserDefaultUILanguage();
  446. // Allow commandline override
  447. if ( CommandLine()->FindParm( "-languageid" ) != 0 )
  448. {
  449. userSpecified = true;
  450. langID = CommandLine()->ParmValue( "-languageid", langID );
  451. }
  452. // Create a phoneme converter ( so we can convert to IPA codes )
  453. hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
  454. if ( FAILED( hr ) )
  455. {
  456. if ( langID != englishID )
  457. {
  458. if ( userSpecified )
  459. {
  460. pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
  461. }
  462. else
  463. {
  464. pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
  465. }
  466. // Try english!!!
  467. langID = englishID;
  468. hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
  469. }
  470. if ( FAILED( hr ) )
  471. {
  472. pfnPrint( "Error: SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
  473. return result;
  474. }
  475. else
  476. {
  477. pfnPrint( "Note: SAPI 5.1 Falling back to use english -languageid %i\n", langID );
  478. }
  479. }
  480. else if ( userSpecified )
  481. {
  482. pfnPrint( "Note: SAPI 5.1 Using user specified -languageid %i\n",langID );
  483. }
  484. SPSTATEHANDLE hStateRoot;
  485. // create/re-create Root level rule of grammar
  486. hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
  487. if ( FAILED( hr ) )
  488. {
  489. pfnPrint( "Error: SAPI 5.1 Unable to create root rule\n" );
  490. return result;
  491. }
  492. // Inactivate it so we can alter it
  493. hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
  494. if ( FAILED( hr ) )
  495. {
  496. pfnPrint( "Error: SAPI 5.1 Unable to deactivate grammar rules\n" );
  497. return result;
  498. }
  499. // Create the rule set from the words in text
  500. {
  501. CSpDynamicString currentWord;
  502. WCHAR *pos = ( WCHAR * )text;
  503. WCHAR str[ 2 ];
  504. str[1]= 0;
  505. while ( *pos )
  506. {
  507. if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
  508. {
  509. // Add word to rule set
  510. if ( currentWord.Length() > 0 )
  511. {
  512. AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
  513. currentWord.Clear();
  514. }
  515. pos++;
  516. continue;
  517. }
  518. // Skip anything that's inside a [ xxx ] pair.
  519. if ( *pos == L'[' )
  520. {
  521. while ( *pos && *pos != L']' )
  522. {
  523. pos++;
  524. }
  525. if ( *pos )
  526. {
  527. pos++;
  528. }
  529. continue;
  530. }
  531. str[ 0 ] = *pos;
  532. currentWord.Append( str );
  533. pos++;
  534. }
  535. if ( currentWord.Length() > 0 )
  536. {
  537. AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
  538. }
  539. if ( wordRules.Count() <= 0 )
  540. {
  541. pfnPrint( "Error: Text %s contained no usable words\n", text );
  542. return result;
  543. }
  544. // Build all word to word transitions in the grammar
  545. if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
  546. {
  547. pfnPrint( "Error: Rule set for %s could not be generated\n", text );
  548. return result;
  549. }
  550. }
  551. // check for recognitions and end of stream event
  552. const ULONGLONG ullInterest =
  553. SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) |
  554. SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
  555. hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
  556. if ( FAILED( hr ) )
  557. {
  558. pfnPrint( "Error: SAPI 5.1 Unable to set interest level\n" );
  559. return result;
  560. }
  561. // use Win32 events for command-line style application
  562. hr = cpRecoContext->SetNotifyWin32Event();
  563. if ( FAILED( hr ) )
  564. {
  565. pfnPrint( "Error: SAPI 5.1 Unable to set win32 notify event\n" );
  566. return result;
  567. }
  568. // connect wav input to recognizer
  569. // SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
  570. hr = cpRecognizer->SetInput(cpInputStream, TRUE);
  571. if ( FAILED( hr ) )
  572. {
  573. pfnPrint( "Error: SAPI 5.1 Unable to associate input stream\n" );
  574. return result;
  575. }
  576. // Activate the CFG ( rather than using dictation )
  577. hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
  578. if ( FAILED( hr ) )
  579. {
  580. switch ( hr )
  581. {
  582. case E_INVALIDARG:
  583. pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
  584. break;
  585. case SP_STREAM_UNINITIALIZED:
  586. pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
  587. break;
  588. case SPERR_UNINITIALIZED:
  589. pfnPrint( "The object has not been properly initialized.\n");
  590. break;
  591. case SPERR_UNSUPPORTED_FORMAT:
  592. pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
  593. break;
  594. case SPERR_NOT_TOPLEVEL_RULE:
  595. pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
  596. break;
  597. default:
  598. pfnPrint( "Unknown error\n" );
  599. break;
  600. }
  601. pfnPrint( "Error: SAPI 5.1 Unable to activate rule set\n" );
  602. return result;
  603. }
  604. // while events occur, continue processing
  605. // timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
  606. BOOL fEndStreamReached = FALSE;
  607. while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
  608. {
  609. CSpEvent spEvent;
  610. // pull all queued events from the reco context's event queue
  611. while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
  612. {
  613. // Check event type
  614. switch (spEvent.eEventId)
  615. {
  616. case SPEI_INTERFERENCE:
  617. {
  618. SPINTERFERENCE interference = spEvent.Interference();
  619. switch ( interference )
  620. {
  621. case SPINTERFERENCE_NONE:
  622. pfnPrint( "[ I None ]\r\n" );
  623. break;
  624. case SPINTERFERENCE_NOISE:
  625. pfnPrint( "[ I Noise ]\r\n" );
  626. break;
  627. case SPINTERFERENCE_NOSIGNAL:
  628. pfnPrint( "[ I No Signal ]\r\n" );
  629. break;
  630. case SPINTERFERENCE_TOOLOUD:
  631. pfnPrint( "[ I Too Loud ]\r\n" );
  632. break;
  633. case SPINTERFERENCE_TOOQUIET:
  634. pfnPrint( "[ I Too Quiet ]\r\n" );
  635. break;
  636. case SPINTERFERENCE_TOOFAST:
  637. pfnPrint( "[ I Too Fast ]\r\n" );
  638. break;
  639. case SPINTERFERENCE_TOOSLOW:
  640. pfnPrint( "[ I Too Slow ]\r\n" );
  641. break;
  642. default:
  643. break;
  644. }
  645. }
  646. break;
  647. case SPEI_PHRASE_START:
  648. pfnPrint( "Phrase Start\r\n" );
  649. sentence.MarkNewPhraseBase();
  650. break;
  651. case SPEI_HYPOTHESIS:
  652. case SPEI_RECOGNITION:
  653. case SPEI_FALSE_RECOGNITION:
  654. {
  655. CComPtr<ISpRecoResult> cpResult;
  656. cpResult = spEvent.RecoResult();
  657. CSpDynamicString dstrText;
  658. if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
  659. {
  660. dstrText = L"(Unrecognized)";
  661. result = SR_RESULT_FAILED;
  662. // It's possible that the failed recog might have more words, so see if that's the case
  663. EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
  664. }
  665. else
  666. {
  667. // Hypothesis or recognition success
  668. cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
  669. EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
  670. if ( spEvent.eEventId == SPEI_RECOGNITION )
  671. {
  672. result = SR_RESULT_SUCCESS;
  673. }
  674. pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
  675. }
  676. cpResult.Release();
  677. }
  678. break;
  679. // end of the wav file was reached by the speech recognition engine
  680. case SPEI_END_SR_STREAM:
  681. fEndStreamReached = TRUE;
  682. break;
  683. }
  684. // clear any event data/object references
  685. spEvent.Clear();
  686. }// END event pulling loop - break on empty event queue OR end stream
  687. }// END event polling loop - break on event timeout OR end stream
  688. // Deactivate rule
  689. hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
  690. if ( FAILED( hr ) )
  691. {
  692. pfnPrint( "Error: SAPI 5.1 Unable to deactivate rule set\n" );
  693. return result;
  694. }
  695. // close the input stream, since we're done with it
  696. // NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
  697. hr = cpInputStream->Close();
  698. if ( FAILED( hr ) )
  699. {
  700. pfnPrint( "Error: SAPI 5.1 Unable to close input stream\n" );
  701. return result;
  702. }
  703. return result;
  704. }
  705. //-----------------------------------------------------------------------------
  706. // Purpose: HACK HACK: We have to delete the RecoContext key or sapi starts to train
  707. // itself on each iteration which was causing some problems.
  708. // Input : hKey -
  709. //-----------------------------------------------------------------------------
  710. void RecursiveRegDelKey(HKEY hKey)
  711. {
  712. char keyname[256]={0};
  713. DWORD namesize=256;
  714. //base case: no subkeys when RegEnumKeyEx returns error on index 0
  715. LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
  716. if (lResult!=ERROR_SUCCESS)
  717. {
  718. return;
  719. }
  720. do
  721. {
  722. HKEY subkey;
  723. LONG lResult2;
  724. LONG lDelResult;
  725. lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);
  726. if (lResult2==ERROR_SUCCESS)
  727. {
  728. RecursiveRegDelKey(subkey);
  729. RegCloseKey(subkey);
  730. lDelResult=RegDeleteKey(hKey,keyname);
  731. namesize=256;
  732. //use 0 in the next function call because when you delete one, the rest shift down!
  733. lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
  734. }
  735. else
  736. {
  737. break;
  738. }
  739. } while (lResult!=ERROR_NO_MORE_ITEMS);
  740. }
  741. bool IsUseable( CWordTag *word )
  742. {
  743. if ( word->m_uiStartByte || word->m_uiEndByte )
  744. return true;
  745. return false;
  746. }
  747. int FindLastUsableWord( CSentence& outwords )
  748. {
  749. int numwords = outwords.m_Words.Count();
  750. if ( numwords < 1 )
  751. {
  752. Assert( 0 );
  753. return -1;
  754. }
  755. for ( int i = numwords-1; i >= 0; i-- )
  756. {
  757. CWordTag *check = outwords.m_Words[ i ];
  758. if ( IsUseable( check ) )
  759. {
  760. return i;
  761. }
  762. }
  763. return -1;
  764. }
  765. int FindFirstUsableWord( CSentence& outwords )
  766. {
  767. int numwords = outwords.m_Words.Count();
  768. if ( numwords < 1 )
  769. {
  770. Assert( 0 );
  771. return -1;
  772. }
  773. for ( int i = 0; i < numwords; i++ )
  774. {
  775. CWordTag *check = outwords.m_Words[ i ];
  776. if ( IsUseable( check ) )
  777. {
  778. return i;
  779. }
  780. }
  781. return -1;
  782. }
  783. //-----------------------------------------------------------------------------
  784. // Purpose: Counts words which have either a valid start or end byte
  785. // Input : *outwords -
  786. // Output : int
  787. //-----------------------------------------------------------------------------
  788. int CountUsableWords( CSentence& outwords )
  789. {
  790. int count = 0;
  791. int numwords = outwords.m_Words.Count();
  792. // Nothing to do
  793. if ( numwords <= 0 )
  794. return count;
  795. for ( int i = 0; i < numwords; i++ )
  796. {
  797. CWordTag *word = outwords.m_Words[ i ];
  798. if ( !IsUseable( word ) )
  799. continue;
  800. count++;
  801. }
  802. return count;
  803. }
  804. //-----------------------------------------------------------------------------
  805. // Purpose: Counts words which have either a valid start or end byte
  806. // Input : *outwords -
  807. // Output : int
  808. //-----------------------------------------------------------------------------
  809. int CountUnuseableWords( CSentence& outwords )
  810. {
  811. int count = 0;
  812. int numwords = outwords.m_Words.Count();
  813. // Nothing to do
  814. if ( numwords <= 0 )
  815. return count;
  816. for ( int i = 0; i < numwords; i++ )
  817. {
  818. CWordTag *word = outwords.m_Words[ i ];
  819. if ( IsUseable( word ) )
  820. continue;
  821. count++;
  822. }
  823. return count;
  824. }
  825. // Keeps same relative spacing, but rebases list
  826. void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
  827. {
  828. // Repartition phonemes based on old range
  829. float oldRange = ( float )( oldEnd - oldStart );
  830. float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
  831. for ( int i = 0; i < word->m_Phonemes.Count(); i++ )
  832. {
  833. CPhonemeTag *tag = word->m_Phonemes[ i ];
  834. Assert( tag );
  835. float frac1 = 0.0f, frac2 = 0.0f;
  836. float delta1, delta2;
  837. delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
  838. delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
  839. if ( oldRange > 0.0f )
  840. {
  841. frac1 = delta1 / oldRange;
  842. frac2 = delta2 / oldRange;
  843. }
  844. tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
  845. tag->m_uiEndByte = word->m_uiStartByte + ( unsigned int ) ( frac2 * newRange );
  846. }
  847. }
  848. void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
  849. {
  850. int wordCount = end - start + 1;
  851. Assert( wordCount >= 1 );
  852. int stepSize = ( sampleEnd - sampleStart ) / wordCount;
  853. int currentStart = sampleStart;
  854. for ( int i = start; i <= end; i++ )
  855. {
  856. CWordTag *word = outwords.m_Words[ i ];
  857. Assert( word );
  858. unsigned int oldStart = word->m_uiStartByte;
  859. unsigned int oldEnd = word->m_uiEndByte;
  860. word->m_uiStartByte = currentStart;
  861. word->m_uiEndByte = currentStart + stepSize;
  862. RepartitionPhonemes( word, oldStart, oldEnd );
  863. currentStart += stepSize;
  864. }
  865. }
  866. void MergeWords( CWordTag *w1, CWordTag *w2 )
  867. {
  868. unsigned int start, end;
  869. start = min( w1->m_uiStartByte, w2->m_uiStartByte );
  870. end = max( w1->m_uiEndByte, w2->m_uiEndByte );
  871. unsigned int mid = ( start + end ) / 2;
  872. unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
  873. oldw1start = w1->m_uiStartByte;
  874. oldw2start = w2->m_uiStartByte;
  875. oldw1end = w1->m_uiEndByte;
  876. oldw2end = w2->m_uiEndByte;
  877. w1->m_uiStartByte = start;
  878. w1->m_uiEndByte = mid;
  879. w2->m_uiStartByte = mid;
  880. w2->m_uiEndByte = end;
  881. RepartitionPhonemes( w1, oldw1start, oldw1end );
  882. RepartitionPhonemes( w2, oldw2start, oldw2end );
  883. }
  884. void FixupZeroLengthWords( CSentence& outwords )
  885. {
  886. while ( 1 )
  887. {
  888. int i;
  889. for ( i = 0 ; i < outwords.m_Words.Count() - 1; i++ )
  890. {
  891. CWordTag *current, *next;
  892. current = outwords.m_Words[ i ];
  893. next = outwords.m_Words[ i + 1 ];
  894. if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
  895. {
  896. MergeWords( current, next );
  897. break;
  898. }
  899. if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
  900. {
  901. MergeWords( current, next );
  902. break;
  903. }
  904. }
  905. if ( i >= outwords.m_Words.Count() - 1 )
  906. {
  907. break;
  908. }
  909. }
  910. }
  911. void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
  912. {
  913. int numwords = outwords.m_Words.Count();
  914. // Nothing to do
  915. if ( numwords <= 0 )
  916. return;
  917. int interationcount = 1;
  918. while( 1 )
  919. {
  920. Log_Msg( LOG_PhonemeExtractor, "\nCompute %i\n", interationcount++ );
  921. LogWords( outwords );
  922. int wordNumber;
  923. // Done!
  924. if ( !CountUnuseableWords( outwords ) )
  925. {
  926. FixupZeroLengthWords( outwords );
  927. break;
  928. }
  929. if ( !CountUsableWords( outwords ) )
  930. {
  931. // Evenly space words across full sample time
  932. PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
  933. break;
  934. }
  935. wordNumber = FindFirstUsableWord( outwords );
  936. // Not the first word
  937. if ( wordNumber > 0 )
  938. {
  939. // Repartition all of the unusables and the first one starting at zero over the range
  940. CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
  941. Assert( firstUsable );
  942. if ( firstUsable->m_uiStartByte != 0 )
  943. {
  944. PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
  945. }
  946. else
  947. {
  948. PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
  949. }
  950. // Start over
  951. continue;
  952. }
  953. wordNumber = FindLastUsableWord( outwords );
  954. // Not the last word
  955. if ( wordNumber >= 0 && wordNumber < numwords - 1 )
  956. {
  957. // Repartition all of the unusables and the first one starting at zero over the range
  958. CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
  959. Assert( lastUsable );
  960. if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
  961. {
  962. PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
  963. }
  964. else
  965. {
  966. PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
  967. }
  968. // Start over
  969. continue;
  970. }
  971. // If we get here it means that the start and end of the list are okay and we just have to
  972. // iterate across the list and fix things in the middle
  973. int startByte = 0;
  974. int endByte = 0;
  975. for ( int i = 0; i < numwords ; i++ )
  976. {
  977. CWordTag *word = outwords.m_Words[ i ];
  978. if ( IsUseable( word ) )
  979. {
  980. startByte = word->m_uiEndByte;
  981. continue;
  982. }
  983. // Found the start of a chain of 1 or more unusable words
  984. // Find the startbyte of the next usable word and count how many words we check
  985. int wordCount = 1;
  986. for ( int j = i + 1; j < numwords; j++ )
  987. {
  988. CWordTag *next = outwords.m_Words[ j ];
  989. if ( IsUseable( next ) )
  990. {
  991. endByte = next->m_uiStartByte;
  992. break;
  993. }
  994. wordCount++;
  995. }
  996. // Now partition words across the gap and go to start again
  997. PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
  998. break;
  999. }
  1000. }
  1001. }
  1002. //-----------------------------------------------------------------------------
  1003. // Purpose: Given a wavfile and a list of inwords, determines the word/phonene
  1004. // sample counts for the sentce
  1005. // Input : *wavfile -
  1006. // *inwords -
  1007. // *outphonemes{ text.Clear( -
  1008. // Output : SR_RESULT
  1009. //-----------------------------------------------------------------------------
  1010. static SR_RESULT SAPI_ExtractPhonemes(
  1011. const char *wavfile,
  1012. int numsamples,
  1013. void (*pfnPrint)( const char *fmt, ... ),
  1014. CSentence& inwords,
  1015. CSentence& outwords )
  1016. {
  1017. LogReset();
  1018. USES_CONVERSION;
  1019. CSpDynamicString text;
  1020. text.Clear();
  1021. HKEY hkwipe;
  1022. LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
  1023. if ( lResult == ERROR_SUCCESS )
  1024. {
  1025. RecursiveRegDelKey( hkwipe );
  1026. RegCloseKey( hkwipe );
  1027. }
  1028. if ( strlen( inwords.GetText() ) <= 0 )
  1029. {
  1030. inwords.SetTextFromWords();
  1031. }
  1032. // Construct a string from the inwords array
  1033. text.Append( T2W( inwords.GetText() ) );
  1034. // Assume failure
  1035. SR_RESULT result = SR_RESULT_ERROR;
  1036. if ( text.Length() > 0 )
  1037. {
  1038. CSentence sentence;
  1039. pfnPrint( "Processing...\r\n" );
  1040. // Give it a try
  1041. result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
  1042. pfnPrint( "Finished.\r\n" );
  1043. // PrintWordsAndPhonemes( sentence, pfnPrint );
  1044. // Copy results to outputs
  1045. outwords.Reset();
  1046. outwords.SetText( inwords.GetText() );
  1047. Log_Msg( LOG_PhonemeExtractor, "Starting\n" );
  1048. LogWords( inwords );
  1049. if ( SR_RESULT_ERROR != result )
  1050. {
  1051. int i;
  1052. Log_Msg( LOG_PhonemeExtractor, "Hypothesized\n" );
  1053. LogWords( sentence );
  1054. for( i = 0 ; i < sentence.m_Words.Count(); i++ )
  1055. {
  1056. CWordTag *tag = sentence.m_Words[ i ];
  1057. if ( tag )
  1058. {
  1059. // Skip '...' tag
  1060. if ( stricmp( tag->GetWord(), "..." ) )
  1061. {
  1062. CWordTag *newTag = new CWordTag( *tag );
  1063. outwords.m_Words.AddToTail( newTag );
  1064. }
  1065. }
  1066. }
  1067. // Now insert unrecognized/skipped words from original list
  1068. //
  1069. int frompos = 0, topos = 0;
  1070. while( 1 )
  1071. {
  1072. // End of source list
  1073. if ( frompos >= inwords.m_Words.Count() )
  1074. break;
  1075. const CWordTag *fromTag = inwords.m_Words[ frompos ];
  1076. // Reached end of destination list, just copy words over from from source list until
  1077. // we run out of source words
  1078. if ( topos >= outwords.m_Words.Count() )
  1079. {
  1080. // Just copy words over
  1081. CWordTag *newWord = new CWordTag( *fromTag );
  1082. // Remove phonemes
  1083. while ( newWord->m_Phonemes.Count() > 0 )
  1084. {
  1085. CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
  1086. newWord->m_Phonemes.Remove( 0 );
  1087. delete kill;
  1088. }
  1089. outwords.m_Words.AddToTail( newWord );
  1090. frompos++;
  1091. topos++;
  1092. continue;
  1093. }
  1094. // Destination word
  1095. const CWordTag *toTag = outwords.m_Words[ topos ];
  1096. // Words match, just skip ahead
  1097. if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
  1098. {
  1099. frompos++;
  1100. topos++;
  1101. continue;
  1102. }
  1103. // The only case we handle is that something in the source wasn't in the destination
  1104. // Find the next source word that appears in the destination
  1105. int skipAhead = frompos + 1;
  1106. bool found = false;
  1107. while ( skipAhead < inwords.m_Words.Count() )
  1108. {
  1109. const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
  1110. if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
  1111. {
  1112. found = true;
  1113. break;
  1114. }
  1115. skipAhead++;
  1116. }
  1117. // Uh oh destination has words that are not in source, just skip to next destination word?
  1118. if ( !found )
  1119. {
  1120. topos++;
  1121. }
  1122. else
  1123. {
  1124. // Copy words from from source list into destination
  1125. //
  1126. int skipCount = skipAhead - frompos;
  1127. while ( --skipCount>= 0 )
  1128. {
  1129. const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
  1130. CWordTag *newWord = new CWordTag( *sourceWord );
  1131. // Remove phonemes
  1132. while ( newWord->m_Phonemes.Count() > 0 )
  1133. {
  1134. CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
  1135. newWord->m_Phonemes.Remove( 0 );
  1136. delete kill;
  1137. }
  1138. outwords.m_Words.InsertBefore( topos, newWord );
  1139. topos++;
  1140. }
  1141. frompos++;
  1142. topos++;
  1143. }
  1144. }
  1145. Log_Msg( LOG_PhonemeExtractor, "\nDone simple check\n" );
  1146. LogWords( outwords );
  1147. LogPhonemes( outwords );
  1148. ComputeMissingByteSpans( numsamples, outwords );
  1149. Log_Msg( LOG_PhonemeExtractor, "\nFinal check\n" );
  1150. LogWords( outwords );
  1151. LogPhonemes( outwords );
  1152. }
  1153. }
  1154. else
  1155. {
  1156. pfnPrint( "Input sentence is empty!\n" );
  1157. }
  1158. // Return results
  1159. return result;
  1160. }
  1161. //-----------------------------------------------------------------------------
  1162. // Purpose: Expose the interface
  1163. //-----------------------------------------------------------------------------
  1164. class CPhonemeExtractorSAPI : public IPhonemeExtractor
  1165. {
  1166. public:
  1167. virtual PE_APITYPE GetAPIType() const
  1168. {
  1169. return SPEECH_API_SAPI;
  1170. }
  1171. // Used for menus, etc
  1172. virtual char const *GetName() const
  1173. {
  1174. return "MS SAPI 5.1";
  1175. }
  1176. SR_RESULT Extract(
  1177. const char *wavfile,
  1178. int numsamples,
  1179. void (*pfnPrint)( const char *fmt, ... ),
  1180. CSentence& inwords,
  1181. CSentence& outwords )
  1182. {
  1183. return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
  1184. }
  1185. };
  1186. EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );