Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

466 lines
16 KiB

  1. // DebugSupport.cpp : Defines the entry point for the console application.
  2. //
  3. #include "stdafx.h"
  4. int g_StreamIndex = 0;
  5. FILE *g_fpOutputFile = NULL;
  6. IStorage *g_pDebugFile = NULL;
  7. WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech );
  8. bool ParseCommandLine( int argc, char* argv[] );
  9. void ExtractSentenceBreaks( void );
  10. void ExtractNormalizedText( void );
  11. void ExtractLexLookup( void );
  12. void ExtractPOSPossibilities( void );
  13. void ExtractMorphology( void );
  14. int main(int argc, char* argv[])
  15. {
  16. bool fSuccess = false;
  17. CoInitialize( NULL );
  18. fSuccess = ParseCommandLine( argc, argv );
  19. if ( fSuccess )
  20. {
  21. switch ( g_StreamIndex )
  22. {
  23. case STREAM_SENTENCEBREAKS:
  24. ExtractSentenceBreaks();
  25. break;
  26. case STREAM_NORMALIZEDTEXT:
  27. ExtractNormalizedText();
  28. break;
  29. case STREAM_LEXLOOKUP:
  30. ExtractLexLookup();
  31. break;
  32. case STREAM_POSPOSSIBILITIES:
  33. ExtractPOSPossibilities();
  34. break;
  35. case STREAM_MORPHOLOGY:
  36. ExtractMorphology();
  37. break;
  38. }
  39. }
  40. CoUninitialize();
  41. return 0;
  42. }
  43. bool ParseCommandLine( int argc, char* argv[] )
  44. {
  45. bool fSuccess = true;
  46. //--- Check number of parameters
  47. if ( argc < 4 )
  48. {
  49. goto USAGE;
  50. }
  51. //--- Check streamname validity
  52. fSuccess = false;
  53. WCHAR StreamName[MAX_PATH];
  54. if ( !MultiByteToWideChar( CP_ACP, 0, argv[2], strlen( argv[2] ) + 1, StreamName, MAX_PATH ) )
  55. {
  56. goto MISC_ERROR;
  57. }
  58. else
  59. {
  60. for ( int i = 0; i < STREAM_LASTTYPE; i++ )
  61. {
  62. if ( wcscmp( StreamName, StreamTypeStrings[i].pStr ) == 0 )
  63. {
  64. fSuccess = true;
  65. g_StreamIndex = i;
  66. break;
  67. }
  68. }
  69. }
  70. if ( !fSuccess )
  71. {
  72. goto USAGE;
  73. }
  74. //--- Try to open debug info file
  75. WCHAR DebugFilename[MAX_PATH];
  76. if ( !MultiByteToWideChar( CP_ACP, 0, argv[1], strlen( argv[1] ) + 1, DebugFilename, MAX_PATH ) )
  77. {
  78. goto MISC_ERROR;
  79. }
  80. if ( FAILED( StgOpenStorage( DebugFilename, NULL, STGM_READ | STGM_SHARE_DENY_WRITE,
  81. NULL, 0, &g_pDebugFile ) ) )
  82. {
  83. goto MISC_ERROR;
  84. }
  85. //--- Try to open file for output
  86. WCHAR OutputFilename[MAX_PATH];
  87. if ( !MultiByteToWideChar( CP_ACP, 0, argv[3], strlen( argv[3] ) + 1, OutputFilename, MAX_PATH ) )
  88. {
  89. goto MISC_ERROR;
  90. }
  91. g_fpOutputFile = _wfopen( OutputFilename, L"w" );
  92. if ( !g_fpOutputFile )
  93. {
  94. printf( "\n\nUnable to open file: %s\n", argv[3] );
  95. goto MISC_ERROR;
  96. }
  97. return true;
  98. USAGE:
  99. printf( "\n\nUSAGE:\n\n\tDebugSupport [debug filename] [streamname] [output filename]\n" );
  100. printf( "\tStream names are:\n\t\tSentenceBreaks\n\t\tNormalizedText\n\t\tMorphology" );
  101. printf( "\n\t\tLexLookup\n\n" );
  102. return false;
  103. MISC_ERROR:
  104. printf( "\n\n\tERROR in ParseCommandLine(...)\n\n" );
  105. return false;
  106. }
  107. //--- Just print the original text out, with a newline character between each sentence.
  108. void ExtractSentenceBreaks( void )
  109. {
  110. IStream *pStgStream = NULL;
  111. if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
  112. 0, &pStgStream) == S_OK )
  113. {
  114. DebugSentItem Item, EmptyItem;
  115. ULONG cbRead = 0, ulOffset = 0;
  116. bool fResetOffset = true;
  117. while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
  118. cbRead == sizeof( Item ) )
  119. {
  120. //--- Check for delimiter
  121. if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
  122. {
  123. fwprintf( g_fpOutputFile, L"\n" );
  124. }
  125. else
  126. {
  127. //--- Print item
  128. fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText );
  129. }
  130. }
  131. }
  132. }
  133. //--- Just print the normalized text of each item out, separated by single spaces,
  134. //--- with a newline character between each sentence.
  135. void ExtractNormalizedText( void )
  136. {
  137. IStream *pStgStream = NULL;
  138. if ( g_pDebugFile->OpenStream( StreamTypeStrings[5].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
  139. 0, &pStgStream) == S_OK )
  140. {
  141. DebugSentItem Item, EmptyItem;
  142. ULONG cbRead = 0;
  143. while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
  144. cbRead == sizeof( Item ) )
  145. {
  146. //--- Check for delimiter
  147. if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
  148. {
  149. fwprintf( g_fpOutputFile, L"\n" );
  150. }
  151. else
  152. {
  153. //--- Print item
  154. if ( Item.ItemInfo.Type != eALPHA_WORD &&
  155. Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
  156. Item.ItemInfo.Type != eOPEN_BRACKET &&
  157. Item.ItemInfo.Type != eOPEN_BRACE &&
  158. Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
  159. Item.ItemInfo.Type != eCLOSE_BRACKET &&
  160. Item.ItemInfo.Type != eCLOSE_BRACE &&
  161. Item.ItemInfo.Type != eSINGLE_QUOTE &&
  162. Item.ItemInfo.Type != eDOUBLE_QUOTE &&
  163. Item.ItemInfo.Type != ePERIOD &&
  164. Item.ItemInfo.Type != eEXCLAMATION &&
  165. Item.ItemInfo.Type != eQUESTION &&
  166. Item.ItemInfo.Type != eCOMMA &&
  167. Item.ItemInfo.Type != eSEMICOLON &&
  168. Item.ItemInfo.Type != eCOLON &&
  169. Item.ItemInfo.Type != eHYPHEN )
  170. {
  171. fwprintf( g_fpOutputFile, L"[ " );
  172. }
  173. for ( ULONG i = 0; i < Item.ulNumWords; i++ )
  174. {
  175. if ( Item.Words[i].ulWordLen > 0 )
  176. {
  177. fwprintf( g_fpOutputFile, L"%s ", Item.Words[i].WordText );
  178. }
  179. else
  180. {
  181. fwprintf( g_fpOutputFile, L"%s ", Item.ItemSrcText );
  182. }
  183. }
  184. if ( Item.ItemInfo.Type != eALPHA_WORD &&
  185. Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
  186. Item.ItemInfo.Type != eOPEN_BRACKET &&
  187. Item.ItemInfo.Type != eOPEN_BRACE &&
  188. Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
  189. Item.ItemInfo.Type != eCLOSE_BRACKET &&
  190. Item.ItemInfo.Type != eCLOSE_BRACE &&
  191. Item.ItemInfo.Type != eSINGLE_QUOTE &&
  192. Item.ItemInfo.Type != eDOUBLE_QUOTE &&
  193. Item.ItemInfo.Type != ePERIOD &&
  194. Item.ItemInfo.Type != eEXCLAMATION &&
  195. Item.ItemInfo.Type != eQUESTION &&
  196. Item.ItemInfo.Type != eCOMMA &&
  197. Item.ItemInfo.Type != eSEMICOLON &&
  198. Item.ItemInfo.Type != eCOLON &&
  199. Item.ItemInfo.Type != eHYPHEN )
  200. {
  201. fwprintf( g_fpOutputFile, L"] " );
  202. }
  203. }
  204. }
  205. }
  206. }
  207. //--- Print the text of each item, and then its Pronunciation and Part of Speech.
  208. //--- Separate each with a newline character.
  209. void ExtractLexLookup( void )
  210. {
  211. IStream *pStgStream = NULL;
  212. if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
  213. 0, &pStgStream) == S_OK )
  214. {
  215. DebugSentItem Item, EmptyItem;
  216. ULONG cbRead = 0;
  217. while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
  218. cbRead == sizeof( Item ) )
  219. {
  220. if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
  221. {
  222. fwprintf( g_fpOutputFile, L"\n" );
  223. }
  224. else
  225. {
  226. //--- Print Normalization delimiter
  227. if ( Item.ItemInfo.Type != eALPHA_WORD &&
  228. Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
  229. Item.ItemInfo.Type != eOPEN_BRACKET &&
  230. Item.ItemInfo.Type != eOPEN_BRACE &&
  231. Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
  232. Item.ItemInfo.Type != eCLOSE_BRACKET &&
  233. Item.ItemInfo.Type != eCLOSE_BRACE &&
  234. Item.ItemInfo.Type != eSINGLE_QUOTE &&
  235. Item.ItemInfo.Type != eDOUBLE_QUOTE &&
  236. Item.ItemInfo.Type != ePERIOD &&
  237. Item.ItemInfo.Type != eEXCLAMATION &&
  238. Item.ItemInfo.Type != eQUESTION &&
  239. Item.ItemInfo.Type != eCOMMA &&
  240. Item.ItemInfo.Type != eSEMICOLON &&
  241. Item.ItemInfo.Type != eCOLON &&
  242. Item.ItemInfo.Type != eHYPHEN )
  243. {
  244. fwprintf( g_fpOutputFile, L"[ " );
  245. }
  246. for ( ULONG i = 0; i < Item.ulNumWords; i++ )
  247. {
  248. //--- Print item
  249. if ( Item.Words[i].WordText[0] != 0 )
  250. {
  251. fwprintf ( g_fpOutputFile, L"%s ", Item.Words[i].WordText );
  252. }
  253. else
  254. {
  255. fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText );
  256. }
  257. //--- Print pronunciation
  258. //CComPtr<ISpPhoneConverter> pPhoneConv;
  259. //if ( SUCCEEDED( SpCreatePhoneConverter(1033, NULL, NULL, &pPhoneConv) ) )
  260. //{
  261. // if ( SUCCEEDED( pPhoneConv->IdToPhone( Item.Words[i].WordPron, Item.Words[i].WordPron ) ) )
  262. // {
  263. // fwprintf( g_fpOutputFile, L"%s", Item.Words[i].WordPron );
  264. // for ( long j = 0; j < (long)( (long)45 - (long)wcslen( Item.Words[i].WordPron ) ); j++ )
  265. // {
  266. // fwprintf( g_fpOutputFile, L" " );
  267. // }
  268. // }
  269. //}
  270. //--- Print POS
  271. fwprintf ( g_fpOutputFile, L"(%s) ", ConvertPOSToString( Item.Words[i].eWordPartOfSpeech ) );
  272. }
  273. //--- Print Normalization delimiter
  274. if ( Item.ItemInfo.Type != eALPHA_WORD &&
  275. Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
  276. Item.ItemInfo.Type != eOPEN_BRACKET &&
  277. Item.ItemInfo.Type != eOPEN_BRACE &&
  278. Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
  279. Item.ItemInfo.Type != eCLOSE_BRACKET &&
  280. Item.ItemInfo.Type != eCLOSE_BRACE &&
  281. Item.ItemInfo.Type != eSINGLE_QUOTE &&
  282. Item.ItemInfo.Type != eDOUBLE_QUOTE &&
  283. Item.ItemInfo.Type != ePERIOD &&
  284. Item.ItemInfo.Type != eEXCLAMATION &&
  285. Item.ItemInfo.Type != eQUESTION &&
  286. Item.ItemInfo.Type != eCOMMA &&
  287. Item.ItemInfo.Type != eSEMICOLON &&
  288. Item.ItemInfo.Type != eCOLON &&
  289. Item.ItemInfo.Type != eHYPHEN )
  290. {
  291. fwprintf( g_fpOutputFile, L"] " );
  292. }
  293. }
  294. }
  295. }
  296. }
  297. void ExtractPOSPossibilities( void )
  298. {
  299. IStream *pStgStream = NULL;
  300. if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
  301. 0, &pStgStream) == S_OK )
  302. {
  303. DebugPronRecord PronRecord, EmptyPronRecord;
  304. ULONG cbRead = 0;
  305. while ( SUCCEEDED( pStgStream->Read( (void*) &PronRecord, sizeof( PronRecord ), &cbRead ) ) &&
  306. cbRead == sizeof( PronRecord ) )
  307. {
  308. //--- Check for delimiter
  309. if ( memcmp( &PronRecord, &EmptyPronRecord, sizeof( PronRecord ) ) == 0 )
  310. {
  311. fwprintf( g_fpOutputFile, L"\n" );
  312. }
  313. else
  314. {
  315. fwprintf( g_fpOutputFile, PronRecord.orthStr );
  316. fwprintf( g_fpOutputFile, L" [ " );
  317. fwprintf( g_fpOutputFile, L"%s - ", ConvertPOSToString( PronRecord.POSchoice ) );
  318. for ( ULONG i = 0; i < PronRecord.pronArray[0].POScount; i++ )
  319. {
  320. fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[0].POScode[i] ) );
  321. }
  322. for ( i = 0; i < PronRecord.pronArray[1].POScount; i++ )
  323. {
  324. fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[1].POScode[i] ) );
  325. }
  326. fwprintf( g_fpOutputFile, L" ]\n" );
  327. }
  328. }
  329. }
  330. }
  331. void ExtractMorphology( void )
  332. {
  333. IStream *pStgStream = NULL;
  334. if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
  335. 0, &pStgStream ) == S_OK )
  336. {
  337. CComPtr<ISpPhoneConverter> pPhoneConv;
  338. if ( SUCCEEDED( SpCreatePhoneConverter( 1033, NULL, NULL, &pPhoneConv ) ) )
  339. {
  340. WCHAR Buffer[SP_MAX_WORD_LENGTH], EmptyBuffer[SP_MAX_WORD_LENGTH];
  341. ULONG cbRead = 0;
  342. ZeroMemory( EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) );
  343. BOOL fRoot = true;
  344. while ( SUCCEEDED( pStgStream->Read( (void*) &Buffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ), &cbRead ) ) &&
  345. cbRead == SP_MAX_WORD_LENGTH * sizeof( WCHAR ) )
  346. {
  347. //--- Check for delimiter
  348. if ( memcmp( &Buffer, &EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) ) == 0 )
  349. {
  350. fwprintf( g_fpOutputFile, L"\n" );
  351. fRoot = true;
  352. }
  353. else if ( fRoot )
  354. {
  355. fwprintf( g_fpOutputFile, L"%s ", Buffer );
  356. fRoot = false;
  357. }
  358. else
  359. {
  360. if ( SUCCEEDED( pPhoneConv->IdToPhone( Buffer, Buffer ) ) )
  361. {
  362. fwprintf( g_fpOutputFile, L"- %s ", Buffer );
  363. }
  364. }
  365. }
  366. }
  367. }
  368. }
  369. WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech )
  370. {
  371. switch (dwPartOfSpeech)
  372. {
  373. case MS_NotOverriden:
  374. return L"Noun";
  375. case MS_Unknown:
  376. return L"Unknown";
  377. case MS_Punctuation:
  378. return L"Punctuation";
  379. case MS_Noun:
  380. return L"Noun";
  381. case MS_Verb:
  382. return L"Verb";
  383. case MS_Modifier:
  384. return L"Modifier";
  385. case MS_Function:
  386. return L"Function";
  387. case MS_Interjection:
  388. return L"Interj";
  389. case MS_Pron:
  390. return L"Pron";
  391. case MS_SubjPron:
  392. return L"SubjPron";
  393. case MS_ObjPron:
  394. return L"ObjPron";
  395. case MS_RelPron:
  396. return L"RelPron";
  397. // case MS_PPron:
  398. // return L"PPron";
  399. // case MS_IPron:
  400. // return L"IPron";
  401. // case MS_RPron:
  402. // return L"RPron";
  403. // case MS_DPron:
  404. // return L"DPron";
  405. case MS_Adj:
  406. return L"Adj";
  407. case MS_Adv:
  408. return L"Adv";
  409. case MS_VAux:
  410. return L"VAux";
  411. // case MS_RVAux:
  412. // return L"RVAux";
  413. case MS_Conj:
  414. return L"Conj";
  415. case MS_CConj:
  416. return L"CConj";
  417. case MS_Interr:
  418. return L"WHWord";
  419. case MS_Det:
  420. return L"Det";
  421. case MS_Contr:
  422. return L"Contr";
  423. // case MS_VPart:
  424. // return L"VPart";
  425. case MS_Prep:
  426. return L"Prep";
  427. // case MS_Quant:
  428. // return L"Quant";
  429. default:
  430. return L"Unknown";
  431. }
  432. }