Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

627 lines
17 KiB

4 years ago
  1. /*****************************************************************************
  2. * *
  3. * MAKEPHR.CPP *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. ******************************************************************************
  9. * *
  10. * Module Intent *
  11. * *
  12. * Main processing code that parcels out the work to all the other units *
  13. * *
  14. *****************************************************************************/
  15. #include "stdafx.h"
  16. #include "..\common\coutput.h"
  17. #include <fcntl.h>
  18. #include "cphrase.h"
  19. #ifdef _DEBUG
  20. #undef THIS_FILE
  21. static char THIS_FILE[] = __FILE__;
  22. #endif
  23. /*****************************************************************************
  24. * *
  25. * Defines *
  26. * *
  27. *****************************************************************************/
  28. const int CB_SAVEMIN_DEFAULT = 40;
  29. // REVIEW: if WinHelp 4.0 ends up being 32-bit, then this limit could
  30. // be raised.
  31. const int MAX_PHRASE_FILE = 0xFFF0;
  32. const int CCHBUF = 2048; // Generic buffer size
  33. const int MAX_LOOP = 35;
  34. const int CNIL = -1; // Nil count (used for invalid cMaxNKeyPh)
  35. // Maximum size of a record you can pass to the sorter.
  36. const int cbMAX_REC_SIZE = 520; // must be at least as large as CCH_MAX_PHRASE
  37. const int DB_SIZE = 8000;
  38. typedef struct {
  39. PSTR sz; // Pointer to phrase string
  40. int coc; // Number of occurances of phrase
  41. int cch; // Length of phrase
  42. int cbSavings; // Number of bytes saved by suppressing phrase
  43. } FR_INFO;
  44. static PSTR pszClubBuffer; // buffer for club's prefix
  45. static PSTR pszCandBuffer; // buffer for candidate
  46. static PSTR pszInputBuffer; // buffer for current line
  47. static int lTotal; // total savings over all clubs
  48. static INLINE int STDCALL CbPrefixPch2(PSTR pch1, PSTR pch2);
  49. static INLINE BOOL STDCALL FSlice(PSTR szInFile, PSTR szOutFile);
  50. static INLINE RC_TYPE STDCALL RcRemoveDuplicates(PSTR szInput, PSTR szOutput);
  51. static INLINE PSTR STDCALL SzGetPhrase(PSTR szBuf, int cchMax, FILE* pfile);
  52. static INLINE VOID STDCALL GenerateKeyPhrases(CTable* ptbl, int cMaxNKeyPh);
  53. static int STDCALL club(int cbSaveMin, CTable* ptbl, int* plcbFileSave);
  54. INLINE static void STDCALL SelectPhrase(FR_INFO fr, CTable* ptbl);
  55. static BOOL STDCALL FGetNextCandidate(FR_INFO* pfrCand);
  56. static int FASTCALL DcbConverge(int cInit, int cTarget, int cbSize);
  57. /***************************************************************************
  58. FUNCTION: CbSaveCchC
  59. PURPOSE: To calculate the savings expected from compressing out a
  60. string given its length and number of occurences.
  61. PARAMETERS:
  62. cch
  63. c
  64. RETURNS:
  65. COMMENTS:
  66. Each phrase, when compressed is represented in the keyphrase table
  67. (with an extra length byte) and is replaced in the plaintext by two
  68. bytes. Hence, the formula for calculating savings is as follows:
  69. c = count of occurences
  70. cch = length of string
  71. 1 = one byte for index into keyphrase array
  72. 2 = two bytes per keyphrase magic cookie
  73. cost of keyphrase = cch + 2*c + 1
  74. benefit of keyphrase = cch * c
  75. savings = benefit - cost = cch * (c - 1) - 2c - 1
  76. Note: keyphrases may be a maximum of 255 bytes long, due to storage
  77. of their lengths in a byte.
  78. MODIFICATION DATES:
  79. 12-Jun-1994 [ralphw]
  80. ***************************************************************************/
  81. INLINE static int STDCALL CbSaveCchC(int cch, int c) {
  82. if (cch < 2)
  83. return 0;
  84. else
  85. return (cch * (c - 1) - 2 * c - 1);
  86. };
  87. /***************************************************************************
  88. FUNCTION: RcMakePhr
  89. PURPOSE:
  90. PARAMETERS:
  91. szInputFile
  92. szOutputFile
  93. cMaxNKeyPh
  94. RETURNS:
  95. COMMENTS:
  96. MODIFICATION DATES:
  97. 19-Jul-1993 [ralphw]
  98. ***************************************************************************/
  99. RC_TYPE STDCALL RcMakePhr(PSTR szOutputFile, int cMaxNKeyPh)
  100. {
  101. CTable tbl;
  102. ASSERT(szOutputFile != NULL);
  103. {
  104. CMem memClubBuffer(CCHBUF); // buffer for club's prefix
  105. CMem memCandBuffer(CCHBUF); // buffer for candidate
  106. CMem memInputBuffer(CCHBUF); // buffer for current line
  107. pszClubBuffer = memClubBuffer.psz;
  108. pszCandBuffer = memCandBuffer.psz;
  109. pszInputBuffer = memInputBuffer.psz;
  110. GenerateKeyPhrases(&tbl, cMaxNKeyPh);
  111. }
  112. tbl.SortTable();
  113. COutput output(szOutputFile);
  114. if (!output.fInitialized) {
  115. // REVIEW: can we get more useful information about the problem?
  116. errHpj.ep = epNoFile;
  117. VReportError(HCERR_CANNOT_OPEN, &errHpj, szOutputFile);
  118. return RC_Failure;
  119. }
  120. #ifdef _DEBUG
  121. DWORD cb = 0;
  122. #endif
  123. for (int pos = 1; pos <= tbl.CountStrings(); pos++) {
  124. #ifdef _DEBUG
  125. /*
  126. * We only check during debug, because this should have already
  127. * been prevented -- we just want to make absolutely certain.
  128. */
  129. // add 2 for CR/LF
  130. cb += strlen(tbl.GetPointer(pos)) + 2;
  131. ASSERT(cb < MAX_PHRASE_FILE + 15);
  132. #endif
  133. output.outstring_eol(tbl.GetPointer(pos));
  134. }
  135. return RC_Success;
  136. }
  137. /***************************************************************************
  138. FUNCTION: FGetNextCandidate
  139. PURPOSE:
  140. This function reads in the next candidate phrase from pfile.
  141. A "candidate" is the next string in the file, plus
  142. successive occurances of the same string. The return value,
  143. put into pfrCand, contains the string, its length, number of
  144. occurances, and bytes saved if it is suppressed. The static
  145. variable szInput is a "put back" buffer which, if full,
  146. contains the string read which didn't fit into the previous
  147. candidate. In addition to values in pfrCand, returns TRUE
  148. if candidate found, and FALSE if end-of-file has been
  149. reached.
  150. PARAMETERS:
  151. pfrCand
  152. pfile
  153. RETURNS:
  154. COMMENTS:
  155. MODIFICATION DATES:
  156. 19-Jul-1993 [ralphw]
  157. ***************************************************************************/
  158. static BOOL STDCALL FGetNextCandidate(FR_INFO* pfrCand)
  159. {
  160. DWORD count = pphrase->GetPhrase(pfrCand->sz);
  161. if (count == (DWORD) -1)
  162. return FALSE;
  163. pfrCand->coc = count;
  164. pfrCand->cch = strlen(pfrCand->sz);
  165. pfrCand->cbSavings = CbSaveCchC(pfrCand->cch, pfrCand->coc);
  166. return TRUE;
  167. }
  168. /***************************************************************************
  169. FUNCTION: CbPrefixPch2
  170. PURPOSE:
  171. To calculate the number of characters shared as a prefix by
  172. the two given strings.
  173. PARAMETERS:
  174. pch1
  175. pch2
  176. RETURNS: length of the prefix
  177. COMMENTS:
  178. MODIFICATION DATES:
  179. 19-Jul-1993 [ralphw]
  180. ***************************************************************************/
  181. static INLINE int STDCALL CbPrefixPch2(PSTR pch1, PSTR pch2)
  182. {
  183. int cch;
  184. for (cch = 0; pch1[cch] == pch2[cch] && pch1[cch] != '\0'; cch++)
  185. ;
  186. if (cch > 1 && options.fDBCS) {
  187. /*
  188. * If what doesn't match is the second character of a DBCS word,
  189. * then we must back up so as not to include the DBCS lead byte.
  190. */
  191. if (IsFirstByte(pch1[cch - 1]))
  192. --cch;
  193. }
  194. return cch;
  195. }
  196. /*-----------------------------------------------------------------------
  197. | Name: club() |
  198. | Purpose:
  199. | Usage: |
  200. | Assumptions: We have sorted the phrases in input file. |
  201. | We only care about phrases saving more than |
  202. | cbSaveMin bytes. |
  203. | |
  204. | Method: |
  205. | We are looking for the phrases that save the most space.|
  206. | We could just look for multiple occurences of a single phrase. |
  207. | (In fact, this was our old method). Unfortunately, this means |
  208. | that if we have two 43 character phrases that differ in the 42nd|
  209. | character, we are probably wasting space by not using the common|
  210. | prefix. The following algorithm attempts to deal with common |
  211. | prefixes in an intelligent way. |
  212. | A group of phrases that effectively share a common |
  213. | prefix is called a "club." As we scan through the input, we are|
  214. | looking for new members to add to the current club. For example|
  215. | the current club may share the prefix "It's faster to." When we|
  216. | get the string "It's faster not to," we must decide whether |
  217. | or not to accept it as a club member. Accepting it would mean |
  218. | that we must now lower our standards to use the prefix "It's |
  219. | faster" in order to increase membership. Rejecting a string |
  220. | causes us to close the club to future members (it's in the by- |
  221. | laws). |
  222. | |
  223. | Our membership decision is based on two criteria. Both |
  224. | the club and the candidate must profit from the transaction in |
  225. | order for us to grow the club. |
  226. | 1) The candidate must earn more compression by joining |
  227. | the club, and using only the prefix he shares |
  228. | with the club than he would earn by retaining |
  229. | his suffix but having to pay the overhead of |
  230. | being a separate keyphrase. |
  231. | 2) The profit earned by the club by growing its member- |
  232. | ship must outweigh the loss of any shortenning |
  233. | of its prefix. |
  234. | |
  235. | Once we have closed membership to a club, we decide if |
  236. | the club saved enough to be worth keeping, and if so we write it|
  237. | to stdout. We then start the next club by having it consist |
  238. | solely of the candidate whose rejection closed the previous club|
  239. | Returns: count of phrases selected. |
  240. -----------------------------------------------------------------------*/
  241. const int cbClubMin = 1;
  242. static int STDCALL club(int cbSaveMin, CTable* ptbl, int* plcbFileSave)
  243. {
  244. CTable* pfileSave = ptbl;
  245. FR_INFO frClub; // Current club phrase
  246. FR_INFO frCand; // Current candidate for club
  247. int cClub; // count of clubs
  248. int cchT;
  249. int cocT;
  250. int cbSavingsT;
  251. pfileSave->Empty();
  252. frCand.sz = pszCandBuffer;
  253. frClub.sz = pszClubBuffer;
  254. //----------open-files-for-reading-and-writing-------------*/
  255. //---initialize-1st-club,-1st-Candidate,-and-variables-------*/
  256. frClub.sz[0] = '\0';
  257. frClub.cch = 0;
  258. frClub.coc = 0;
  259. frClub.cbSavings = 0;
  260. cClub = 0;
  261. lTotal = 0;
  262. pphrase->SetPosition();
  263. *plcbFileSave = 0;
  264. while (FGetNextCandidate(&frCand)) {
  265. // Get temporary values for combination of frCand and frClub
  266. cchT = CbPrefixPch2(frClub.sz, frCand.sz);
  267. cocT = frClub.coc + frCand.coc;
  268. cbSavingsT = CbSaveCchC(cchT, cocT);
  269. //---------IF-it-is-worthwhile-to-admit-candidate-----------*/
  270. if (cchT > 2
  271. && cbSavingsT > frCand.cbSavings + frClub.cbSavings + cbClubMin)
  272. {
  273. // Admit candidate into club:
  274. frClub.cch = cchT;
  275. frClub.coc = cocT;
  276. frClub.cbSavings = cbSavingsT;
  277. frClub.sz[cchT] = '\0';
  278. }
  279. else
  280. //---------ELSE-the-club-is-closed-------------------------*/
  281. {
  282. //---------IF-Club-saves-enuf-make-it-a-keyphrase---------*/
  283. if (frClub.cbSavings >= cbSaveMin) {
  284. SelectPhrase(frClub, pfileSave);
  285. // add 2 for CR/LF
  286. *plcbFileSave += strlen(frClub.sz) + 2;
  287. cClub++;
  288. ASSERT(cClub > 0);
  289. lTotal += frClub.cbSavings;
  290. }
  291. // Make candidate start of next club
  292. frClub = frCand;
  293. frClub.sz = pszClubBuffer; // Keep buffers separate
  294. strcpy(frClub.sz, frCand.sz);
  295. }
  296. } // End while (more candidates)
  297. // Check last club for worthwhile savings
  298. if (frClub.cbSavings >= cbSaveMin) {
  299. SelectPhrase(frClub, pfileSave);
  300. *plcbFileSave += strlen(frClub.sz);
  301. cClub++;
  302. lTotal += frClub.cbSavings;
  303. }
  304. //--------------------print-data---------------------*/
  305. #ifdef REPORT
  306. {
  307. wsprintf(szParentString,
  308. "DEBUG: cbSaveMin %d yields %s keyphrases, saving %s bytes\r\n",
  309. cbSaveMin, FormatNumber(cClub), FormatNumber(lTotal));
  310. SendStringToParent(szParentString);
  311. }
  312. #endif
  313. //---------------close-all-files---------------------------*/
  314. // Return number of phrases generated:
  315. return (cClub);
  316. }
  317. /***************************************************************************
  318. *
  319. - Name: DcbConverge
  320. -
  321. * Purpose:
  322. * This function is used to compute the delta to the cbSaveMin
  323. * parameter to get cInit to converge to cTarget. Lots of hocus pocus.
  324. *
  325. * Arguments:
  326. * cInit - Current count of phrases
  327. * cTarget - Limit that count of phrases violates
  328. * cbSize - Current cbSaveMin, used to scale size of delta
  329. *
  330. * Returns:
  331. * Value to be added to cbSaveMin for next club attempt. This value
  332. * will be negative if cbSaveMin should be reduced.
  333. *
  334. ***************************************************************************/
  335. static int FASTCALL DcbConverge(int cInit, int cTarget, int cbSize)
  336. {
  337. int dcb;
  338. dcb = ((cInit - cTarget) / 50);
  339. if (dcb == 0)
  340. dcb = (cInit > cTarget ? 1 : -1);
  341. return dcb * (1 + cbSize / 32);
  342. }
  343. /***************************************************************************
  344. FUNCTION: GenerateKeyPhrases
  345. PURPOSE:
  346. Generates a key phrase list from list of candidates in szInput,
  347. using the club method, and writes them out to szOutput using
  348. SelectPhrase(). Gets numbers to use from szData (if it exists --
  349. otherwise use defaults) except for when a valid phrase count is
  350. passed in cMaxNKeyPh.
  351. Calls club() repeatedly with different values for cbClubMin and
  352. cbSaveMin until club() returns the correct number of phrases
  353. generated. If we get into an infinite loop (by looping more than
  354. MAX_LOOP times), just use the best values received so far.
  355. PARAMETERS:
  356. szInput
  357. szData
  358. szOutput
  359. cMaxNKeyPh
  360. RETURNS:
  361. COMMENTS:
  362. MODIFICATION DATES:
  363. 19-Jul-1993 [ralphw]
  364. ***************************************************************************/
  365. const int MIN_SAVINGS = 1;
  366. #define GRIND_TIME 4
  367. static INLINE void STDCALL GenerateKeyPhrases(CTable* ptbl,
  368. int cMaxNKeyPh)
  369. {
  370. int cPhrases;
  371. int cMinNKeyPh;
  372. int dcbSaveMin, dcbSaveMinOld;
  373. int lcbFile = 0;
  374. int cbSaveMin = CB_SAVEMIN_DEFAULT;
  375. if (cMaxNKeyPh == CNIL)
  376. cMaxNKeyPh = MAX_PHRASES;
  377. dcbSaveMin = 0;
  378. cMinNKeyPh = cMaxNKeyPh - (cMaxNKeyPh >> 4);
  379. cGrind = 0;
  380. /*
  381. * While we don't have the right amount of phrases, or while the
  382. * output file is to big, we will do this loop and adjust the phrase
  383. * file.
  384. */
  385. while ((cPhrases = club(cbSaveMin, ptbl, &lcbFile))
  386. > cMaxNKeyPh || cPhrases < cMinNKeyPh || lcbFile >= MAX_PHRASE_FILE) {
  387. if (++cGrind == GRIND_TIME) {
  388. doGrind();
  389. cGrind = 0;
  390. }
  391. dcbSaveMinOld = dcbSaveMin;
  392. // Check for output file > 64K -- WinHelp limitation
  393. if (lcbFile >= MAX_PHRASE_FILE && cPhrases <= cMaxNKeyPh) {
  394. ASSERT(cMaxNKeyPh > ((cPhrases * MAX_PHRASE_FILE) / lcbFile));
  395. cMaxNKeyPh = (cPhrases * MAX_PHRASE_FILE) / lcbFile;
  396. cMinNKeyPh = cMaxNKeyPh - (cMaxNKeyPh >> 4);
  397. dcbSaveMin = DcbConverge(cPhrases, cMaxNKeyPh, cbSaveMin);
  398. cbSaveMin += dcbSaveMin;
  399. continue;
  400. }
  401. if (cPhrases <= cMaxNKeyPh && cPhrases >= cMinNKeyPh)
  402. break;
  403. // Can't go beyond 1,1:
  404. if (cbSaveMin == MIN_SAVINGS && cPhrases < cMinNKeyPh)
  405. break;
  406. // Adjust cbSaveMin
  407. if (cPhrases > cMaxNKeyPh) {
  408. dcbSaveMin = DcbConverge(cPhrases, cMaxNKeyPh, cbSaveMin);
  409. ASSERT(dcbSaveMin > 0);
  410. }
  411. else {
  412. ASSERT(cPhrases < cMinNKeyPh);
  413. dcbSaveMin = DcbConverge(cPhrases, cMinNKeyPh, cbSaveMin);
  414. ASSERT(dcbSaveMin < 0);
  415. // If we are not converging, then break
  416. if (dcbSaveMinOld > 0 && (-dcbSaveMin >= dcbSaveMinOld))
  417. break;
  418. }
  419. cbSaveMin = max(cbSaveMin + dcbSaveMin, MIN_SAVINGS);
  420. }
  421. if (cbSaveMin == 1)
  422. return; // Can't do any better then this
  423. // Try to squeeze a bit more out of the phrase file. This will increase
  424. // compile time, but can generate a bit better compression.
  425. #ifdef _DEBUG
  426. int cOrgTotal = lTotal;
  427. #endif
  428. int cBestTotal = lTotal;
  429. int cbBestSaveMin = cbSaveMin;
  430. while (lcbFile < MAX_PHRASE_FILE && cbSaveMin > MIN_SAVINGS) {
  431. if (club(--cbSaveMin, ptbl, &lcbFile) >= MAX_PHRASES)
  432. break;
  433. if (++cGrind == GRIND_TIME) {
  434. doGrind();
  435. cGrind = 0;
  436. }
  437. if (lcbFile < MAX_PHRASE_FILE && lTotal > cBestTotal) {
  438. // Keep track of best compression to date
  439. cBestTotal = lTotal;
  440. cbBestSaveMin = cbSaveMin;
  441. }
  442. else
  443. break;
  444. }
  445. club(cbBestSaveMin, ptbl, &lcbFile);
  446. }
  447. /***************************************************************************
  448. FUNCTION: SelectPhrase
  449. PURPOSE:
  450. This function puts the given phrase into the list of phrases
  451. to suppress. The file argument is the file to save the
  452. phrases in.
  453. First, one trailing space is removed, if present, from the
  454. end of the string. This is because, in the phrase
  455. replacement algorithm, phrases from the list with trailing
  456. spaces added are available for free.
  457. PARAMETERS:
  458. fr
  459. pfile
  460. RETURNS:
  461. COMMENTS:
  462. MODIFICATION DATES:
  463. 19-Jul-1993 [ralphw]
  464. ***************************************************************************/
  465. INLINE static void STDCALL SelectPhrase(FR_INFO fr, CTable* ptbl)
  466. {
  467. /*
  468. * If phrase contains a trailing space, we remove it, but only if the
  469. * resulting phrase is longer than 2 characters.
  470. */
  471. if (fr.sz[fr.cch - 1] == ' ' && fr.cch > 3)
  472. fr.sz[--fr.cch] = '\0';
  473. if (!ptbl->IsCSStringInTable(fr.sz))
  474. ptbl->AddString(fr.sz);
  475. }