Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1981 lines
46 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /*
  4. * jchuff.c
  5. *
  6. * Copyright (C) 1991-1996, Thomas G. Lane.
  7. * This file is part of the Independent JPEG Group's software.
  8. * For conditions of distribution and use, see the accompanying README file.
  9. *
  10. * This file contains Huffman entropy encoding routines.
  11. *
  12. * Much of the complexity here has to do with supporting output suspension.
  13. * If the data destination module demands suspension, we want to be able to
  14. * back up to the start of the current MCU. To do this, we copy state
  15. * variables into local working storage, and update them back to the
  16. * permanent JPEG objects only upon successful completion of an MCU.
  17. */
  18. // MMx Optimisation disabled 5/29/97 Gromit Bug 4375 -Tiling error. - ajais.
  19. #pragma warning( disable : 4799 )
  20. #define JPEG_INTERNALS
  21. #include "jinclude.h"
  22. #include "jpeglib.h"
  23. #include "jchuff.h" /* Declarations shared with jcphuff.c */
  24. /* Expanded entropy encoder object for Huffman encoding.
  25. *
  26. * The savable_state subrecord contains fields that change within an MCU,
  27. * but must not be updated permanently until we complete the MCU.
  28. */
  29. typedef struct
  30. {
  31. __int64 put_buffer_64; //mmx bit-accumulation buffer
  32. INT32 put_buffer; /* current bit-accumulation buffer */
  33. int put_bits; /* # of bits now in it */
  34. int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
  35. } savable_state;
  36. /* This macro is to work around compilers with missing or broken
  37. * structure assignment. You'll need to fix this code if you have
  38. * such a compiler and you change MAX_COMPS_IN_SCAN.
  39. */
  40. //#ifndef NO_STRUCT_ASSIGN
  41. //#define ASSIGN_STATE(dest,src) ((dest) = (src))
  42. //#else
  43. // pull out the assignments to put_buffer and put_bits since they are implentation dependent
  44. #if MAX_COMPS_IN_SCAN == 4
  45. #define ASSIGN_STATE(dest,src) \
  46. ((dest).last_dc_val[0] = (src).last_dc_val[0], \
  47. (dest).last_dc_val[1] = (src).last_dc_val[1], \
  48. (dest).last_dc_val[2] = (src).last_dc_val[2], \
  49. (dest).last_dc_val[3] = (src).last_dc_val[3])
  50. /*((dest).put_buffer = (src).put_buffer, \
  51. (dest).put_bits = (src).put_bits, */
  52. #endif
  53. //#endif
  54. typedef struct
  55. {
  56. struct jpeg_entropy_encoder pub; /* public fields */
  57. savable_state saved; /* Bit buffer & DC state at start of MCU */
  58. /* These fields are NOT loaded into local working state. */
  59. unsigned int restarts_to_go; /* MCUs left in this restart interval */
  60. int next_restart_num; /* next restart number to write (0-7) */
  61. /* Pointers to derived tables (these workspaces have image lifespan) */
  62. c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
  63. c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
  64. #ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */
  65. long * dc_count_ptrs[NUM_HUFF_TBLS];
  66. long * ac_count_ptrs[NUM_HUFF_TBLS];
  67. #endif
  68. } huff_entropy_encoder;
  69. typedef huff_entropy_encoder * huff_entropy_ptr;
  70. /* Working state while writing an MCU.
  71. * This struct contains all the fields that are needed by subroutines.
  72. */
  73. typedef struct
  74. {
  75. // make the next two variables global for easy access in mmx version
  76. // JOCTET * next_output_byte; /* => next byte to write in buffer */
  77. // size_t free_in_buffer; /* # of byte spaces remaining in buffer */
  78. // savable_state cur; /* Current bit buffer & DC state */
  79. // flatten (instantiate) savable state here
  80. __int64 put_buffer_64; // mmx bit accumulation buffer
  81. INT32 put_buffer; /* current bit-accumulation buffer */
  82. int put_bits; /* # of bits now in it */
  83. int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
  84. j_compress_ptr cinfo; /* dump_buffer needs access to this */
  85. } working_state;
  86. //global vaiables
  87. __int64 put_buffer_64;
  88. // INT32 put_buffer;
  89. int put_bits;
  90. JOCTET * next_output_byte; /* => next byte to write in buffer */
  91. size_t free_in_buffer; /* # of byte spaces remaining in buffer */
  92. boolean mmx_cpu=1;
  93. /* Forward declarations */
  94. METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
  95. JBLOCKROW *MCU_data));
  96. METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
  97. #ifdef ENTROPY_OPT_SUPPORTED
  98. METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
  99. JBLOCKROW *MCU_data));
  100. METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
  101. #endif
  102. void countZeros(int *indexBlock,short *coefBlock,short *outBlock,int *lastZeros,int *numElements);
  103. boolean emit_bits_fast (working_state * state, unsigned int code, int bsize, int only1);
  104. //extern boolean emit_bits (working_state * state, unsigned int code, int size);
  105. boolean encode_one_block_fast (working_state * state, JCOEFPTR block, int last_dc_val,
  106. c_derived_tbl *dctbl, c_derived_tbl *actbl);
  107. //extern boolean encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
  108. // c_derived_tbl *dctbl, c_derived_tbl *actbl);
  109. /*
  110. * Initialize for a Huffman-compressed scan.
  111. * If gather_statistics is TRUE, we do not output anything during the scan,
  112. * just count the Huffman symbols used and generate Huffman code tables.
  113. */
  114. METHODDEF(void)
  115. start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
  116. {
  117. huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
  118. int ci, dctbl, actbl;
  119. jpeg_component_info * compptr;
  120. if (gather_statistics) {
  121. #ifdef ENTROPY_OPT_SUPPORTED
  122. entropy->pub.encode_mcu = encode_mcu_gather;
  123. entropy->pub.finish_pass = finish_pass_gather;
  124. #else
  125. ERREXIT(cinfo, JERR_NOT_COMPILED);
  126. #endif
  127. } else {
  128. entropy->pub.encode_mcu = encode_mcu_huff;
  129. entropy->pub.finish_pass = finish_pass_huff;
  130. }
  131. for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
  132. compptr = cinfo->cur_comp_info[ci];
  133. dctbl = compptr->dc_tbl_no;
  134. actbl = compptr->ac_tbl_no;
  135. /* Make sure requested tables are present */
  136. /* (In gather mode, tables need not be allocated yet) */
  137. if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS ||
  138. (cinfo->dc_huff_tbl_ptrs[dctbl] == NULL && !gather_statistics))
  139. ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
  140. if (actbl < 0 || actbl >= NUM_HUFF_TBLS ||
  141. (cinfo->ac_huff_tbl_ptrs[actbl] == NULL && !gather_statistics))
  142. ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
  143. if (gather_statistics) {
  144. #ifdef ENTROPY_OPT_SUPPORTED
  145. /* Allocate and zero the statistics tables */
  146. /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
  147. if (entropy->dc_count_ptrs[dctbl] == NULL)
  148. entropy->dc_count_ptrs[dctbl] = (long *)
  149. (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
  150. 257 * SIZEOF(long));
  151. MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
  152. if (entropy->ac_count_ptrs[actbl] == NULL)
  153. entropy->ac_count_ptrs[actbl] = (long *)
  154. (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
  155. 257 * SIZEOF(long));
  156. MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
  157. #endif
  158. } else {
  159. /* Compute derived values for Huffman tables */
  160. /* We may do this more than once for a table, but it's not expensive */
  161. jpeg_make_c_derived_tbl(cinfo, cinfo->dc_huff_tbl_ptrs[dctbl],
  162. & entropy->dc_derived_tbls[dctbl]);
  163. jpeg_make_c_derived_tbl(cinfo, cinfo->ac_huff_tbl_ptrs[actbl],
  164. & entropy->ac_derived_tbls[actbl]);
  165. }
  166. /* Initialize DC predictions to 0 */
  167. entropy->saved.last_dc_val[ci] = 0;
  168. }
  169. /* Initialize bit buffer to empty */
  170. entropy->saved.put_buffer_64 = 0;
  171. entropy->saved.put_buffer = 0;
  172. entropy->saved.put_bits = 0;
  173. /* Initialize restart stuff */
  174. entropy->restarts_to_go = cinfo->restart_interval;
  175. entropy->next_restart_num = 0;
  176. }
  177. /*
  178. * Compute the derived values for a Huffman table.
  179. * Note this is also used by jcphuff.c.
  180. */
  181. GLOBAL(void)
  182. jpeg_make_c_derived_tbl (j_compress_ptr cinfo, JHUFF_TBL * htbl,
  183. c_derived_tbl ** pdtbl)
  184. {
  185. c_derived_tbl *dtbl;
  186. int p, i, l, lastp, si;
  187. char huffsize[257];
  188. unsigned int huffcode[257];
  189. unsigned int code;
  190. /* Allocate a workspace if we haven't already done so. */
  191. if (*pdtbl == NULL)
  192. *pdtbl = (c_derived_tbl *)
  193. (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
  194. SIZEOF(c_derived_tbl));
  195. dtbl = *pdtbl;
  196. /* Figure C.1: make table of Huffman code length for each symbol */
  197. /* Note that this is in code-length order. */
  198. p = 0;
  199. for (l = 1; l <= 16; l++) {
  200. for (i = 1; i <= (int) htbl->bits[l]; i++)
  201. huffsize[p++] = (char) l;
  202. }
  203. huffsize[p] = 0;
  204. lastp = p;
  205. /* Figure C.2: generate the codes themselves */
  206. /* Note that this is in code-length order. */
  207. code = 0;
  208. si = huffsize[0];
  209. p = 0;
  210. while (huffsize[p]) {
  211. while (((int) huffsize[p]) == si) {
  212. huffcode[p++] = code;
  213. code++;
  214. }
  215. code <<= 1;
  216. si++;
  217. }
  218. /* Figure C.3: generate encoding tables */
  219. /* These are code and size indexed by symbol value */
  220. /* Set any codeless symbols to have code length 0;
  221. * this allows emit_bits to detect any attempt to emit such symbols.
  222. */
  223. MEMZERO(dtbl->ehufsi, SIZEOF(dtbl->ehufsi));
  224. for (p = 0; p < lastp; p++) {
  225. dtbl->ehufco[htbl->huffval[p]] = huffcode[p];
  226. dtbl->ehufsi[htbl->huffval[p]] = huffsize[p];
  227. }
  228. }
  229. /* Outputting bytes to the file */
  230. /* Emit a byte, taking 'action' if must suspend. */
  231. #define emit_byte(state,val,action) \
  232. { *next_output_byte++ = (JOCTET) (val); \
  233. if (--free_in_buffer == 0) \
  234. if (! dump_buffer(state)) \
  235. { action; } }
  236. GLOBAL(boolean)
  237. dump_buffer (working_state * state)
  238. /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
  239. {
  240. struct jpeg_destination_mgr * dest = state->cinfo->dest;
  241. if (! (*dest->empty_output_buffer) (state->cinfo))
  242. return FALSE;
  243. /* After a successful buffer dump, must reset buffer pointers */
  244. next_output_byte = dest->next_output_byte;
  245. free_in_buffer = dest->free_in_buffer;
  246. return TRUE;
  247. }
  248. /* Outputting bits to the file */
  249. /* Only the right 24 bits of put_buffer are used; the valid bits are
  250. * left-justified in this part. At most 16 bits can be passed to emit_bits
  251. * in one call, and we never retain more than 7 bits in put_buffer
  252. * between calls, so 24 bits are sufficient.
  253. */
  254. //INLINE
  255. LOCAL(boolean)
  256. emit_bits (working_state * state, unsigned int code, int size)
  257. /* Emit some bits; return TRUE if successful, FALSE if must suspend */
  258. {
  259. /* This routine is heavily used, so it's worth coding tightly. */
  260. register INT32 put_buffer = (INT32) code;
  261. register int put_bits = state->put_bits;
  262. /* if size is 0, caller used an invalid Huffman table entry */
  263. if (size == 0)
  264. ERREXIT(state->cinfo, JERR_HUFF_MISSING_CODE);
  265. put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
  266. put_bits += size; /* new number of bits in buffer */
  267. put_buffer <<= 24 - put_bits; /* align incoming bits */
  268. put_buffer |= state->put_buffer; /* and merge with old buffer contents */
  269. while (put_bits >= 8) {
  270. int c = (int) ((put_buffer >> 16) & 0xFF);
  271. emit_byte(state, c, return FALSE);
  272. if (c == 0xFF) { /* need to stuff a zero byte? */
  273. emit_byte(state, 0, return FALSE);
  274. }
  275. put_buffer <<= 8;
  276. put_bits -= 8;
  277. }
  278. state->put_buffer = put_buffer; /* update state variables */
  279. state->put_bits = put_bits;
  280. return TRUE;
  281. }
  282. //This is a routine to dump whatever is in put_buffer out - I salvaged it from another
  283. //routine so there is some dead-code as-used.
  284. //As flush-bits is not called frequently, there should not be much overhead to this code . . .
  285. //MJB
  286. //
  287. // Need to add #ifdef for Alpha port
  288. //
  289. #if defined (_X86_)
  290. void flush_bit_buffer_64()
  291. {
  292. // byte-align previous bits if any
  293. __asm{
  294. mov ebx,[put_bits]
  295. mov eax,[next_output_byte]
  296. test ebx,ebx
  297. je no_ser_buf_data
  298. movq mm0,[put_buffer_64]
  299. pxor mm2,mm2
  300. dump_loop: movq mm1,mm0
  301. psrlq mm0,56
  302. movd ecx,mm0
  303. movq mm0,mm1
  304. mov (byte ptr[eax]),cl
  305. inc eax
  306. cmp ecx,0xFF
  307. jne not_ff
  308. mov (byte ptr[eax]),0x00
  309. dec [free_in_buffer]
  310. inc eax
  311. nop
  312. not_ff:
  313. dec [free_in_buffer]
  314. psllq mm0,8
  315. sub ebx,8
  316. jg dump_loop
  317. mov [put_bits],0
  318. //mov [eb_ptr],eax
  319. movq [put_buffer_64],mm2
  320. //emms
  321. no_ser_buf_data:
  322. mov [next_output_byte],eax
  323. }
  324. }
  325. #endif // #ifdef (_X86_)
  326. LOCAL(boolean)
  327. flush_bits (working_state * state)
  328. {
  329. //
  330. // Need to add #ifdef for Alpha port
  331. //
  332. #if defined (_X86_)
  333. if (0)//vfMMXMachine)
  334. {
  335. //if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
  336. if (! emit_bits_fast(state, 0x7F, 7, 1)) /* fill any partial byte with ones */
  337. return FALSE;
  338. if (put_bits)
  339. {
  340. flush_bit_buffer_64(); // New stuff to write the last, few bits . . .
  341. }
  342. }
  343. else
  344. #endif
  345. {
  346. if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
  347. return FALSE;
  348. }
  349. state->put_buffer_64 = 0; /* and reset bit-buffer to empty */
  350. state->put_buffer = 0; /* and reset bit-buffer to empty */
  351. state->put_bits = 0;
  352. return TRUE;
  353. }
  354. /* Encode a single block's worth of coefficients */
  355. LOCAL(boolean)
  356. encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
  357. c_derived_tbl *dctbl, c_derived_tbl *actbl)
  358. {
  359. register int temp, temp2;
  360. register int nbits;
  361. register int k, r, i;
  362. /* Encode the DC coefficient difference per section F.1.2.1 */
  363. temp = temp2 = block[0] - last_dc_val;
  364. if (temp < 0)
  365. {
  366. temp = -temp; /* temp is abs value of input */
  367. /* For a negative input, want temp2 = bitwise complement of abs(input) */
  368. /* This code assumes we are on a two's complement machine */
  369. temp2--;
  370. }
  371. /* Find the number of bits needed for the magnitude of the coefficient */
  372. nbits = 0;
  373. while (temp)
  374. {
  375. nbits++;
  376. temp >>= 1;
  377. }
  378. /* Emit the Huffman-coded symbol for the number of bits */
  379. if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
  380. return FALSE;
  381. /* Emit that number of bits of the value, if positive, */
  382. /* or the complement of its magnitude, if negative. */
  383. if (nbits) /* emit_bits rejects calls with size 0 */
  384. if (! emit_bits(state, (unsigned int) temp2, nbits))
  385. return FALSE;
  386. /* Encode the AC coefficients per section F.1.2.2 */
  387. r = 0; /* r = run length of zeros */
  388. for (k = 1; k < DCTSIZE2; k++)
  389. {
  390. if ((temp = block[jpeg_natural_order[k]]) == 0)
  391. {
  392. r++;
  393. }
  394. else
  395. {
  396. /* if run length > 15, must emit special run-length-16 codes (0xF0) */
  397. while (r > 15)
  398. {
  399. if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
  400. return FALSE;
  401. r -= 16;
  402. }
  403. temp2 = temp;
  404. if (temp < 0)
  405. {
  406. temp = -temp; /* temp is abs value of input */
  407. /* This code assumes we are on a two's complement machine */
  408. temp2--;
  409. }
  410. /* Find the number of bits needed for the magnitude of the coefficient */
  411. nbits = 1; /* there must be at least one 1 bit */
  412. while ((temp >>= 1))
  413. nbits++;
  414. /* Emit Huffman symbol for run length / number of bits */
  415. i = (r << 4) + nbits;
  416. if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
  417. return FALSE;
  418. /* Emit that number of bits of the value, if positive, */
  419. /* or the complement of its magnitude, if negative. */
  420. if (! emit_bits(state, (unsigned int) temp2, nbits))
  421. return FALSE;
  422. r = 0;
  423. }
  424. }
  425. /* If the last coef(s) were zero, emit an end-of-block code */
  426. if (r > 0)
  427. if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
  428. return FALSE;
  429. return TRUE;
  430. }
  431. /*
  432. * Emit a restart marker & resynchronize predictions.
  433. */
  434. LOCAL(boolean)
  435. emit_restart (working_state * state, int restart_num)
  436. {
  437. int ci;
  438. if (! flush_bits(state))
  439. return FALSE;
  440. emit_byte(state, 0xFF, return FALSE);
  441. emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
  442. /* Re-initialize DC predictions to 0 */
  443. for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
  444. state->last_dc_val[ci] = 0;
  445. /* The restart counter is not updated until we successfully write the MCU. */
  446. return TRUE;
  447. }
  448. /*
  449. * Encode and output one MCU's worth of Huffman-compressed coefficients.
  450. */
  451. METHODDEF(boolean)
  452. encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  453. {
  454. huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
  455. working_state state;
  456. int blkn, ci;
  457. jpeg_component_info * compptr;
  458. /* Load up working state */
  459. next_output_byte = cinfo->dest->next_output_byte;
  460. free_in_buffer = cinfo->dest->free_in_buffer;
  461. if (0)//vfMMXMachine)
  462. {
  463. state.put_buffer_64=entropy->saved.put_buffer_64;
  464. }
  465. else
  466. {
  467. state.put_buffer=entropy->saved.put_buffer;
  468. }
  469. state.put_bits=entropy->saved.put_bits;
  470. ASSIGN_STATE(state, entropy->saved);
  471. state.cinfo = cinfo;
  472. /* Emit restart marker if needed */
  473. if (cinfo->restart_interval)
  474. {
  475. if (entropy->restarts_to_go == 0)
  476. if (! emit_restart(&state, entropy->next_restart_num))
  477. return FALSE;
  478. }
  479. /* Encode the MCU data blocks */
  480. for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++)
  481. {
  482. ci = cinfo->MCU_membership[blkn];
  483. compptr = cinfo->cur_comp_info[ci];
  484. //
  485. // Need to add #ifdef for Alpha port
  486. //
  487. #if defined (_X86_)
  488. if (0)//vfMMXMachine)
  489. {
  490. if (! encode_one_block_fast(&state,
  491. MCU_data[blkn][0], state.last_dc_val[ci],
  492. entropy->dc_derived_tbls[compptr->dc_tbl_no],
  493. entropy->ac_derived_tbls[compptr->ac_tbl_no]))
  494. return FALSE;
  495. }
  496. else
  497. #endif
  498. {
  499. if (! encode_one_block(&state,
  500. MCU_data[blkn][0], state.last_dc_val[ci],
  501. entropy->dc_derived_tbls[compptr->dc_tbl_no],
  502. entropy->ac_derived_tbls[compptr->ac_tbl_no]))
  503. return FALSE;
  504. }
  505. /* Update last_dc_val */
  506. state.last_dc_val[ci] = MCU_data[blkn][0][0];
  507. }
  508. /* Completed MCU, so update state */
  509. cinfo->dest->next_output_byte = next_output_byte;
  510. cinfo->dest->free_in_buffer = free_in_buffer;
  511. if (0)//vfMMXMachine)
  512. {
  513. entropy->saved.put_buffer_64=state.put_buffer_64;
  514. }
  515. else
  516. {
  517. entropy->saved.put_buffer=state.put_buffer;
  518. }
  519. entropy->saved.put_bits=state.put_bits;
  520. ASSIGN_STATE(entropy->saved, state);
  521. /* Update restart-interval state too */
  522. if (cinfo->restart_interval)
  523. {
  524. if (entropy->restarts_to_go == 0)
  525. {
  526. entropy->restarts_to_go = cinfo->restart_interval;
  527. entropy->next_restart_num++;
  528. entropy->next_restart_num &= 7;
  529. }
  530. entropy->restarts_to_go--;
  531. }
  532. //
  533. // Need to add #ifdef for Alpha port
  534. //
  535. #if defined (_X86_)
  536. if (0)//vfMMXMachine)
  537. {
  538. __asm emms
  539. }
  540. #endif
  541. return TRUE;
  542. }
  543. /*
  544. * Finish up at the end of a Huffman-compressed scan.
  545. */
  546. METHODDEF(void)
  547. finish_pass_huff (j_compress_ptr cinfo)
  548. {
  549. huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
  550. working_state state;
  551. /* Load up working state ... flush_bits needs it */
  552. next_output_byte = cinfo->dest->next_output_byte;
  553. free_in_buffer = cinfo->dest->free_in_buffer;
  554. if (0)//vfMMXMachine)
  555. {
  556. state.put_buffer_64=entropy->saved.put_buffer_64;
  557. }
  558. else
  559. {
  560. state.put_buffer=entropy->saved.put_buffer;
  561. }
  562. state.put_bits=entropy->saved.put_bits;
  563. ASSIGN_STATE(state, entropy->saved);
  564. state.cinfo = cinfo;
  565. /* Flush out the last data */
  566. if (!flush_bits(&state))
  567. ERREXIT(cinfo, JERR_CANT_SUSPEND);
  568. /* Update state */
  569. cinfo->dest->next_output_byte = next_output_byte;
  570. cinfo->dest->free_in_buffer = free_in_buffer;
  571. if (0)//vfMMXMachine)
  572. {
  573. entropy->saved.put_buffer_64=state.put_buffer_64;
  574. }
  575. else
  576. {
  577. entropy->saved.put_buffer=state.put_buffer;
  578. }
  579. entropy->saved.put_bits=state.put_bits;
  580. ASSIGN_STATE(entropy->saved, state);
  581. //
  582. // Need to add #ifdef for Alpha port
  583. //
  584. #if defined (_X86_)
  585. if (0)//vfMMXMachine)
  586. {
  587. __asm emms
  588. }
  589. #endif
  590. }
  591. /*
  592. * Huffman coding optimization.
  593. *
  594. * This actually is optimization, in the sense that we find the best possible
  595. * Huffman table(s) for the given data. We first scan the supplied data and
  596. * count the number of uses of each symbol that is to be Huffman-coded.
  597. * (This process must agree with the code above.) Then we build an
  598. * optimal Huffman coding tree for the observed counts.
  599. *
  600. * The JPEG standard requires Huffman codes to be no more than 16 bits long.
  601. * If some symbols have a very small but nonzero probability, the Huffman tree
  602. * must be adjusted to meet the code length restriction. We currently use
  603. * the adjustment method suggested in the JPEG spec. This method is *not*
  604. * optimal; it may not choose the best possible limited-length code. But
  605. * since the symbols involved are infrequently used, it's not clear that
  606. * going to extra trouble is worthwhile.
  607. */
  608. #ifdef ENTROPY_OPT_SUPPORTED
  609. /* Process a single block's worth of coefficients */
  610. LOCAL(void)
  611. htest_one_block (JCOEFPTR block, int last_dc_val,
  612. long dc_counts[], long ac_counts[])
  613. {
  614. register int temp;
  615. register int nbits;
  616. register int k, r;
  617. /* Encode the DC coefficient difference per section F.1.2.1 */
  618. temp = block[0] - last_dc_val;
  619. if (temp < 0)
  620. temp = -temp;
  621. /* Find the number of bits needed for the magnitude of the coefficient */
  622. nbits = 0;
  623. while (temp)
  624. {
  625. nbits++;
  626. temp >>= 1;
  627. }
  628. /* Count the Huffman symbol for the number of bits */
  629. dc_counts[nbits]++;
  630. /* Encode the AC coefficients per section F.1.2.2 */
  631. r = 0; /* r = run length of zeros */
  632. for (k = 1; k < DCTSIZE2; k++)
  633. {
  634. if ((temp = block[jpeg_natural_order[k]]) == 0)
  635. {
  636. r++;
  637. }
  638. else
  639. {
  640. /* if run length > 15, must emit special run-length-16 codes (0xF0) */
  641. while (r > 15)
  642. {
  643. ac_counts[0xF0]++;
  644. r -= 16;
  645. }
  646. /* Find the number of bits needed for the magnitude of the coefficient */
  647. if (temp < 0) temp = -temp;
  648. /* Find the number of bits needed for the magnitude of the coefficient */
  649. nbits = 1; /* there must be at least one 1 bit */
  650. while ((temp >>= 1)) nbits++;
  651. /* Count Huffman symbol for run length / number of bits */
  652. ac_counts[(r << 4) + nbits]++;
  653. r = 0;
  654. }
  655. }
  656. /* If the last coef(s) were zero, emit an end-of-block code */
  657. if (r > 0)
  658. ac_counts[0]++;
  659. }
  660. /*
  661. * Trial-encode one MCU's worth of Huffman-compressed coefficients.
  662. * No data is actually output, so no suspension return is possible.
  663. */
  664. METHODDEF(boolean)
  665. encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  666. {
  667. huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
  668. int blkn, ci;
  669. jpeg_component_info * compptr;
  670. /* Take care of restart intervals if needed */
  671. if (cinfo->restart_interval) {
  672. if (entropy->restarts_to_go == 0) {
  673. /* Re-initialize DC predictions to 0 */
  674. for (ci = 0; ci < cinfo->comps_in_scan; ci++)
  675. entropy->saved.last_dc_val[ci] = 0;
  676. /* Update restart state */
  677. entropy->restarts_to_go = cinfo->restart_interval;
  678. }
  679. entropy->restarts_to_go--;
  680. }
  681. for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
  682. ci = cinfo->MCU_membership[blkn];
  683. compptr = cinfo->cur_comp_info[ci];
  684. htest_one_block(MCU_data[blkn][0], entropy->saved.last_dc_val[ci],
  685. entropy->dc_count_ptrs[compptr->dc_tbl_no],
  686. entropy->ac_count_ptrs[compptr->ac_tbl_no]);
  687. entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0];
  688. }
  689. return TRUE;
  690. }
  691. /*
  692. * Generate the optimal coding for the given counts, fill htbl.
  693. * Note this is also used by jcphuff.c.
  694. */
  695. GLOBAL(void)
  696. jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
  697. {
  698. #define MAX_CLEN 32 /* assumed maximum initial code length */
  699. UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */
  700. int codesize[257]; /* codesize[k] = code length of symbol k */
  701. int others[257]; /* next symbol in current branch of tree */
  702. int c1, c2;
  703. int p, i, j;
  704. long v;
  705. /* This algorithm is explained in section K.2 of the JPEG standard */
  706. MEMZERO(bits, SIZEOF(bits));
  707. MEMZERO(codesize, SIZEOF(codesize));
  708. for (i = 0; i < 257; i++)
  709. others[i] = -1; /* init links to empty */
  710. freq[256] = 1; /* make sure there is a nonzero count */
  711. /* Including the pseudo-symbol 256 in the Huffman procedure guarantees
  712. * that no real symbol is given code-value of all ones, because 256
  713. * will be placed in the largest codeword category.
  714. */
  715. /* Huffman's basic algorithm to assign optimal code lengths to symbols */
  716. for (;;) {
  717. /* Find the smallest nonzero frequency, set c1 = its symbol */
  718. /* In case of ties, take the larger symbol number */
  719. c1 = -1;
  720. v = 1000000000L;
  721. for (i = 0; i <= 256; i++) {
  722. if (freq[i] && freq[i] <= v) {
  723. v = freq[i];
  724. c1 = i;
  725. }
  726. }
  727. /* Find the next smallest nonzero frequency, set c2 = its symbol */
  728. /* In case of ties, take the larger symbol number */
  729. c2 = -1;
  730. v = 1000000000L;
  731. for (i = 0; i <= 256; i++) {
  732. if (freq[i] && freq[i] <= v && i != c1) {
  733. v = freq[i];
  734. c2 = i;
  735. }
  736. }
  737. /* Done if we've merged everything into one frequency */
  738. if (c2 < 0)
  739. break;
  740. /* Else merge the two counts/trees */
  741. freq[c1] += freq[c2];
  742. freq[c2] = 0;
  743. /* Increment the codesize of everything in c1's tree branch */
  744. codesize[c1]++;
  745. while (others[c1] >= 0) {
  746. c1 = others[c1];
  747. codesize[c1]++;
  748. }
  749. others[c1] = c2; /* chain c2 onto c1's tree branch */
  750. /* Increment the codesize of everything in c2's tree branch */
  751. codesize[c2]++;
  752. while (others[c2] >= 0) {
  753. c2 = others[c2];
  754. codesize[c2]++;
  755. }
  756. }
  757. /* Now count the number of symbols of each code length */
  758. for (i = 0; i <= 256; i++) {
  759. if (codesize[i]) {
  760. /* The JPEG standard seems to think that this can't happen, */
  761. /* but I'm paranoid... */
  762. if (codesize[i] > MAX_CLEN)
  763. ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
  764. bits[codesize[i]]++;
  765. }
  766. }
  767. /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
  768. * Huffman procedure assigned any such lengths, we must adjust the coding.
  769. * Here is what the JPEG spec says about how this next bit works:
  770. * Since symbols are paired for the longest Huffman code, the symbols are
  771. * removed from this length category two at a time. The prefix for the pair
  772. * (which is one bit shorter) is allocated to one of the pair; then,
  773. * skipping the BITS entry for that prefix length, a code word from the next
  774. * shortest nonzero BITS entry is converted into a prefix for two code words
  775. * one bit longer.
  776. */
  777. for (i = MAX_CLEN; i > 16; i--) {
  778. while (bits[i] > 0) {
  779. j = i - 2; /* find length of new prefix to be used */
  780. while (bits[j] == 0)
  781. j--;
  782. bits[i] -= 2; /* remove two symbols */
  783. bits[i-1]++; /* one goes in this length */
  784. bits[j+1] += 2; /* two new symbols in this length */
  785. bits[j]--; /* symbol of this length is now a prefix */
  786. }
  787. }
  788. /* Remove the count for the pseudo-symbol 256 from the largest codelength */
  789. while (bits[i] == 0) /* find largest codelength still in use */
  790. i--;
  791. bits[i]--;
  792. /* Return final symbol counts (only for lengths 0..16) */
  793. MEMCOPY(htbl->bits, bits, SIZEOF(htbl->bits));
  794. /* Return a list of the symbols sorted by code length */
  795. /* It's not real clear to me why we don't need to consider the codelength
  796. * changes made above, but the JPEG spec seems to think this works.
  797. */
  798. p = 0;
  799. for (i = 1; i <= MAX_CLEN; i++) {
  800. for (j = 0; j <= 255; j++) {
  801. if (codesize[j] == i) {
  802. htbl->huffval[p] = (UINT8) j;
  803. p++;
  804. }
  805. }
  806. }
  807. /* Set sent_table FALSE so updated table will be written to JPEG file. */
  808. htbl->sent_table = FALSE;
  809. }
  810. /*
  811. * Finish up a statistics-gathering pass and create the new Huffman tables.
  812. */
  813. METHODDEF(void)
  814. finish_pass_gather (j_compress_ptr cinfo)
  815. {
  816. huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
  817. int ci, dctbl, actbl;
  818. jpeg_component_info * compptr;
  819. JHUFF_TBL **htblptr;
  820. boolean did_dc[NUM_HUFF_TBLS];
  821. boolean did_ac[NUM_HUFF_TBLS];
  822. /* It's important not to apply jpeg_gen_optimal_table more than once
  823. * per table, because it clobbers the input frequency counts!
  824. */
  825. MEMZERO(did_dc, SIZEOF(did_dc));
  826. MEMZERO(did_ac, SIZEOF(did_ac));
  827. for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
  828. compptr = cinfo->cur_comp_info[ci];
  829. dctbl = compptr->dc_tbl_no;
  830. actbl = compptr->ac_tbl_no;
  831. if (! did_dc[dctbl]) {
  832. htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
  833. if (*htblptr == NULL)
  834. *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
  835. jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
  836. did_dc[dctbl] = TRUE;
  837. }
  838. if (! did_ac[actbl]) {
  839. htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
  840. if (*htblptr == NULL)
  841. *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
  842. jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
  843. did_ac[actbl] = TRUE;
  844. }
  845. }
  846. }
  847. #endif /* ENTROPY_OPT_SUPPORTED */
  848. /*
  849. * Module initialization routine for Huffman entropy encoding.
  850. */
  851. GLOBAL(void)
  852. jinit_huff_encoder (j_compress_ptr cinfo)
  853. {
  854. huff_entropy_ptr entropy;
  855. int i;
  856. entropy = (huff_entropy_ptr)
  857. (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
  858. SIZEOF(huff_entropy_encoder));
  859. cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
  860. entropy->pub.start_pass = start_pass_huff;
  861. /* Mark tables unallocated */
  862. for (i = 0; i < NUM_HUFF_TBLS; i++) {
  863. entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
  864. #ifdef ENTROPY_OPT_SUPPORTED
  865. entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
  866. #endif
  867. }
  868. }
  869. //mark buxton's new emit_bits:
  870. static unsigned int onlynbits[] = {
  871. 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x000000F, 0x0000001F,
  872. 0x0000003F, 0x0000007F, 0x000000FF, 0x000001FF, 0x000003FF, 0x000007FF,
  873. 0x00000FFF, 0x00001FFF, 0x00003FFF, 0x00007FFF, 0x0000FFFF, 0x0001FFFF,
  874. 0x0003FFFF, 0x0007FFFF, 0x000FFFFF, 0x001FFFFF, 0x003FFFFF, 0x007FFFFF,
  875. 0x00FFFFFF, 0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF, 0x0FFFFFFF, 0x1FFFFFFF,
  876. 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF
  877. };
  878. //
  879. // Need to add #ifdef for Alpha port
  880. //
  881. #if defined (_X86_)
  882. GLOBAL(boolean)
  883. emit_bits_fast (working_state * state, unsigned int code, int bsize, int only1){
  884. // Emit some bits; return TRUE if successful, FALSE if must suspend
  885. // This routine is heavily used, so it's worth coding tightly.
  886. //unsigned int put_buffer = code;
  887. // int put_bits = state->put_bits;
  888. unsigned c;
  889. __asm{
  890. mov edx,64
  891. mov esi,[put_bits]
  892. add esi,dword ptr[bsize]
  893. mov [put_bits],esi
  894. sub edx,esi
  895. mov ebx,[free_in_buffer]
  896. movd mm3,edx
  897. mov edx,[next_output_byte]
  898. movd mm0,[code]
  899. movq mm7,[put_buffer_64]
  900. psllq mm0,mm3 //put_buffer <<= 64-put_bits;
  901. por mm7,mm0
  902. cmp [only1],0
  903. movq mm0,mm7
  904. jne got_FF
  905. cmp ebx,8
  906. jng got_FF
  907. cmp esi,32
  908. jng buffer_not_full
  909. // test [next_output_byte],0x3
  910. // jnz byte_write
  911. //test to see if the data is on a 4-byte boundary. If not, don't use the
  912. //integer write.
  913. //integer_write: //Write 32 bits.
  914. movq mm1,mm5
  915. psrlq mm0,32
  916. pcmpeqb mm1,mm0
  917. sub ebx,4
  918. movd eax,mm1
  919. movq mm2,mm0 // | - | - | - | - | D | C | B | A | MM2
  920. test eax,eax
  921. jne got_FF
  922. // big-endian data
  923. psrlq mm0,8 // | - | - | - | - | - | D | C | B | MM0
  924. mov [free_in_buffer],ebx
  925. punpcklbw mm0,mm2 // | - | - | C | D | B | C | A | B | MM0
  926. add edx,4
  927. pslld mm0,16 // | C | D | B | C | A | B | - | - | MM0
  928. sub esi,32
  929. psrad mm0,16 //have to use this pair because packssdw expects 16-byte _signed_ data.
  930. psllq mm7,32
  931. packssdw mm0,mm0 // | - | - | - | - | C | D | A | B | MM0
  932. mov [put_bits],esi
  933. movq mm2,mm0 // | - | - | - | - | C | D | A | B | MM2
  934. movq [put_buffer_64],mm7
  935. psrlq mm0,16 // | - | - | - | - | - | - | C | D | MM0
  936. mov [next_output_byte],edx
  937. punpcklwd mm0,mm2 // | - | - | - | - | C | B | C | D | MM0
  938. movd [edx-4],mm0
  939. nop
  940. }
  941. return TRUE;
  942. got_FF: //only if an FF was returned.
  943. while (put_bits >=8) {
  944. __asm{
  945. movq mm4,mm7
  946. psrlq mm7,56
  947. movd [c],mm7
  948. psllq mm4,8
  949. movq mm7,mm4
  950. sub [put_bits],8
  951. }
  952. emit_byte(state, c, return FALSE);
  953. //emit_byte_fast(c);
  954. if (c == 0xFF) emit_byte(state, 0, return FALSE);
  955. //if (c==0xFF) emit_byte_fast(0);
  956. //put_bits -= 8;
  957. }
  958. buffer_not_full:
  959. __asm movq [put_buffer_64],mm7
  960. return TRUE;
  961. }
  962. /* Encode a single block's worth of coefficients */
  963. GLOBAL(boolean)
  964. encode_one_block_fast (working_state * state, JCOEFPTR block, int last_dc_val,
  965. c_derived_tbl *dctbl, c_derived_tbl *actbl)
  966. {
  967. JCOEF temp ;
  968. int nbits;
  969. int k, i ,j;
  970. /*unsigned <-Buxton BUG??*/ int l;
  971. int lastzeros = 0, numElements = 0 ;
  972. short dummy_outblock[192+4] ; // 64 values, 64 corresponding zero & bit counts
  973. short *outblock;
  974. outblock= (short *)(((unsigned int)dummy_outblock+7)&0xFFFFFFF8);
  975. // Encode the DC coefficient difference per section F.1.2.1
  976. //starttimer();
  977. temp = block[0] ;
  978. block[0] = (JCOEF) (block[0] - last_dc_val) ;
  979. countZeros((int *)jpeg_natural_order, block, outblock, &lastzeros, &numElements) ;
  980. //formatting for frequently used MMX registers in emit_bits_fast
  981. __asm{
  982. //provide two constants: mm5 = 0xFFFF|FFFF|FFFF|FFFF
  983. // mm6 = 0x0000|FFFF|0000|FFFF
  984. pcmpeqb mm5,mm5
  985. pxor mm0,mm0
  986. movq mm6,mm5
  987. punpcklwd mm6,mm0
  988. }
  989. if(block[0] == 0) {
  990. // Emit the Huffman-coded symbol for the number of bits
  991. //changed to only1 - dshade
  992. if (! emit_bits_fast(state, dctbl->ehufco[0], dctbl->ehufsi[0],1)) return FALSE;
  993. outblock[64] -- ;
  994. k=0;
  995. } else {
  996. nbits = outblock[128] ;
  997. k=1;
  998. //changed to only1 - dshade
  999. if (!emit_bits_fast(state,dctbl->ehufco[nbits]<<nbits | ((unsigned int) outblock[0]&onlynbits[nbits]),nbits + dctbl->ehufsi[nbits],1)) return FALSE;
  1000. }
  1001. block[0] = temp ; // get the original value of block[0] back in.
  1002. if( (numElements == 0) || (numElements == 1) ) {
  1003. lastzeros = 63 ; // DC element handled outside of the AC loop below
  1004. }
  1005. // Encode the AC coefficients per section F.1.2.2
  1006. for (; k < numElements; k++) {
  1007. l = outblock[64+k];//<<4;
  1008. //store frequently used lookups:
  1009. j = outblock[128+k];
  1010. // if run length > 15, must emit special run-length-16 codes (0xF0)
  1011. while (l > 15/*240*/) {
  1012. // changed to only1 - dshade
  1013. if (! emit_bits_fast(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0],1))
  1014. return FALSE;
  1015. l -= 16; //256;
  1016. }
  1017. // if (l < 0) // dshade
  1018. // l = 0; // dshade
  1019. l = l << 4;
  1020. i = l + j;
  1021. //hufandval = actbl->ehufco[i]<<j | ((unsigned int) outblock[k]&onlynbits[j]);
  1022. //hufandvallen = j + actbl->ehufsi[i];
  1023. //changed to only1 - dshade
  1024. if (!emit_bits_fast(state,actbl->ehufco[i]<<j | ((unsigned int) outblock[k]&onlynbits[j]),j + actbl->ehufsi[i],1)) return FALSE;
  1025. }
  1026. // If the last coef(s) were zero, emit an end-of-block code
  1027. if (lastzeros > 0)
  1028. { //changed to only1 - dshade
  1029. if (! emit_bits_fast(state, actbl->ehufco[0], actbl->ehufsi[0],1))
  1030. return FALSE;
  1031. }
  1032. //cumulative_time += stoptimer();
  1033. _asm emms // dshade
  1034. return TRUE;
  1035. }
  1036. //used above as:
  1037. //countZeros((int *)jpeg_natural_order, block, outblock, &lastzeros, &numElements) ;
  1038. __int32 jmpswitch;
  1039. void countZeros(
  1040. int *dwindexBlock,
  1041. short *dwcoefBlock,
  1042. short *dwoutBlock,
  1043. int *dwlastZeros,
  1044. int *dwnumElem
  1045. ){
  1046. static __int64 const_1 = 0x0001000100010001;
  1047. static __int64 const_2 = 0x0002000200020002;
  1048. static __int64 const_3 = 0x0003000300030003;
  1049. static __int64 const_4 = 0x0004000400040004;
  1050. static __int64 const_8 = 0x0008000800080008;
  1051. static __int64 const_15 = 0x000f000f000f000f;
  1052. static __int64 const_255 = 0x00ff00ff00ff00ff;
  1053. //#define sizLOCALS 8
  1054. //_countZeros proc USES eax ebx ecx edx esi edi
  1055. // Move all paramters to be based on esp
  1056. // Must REMEMBER NOT to use the stack for any push/pops
  1057. // the following are the parameters passed into the routine.
  1058. //#define dwindexBlock dword ptr [esp+32+sizLOCALS] // 32 bit elements
  1059. //#define dwcoefBlock dword ptr [esp+36+sizLOCALS] // 16 bit elements
  1060. //#define dwoutBlock dword ptr [esp+40+sizLOCALS] // 16 bit elements
  1061. //#define dwlastZeros dword ptr [esp+44+sizLOCALS] // address of a 32 bit element
  1062. //#define dwnumElem dword ptr [esp+48+sizLOCALS] // number of non-zero elements
  1063. // dwlastZeros stores the number of trailing zero values.
  1064. //;;;;; LOCALS :;;;;;;;;;;;;;;;
  1065. __int32 locdwoutBlock;
  1066. __int32 locdwZeroCount;
  1067. __int32 loopctr;
  1068. // these are used as scratchpad registers.
  1069. // right now a new local has been added on an as needed
  1070. // basis. There's potential for reducing the number of locals.
  1071. //#define locdwoutBlock dword ptr [esp+0]
  1072. //#define locdwZeroCount dword ptr [esp+4]
  1073. //;;;;; END OF LOCALS ;;;;;;;;;;;;;;;
  1074. __asm{
  1075. //sub esp, sizLOCALS
  1076. mov esi, dwindexBlock ; load the input array pointer
  1077. mov edi, dwcoefBlock
  1078. mov eax, dwoutBlock
  1079. nop //************************;;
  1080. mov locdwoutBlock, eax
  1081. nop //************************;;
  1082. mov dword ptr[loopctr], 10h // loop count of 16 : four elements handled per loop
  1083. mov locdwZeroCount, 0h // initialize zero counter to 0
  1084. CountZeroLoop:
  1085. // align the zigzag elements of inblock into MMX register, four words
  1086. // at a time.
  1087. // get index for next four elements in coeff array from the zigzag array
  1088. mov eax, [esi]
  1089. mov ebx, [esi+4]
  1090. mov ecx, [esi+8]
  1091. mov edx, [esi+12]
  1092. // get the next four coeff. words
  1093. mov eax, [edi+2*eax]
  1094. mov ebx, [edi+2*ebx]
  1095. mov ecx, [edi+2*ecx]
  1096. mov edx, [edi+2*edx]
  1097. // pack first two words in eax (first word in LS 16 bits)
  1098. shl ebx, 16
  1099. and eax, 0ffffh
  1100. // pack next two words in ecx (third word in LS 16 bits)
  1101. shl edx, 16
  1102. and ecx, 0ffffh
  1103. or eax, ebx
  1104. or ecx, edx
  1105. mov ebx, eax
  1106. or eax, ecx // check to see if all 4 elems. are zero
  1107. cmp eax, 0h
  1108. jz caseAllZeros
  1109. movd mm0, ebx // move LS two words into mm0
  1110. pxor mm2, mm2 // initialize mm2 to zero
  1111. movd mm1, ecx // move MS two words into mm1
  1112. nop //************************;;
  1113. pcmpeqw mm0, mm2
  1114. pcmpeqw mm1, mm2
  1115. movq mm3,mm0
  1116. por mm0,mm1
  1117. movd eax,mm0
  1118. pcmpeqw mm2,mm2
  1119. cmp eax,0h
  1120. jz caseNoZeros
  1121. movd eax,mm3
  1122. pandn mm1,mm2
  1123. movd edx,mm1
  1124. not eax
  1125. and eax,0x00020001
  1126. and edx,0x00080004
  1127. or eax,edx
  1128. add esi,16
  1129. mov edx,eax
  1130. shr eax,16
  1131. or eax,edx
  1132. mov edx, locdwZeroCount
  1133. and eax,0xFFFF
  1134. nop
  1135. dec eax
  1136. lea eax,[JmpTable+eax*8]
  1137. jmp eax
  1138. JmpTable:
  1139. jmp case0001
  1140. nop
  1141. nop
  1142. nop
  1143. jmp case0010
  1144. nop
  1145. nop
  1146. nop
  1147. jmp case0011
  1148. nop
  1149. nop
  1150. nop
  1151. jmp case0100
  1152. nop
  1153. nop
  1154. nop
  1155. jmp case0101
  1156. nop
  1157. nop
  1158. nop
  1159. jmp case0110
  1160. nop
  1161. nop
  1162. nop
  1163. jmp case0111
  1164. nop
  1165. nop
  1166. nop
  1167. jmp case1000
  1168. nop
  1169. nop
  1170. nop
  1171. jmp case1001
  1172. nop
  1173. nop
  1174. nop
  1175. jmp case1010
  1176. nop
  1177. nop
  1178. nop
  1179. jmp case1011
  1180. nop
  1181. nop
  1182. nop
  1183. jmp case1100
  1184. nop
  1185. nop
  1186. nop
  1187. jmp case1101
  1188. nop
  1189. nop
  1190. nop
  1191. jmp case1110
  1192. nop
  1193. nop
  1194. nop
  1195. jmp caseNoZeros
  1196. caseAllZeros:
  1197. add locdwZeroCount, 4
  1198. add esi, 16
  1199. dec [loopctr] // decrement loop counter
  1200. jnz CountZeroLoop
  1201. jmp AllDone
  1202. caseNoZeros:
  1203. mov eax, locdwoutBlock
  1204. mov edx, locdwZeroCount
  1205. mov locdwZeroCount, 0h
  1206. add esi, 16 // esi points to a 32 bit quantity
  1207. mov [eax], ebx // store the LS two words
  1208. mov [eax+4], ecx // store the MS two words
  1209. add locdwoutBlock, 8
  1210. mov [eax+128], edx
  1211. mov dword ptr [eax+132], 0
  1212. nop //************************;;
  1213. dec [loopctr]
  1214. jnz CountZeroLoop
  1215. jmp AllDone
  1216. // case0000:
  1217. // this case is taken care of by caseAllZero
  1218. case0001:
  1219. mov eax, locdwoutBlock
  1220. mov locdwZeroCount, 3
  1221. mov [eax], bx // store the LS word
  1222. mov [eax+128], dx //; store the corresponding zero count
  1223. add locdwoutBlock, 2
  1224. nop //************************;;
  1225. dec [loopctr]
  1226. jnz CountZeroLoop
  1227. jmp AllDone
  1228. case0010:
  1229. mov eax, locdwoutBlock
  1230. mov locdwZeroCount, 2
  1231. shr ebx, 16 // get the MS word into LS 16 bits
  1232. add edx, 1 // increment zero count
  1233. mov [eax+128], dx // store the corresponding zero count
  1234. nop //************************;;
  1235. mov [eax], bx // store the LS word
  1236. add locdwoutBlock, 2
  1237. dec [loopctr]
  1238. jnz CountZeroLoop
  1239. jmp AllDone
  1240. case0011:
  1241. mov eax, locdwoutBlock
  1242. mov locdwZeroCount, 2
  1243. mov [eax], ebx // store the LS word
  1244. mov [eax+128], edx // store the corresponding zero count
  1245. add locdwoutBlock, 4
  1246. nop //************************;;
  1247. dec [loopctr]
  1248. jnz CountZeroLoop
  1249. jmp AllDone
  1250. case0100:
  1251. mov eax, locdwoutBlock
  1252. add edx, 2
  1253. mov [eax], cx // store the LS word within MS DWORD
  1254. mov [eax+128], dx // store the corresponding zero count
  1255. add locdwoutBlock, 2
  1256. mov locdwZeroCount, 1
  1257. dec [loopctr]
  1258. jnz CountZeroLoop
  1259. jmp AllDone
  1260. case0101:
  1261. mov eax, locdwoutBlock
  1262. or edx, 10000h // zero count is 1 for second word
  1263. mov [eax], bx // store the LS word within LS DWORD
  1264. mov [eax+2], cx // store the LS word within MS DWORD
  1265. mov [eax+128], edx // store the corresponding zero count
  1266. nop //************************;;
  1267. add locdwoutBlock, 4
  1268. mov locdwZeroCount, 1
  1269. dec [loopctr]
  1270. jnz CountZeroLoop
  1271. jmp AllDone
  1272. case0110:
  1273. mov eax, locdwoutBlock
  1274. add edx, 1 // zero count is incremented for first word
  1275. shr ebx, 16 // move the word to be written into LS 16 bits
  1276. mov [eax], bx // store the LS word within LS DWORD
  1277. mov [eax+2], cx // store the LS word within MS DWORD
  1278. add locdwoutBlock, 4
  1279. mov [eax+128], edx // store the corresponding zero count
  1280. mov locdwZeroCount, 1
  1281. dec [loopctr]
  1282. jnz CountZeroLoop
  1283. jmp AllDone
  1284. case0111:
  1285. mov eax, locdwoutBlock
  1286. nop
  1287. add locdwoutBlock, 6
  1288. mov locdwZeroCount, 1
  1289. mov [eax], ebx // store the LS word within LS DWORD
  1290. mov [eax+4], cx // store the LS word within MS DWORD
  1291. mov [eax+128], edx // store the corresponding zero count
  1292. mov word ptr [eax+132], 0 // zerocount of 0 for third word
  1293. dec [loopctr]
  1294. jnz CountZeroLoop
  1295. jmp AllDone
  1296. case1000:
  1297. mov eax, locdwoutBlock
  1298. add edx, 3
  1299. shr ecx, 16
  1300. nop //************************;;
  1301. mov [eax], cx // store the LS word within MS DWORD
  1302. mov [eax+128], dx // store the corresponding zero count
  1303. add locdwoutBlock, 2
  1304. mov locdwZeroCount, 0
  1305. dec [loopctr]
  1306. jnz CountZeroLoop
  1307. jmp AllDone
  1308. case1001:
  1309. mov eax, locdwoutBlock
  1310. shr ecx, 16 // word 3 into LS bits
  1311. or edx, 00020000h //// zero count of two for MS word
  1312. mov [eax], bx // store the LS word within MS DWORD
  1313. mov [eax+2], cx // store the LS word within MS DWORD
  1314. add locdwoutBlock, 4
  1315. mov [eax+128], edx // store the corresponding zero count
  1316. mov locdwZeroCount, 0
  1317. dec [loopctr]
  1318. jnz CountZeroLoop
  1319. jmp AllDone
  1320. case1010:
  1321. mov eax, locdwoutBlock
  1322. nop
  1323. add edx, 1 // increment zero count
  1324. shr ecx, 16 // word 3 into LS bits
  1325. shr ebx, 16 // word 2 into LS bits
  1326. or edx, 00010000h // zero count of two for MS word
  1327. mov [eax], bx // store the LS word within MS DWORD
  1328. mov [eax+2], cx // store the LS word within MS DWORD
  1329. mov [eax+128], edx // store the corresponding zero count
  1330. //add esi, 16 // esi points to a 32 bit quantity
  1331. add locdwoutBlock, 4
  1332. mov locdwZeroCount, 0
  1333. dec [loopctr]
  1334. jnz CountZeroLoop
  1335. jmp AllDone
  1336. case1011:
  1337. mov eax, locdwoutBlock
  1338. shr ecx, 16 // word 3 into LS bits
  1339. mov [eax], ebx // store the LS DWORD
  1340. mov [eax+4], cx // store the LS word within MS DWORD
  1341. mov [eax+128], edx // store the corresponding zero count
  1342. mov word ptr [eax+132], 1
  1343. add locdwoutBlock, 6
  1344. mov locdwZeroCount, 0
  1345. dec [loopctr]
  1346. jnz CountZeroLoop
  1347. jmp AllDone
  1348. case1100:
  1349. mov eax, locdwoutBlock
  1350. add edx, 2 // add 2 to zeroc count
  1351. mov [eax], ecx // store the LS DWORD
  1352. mov [eax+128], edx // store the corresponding zero count
  1353. add locdwoutBlock, 4
  1354. mov locdwZeroCount, 0
  1355. dec [loopctr]
  1356. jnz CountZeroLoop
  1357. jmp AllDone
  1358. case1101:
  1359. mov eax, locdwoutBlock
  1360. nop ////************************////
  1361. mov [eax], bx // store the LS DWORD
  1362. mov [eax+128], dx // store the corresponding zero count
  1363. mov [eax+2], ecx
  1364. mov dword ptr [eax+130], 1 // zero count of 1 for 2nd word
  1365. add locdwoutBlock, 6
  1366. mov locdwZeroCount, 0
  1367. dec [loopctr]
  1368. jnz CountZeroLoop
  1369. jmp AllDone
  1370. case1110:
  1371. mov eax, locdwoutBlock
  1372. shr ebx, 16 // get word 1 in LS word
  1373. add dx, 1 // add 1 to zerocount
  1374. nop ////************************////
  1375. mov [eax], bx // store the LS DWORD
  1376. mov [eax+128], dx // store the corresponding zero count
  1377. mov [eax+2], ecx
  1378. mov dword ptr [eax+130], 0 // zero count of 0 for 2nd & 3rd word
  1379. add locdwoutBlock, 6
  1380. mov locdwZeroCount, 0
  1381. dec [loopctr]
  1382. jnz CountZeroLoop
  1383. jmp AllDone
  1384. // case1111:
  1385. // this case is handled by caseNoZeros
  1386. ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1387. AllDone:
  1388. // at this point all zero counting is done
  1389. // now get the number of non-zero elements and round to the
  1390. // nearest multiple of four in the +infinity direction
  1391. mov eax, locdwoutBlock
  1392. sub eax, dwoutBlock // how many non-zero elements did you write
  1393. mov ebx, dwnumElem // address to store number of non-zero elements
  1394. //*** small bug fix by dshade 4/8/97 to eliminate case where loop count went negative below
  1395. //*** all changes have //*** by them
  1396. mov edx, eax //*** make a copy before shifting
  1397. shr eax, 1 // a/c for each element being 2 bytes
  1398. //// round the number of elements to nearest multiple of four
  1399. mov [ebx], eax
  1400. //add eax, 4
  1401. add edx, 7 //*** add get an even multiple of 8
  1402. and edx, 0x1f8 //*** edx holds the number of writes rounded up to 8
  1403. //and eax, 0fch // get the LS 2 bits to be zero
  1404. mov esi, dwoutBlock
  1405. //// This loop should count the number of bits needed to represent
  1406. //// the input value. It handles four inputs in each iteration
  1407. CountBitsLoop:
  1408. movq mm0, [esi] // get the first four input data
  1409. pxor mm7, mm7 // clear mm7
  1410. movq mm1, mm0
  1411. pcmpgtw mm0, mm7 // is input number positive ?
  1412. movq mm3, mm1
  1413. pand mm1, mm0 // original number if greater than 0, else 0
  1414. psubw mm7, mm3 // 0 minus input number
  1415. movq mm2, mm0
  1416. psubw mm3, qword ptr [const_1] // decrement input
  1417. pandn mm0, mm7 // if number < 0 then (- input), else 0
  1418. por mm0, mm1 // abs(input)
  1419. pandn mm2, mm3 // input minus 1 if input < 0, else 0
  1420. por mm2, mm1 // same as input, if input positive, else input minus 1.
  1421. nop ////************************////
  1422. movq mm3, mm0
  1423. movq mm1, mm0
  1424. pcmpgtw mm1, qword ptr [const_255] // split the 16bit value across bit 8 (256)
  1425. psrlw mm0, 8 // get MS 8 bits into LS byte
  1426. movq [esi], mm2 // store input (if it's +ve), else store 1s comp. of input
  1427. movq mm2, mm1
  1428. pand mm1, qword ptr [const_8] // value > 255 implies need for 8 bits, else zero
  1429. pand mm0, mm2 // sift the ones greater than 255, else zeros
  1430. pandn mm2, mm3 // sift the ones less than 256, else zeros
  1431. movq mm5, mm0
  1432. por mm0, mm2 // get reqd. portions of data in LS 8 bits
  1433. por mm5, mm2 // copy of above instruction
  1434. pcmpgtw mm5, qword ptr [const_15]
  1435. movq mm4, mm0
  1436. movq mm3, mm5
  1437. psrlw mm0, 4
  1438. pand mm5, qword ptr [const_4]
  1439. pand mm0, mm3
  1440. movq mm2, mm0
  1441. pandn mm3, mm4
  1442. por mm0, mm3
  1443. por mm2, mm3
  1444. pcmpgtw mm2, qword ptr [const_3]
  1445. movq mm4, mm0
  1446. movq mm3, mm2
  1447. psrlw mm0, 2
  1448. pand mm2, qword ptr [const_2]
  1449. pand mm0, mm3
  1450. movq mm6, mm0
  1451. pandn mm3, mm4
  1452. por mm0, mm3
  1453. por mm6, mm3
  1454. pcmpgtw mm6, qword ptr [const_1]
  1455. movq mm4, mm0
  1456. movq mm3, mm6
  1457. psrlw mm0, 1
  1458. pand mm6, qword ptr [const_1]
  1459. pand mm0, mm3
  1460. por mm1, mm5
  1461. pandn mm3, mm4
  1462. por mm6, mm2
  1463. por mm0, mm3
  1464. por mm1, mm6
  1465. nop ////************************////
  1466. paddw mm0, mm1
  1467. nop ////************************////
  1468. movq [esi+256], mm0
  1469. nop ////************************////
  1470. add esi, 8
  1471. nop ////************************////
  1472. //sub eax, 4
  1473. sub edx, 8 //*** decrement byte count
  1474. jg CountBitsLoop //*** changed loop conditions to break out if not positive
  1475. mov eax, locdwZeroCount
  1476. mov ebx, dwlastZeros
  1477. //emms
  1478. //nop ////************************////
  1479. mov [ebx], eax
  1480. //add esp, sizLOCALS
  1481. }
  1482. return;
  1483. }
  1484. #endif // #define (_X86_)