#include "stdafx.h" #pragma hdrstop /* * jchuff.c * * Copyright (C) 1991-1996, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * * This file contains Huffman entropy encoding routines. * * Much of the complexity here has to do with supporting output suspension. * If the data destination module demands suspension, we want to be able to * back up to the start of the current MCU. To do this, we copy state * variables into local working storage, and update them back to the * permanent JPEG objects only upon successful completion of an MCU. */ // MMx Optimisation disabled 5/29/97 Gromit Bug 4375 -Tiling error. - ajais. #pragma warning( disable : 4799 ) #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" #include "jchuff.h" /* Declarations shared with jcphuff.c */ /* Expanded entropy encoder object for Huffman encoding. * * The savable_state subrecord contains fields that change within an MCU, * but must not be updated permanently until we complete the MCU. */ typedef struct { __int64 put_buffer_64; //mmx bit-accumulation buffer INT32 put_buffer; /* current bit-accumulation buffer */ int put_bits; /* # of bits now in it */ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ } savable_state; /* This macro is to work around compilers with missing or broken * structure assignment. You'll need to fix this code if you have * such a compiler and you change MAX_COMPS_IN_SCAN. */ //#ifndef NO_STRUCT_ASSIGN //#define ASSIGN_STATE(dest,src) ((dest) = (src)) //#else // pull out the assignments to put_buffer and put_bits since they are implentation dependent #if MAX_COMPS_IN_SCAN == 4 #define ASSIGN_STATE(dest,src) \ ((dest).last_dc_val[0] = (src).last_dc_val[0], \ (dest).last_dc_val[1] = (src).last_dc_val[1], \ (dest).last_dc_val[2] = (src).last_dc_val[2], \ (dest).last_dc_val[3] = (src).last_dc_val[3]) /*((dest).put_buffer = (src).put_buffer, \ (dest).put_bits = (src).put_bits, */ #endif //#endif typedef struct { struct jpeg_entropy_encoder pub; /* public fields */ savable_state saved; /* Bit buffer & DC state at start of MCU */ /* These fields are NOT loaded into local working state. */ unsigned int restarts_to_go; /* MCUs left in this restart interval */ int next_restart_num; /* next restart number to write (0-7) */ /* Pointers to derived tables (these workspaces have image lifespan) */ c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS]; c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS]; #ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */ long * dc_count_ptrs[NUM_HUFF_TBLS]; long * ac_count_ptrs[NUM_HUFF_TBLS]; #endif } huff_entropy_encoder; typedef huff_entropy_encoder * huff_entropy_ptr; /* Working state while writing an MCU. * This struct contains all the fields that are needed by subroutines. */ typedef struct { // make the next two variables global for easy access in mmx version // JOCTET * next_output_byte; /* => next byte to write in buffer */ // size_t free_in_buffer; /* # of byte spaces remaining in buffer */ // savable_state cur; /* Current bit buffer & DC state */ // flatten (instantiate) savable state here __int64 put_buffer_64; // mmx bit accumulation buffer INT32 put_buffer; /* current bit-accumulation buffer */ int put_bits; /* # of bits now in it */ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ j_compress_ptr cinfo; /* dump_buffer needs access to this */ } working_state; //global vaiables __int64 put_buffer_64; // INT32 put_buffer; int put_bits; JOCTET * next_output_byte; /* => next byte to write in buffer */ size_t free_in_buffer; /* # of byte spaces remaining in buffer */ boolean mmx_cpu=1; /* Forward declarations */ METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo, JBLOCKROW *MCU_data)); METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo)); #ifdef ENTROPY_OPT_SUPPORTED METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo, JBLOCKROW *MCU_data)); METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo)); #endif void countZeros(int *indexBlock,short *coefBlock,short *outBlock,int *lastZeros,int *numElements); boolean emit_bits_fast (working_state * state, unsigned int code, int bsize, int only1); //extern boolean emit_bits (working_state * state, unsigned int code, int size); boolean encode_one_block_fast (working_state * state, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); //extern boolean encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, // c_derived_tbl *dctbl, c_derived_tbl *actbl); /* * Initialize for a Huffman-compressed scan. * If gather_statistics is TRUE, we do not output anything during the scan, * just count the Huffman symbols used and generate Huffman code tables. */ METHODDEF(void) start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; int ci, dctbl, actbl; jpeg_component_info * compptr; if (gather_statistics) { #ifdef ENTROPY_OPT_SUPPORTED entropy->pub.encode_mcu = encode_mcu_gather; entropy->pub.finish_pass = finish_pass_gather; #else ERREXIT(cinfo, JERR_NOT_COMPILED); #endif } else { entropy->pub.encode_mcu = encode_mcu_huff; entropy->pub.finish_pass = finish_pass_huff; } for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; dctbl = compptr->dc_tbl_no; actbl = compptr->ac_tbl_no; /* Make sure requested tables are present */ /* (In gather mode, tables need not be allocated yet) */ if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS || (cinfo->dc_huff_tbl_ptrs[dctbl] == NULL && !gather_statistics)) ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl); if (actbl < 0 || actbl >= NUM_HUFF_TBLS || (cinfo->ac_huff_tbl_ptrs[actbl] == NULL && !gather_statistics)) ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl); if (gather_statistics) { #ifdef ENTROPY_OPT_SUPPORTED /* Allocate and zero the statistics tables */ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ if (entropy->dc_count_ptrs[dctbl] == NULL) entropy->dc_count_ptrs[dctbl] = (long *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, 257 * SIZEOF(long)); MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long)); if (entropy->ac_count_ptrs[actbl] == NULL) entropy->ac_count_ptrs[actbl] = (long *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, 257 * SIZEOF(long)); MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long)); #endif } else { /* Compute derived values for Huffman tables */ /* We may do this more than once for a table, but it's not expensive */ jpeg_make_c_derived_tbl(cinfo, cinfo->dc_huff_tbl_ptrs[dctbl], & entropy->dc_derived_tbls[dctbl]); jpeg_make_c_derived_tbl(cinfo, cinfo->ac_huff_tbl_ptrs[actbl], & entropy->ac_derived_tbls[actbl]); } /* Initialize DC predictions to 0 */ entropy->saved.last_dc_val[ci] = 0; } /* Initialize bit buffer to empty */ entropy->saved.put_buffer_64 = 0; entropy->saved.put_buffer = 0; entropy->saved.put_bits = 0; /* Initialize restart stuff */ entropy->restarts_to_go = cinfo->restart_interval; entropy->next_restart_num = 0; } /* * Compute the derived values for a Huffman table. * Note this is also used by jcphuff.c. */ GLOBAL(void) jpeg_make_c_derived_tbl (j_compress_ptr cinfo, JHUFF_TBL * htbl, c_derived_tbl ** pdtbl) { c_derived_tbl *dtbl; int p, i, l, lastp, si; char huffsize[257]; unsigned int huffcode[257]; unsigned int code; /* Allocate a workspace if we haven't already done so. */ if (*pdtbl == NULL) *pdtbl = (c_derived_tbl *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(c_derived_tbl)); dtbl = *pdtbl; /* Figure C.1: make table of Huffman code length for each symbol */ /* Note that this is in code-length order. */ p = 0; for (l = 1; l <= 16; l++) { for (i = 1; i <= (int) htbl->bits[l]; i++) huffsize[p++] = (char) l; } huffsize[p] = 0; lastp = p; /* Figure C.2: generate the codes themselves */ /* Note that this is in code-length order. */ code = 0; si = huffsize[0]; p = 0; while (huffsize[p]) { while (((int) huffsize[p]) == si) { huffcode[p++] = code; code++; } code <<= 1; si++; } /* Figure C.3: generate encoding tables */ /* These are code and size indexed by symbol value */ /* Set any codeless symbols to have code length 0; * this allows emit_bits to detect any attempt to emit such symbols. */ MEMZERO(dtbl->ehufsi, SIZEOF(dtbl->ehufsi)); for (p = 0; p < lastp; p++) { dtbl->ehufco[htbl->huffval[p]] = huffcode[p]; dtbl->ehufsi[htbl->huffval[p]] = huffsize[p]; } } /* Outputting bytes to the file */ /* Emit a byte, taking 'action' if must suspend. */ #define emit_byte(state,val,action) \ { *next_output_byte++ = (JOCTET) (val); \ if (--free_in_buffer == 0) \ if (! dump_buffer(state)) \ { action; } } GLOBAL(boolean) dump_buffer (working_state * state) /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */ { struct jpeg_destination_mgr * dest = state->cinfo->dest; if (! (*dest->empty_output_buffer) (state->cinfo)) return FALSE; /* After a successful buffer dump, must reset buffer pointers */ next_output_byte = dest->next_output_byte; free_in_buffer = dest->free_in_buffer; return TRUE; } /* Outputting bits to the file */ /* Only the right 24 bits of put_buffer are used; the valid bits are * left-justified in this part. At most 16 bits can be passed to emit_bits * in one call, and we never retain more than 7 bits in put_buffer * between calls, so 24 bits are sufficient. */ //INLINE LOCAL(boolean) emit_bits (working_state * state, unsigned int code, int size) /* Emit some bits; return TRUE if successful, FALSE if must suspend */ { /* This routine is heavily used, so it's worth coding tightly. */ register INT32 put_buffer = (INT32) code; register int put_bits = state->put_bits; /* if size is 0, caller used an invalid Huffman table entry */ if (size == 0) ERREXIT(state->cinfo, JERR_HUFF_MISSING_CODE); put_buffer &= (((INT32) 1)<put_buffer; /* and merge with old buffer contents */ while (put_bits >= 8) { int c = (int) ((put_buffer >> 16) & 0xFF); emit_byte(state, c, return FALSE); if (c == 0xFF) { /* need to stuff a zero byte? */ emit_byte(state, 0, return FALSE); } put_buffer <<= 8; put_bits -= 8; } state->put_buffer = put_buffer; /* update state variables */ state->put_bits = put_bits; return TRUE; } //This is a routine to dump whatever is in put_buffer out - I salvaged it from another //routine so there is some dead-code as-used. //As flush-bits is not called frequently, there should not be much overhead to this code . . . //MJB // // Need to add #ifdef for Alpha port // #if defined (_X86_) void flush_bit_buffer_64() { // byte-align previous bits if any __asm{ mov ebx,[put_bits] mov eax,[next_output_byte] test ebx,ebx je no_ser_buf_data movq mm0,[put_buffer_64] pxor mm2,mm2 dump_loop: movq mm1,mm0 psrlq mm0,56 movd ecx,mm0 movq mm0,mm1 mov (byte ptr[eax]),cl inc eax cmp ecx,0xFF jne not_ff mov (byte ptr[eax]),0x00 dec [free_in_buffer] inc eax nop not_ff: dec [free_in_buffer] psllq mm0,8 sub ebx,8 jg dump_loop mov [put_bits],0 //mov [eb_ptr],eax movq [put_buffer_64],mm2 //emms no_ser_buf_data: mov [next_output_byte],eax } } #endif // #ifdef (_X86_) LOCAL(boolean) flush_bits (working_state * state) { // // Need to add #ifdef for Alpha port // #if defined (_X86_) if (0)//vfMMXMachine) { //if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */ if (! emit_bits_fast(state, 0x7F, 7, 1)) /* fill any partial byte with ones */ return FALSE; if (put_bits) { flush_bit_buffer_64(); // New stuff to write the last, few bits . . . } } else #endif { if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */ return FALSE; } state->put_buffer_64 = 0; /* and reset bit-buffer to empty */ state->put_buffer = 0; /* and reset bit-buffer to empty */ state->put_bits = 0; return TRUE; } /* Encode a single block's worth of coefficients */ LOCAL(boolean) encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { register int temp, temp2; register int nbits; register int k, r, i; /* Encode the DC coefficient difference per section F.1.2.1 */ temp = temp2 = block[0] - last_dc_val; if (temp < 0) { temp = -temp; /* temp is abs value of input */ /* For a negative input, want temp2 = bitwise complement of abs(input) */ /* This code assumes we are on a two's complement machine */ temp2--; } /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 0; while (temp) { nbits++; temp >>= 1; } /* Emit the Huffman-coded symbol for the number of bits */ if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits])) return FALSE; /* Emit that number of bits of the value, if positive, */ /* or the complement of its magnitude, if negative. */ if (nbits) /* emit_bits rejects calls with size 0 */ if (! emit_bits(state, (unsigned int) temp2, nbits)) return FALSE; /* Encode the AC coefficients per section F.1.2.2 */ r = 0; /* r = run length of zeros */ for (k = 1; k < DCTSIZE2; k++) { if ((temp = block[jpeg_natural_order[k]]) == 0) { r++; } else { /* if run length > 15, must emit special run-length-16 codes (0xF0) */ while (r > 15) { if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0])) return FALSE; r -= 16; } temp2 = temp; if (temp < 0) { temp = -temp; /* temp is abs value of input */ /* This code assumes we are on a two's complement machine */ temp2--; } /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 1; /* there must be at least one 1 bit */ while ((temp >>= 1)) nbits++; /* Emit Huffman symbol for run length / number of bits */ i = (r << 4) + nbits; if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i])) return FALSE; /* Emit that number of bits of the value, if positive, */ /* or the complement of its magnitude, if negative. */ if (! emit_bits(state, (unsigned int) temp2, nbits)) return FALSE; r = 0; } } /* If the last coef(s) were zero, emit an end-of-block code */ if (r > 0) if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0])) return FALSE; return TRUE; } /* * Emit a restart marker & resynchronize predictions. */ LOCAL(boolean) emit_restart (working_state * state, int restart_num) { int ci; if (! flush_bits(state)) return FALSE; emit_byte(state, 0xFF, return FALSE); emit_byte(state, JPEG_RST0 + restart_num, return FALSE); /* Re-initialize DC predictions to 0 */ for (ci = 0; ci < state->cinfo->comps_in_scan; ci++) state->last_dc_val[ci] = 0; /* The restart counter is not updated until we successfully write the MCU. */ return TRUE; } /* * Encode and output one MCU's worth of Huffman-compressed coefficients. */ METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; working_state state; int blkn, ci; jpeg_component_info * compptr; /* Load up working state */ next_output_byte = cinfo->dest->next_output_byte; free_in_buffer = cinfo->dest->free_in_buffer; if (0)//vfMMXMachine) { state.put_buffer_64=entropy->saved.put_buffer_64; } else { state.put_buffer=entropy->saved.put_buffer; } state.put_bits=entropy->saved.put_bits; ASSIGN_STATE(state, entropy->saved); state.cinfo = cinfo; /* Emit restart marker if needed */ if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! emit_restart(&state, entropy->next_restart_num)) return FALSE; } /* Encode the MCU data blocks */ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { ci = cinfo->MCU_membership[blkn]; compptr = cinfo->cur_comp_info[ci]; // // Need to add #ifdef for Alpha port // #if defined (_X86_) if (0)//vfMMXMachine) { if (! encode_one_block_fast(&state, MCU_data[blkn][0], state.last_dc_val[ci], entropy->dc_derived_tbls[compptr->dc_tbl_no], entropy->ac_derived_tbls[compptr->ac_tbl_no])) return FALSE; } else #endif { if (! encode_one_block(&state, MCU_data[blkn][0], state.last_dc_val[ci], entropy->dc_derived_tbls[compptr->dc_tbl_no], entropy->ac_derived_tbls[compptr->ac_tbl_no])) return FALSE; } /* Update last_dc_val */ state.last_dc_val[ci] = MCU_data[blkn][0][0]; } /* Completed MCU, so update state */ cinfo->dest->next_output_byte = next_output_byte; cinfo->dest->free_in_buffer = free_in_buffer; if (0)//vfMMXMachine) { entropy->saved.put_buffer_64=state.put_buffer_64; } else { entropy->saved.put_buffer=state.put_buffer; } entropy->saved.put_bits=state.put_bits; ASSIGN_STATE(entropy->saved, state); /* Update restart-interval state too */ if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) { entropy->restarts_to_go = cinfo->restart_interval; entropy->next_restart_num++; entropy->next_restart_num &= 7; } entropy->restarts_to_go--; } // // Need to add #ifdef for Alpha port // #if defined (_X86_) if (0)//vfMMXMachine) { __asm emms } #endif return TRUE; } /* * Finish up at the end of a Huffman-compressed scan. */ METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; working_state state; /* Load up working state ... flush_bits needs it */ next_output_byte = cinfo->dest->next_output_byte; free_in_buffer = cinfo->dest->free_in_buffer; if (0)//vfMMXMachine) { state.put_buffer_64=entropy->saved.put_buffer_64; } else { state.put_buffer=entropy->saved.put_buffer; } state.put_bits=entropy->saved.put_bits; ASSIGN_STATE(state, entropy->saved); state.cinfo = cinfo; /* Flush out the last data */ if (!flush_bits(&state)) ERREXIT(cinfo, JERR_CANT_SUSPEND); /* Update state */ cinfo->dest->next_output_byte = next_output_byte; cinfo->dest->free_in_buffer = free_in_buffer; if (0)//vfMMXMachine) { entropy->saved.put_buffer_64=state.put_buffer_64; } else { entropy->saved.put_buffer=state.put_buffer; } entropy->saved.put_bits=state.put_bits; ASSIGN_STATE(entropy->saved, state); // // Need to add #ifdef for Alpha port // #if defined (_X86_) if (0)//vfMMXMachine) { __asm emms } #endif } /* * Huffman coding optimization. * * This actually is optimization, in the sense that we find the best possible * Huffman table(s) for the given data. We first scan the supplied data and * count the number of uses of each symbol that is to be Huffman-coded. * (This process must agree with the code above.) Then we build an * optimal Huffman coding tree for the observed counts. * * The JPEG standard requires Huffman codes to be no more than 16 bits long. * If some symbols have a very small but nonzero probability, the Huffman tree * must be adjusted to meet the code length restriction. We currently use * the adjustment method suggested in the JPEG spec. This method is *not* * optimal; it may not choose the best possible limited-length code. But * since the symbols involved are infrequently used, it's not clear that * going to extra trouble is worthwhile. */ #ifdef ENTROPY_OPT_SUPPORTED /* Process a single block's worth of coefficients */ LOCAL(void) htest_one_block (JCOEFPTR block, int last_dc_val, long dc_counts[], long ac_counts[]) { register int temp; register int nbits; register int k, r; /* Encode the DC coefficient difference per section F.1.2.1 */ temp = block[0] - last_dc_val; if (temp < 0) temp = -temp; /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 0; while (temp) { nbits++; temp >>= 1; } /* Count the Huffman symbol for the number of bits */ dc_counts[nbits]++; /* Encode the AC coefficients per section F.1.2.2 */ r = 0; /* r = run length of zeros */ for (k = 1; k < DCTSIZE2; k++) { if ((temp = block[jpeg_natural_order[k]]) == 0) { r++; } else { /* if run length > 15, must emit special run-length-16 codes (0xF0) */ while (r > 15) { ac_counts[0xF0]++; r -= 16; } /* Find the number of bits needed for the magnitude of the coefficient */ if (temp < 0) temp = -temp; /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 1; /* there must be at least one 1 bit */ while ((temp >>= 1)) nbits++; /* Count Huffman symbol for run length / number of bits */ ac_counts[(r << 4) + nbits]++; r = 0; } } /* If the last coef(s) were zero, emit an end-of-block code */ if (r > 0) ac_counts[0]++; } /* * Trial-encode one MCU's worth of Huffman-compressed coefficients. * No data is actually output, so no suspension return is possible. */ METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; int blkn, ci; jpeg_component_info * compptr; /* Take care of restart intervals if needed */ if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) { /* Re-initialize DC predictions to 0 */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) entropy->saved.last_dc_val[ci] = 0; /* Update restart state */ entropy->restarts_to_go = cinfo->restart_interval; } entropy->restarts_to_go--; } for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { ci = cinfo->MCU_membership[blkn]; compptr = cinfo->cur_comp_info[ci]; htest_one_block(MCU_data[blkn][0], entropy->saved.last_dc_val[ci], entropy->dc_count_ptrs[compptr->dc_tbl_no], entropy->ac_count_ptrs[compptr->ac_tbl_no]); entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0]; } return TRUE; } /* * Generate the optimal coding for the given counts, fill htbl. * Note this is also used by jcphuff.c. */ GLOBAL(void) jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) { #define MAX_CLEN 32 /* assumed maximum initial code length */ UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */ int codesize[257]; /* codesize[k] = code length of symbol k */ int others[257]; /* next symbol in current branch of tree */ int c1, c2; int p, i, j; long v; /* This algorithm is explained in section K.2 of the JPEG standard */ MEMZERO(bits, SIZEOF(bits)); MEMZERO(codesize, SIZEOF(codesize)); for (i = 0; i < 257; i++) others[i] = -1; /* init links to empty */ freq[256] = 1; /* make sure there is a nonzero count */ /* Including the pseudo-symbol 256 in the Huffman procedure guarantees * that no real symbol is given code-value of all ones, because 256 * will be placed in the largest codeword category. */ /* Huffman's basic algorithm to assign optimal code lengths to symbols */ for (;;) { /* Find the smallest nonzero frequency, set c1 = its symbol */ /* In case of ties, take the larger symbol number */ c1 = -1; v = 1000000000L; for (i = 0; i <= 256; i++) { if (freq[i] && freq[i] <= v) { v = freq[i]; c1 = i; } } /* Find the next smallest nonzero frequency, set c2 = its symbol */ /* In case of ties, take the larger symbol number */ c2 = -1; v = 1000000000L; for (i = 0; i <= 256; i++) { if (freq[i] && freq[i] <= v && i != c1) { v = freq[i]; c2 = i; } } /* Done if we've merged everything into one frequency */ if (c2 < 0) break; /* Else merge the two counts/trees */ freq[c1] += freq[c2]; freq[c2] = 0; /* Increment the codesize of everything in c1's tree branch */ codesize[c1]++; while (others[c1] >= 0) { c1 = others[c1]; codesize[c1]++; } others[c1] = c2; /* chain c2 onto c1's tree branch */ /* Increment the codesize of everything in c2's tree branch */ codesize[c2]++; while (others[c2] >= 0) { c2 = others[c2]; codesize[c2]++; } } /* Now count the number of symbols of each code length */ for (i = 0; i <= 256; i++) { if (codesize[i]) { /* The JPEG standard seems to think that this can't happen, */ /* but I'm paranoid... */ if (codesize[i] > MAX_CLEN) ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW); bits[codesize[i]]++; } } /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure * Huffman procedure assigned any such lengths, we must adjust the coding. * Here is what the JPEG spec says about how this next bit works: * Since symbols are paired for the longest Huffman code, the symbols are * removed from this length category two at a time. The prefix for the pair * (which is one bit shorter) is allocated to one of the pair; then, * skipping the BITS entry for that prefix length, a code word from the next * shortest nonzero BITS entry is converted into a prefix for two code words * one bit longer. */ for (i = MAX_CLEN; i > 16; i--) { while (bits[i] > 0) { j = i - 2; /* find length of new prefix to be used */ while (bits[j] == 0) j--; bits[i] -= 2; /* remove two symbols */ bits[i-1]++; /* one goes in this length */ bits[j+1] += 2; /* two new symbols in this length */ bits[j]--; /* symbol of this length is now a prefix */ } } /* Remove the count for the pseudo-symbol 256 from the largest codelength */ while (bits[i] == 0) /* find largest codelength still in use */ i--; bits[i]--; /* Return final symbol counts (only for lengths 0..16) */ MEMCOPY(htbl->bits, bits, SIZEOF(htbl->bits)); /* Return a list of the symbols sorted by code length */ /* It's not real clear to me why we don't need to consider the codelength * changes made above, but the JPEG spec seems to think this works. */ p = 0; for (i = 1; i <= MAX_CLEN; i++) { for (j = 0; j <= 255; j++) { if (codesize[j] == i) { htbl->huffval[p] = (UINT8) j; p++; } } } /* Set sent_table FALSE so updated table will be written to JPEG file. */ htbl->sent_table = FALSE; } /* * Finish up a statistics-gathering pass and create the new Huffman tables. */ METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; int ci, dctbl, actbl; jpeg_component_info * compptr; JHUFF_TBL **htblptr; boolean did_dc[NUM_HUFF_TBLS]; boolean did_ac[NUM_HUFF_TBLS]; /* It's important not to apply jpeg_gen_optimal_table more than once * per table, because it clobbers the input frequency counts! */ MEMZERO(did_dc, SIZEOF(did_dc)); MEMZERO(did_ac, SIZEOF(did_ac)); for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; dctbl = compptr->dc_tbl_no; actbl = compptr->ac_tbl_no; if (! did_dc[dctbl]) { htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl]; if (*htblptr == NULL) *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]); did_dc[dctbl] = TRUE; } if (! did_ac[actbl]) { htblptr = & cinfo->ac_huff_tbl_ptrs[actbl]; if (*htblptr == NULL) *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]); did_ac[actbl] = TRUE; } } } #endif /* ENTROPY_OPT_SUPPORTED */ /* * Module initialization routine for Huffman entropy encoding. */ GLOBAL(void) jinit_huff_encoder (j_compress_ptr cinfo) { huff_entropy_ptr entropy; int i; entropy = (huff_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(huff_entropy_encoder)); cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; entropy->pub.start_pass = start_pass_huff; /* Mark tables unallocated */ for (i = 0; i < NUM_HUFF_TBLS; i++) { entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL; #ifdef ENTROPY_OPT_SUPPORTED entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL; #endif } } //mark buxton's new emit_bits: static unsigned int onlynbits[] = { 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x000000F, 0x0000001F, 0x0000003F, 0x0000007F, 0x000000FF, 0x000001FF, 0x000003FF, 0x000007FF, 0x00000FFF, 0x00001FFF, 0x00003FFF, 0x00007FFF, 0x0000FFFF, 0x0001FFFF, 0x0003FFFF, 0x0007FFFF, 0x000FFFFF, 0x001FFFFF, 0x003FFFFF, 0x007FFFFF, 0x00FFFFFF, 0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF, 0x0FFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; // // Need to add #ifdef for Alpha port // #if defined (_X86_) GLOBAL(boolean) emit_bits_fast (working_state * state, unsigned int code, int bsize, int only1){ // Emit some bits; return TRUE if successful, FALSE if must suspend // This routine is heavily used, so it's worth coding tightly. //unsigned int put_buffer = code; // int put_bits = state->put_bits; unsigned c; __asm{ mov edx,64 mov esi,[put_bits] add esi,dword ptr[bsize] mov [put_bits],esi sub edx,esi mov ebx,[free_in_buffer] movd mm3,edx mov edx,[next_output_byte] movd mm0,[code] movq mm7,[put_buffer_64] psllq mm0,mm3 //put_buffer <<= 64-put_bits; por mm7,mm0 cmp [only1],0 movq mm0,mm7 jne got_FF cmp ebx,8 jng got_FF cmp esi,32 jng buffer_not_full // test [next_output_byte],0x3 // jnz byte_write //test to see if the data is on a 4-byte boundary. If not, don't use the //integer write. //integer_write: //Write 32 bits. movq mm1,mm5 psrlq mm0,32 pcmpeqb mm1,mm0 sub ebx,4 movd eax,mm1 movq mm2,mm0 // | - | - | - | - | D | C | B | A | MM2 test eax,eax jne got_FF // big-endian data psrlq mm0,8 // | - | - | - | - | - | D | C | B | MM0 mov [free_in_buffer],ebx punpcklbw mm0,mm2 // | - | - | C | D | B | C | A | B | MM0 add edx,4 pslld mm0,16 // | C | D | B | C | A | B | - | - | MM0 sub esi,32 psrad mm0,16 //have to use this pair because packssdw expects 16-byte _signed_ data. psllq mm7,32 packssdw mm0,mm0 // | - | - | - | - | C | D | A | B | MM0 mov [put_bits],esi movq mm2,mm0 // | - | - | - | - | C | D | A | B | MM2 movq [put_buffer_64],mm7 psrlq mm0,16 // | - | - | - | - | - | - | C | D | MM0 mov [next_output_byte],edx punpcklwd mm0,mm2 // | - | - | - | - | C | B | C | D | MM0 movd [edx-4],mm0 nop } return TRUE; got_FF: //only if an FF was returned. while (put_bits >=8) { __asm{ movq mm4,mm7 psrlq mm7,56 movd [c],mm7 psllq mm4,8 movq mm7,mm4 sub [put_bits],8 } emit_byte(state, c, return FALSE); //emit_byte_fast(c); if (c == 0xFF) emit_byte(state, 0, return FALSE); //if (c==0xFF) emit_byte_fast(0); //put_bits -= 8; } buffer_not_full: __asm movq [put_buffer_64],mm7 return TRUE; } /* Encode a single block's worth of coefficients */ GLOBAL(boolean) encode_one_block_fast (working_state * state, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { JCOEF temp ; int nbits; int k, i ,j; /*unsigned <-Buxton BUG??*/ int l; int lastzeros = 0, numElements = 0 ; short dummy_outblock[192+4] ; // 64 values, 64 corresponding zero & bit counts short *outblock; outblock= (short *)(((unsigned int)dummy_outblock+7)&0xFFFFFFF8); // Encode the DC coefficient difference per section F.1.2.1 //starttimer(); temp = block[0] ; block[0] = (JCOEF) (block[0] - last_dc_val) ; countZeros((int *)jpeg_natural_order, block, outblock, &lastzeros, &numElements) ; //formatting for frequently used MMX registers in emit_bits_fast __asm{ //provide two constants: mm5 = 0xFFFF|FFFF|FFFF|FFFF // mm6 = 0x0000|FFFF|0000|FFFF pcmpeqb mm5,mm5 pxor mm0,mm0 movq mm6,mm5 punpcklwd mm6,mm0 } if(block[0] == 0) { // Emit the Huffman-coded symbol for the number of bits //changed to only1 - dshade if (! emit_bits_fast(state, dctbl->ehufco[0], dctbl->ehufsi[0],1)) return FALSE; outblock[64] -- ; k=0; } else { nbits = outblock[128] ; k=1; //changed to only1 - dshade if (!emit_bits_fast(state,dctbl->ehufco[nbits]<ehufsi[nbits],1)) return FALSE; } block[0] = temp ; // get the original value of block[0] back in. if( (numElements == 0) || (numElements == 1) ) { lastzeros = 63 ; // DC element handled outside of the AC loop below } // Encode the AC coefficients per section F.1.2.2 for (; k < numElements; k++) { l = outblock[64+k];//<<4; //store frequently used lookups: j = outblock[128+k]; // if run length > 15, must emit special run-length-16 codes (0xF0) while (l > 15/*240*/) { // changed to only1 - dshade if (! emit_bits_fast(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0],1)) return FALSE; l -= 16; //256; } // if (l < 0) // dshade // l = 0; // dshade l = l << 4; i = l + j; //hufandval = actbl->ehufco[i]<ehufsi[i]; //changed to only1 - dshade if (!emit_bits_fast(state,actbl->ehufco[i]<ehufsi[i],1)) return FALSE; } // If the last coef(s) were zero, emit an end-of-block code if (lastzeros > 0) { //changed to only1 - dshade if (! emit_bits_fast(state, actbl->ehufco[0], actbl->ehufsi[0],1)) return FALSE; } //cumulative_time += stoptimer(); _asm emms // dshade return TRUE; } //used above as: //countZeros((int *)jpeg_natural_order, block, outblock, &lastzeros, &numElements) ; __int32 jmpswitch; void countZeros( int *dwindexBlock, short *dwcoefBlock, short *dwoutBlock, int *dwlastZeros, int *dwnumElem ){ static __int64 const_1 = 0x0001000100010001; static __int64 const_2 = 0x0002000200020002; static __int64 const_3 = 0x0003000300030003; static __int64 const_4 = 0x0004000400040004; static __int64 const_8 = 0x0008000800080008; static __int64 const_15 = 0x000f000f000f000f; static __int64 const_255 = 0x00ff00ff00ff00ff; //#define sizLOCALS 8 //_countZeros proc USES eax ebx ecx edx esi edi // Move all paramters to be based on esp // Must REMEMBER NOT to use the stack for any push/pops // the following are the parameters passed into the routine. //#define dwindexBlock dword ptr [esp+32+sizLOCALS] // 32 bit elements //#define dwcoefBlock dword ptr [esp+36+sizLOCALS] // 16 bit elements //#define dwoutBlock dword ptr [esp+40+sizLOCALS] // 16 bit elements //#define dwlastZeros dword ptr [esp+44+sizLOCALS] // address of a 32 bit element //#define dwnumElem dword ptr [esp+48+sizLOCALS] // number of non-zero elements // dwlastZeros stores the number of trailing zero values. //;;;;; LOCALS :;;;;;;;;;;;;;;; __int32 locdwoutBlock; __int32 locdwZeroCount; __int32 loopctr; // these are used as scratchpad registers. // right now a new local has been added on an as needed // basis. There's potential for reducing the number of locals. //#define locdwoutBlock dword ptr [esp+0] //#define locdwZeroCount dword ptr [esp+4] //;;;;; END OF LOCALS ;;;;;;;;;;;;;;; __asm{ //sub esp, sizLOCALS mov esi, dwindexBlock ; load the input array pointer mov edi, dwcoefBlock mov eax, dwoutBlock nop //************************;; mov locdwoutBlock, eax nop //************************;; mov dword ptr[loopctr], 10h // loop count of 16 : four elements handled per loop mov locdwZeroCount, 0h // initialize zero counter to 0 CountZeroLoop: // align the zigzag elements of inblock into MMX register, four words // at a time. // get index for next four elements in coeff array from the zigzag array mov eax, [esi] mov ebx, [esi+4] mov ecx, [esi+8] mov edx, [esi+12] // get the next four coeff. words mov eax, [edi+2*eax] mov ebx, [edi+2*ebx] mov ecx, [edi+2*ecx] mov edx, [edi+2*edx] // pack first two words in eax (first word in LS 16 bits) shl ebx, 16 and eax, 0ffffh // pack next two words in ecx (third word in LS 16 bits) shl edx, 16 and ecx, 0ffffh or eax, ebx or ecx, edx mov ebx, eax or eax, ecx // check to see if all 4 elems. are zero cmp eax, 0h jz caseAllZeros movd mm0, ebx // move LS two words into mm0 pxor mm2, mm2 // initialize mm2 to zero movd mm1, ecx // move MS two words into mm1 nop //************************;; pcmpeqw mm0, mm2 pcmpeqw mm1, mm2 movq mm3,mm0 por mm0,mm1 movd eax,mm0 pcmpeqw mm2,mm2 cmp eax,0h jz caseNoZeros movd eax,mm3 pandn mm1,mm2 movd edx,mm1 not eax and eax,0x00020001 and edx,0x00080004 or eax,edx add esi,16 mov edx,eax shr eax,16 or eax,edx mov edx, locdwZeroCount and eax,0xFFFF nop dec eax lea eax,[JmpTable+eax*8] jmp eax JmpTable: jmp case0001 nop nop nop jmp case0010 nop nop nop jmp case0011 nop nop nop jmp case0100 nop nop nop jmp case0101 nop nop nop jmp case0110 nop nop nop jmp case0111 nop nop nop jmp case1000 nop nop nop jmp case1001 nop nop nop jmp case1010 nop nop nop jmp case1011 nop nop nop jmp case1100 nop nop nop jmp case1101 nop nop nop jmp case1110 nop nop nop jmp caseNoZeros caseAllZeros: add locdwZeroCount, 4 add esi, 16 dec [loopctr] // decrement loop counter jnz CountZeroLoop jmp AllDone caseNoZeros: mov eax, locdwoutBlock mov edx, locdwZeroCount mov locdwZeroCount, 0h add esi, 16 // esi points to a 32 bit quantity mov [eax], ebx // store the LS two words mov [eax+4], ecx // store the MS two words add locdwoutBlock, 8 mov [eax+128], edx mov dword ptr [eax+132], 0 nop //************************;; dec [loopctr] jnz CountZeroLoop jmp AllDone // case0000: // this case is taken care of by caseAllZero case0001: mov eax, locdwoutBlock mov locdwZeroCount, 3 mov [eax], bx // store the LS word mov [eax+128], dx //; store the corresponding zero count add locdwoutBlock, 2 nop //************************;; dec [loopctr] jnz CountZeroLoop jmp AllDone case0010: mov eax, locdwoutBlock mov locdwZeroCount, 2 shr ebx, 16 // get the MS word into LS 16 bits add edx, 1 // increment zero count mov [eax+128], dx // store the corresponding zero count nop //************************;; mov [eax], bx // store the LS word add locdwoutBlock, 2 dec [loopctr] jnz CountZeroLoop jmp AllDone case0011: mov eax, locdwoutBlock mov locdwZeroCount, 2 mov [eax], ebx // store the LS word mov [eax+128], edx // store the corresponding zero count add locdwoutBlock, 4 nop //************************;; dec [loopctr] jnz CountZeroLoop jmp AllDone case0100: mov eax, locdwoutBlock add edx, 2 mov [eax], cx // store the LS word within MS DWORD mov [eax+128], dx // store the corresponding zero count add locdwoutBlock, 2 mov locdwZeroCount, 1 dec [loopctr] jnz CountZeroLoop jmp AllDone case0101: mov eax, locdwoutBlock or edx, 10000h // zero count is 1 for second word mov [eax], bx // store the LS word within LS DWORD mov [eax+2], cx // store the LS word within MS DWORD mov [eax+128], edx // store the corresponding zero count nop //************************;; add locdwoutBlock, 4 mov locdwZeroCount, 1 dec [loopctr] jnz CountZeroLoop jmp AllDone case0110: mov eax, locdwoutBlock add edx, 1 // zero count is incremented for first word shr ebx, 16 // move the word to be written into LS 16 bits mov [eax], bx // store the LS word within LS DWORD mov [eax+2], cx // store the LS word within MS DWORD add locdwoutBlock, 4 mov [eax+128], edx // store the corresponding zero count mov locdwZeroCount, 1 dec [loopctr] jnz CountZeroLoop jmp AllDone case0111: mov eax, locdwoutBlock nop add locdwoutBlock, 6 mov locdwZeroCount, 1 mov [eax], ebx // store the LS word within LS DWORD mov [eax+4], cx // store the LS word within MS DWORD mov [eax+128], edx // store the corresponding zero count mov word ptr [eax+132], 0 // zerocount of 0 for third word dec [loopctr] jnz CountZeroLoop jmp AllDone case1000: mov eax, locdwoutBlock add edx, 3 shr ecx, 16 nop //************************;; mov [eax], cx // store the LS word within MS DWORD mov [eax+128], dx // store the corresponding zero count add locdwoutBlock, 2 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1001: mov eax, locdwoutBlock shr ecx, 16 // word 3 into LS bits or edx, 00020000h //// zero count of two for MS word mov [eax], bx // store the LS word within MS DWORD mov [eax+2], cx // store the LS word within MS DWORD add locdwoutBlock, 4 mov [eax+128], edx // store the corresponding zero count mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1010: mov eax, locdwoutBlock nop add edx, 1 // increment zero count shr ecx, 16 // word 3 into LS bits shr ebx, 16 // word 2 into LS bits or edx, 00010000h // zero count of two for MS word mov [eax], bx // store the LS word within MS DWORD mov [eax+2], cx // store the LS word within MS DWORD mov [eax+128], edx // store the corresponding zero count //add esi, 16 // esi points to a 32 bit quantity add locdwoutBlock, 4 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1011: mov eax, locdwoutBlock shr ecx, 16 // word 3 into LS bits mov [eax], ebx // store the LS DWORD mov [eax+4], cx // store the LS word within MS DWORD mov [eax+128], edx // store the corresponding zero count mov word ptr [eax+132], 1 add locdwoutBlock, 6 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1100: mov eax, locdwoutBlock add edx, 2 // add 2 to zeroc count mov [eax], ecx // store the LS DWORD mov [eax+128], edx // store the corresponding zero count add locdwoutBlock, 4 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1101: mov eax, locdwoutBlock nop ////************************//// mov [eax], bx // store the LS DWORD mov [eax+128], dx // store the corresponding zero count mov [eax+2], ecx mov dword ptr [eax+130], 1 // zero count of 1 for 2nd word add locdwoutBlock, 6 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone case1110: mov eax, locdwoutBlock shr ebx, 16 // get word 1 in LS word add dx, 1 // add 1 to zerocount nop ////************************//// mov [eax], bx // store the LS DWORD mov [eax+128], dx // store the corresponding zero count mov [eax+2], ecx mov dword ptr [eax+130], 0 // zero count of 0 for 2nd & 3rd word add locdwoutBlock, 6 mov locdwZeroCount, 0 dec [loopctr] jnz CountZeroLoop jmp AllDone // case1111: // this case is handled by caseNoZeros //////////////////////////////////////////////////////////////////////////////////////////////////////////// AllDone: // at this point all zero counting is done // now get the number of non-zero elements and round to the // nearest multiple of four in the +infinity direction mov eax, locdwoutBlock sub eax, dwoutBlock // how many non-zero elements did you write mov ebx, dwnumElem // address to store number of non-zero elements //*** small bug fix by dshade 4/8/97 to eliminate case where loop count went negative below //*** all changes have //*** by them mov edx, eax //*** make a copy before shifting shr eax, 1 // a/c for each element being 2 bytes //// round the number of elements to nearest multiple of four mov [ebx], eax //add eax, 4 add edx, 7 //*** add get an even multiple of 8 and edx, 0x1f8 //*** edx holds the number of writes rounded up to 8 //and eax, 0fch // get the LS 2 bits to be zero mov esi, dwoutBlock //// This loop should count the number of bits needed to represent //// the input value. It handles four inputs in each iteration CountBitsLoop: movq mm0, [esi] // get the first four input data pxor mm7, mm7 // clear mm7 movq mm1, mm0 pcmpgtw mm0, mm7 // is input number positive ? movq mm3, mm1 pand mm1, mm0 // original number if greater than 0, else 0 psubw mm7, mm3 // 0 minus input number movq mm2, mm0 psubw mm3, qword ptr [const_1] // decrement input pandn mm0, mm7 // if number < 0 then (- input), else 0 por mm0, mm1 // abs(input) pandn mm2, mm3 // input minus 1 if input < 0, else 0 por mm2, mm1 // same as input, if input positive, else input minus 1. nop ////************************//// movq mm3, mm0 movq mm1, mm0 pcmpgtw mm1, qword ptr [const_255] // split the 16bit value across bit 8 (256) psrlw mm0, 8 // get MS 8 bits into LS byte movq [esi], mm2 // store input (if it's +ve), else store 1s comp. of input movq mm2, mm1 pand mm1, qword ptr [const_8] // value > 255 implies need for 8 bits, else zero pand mm0, mm2 // sift the ones greater than 255, else zeros pandn mm2, mm3 // sift the ones less than 256, else zeros movq mm5, mm0 por mm0, mm2 // get reqd. portions of data in LS 8 bits por mm5, mm2 // copy of above instruction pcmpgtw mm5, qword ptr [const_15] movq mm4, mm0 movq mm3, mm5 psrlw mm0, 4 pand mm5, qword ptr [const_4] pand mm0, mm3 movq mm2, mm0 pandn mm3, mm4 por mm0, mm3 por mm2, mm3 pcmpgtw mm2, qword ptr [const_3] movq mm4, mm0 movq mm3, mm2 psrlw mm0, 2 pand mm2, qword ptr [const_2] pand mm0, mm3 movq mm6, mm0 pandn mm3, mm4 por mm0, mm3 por mm6, mm3 pcmpgtw mm6, qword ptr [const_1] movq mm4, mm0 movq mm3, mm6 psrlw mm0, 1 pand mm6, qword ptr [const_1] pand mm0, mm3 por mm1, mm5 pandn mm3, mm4 por mm6, mm2 por mm0, mm3 por mm1, mm6 nop ////************************//// paddw mm0, mm1 nop ////************************//// movq [esi+256], mm0 nop ////************************//// add esi, 8 nop ////************************//// //sub eax, 4 sub edx, 8 //*** decrement byte count jg CountBitsLoop //*** changed loop conditions to break out if not positive mov eax, locdwZeroCount mov ebx, dwlastZeros //emms //nop ////************************//// mov [ebx], eax //add esp, sizLOCALS } return; } #endif // #define (_X86_)