|
|
/*
* optenc.c * * Encoder for optimal parser * * * Future Improvements: * * When two estimations are equal, for example, "should I output a * character or a match?" there should be some way of deciding * which to take. Right now we force it to output a match, but * for text files, outputting a character results in a small * savings. Even when comparing two matches, we might want to * force it to take one type of match over another. */
#include "encoder.h"
#define copymem(src,dst,size) memcpy(dst,src,size)
static bool redo_first_block(t_encoder_context *context, long *bufpos_ptr); static void block_end(t_encoder_context *context, long BufPos);
/*
* encode a match of length <len> (where <len> >=2), and position <pos> */
#ifdef EXTRALONGMATCHES
#define OUT_MATCH(len,pos) \
{ \ ULONG enclen = (len); \ ULONG extlen = 0; \ if ( enclen > MAX_MATCH ) { \ extlen = enclen - MAX_MATCH; \ enclen = MAX_MATCH; \ } \ context->enc_ItemType [context->enc_literals >> 3] |= (1 << (context->enc_literals & 7)); \ context->enc_LitData [context->enc_literals ] = (byte)(enclen-MIN_MATCH); \ context->enc_ExtraLength[context->enc_literals++ ] = (ushort)(extlen); \ context->enc_DistData [context->enc_distances++ ] = (pos); \ }
#else
#define OUT_MATCH(len,pos) \
{ \ context->enc_ItemType[(context->enc_literals >> 3)] |= (1 << (context->enc_literals & 7)); \ context->enc_LitData [context->enc_literals++] = (byte)(len-2); \ context->enc_DistData[context->enc_distances++] = pos; \ }
#endif
/* encode a character */ #define OUT_CHAR(ch) \
context->enc_LitData [context->enc_literals++] = ch;
#define TREE_CREATE_CHECK() \
if (context->enc_literals >= context->enc_next_tree_create) \ { \ update_tree_estimates(context);\ context->enc_next_tree_create += TREE_CREATE_INTERVAL; \ }
/*
* Returns an estimation of how many bits it would take to output * a given character */ #define CHAR_EST(c) (numbits_t) (context->enc_main_tree_len[(c)])
/*
* Returns an estimation of how many bits it would take to output * a given match. * * <ml> is the match length, where ml >= 2 * <mp> is the match position * * The result is stored in <result> */ #define MATCH_EST(ml,mp,result) \
{ \ ulong mp_slot; \ mp_slot = MP_SLOT(mp); \ if (ml < (NUM_PRIMARY_LENGTHS+2)) \ { \ result = (numbits_t) \ (context->enc_main_tree_len[(NUM_CHARS-2)+(mp_slot<<NL_SHIFT)+ml] + \ enc_extra_bits[mp_slot]); \ } \ else \ { \ result = (numbits_t) \ (context->enc_main_tree_len[(NUM_CHARS+NUM_PRIMARY_LENGTHS)+(mp_slot<<NL_SHIFT)] + \ context->enc_secondary_tree_len[ml-(NUM_PRIMARY_LENGTHS+2)] + \ enc_extra_bits[mp_slot]); \ } \ }
#ifdef _DEBUG
static void VERIFY_MATCH( t_encoder_context *context, long bufpos, int largest_match_len ) { int i, j; ulong match_pos;
/*
* Ensure match does not cross boundary */ _ASSERTE( largest_match_len <= (CHUNK_SIZE-1) - (bufpos & (CHUNK_SIZE-1)) );
for (i = MIN_MATCH; i <= largest_match_len; i++) { match_pos = context->enc_matchpos_table[i];
if (match_pos < NUM_REPEATED_OFFSETS) match_pos = context->enc_last_matchpos_offset[match_pos]; else match_pos -= (NUM_REPEATED_OFFSETS-1);
_ASSERTE (match_pos <= context->enc_window_size-4);
for (j = 0; j < i; j++) { _ASSERTE ( context->enc_MemWindow[bufpos+j] == context->enc_MemWindow[bufpos-match_pos+j] ); } } } #else
#define VERIFY_MATCH(a,b,c) ;
#endif
void flush_all_pending_blocks(t_encoder_context *context) { /*
* Force all blocks to be output */ while (context->enc_literals > 0) output_block(context);
/*
* Flush compressed data out to the caller */ perform_flush_output_callback(context); }
void encoder_start(t_encoder_context *context) { long BytesRead, RealBufPos;
/*
* RealBufPos is our position in the window, * and equals [0...window_size + second_partition_size - 1] */ RealBufPos = context->enc_BufPos - (ulong)(context->enc_RealMemWindow - context->enc_MemWindow);
BytesRead = comp_read_input(context, RealBufPos, CHUNK_SIZE);
if (BytesRead > 0) opt_encode_top(context, BytesRead); }
static void update_tree_estimates(t_encoder_context *context) { if (context->enc_literals) { /*
* Get stats on literals from 0...context->enc_literals */ if (context->enc_need_to_recalc_stats) { /*
* Cumulative total was destroyed, so need to * recalculate */ get_block_stats( context, 0, 0, context->enc_literals );
context->enc_need_to_recalc_stats = false; } else { /*
* Add stats from last_literals...context->enc_literals * to cumulative total */ update_cumulative_block_stats( context, context->enc_last_literals, context->enc_last_distances, context->enc_literals ); }
create_trees(context, false); /* don't generate codes */
fix_tree_cost_estimates(context);
/*
* For cumulative total */ context->enc_last_literals = context->enc_literals; context->enc_last_distances = context->enc_distances; } }
void opt_encode_top(t_encoder_context *context, long BytesRead) { ulong BufPos; ulong RealBufPos; ulong BufPosEnd; ulong BufPosEndThisChunk; ulong MatchPos; ulong i; ulong end_pos; long EncMatchLength; /* must be a signed number */ long ExMatchOff = -1; /* initialize to prevent compiler warning */
/*
* Current position in encoding window */ BufPos = context->enc_BufPos;
/*
* Stop encoding when we reach here */ BufPosEnd = context->enc_BufPos + BytesRead;
/*
* If this is our first time in here (since a new group), then * when we reach this many literals, update our tree cost * estimates. * * Also, output the file size we're using for translation * (0 means no translation at all, which will speed things up * for the decoder). */ if (context->enc_first_time_this_group) { context->enc_first_time_this_group = false;
/*
* Recreate trees when we reach this many literals */ context->enc_next_tree_create = 10000;
if (context->enc_file_size_for_translation) { output_bits(context, 1, 1); /* translation */
output_bits(context, 16, context->enc_file_size_for_translation >> 16); output_bits(context, 16, context->enc_file_size_for_translation & 65535); } else { output_bits(context, 1, 0); /* no translation */ } } else { /*
* If this is our second or later time in here, then add in the * strings we removed last time. * * We have to be careful here, though, because end_pos is * equal to our current BufPos - window_size, not * BufPos - i - window_size; we don't have that much history * around. */ for (i = BREAK_LENGTH; i > 0; i--) quick_insert_bsearch_findmatch( context, BufPos - (long) i, BufPos - context->enc_window_size+4 ); }
while (1) {
top_of_main_loop:
/*
* While we haven't reached the end of the data */ while (BufPos < BufPosEnd) {
BufPosEndThisChunk = ( BufPos + CHUNK_SIZE ) & ~( CHUNK_SIZE - 1 );
if ( BufPosEndThisChunk > BufPosEnd ) { BufPosEndThisChunk = BufPosEnd; }
/*
* Search for matches of all different possible lengths, at BufPos */ EncMatchLength = binary_search_findmatch(context, BufPos);
if (EncMatchLength < MIN_MATCH) {
output_literal:
/*
* No match longer than 1 character exists in the history * window, so output the character at BufPos as a symbol. */ OUT_CHAR(context->enc_MemWindow[BufPos]);
#ifdef TRACING
EncTracingLiteral( BufPos, context->enc_MemWindow[BufPos] ); #endif
BufPos++;
/*
* Check for exceeding literal buffer */ if (context->enc_literals >= (MAX_LITERAL_ITEMS-8)) block_end(context, BufPos);
continue; }
/*
* Found a match. * * Make sure it cannot exceed the end of the buffer. */ if ( EncMatchLength > (long)( BufPosEndThisChunk - BufPos )) { EncMatchLength = (long)( BufPosEndThisChunk - BufPos );
/*
* Oops, not enough for even a small match, so we * have to output a literal */ if (EncMatchLength < MIN_MATCH) goto output_literal; }
VERIFY_MATCH(context, BufPos, EncMatchLength);
if (EncMatchLength < FAST_DECISION_THRESHOLD) { /*
* A match has been found that is between MIN_MATCH and * FAST_DECISION_THRESHOLD bytes in length. The following * algorithm is the optimal encoder that will determine the * most efficient order of matches and unmatched characters * over a span area defined by LOOK. * * The code is essentially a shortest path determination * algorithm. A stream of data can be encoded in a vast number * of different ways depending on the match lengths and offsets * chosen. The key to good compression ratios is to chose the * least expensive path. */ ulong span; ulong epos, bpos, NextPrevPos, MatchPos; decision_node *decision_node_ptr; long iterations;
/*
* Points to the end of the area covered by this match; the span * will continually be extended whenever we find more matches * later on. It will stop being extended when we reach a spot * where there are no matches, which is when we decide which * path to take to output the matches. */ span = BufPos + EncMatchLength;
/*
* The furthest position into which we will do our lookahead parsing */ epos = BufPos + LOOK;
/*
* Temporary BufPos variable */ bpos = BufPos;
/*
* Calculate the path to the next character if we output * an unmatched symbol. */
/* bits required to get here */ context->enc_decision_node[1].numbits = CHAR_EST(context->enc_MemWindow[BufPos]);
/* where we came from */ context->enc_decision_node[1].path = BufPos;
/*
* For the match found, estimate the cost of encoding the match * for each possible match length, shortest offset combination. * * The cost, path and offset is stored at BufPos + Length. */ for (i = MIN_MATCH; i <= (ulong)EncMatchLength; i++) { /*
* Get estimation of match cost given match length = i, * match position = context->enc_matchpos_table[i], and store * the result in context->enc_numbits[i] */ MATCH_EST(i, context->enc_matchpos_table[i], context->enc_decision_node[i].numbits);
/*
* Where we came from */ context->enc_decision_node[i].path = BufPos;
/*
* Associated match position with this path */ context->enc_decision_node[i].link = context->enc_matchpos_table[i];
#ifdef TRACING
{ ULONG TrMatchPos = context->enc_matchpos_table[i]; ULONG TrMatchOff;
if ( TrMatchPos < NUM_REPEATED_OFFSETS ) { TrMatchOff = context->enc_last_matchpos_offset[ TrMatchPos ]; } else { TrMatchOff = TrMatchPos - ( NUM_REPEATED_OFFSETS - 1 ); }
context->enc_decision_node[i].matchoff = TrMatchOff; } #endif
}
/*
* Set bit counter to zero at the start */ context->enc_decision_node[0].numbits = 0;
/*
* Initialise relative match position tables * * Really context->enc_repeated_offset_table[BufPos-bpos][x], but here * BufPos == bpos */ context->enc_decision_node[0].repeated_offset[0] = context->enc_last_matchpos_offset[0]; context->enc_decision_node[0].repeated_offset[1] = context->enc_last_matchpos_offset[1]; context->enc_decision_node[0].repeated_offset[2] = context->enc_last_matchpos_offset[2];
decision_node_ptr = &context->enc_decision_node[-(long) bpos];
#define rpt_offset_ptr(where,which_offset) decision_node_ptr[(where)].repeated_offset[(which_offset)]
while (1) { numbits_t est, cum_numbits;
BufPos++;
/*
* Set the proper repeated offset locations depending on the * shortest path to the location prior to searching for a * match. */
/*
* If this is a match (i.e. path skips over more * than one character). */ if (decision_node_ptr[BufPos].path != (ulong) (BufPos-1)) { ulong LastPos = decision_node_ptr[BufPos].path;
/*
* link_ptr[BufPos] is the match position for this * location */ if (decision_node_ptr[BufPos].link >= NUM_REPEATED_OFFSETS) { context->enc_last_matchpos_offset[0] = decision_node_ptr[BufPos].link-(NUM_REPEATED_OFFSETS-1); context->enc_last_matchpos_offset[1] = rpt_offset_ptr(LastPos,0); context->enc_last_matchpos_offset[2] = rpt_offset_ptr(LastPos,1); } else if (decision_node_ptr[BufPos].link == 0) { context->enc_last_matchpos_offset[0] = rpt_offset_ptr(LastPos,0); context->enc_last_matchpos_offset[1] = rpt_offset_ptr(LastPos,1); context->enc_last_matchpos_offset[2] = rpt_offset_ptr(LastPos,2); } else if (decision_node_ptr[BufPos].link == 1) { context->enc_last_matchpos_offset[0] = rpt_offset_ptr(LastPos,1); context->enc_last_matchpos_offset[1] = rpt_offset_ptr(LastPos,0); context->enc_last_matchpos_offset[2] = rpt_offset_ptr(LastPos,2); } else /* == 2 */ { context->enc_last_matchpos_offset[0] = rpt_offset_ptr(LastPos,2); context->enc_last_matchpos_offset[1] = rpt_offset_ptr(LastPos,1); context->enc_last_matchpos_offset[2] = rpt_offset_ptr(LastPos,0); } }
rpt_offset_ptr(BufPos,0) = context->enc_last_matchpos_offset[0]; rpt_offset_ptr(BufPos,1) = context->enc_last_matchpos_offset[1]; rpt_offset_ptr(BufPos,2) = context->enc_last_matchpos_offset[2];
/*
* The following is one of the two possible break points from * the inner encoding loop. This break will exit the loop if * a point is reached that no match can incorporate; i.e. a * character that does not match back to anything is a point * where all possible paths will converge and the longest one * can be chosen. */ if (span == BufPos) break;
/*
* Search for matches at BufPos */ EncMatchLength = binary_search_findmatch(context, BufPos);
/*
* Make sure that the match does not exceed the stop point */ if ((ulong) EncMatchLength + BufPos > BufPosEndThisChunk) { EncMatchLength = BufPosEndThisChunk - BufPos;
if (EncMatchLength < MIN_MATCH) EncMatchLength = 0; }
VERIFY_MATCH(context, BufPos, EncMatchLength);
/*
* If the match is very long or it exceeds epos (either * surpassing the LOOK area, or exceeding past the end of the * input buffer), then break the loop and output the path. */ if (EncMatchLength > FAST_DECISION_THRESHOLD || BufPos + (ulong) EncMatchLength >= epos) { MatchPos = context->enc_matchpos_table[EncMatchLength];
#ifdef EXTRALONGMATCHES
if ( EncMatchLength == MAX_MATCH ) { if ( MatchPos < NUM_REPEATED_OFFSETS ) { ExMatchOff = context->enc_last_matchpos_offset[ MatchPos ]; } else { ExMatchOff = MatchPos - ( NUM_REPEATED_OFFSETS - 1 ); } } #endif
#ifdef TRACING
{ ULONG TrMatchOff;
if ( MatchPos < NUM_REPEATED_OFFSETS ) { TrMatchOff = context->enc_last_matchpos_offset[ MatchPos ]; } else { TrMatchOff = MatchPos - ( NUM_REPEATED_OFFSETS - 1 ); }
decision_node_ptr[BufPos+EncMatchLength].matchoff = TrMatchOff; } #endif
decision_node_ptr[BufPos+EncMatchLength].link = MatchPos; decision_node_ptr[BufPos+EncMatchLength].path = BufPos;
/*
* Quickly insert data into the search tree without * returning match positions/lengths */ #ifndef INSERT_NEAR_LONG_MATCHES
if (MatchPos == 3 && EncMatchLength > 16) { /*
* If we found a match 1 character away and it's * length 16 or more, it's probably a string of * zeroes, so don't insert that into the search * engine, since doing so can slow things down * significantly! */ quick_insert_bsearch_findmatch( context, BufPos + 1, BufPos - context->enc_window_size + (1 + 4) /* bp+1 -(ws-4) */ ); } else #endif
{ for (i = 1; i < (ulong) EncMatchLength; i++) quick_insert_bsearch_findmatch( context, BufPos + i, BufPos + i - context->enc_window_size + 4 ); }
BufPos += EncMatchLength;
/*
* Update the relative match positions */ if (MatchPos >= NUM_REPEATED_OFFSETS) { context->enc_last_matchpos_offset[2] = context->enc_last_matchpos_offset[1]; context->enc_last_matchpos_offset[1] = context->enc_last_matchpos_offset[0]; context->enc_last_matchpos_offset[0] = MatchPos-(NUM_REPEATED_OFFSETS-1); } else if (MatchPos) { ulong t = context->enc_last_matchpos_offset[0]; context->enc_last_matchpos_offset[0] = context->enc_last_matchpos_offset[MatchPos]; context->enc_last_matchpos_offset[MatchPos] = t; }
break; }
/*
* The following code will extend the area spanned by the * set of matches if the current match surpasses the end of * the span. A match of length two that is far is not * accepted, since it would normally be encoded as characters, * thus allowing the paths to converge. */ if (EncMatchLength > 2 || (EncMatchLength == 2 && context->enc_matchpos_table[2] < BREAK_MAX_LENGTH_TWO_OFFSET)) { if (span < (ulong) (BufPos + EncMatchLength)) { long end; long i;
end = min(BufPos+EncMatchLength-bpos, LOOK-1);
/*
* These new positions are undefined for now, since we haven't * gone there yet, so put in the costliest value */ for (i = span-bpos+1; i <= end; i++) context->enc_decision_node[i].numbits = (numbits_t) -1;
span = BufPos + EncMatchLength; } }
/*
* The following code will iterate through all combinations * of match lengths for the current match. It will estimate * the cost of the path from the beginning of LOOK to * BufPos and to every locations spanned by the current * match. If the path through BufPos with the found matches * is estimated to take fewer number of bits to encode than * the previously found match, then the path to the location * is altered. * * The code relies on accurate estimation of the cost of * encoding a character or a match. Furthermore, it requires * a search engine that will store the smallest match offset * of each possible match length. * * A match of length one is simply treated as an unmatched * character. */
/*
* Get the estimated number of bits required to encode the * path leading up to BufPos. */ cum_numbits = decision_node_ptr[BufPos].numbits;
/*
* Calculate the estimated cost of outputting the path through * BufPos and outputting the next character as an unmatched byte */ est = cum_numbits + CHAR_EST(context->enc_MemWindow[BufPos]);
/*
* Check if it is more efficient to encode the next character * as an unmatched character rather than the previously found * match. If so, then update the cheapest path to BufPos + 1. * * What happens if est == numbits[BufPos-bpos+1]; i.e. it * works out as well to output a character as to output a * match? It's a tough call; however, we will push the * encoder to use matches where possible. */ if (est < decision_node_ptr[BufPos+1].numbits) { decision_node_ptr[BufPos+1].numbits = est; decision_node_ptr[BufPos+1].path = BufPos; }
/*
* Now, iterate through the remaining match lengths and * compare the new path to the existing. Change the path * if it is found to be more cost effective to go through * BufPos. */ for (i = MIN_MATCH; i <= (ulong) EncMatchLength; i++) { MATCH_EST(i, context->enc_matchpos_table[i], est); est += cum_numbits;
/*
* If est == numbits[BufPos+i] we want to leave things * alone, since this will tend to force the matches * to be smaller in size, which is beneficial for most * data. */ if (est < decision_node_ptr[BufPos+i].numbits) { decision_node_ptr[BufPos+i].numbits = est; decision_node_ptr[BufPos+i].path = BufPos; decision_node_ptr[BufPos+i].link = context->enc_matchpos_table[i];
#ifdef TRACING
{ ULONG TrMatchPos = context->enc_matchpos_table[i]; ULONG TrMatchOff;
if ( TrMatchPos < NUM_REPEATED_OFFSETS ) { TrMatchOff = context->enc_last_matchpos_offset[ TrMatchPos ]; } else { TrMatchOff = TrMatchPos - ( NUM_REPEATED_OFFSETS - 1 ); }
decision_node_ptr[BufPos+i].matchoff = TrMatchOff; } #endif
} } } /* continue to loop through span of matches */
/*
* Here BufPos == span, ie. a non-matchable character found. The * following code will output the path properly. */
/*
* Unfortunately the path is stored in reverse; how to get from * where we are now, to get back to where it all started. * * Traverse the path back to the original starting position * of the LOOK span. Invert the path pointers in order to be * able to traverse back to the current position from the start. */
/*
* Count the number of iterations we did, so when we go forwards * we'll do the same amount */ iterations = 0;
NextPrevPos = decision_node_ptr[BufPos].path;
do { ulong PrevPos;
PrevPos = NextPrevPos;
NextPrevPos = decision_node_ptr[PrevPos].path; decision_node_ptr[PrevPos].path = BufPos;
BufPos = PrevPos;
iterations++; } while (BufPos != bpos);
if (context->enc_literals + iterations >= (MAX_LITERAL_ITEMS-8) || context->enc_distances + iterations >= (MAX_DIST_ITEMS-8)) { block_end(context, BufPos); }
/*
* Traverse from the beginning of the LOOK span to the end of * the span along the stored path, outputting matches and * characters appropriately. */ do { if (decision_node_ptr[BufPos].path > BufPos+1) { /*
* Path skips over more than 1 character; therefore it's a match */
#ifdef EXTRALONGMATCHES
//
// If the match length to output here is MAX_MATCH,
// this must be the last entry in the decision chain,
// and we can extend the match as far as it will go.
//
long ExMatchPos = decision_node_ptr[ decision_node_ptr[BufPos].path ].link; long ExMatchLength = decision_node_ptr[BufPos].path - BufPos;
if ( ExMatchLength == MAX_MATCH ) {
ulong ExBufPtr = BufPos + MAX_MATCH;
#ifdef TRACING
ASSERT( ExMatchOff == (long)decision_node_ptr[ decision_node_ptr[BufPos].path ].matchoff ); #endif /* TRACING */
while (( ExBufPtr < BufPosEndThisChunk ) && ( context->enc_MemWindow[ ExBufPtr ] == context->enc_MemWindow[ ExBufPtr - ExMatchOff ] )) {
++ExBufPtr; ++ExMatchLength; } }
OUT_MATCH( ExMatchLength, ExMatchPos );
#ifdef TRACING
EncTracingMatch( BufPos, ExMatchLength, ExMatchPos, decision_node_ptr[ decision_node_ptr[BufPos].path ].matchoff ); #endif // TRACING
BufPos += ExMatchLength;
#else /* ! EXTRALONGMATCHES */
OUT_MATCH( decision_node_ptr[BufPos].path - BufPos, decision_node_ptr[ decision_node_ptr[BufPos].path ].link );
#ifdef TRACING
EncTracingMatch( BufPos, decision_node_ptr[BufPos].path - BufPos, decision_node_ptr[ decision_node_ptr[BufPos].path ].link, decision_node_ptr[ decision_node_ptr[BufPos].path ].matchpos ); #endif // TRACING
BufPos = decision_node_ptr[BufPos].path;
#endif /* ! EXTRALONGMATCHES */
} else { /*
* Path goes to the next character; therefore it's a symbol */ OUT_CHAR(context->enc_MemWindow[BufPos]);
#ifdef TRACING
EncTracingLiteral( BufPos, context->enc_MemWindow[BufPos] ); #endif
BufPos++; } } while (--iterations != 0);
TREE_CREATE_CHECK();
/*
* If we're filling up, and are close to outputting a block, * and it's the first block, then recompress the first N * literals using our accumulated stats. */ if (context->enc_first_block && (context->enc_literals >= (MAX_LITERAL_ITEMS-512) || context->enc_distances >= (MAX_DIST_ITEMS-512))) { if (redo_first_block(context, &BufPos)) goto top_of_main_loop;
/*
* Unable to redo, so output the block */ block_end(context, BufPos); } } else /* EncMatchLength >= FAST_DECISION_THRESHOLD */ { /*
* This code reflects a speed optimization that will always take * a match of length >= FAST_DECISION_THRESHOLD characters. */
/*
* The position associated with the match we found */
MatchPos = context->enc_matchpos_table[EncMatchLength];
#ifdef EXTRALONGMATCHES
if ( EncMatchLength == MAX_MATCH ) {
//
// Extend the match length up to end of input buffer
// or the current position in the history buffer.
//
ulong BufPtr = BufPos + MAX_MATCH; long MatchOff;
if ( MatchPos < NUM_REPEATED_OFFSETS ) { MatchOff = context->enc_last_matchpos_offset[ MatchPos ]; } else { MatchOff = MatchPos - ( NUM_REPEATED_OFFSETS - 1 ); }
while (( BufPtr < BufPosEndThisChunk ) && ( context->enc_MemWindow[ BufPtr ] == context->enc_MemWindow[ BufPtr - MatchOff ] )) {
++BufPtr; ++EncMatchLength; } }
#endif
/*
* Quickly insert match substrings into search tree * (don't look for new matches; just insert the strings) */ #ifndef INSERT_NEAR_LONG_MATCHES
if (MatchPos == 3 && EncMatchLength > 16) { quick_insert_bsearch_findmatch( context, BufPos + 1, BufPos - context->enc_window_size + 5 /* bp+1 -(ws-4) */ ); } else #endif
{ for (i = 1; i < (ulong) EncMatchLength; i++) quick_insert_bsearch_findmatch( context, BufPos + i, BufPos + i - context->enc_window_size + 4 ); }
/*
* Output the match */ OUT_MATCH(EncMatchLength, MatchPos);
#ifdef TRACING
{ ULONG TrMatchOff;
if ( MatchPos < NUM_REPEATED_OFFSETS ) { TrMatchOff = context->enc_last_matchpos_offset[ MatchPos ]; } else { TrMatchOff = MatchPos - ( NUM_REPEATED_OFFSETS - 1 ); }
EncTracingMatch( BufPos, EncMatchLength, MatchPos, TrMatchOff ); }
#endif // TRACING
/*
* Advance our position in the window */ BufPos += EncMatchLength;
if (MatchPos >= NUM_REPEATED_OFFSETS) { context->enc_last_matchpos_offset[2] = context->enc_last_matchpos_offset[1]; context->enc_last_matchpos_offset[1] = context->enc_last_matchpos_offset[0]; context->enc_last_matchpos_offset[0] = MatchPos-(NUM_REPEATED_OFFSETS-1); } else if (MatchPos) { ulong t = context->enc_last_matchpos_offset[0]; context->enc_last_matchpos_offset[0] = context->enc_last_matchpos_offset[MatchPos]; context->enc_last_matchpos_offset[MatchPos] = t; }
/*
* Check to see if we're close to overflowing our output arrays, and * output a block if this is the case */ if (context->enc_literals >= (MAX_LITERAL_ITEMS-8) || context->enc_distances >= (MAX_DIST_ITEMS-8)) block_end(context, BufPos);
} /* EncMatchLength >= FAST_DECISION_THRESHOLD */
} /* end while ... BufPos < BufPosEnd */
/*
* Value of BufPos corresponding to earliest window data */ context->enc_earliest_window_data_remaining = BufPos - context->enc_window_size;
/*
* We didn't read 32K, so we know for sure that * this was our last block of data. */ if (BytesRead < CHUNK_SIZE) { /*
* If we have never output a block, and we haven't * recalculated the stats already, then recalculate * the stats and recompress. */ if (context->enc_first_block) { if (redo_first_block(context, &BufPos)) goto top_of_main_loop; }
break; }
/*
* Remove the last BREAK_LENGTH nodes from the binary search tree, * since we have been inserting strings which contain undefined * data at the end. */ end_pos = BufPos - (context->enc_window_size-4-BREAK_LENGTH);
for (i = 1; (i <= BREAK_LENGTH); i++) binary_search_remove_node(context, BufPos-i, end_pos);
/*
* If we're still in the first window_size + second partition size * bytes in the file then we don't need to copymem() yet. * * RealBufPos is the real position in the file. */ RealBufPos = BufPos - (ulong)(context->enc_RealMemWindow - context->enc_MemWindow);
if (RealBufPos < context->enc_window_size + context->enc_encoder_second_partition_size) break;
/*
* We're about to trash a whole bunch of history with our copymem, * so we'd better redo the first block now if we are ever going to. */ if (context->enc_first_block) { if (redo_first_block(context, &BufPos)) goto top_of_main_loop; }
/*
* We're about to remove a large number of symbols from the window. * Test to see whether, if we were to output a block now, our compressed * output size would be larger than our uncompressed data. If so, then * we will output an uncompressed block. * * The reason we have to do this check here, is that data in the * window is about to be destroyed. We can't simply put this check in * the block outputting code, since there is no guarantee that the * memory window contents corresponding to everything in that block, * are still around - all we'd have would be a set of literals and * distances, when we need all the uncompressed literals to output * an uncompressed block. */
/*
* What value of bufpos corresponds to the oldest data we have in the * buffer? * * After the memory copy, that will be the current buffer position, * minus window_size. */
/*
* The end of the data buffer is reached, more data needs to be read * and the existing data must be shifted into the history window. * * MSVC 4.x generates code which does REP MOVSD so no need to * write this in assembly. */ copymem( &context->enc_RealMemWindow[context->enc_encoder_second_partition_size], &context->enc_RealMemWindow[0], context->enc_window_size );
copymem( &context->enc_RealLeft[context->enc_encoder_second_partition_size], &context->enc_RealLeft[0], sizeof(ulong)*context->enc_window_size );
copymem( &context->enc_RealRight[context->enc_encoder_second_partition_size], &context->enc_RealRight[0], sizeof(ulong)*context->enc_window_size );
context->enc_earliest_window_data_remaining = BufPos - context->enc_window_size;
/*
* The following bit of code is CRUCIAL yet unorthodox in function * and serves as a speed and syntax optimization and makes the code * easier to understand once grasped. * * The three main buffers, context->enc_MemWindow, context->enc_Left and context->enc_Right, * are referensed by BufPos and SearchPos relative to the current * compression window locations. When the encoder reaches the end * of its block of input memory, the data in the input buffer is * shifted into the compression history window and the new input * stream is loaded. Typically the BufPos pointer would be reduced * to signify the replaced data. However, this code reduces the * base pointers to reflect the shift of data, and leaves the BufPos * pointer in its current state. Therefore, the BufPos pointer is * an absolute pointer reflecting the position in the input stream, * and NOT the position in the buffer. The base pointers will point * to invalid memory locations with addresses smaller than the * actual array base pointers. However, when the two pointers are * added together, &(context->enc_MemWindow+BufPos), it will point to the * correct and valid position in the buffer. */
context->enc_MemWindow -= context->enc_encoder_second_partition_size; context->enc_Left -= context->enc_encoder_second_partition_size; context->enc_Right -= context->enc_encoder_second_partition_size;
break; }
/*
* Store BufPos in global variable */ context->enc_BufPos = BufPos; }
static void block_end(t_encoder_context *context, long BufPos) { context->enc_first_block = false; context->enc_need_to_recalc_stats = true;
output_block(context);
if (context->enc_literals < TREE_CREATE_INTERVAL) { context->enc_next_tree_create = TREE_CREATE_INTERVAL; } else { context->enc_next_tree_create = context->enc_literals + TREE_CREATE_INTERVAL; /* recreate right away */ }
context->enc_bufpos_last_output_block = BufPos; }
static bool redo_first_block(t_encoder_context *context, long *bufpos_ptr) { long start_at; long earliest_can_start_at; long pos_in_file; long history_needed; long history_avail; long BufPos; long split_at_literal;
context->enc_first_block = false;
BufPos = *bufpos_ptr;
/*
* For the first context->enc_window size bytes in the file, we don't * need to have context->enc_window size bytes around. * * For anything after that, though, we do need to have window_size * previous bytes to look into. */
/*
* How many bytes are we into the file? */ pos_in_file = BufPos - context->enc_window_size;
/*
* First let's figure out the total history required from * BufPos backwards. For starters, we need all the bytes * we're going to recompress. We get that by seeing the * last time we output a block. */ history_needed = BufPos - context->enc_bufpos_last_output_block;
/*
* Plus we will need window_size bytes before that (for matching * into) unless we're looking within the first window_size * bytes of the file. */ if (context->enc_bufpos_last_output_block-context->enc_window_size < context->enc_window_size) history_needed += context->enc_bufpos_last_output_block - context->enc_window_size; else history_needed += context->enc_window_size;
history_avail = (ulong)(&context->enc_MemWindow[BufPos] - &context->enc_RealMemWindow[0]);
if (history_needed <= history_avail) { earliest_can_start_at = context->enc_bufpos_last_output_block; } else { /*
* Not enough history available */ return false; }
start_at = earliest_can_start_at;
split_block( context, 0, context->enc_literals, context->enc_distances, &split_at_literal, NULL /* don't need # distances returned */ );
get_block_stats( context, 0, 0, split_at_literal );
create_trees(context, false); /* don't generate codes */ fix_tree_cost_estimates(context);
#ifdef MULTIPLE_SEARCH_TREES
/*
* Now set all the tree root pointers to NULL * (don't need to reset the left/right pointers). */ memset(context->enc_tree_root, 0, NUM_SEARCH_TREES * sizeof(ulong)); #else
context->enc_single_tree_root = 0; #endif
/*
* Clear item array and reset literal and distance * counters */ memset(context->enc_ItemType, 0, (MAX_LITERAL_ITEMS/8));
/*
* Reset encoder state */ context->enc_last_matchpos_offset[0] = 1; context->enc_last_matchpos_offset[1] = 1; context->enc_last_matchpos_offset[2] = 1;
context->enc_repeated_offset_at_literal_zero[0] = 1; context->enc_repeated_offset_at_literal_zero[1] = 1; context->enc_repeated_offset_at_literal_zero[2] = 1;
context->enc_input_running_total = 0;
context->enc_literals = 0; context->enc_distances = 0;
context->enc_need_to_recalc_stats = true;
context->enc_next_tree_create = split_at_literal;
*bufpos_ptr = start_at;
return true; }
|