Skip to content

Commit

Permalink
Reduce token count by 1% without changing the ratio
Browse files Browse the repository at this point in the history
  • Loading branch information
marty-emmanuel committed Apr 25, 2019
1 parent b7967c3 commit 9e75a55
Showing 1 changed file with 34 additions and 24 deletions.
58 changes: 34 additions & 24 deletions src/shrink.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#define LAST_MATCH_OFFSET 4
#define LAST_LITERALS 1

#define MODESWITCH_PENALTY 1

/** One match */
typedef struct _lzsa_match {
unsigned short length;
Expand Down Expand Up @@ -389,24 +391,24 @@ static void lzsa_find_all_matches(lsza_compressor *pCompressor, const int nStart
}

/**
* Get the number of extra bytes required to represent a literals length
* Get the number of extra bits required to represent a literals length
*
* @param nLength literals length
*
* @return number of extra bytes required
* @return number of extra bits required
*/
static inline int lzsa_get_literals_varlen_size(const int nLength) {
if (nLength < LITERALS_RUN_LEN) {
return 0;
}
else {
if (nLength < 256)
return 1;
return 8;
else {
if (nLength < 512)
return 2;
return 16;
else
return 3;
return 24;
}
}
}
Expand Down Expand Up @@ -440,24 +442,24 @@ static inline int lzsa_write_literals_varlen(unsigned char *pOutData, int nOutOf
}

/**
* Get the number of extra bytes required to represent an encoded match length
* Get the number of extra bits required to represent an encoded match length
*
* @param nLength encoded match length (actual match length - MIN_MATCH_SIZE)
*
* @return number of extra bytes required
* @return number of extra bits required
*/
static inline int lzsa_get_match_varlen_size(const int nLength) {
if (nLength < MATCH_RUN_LEN) {
return 0;
}
else {
if ((nLength + MIN_MATCH_SIZE) < 256)
return 1;
return 8;
else {
if ((nLength + MIN_MATCH_SIZE) < 512)
return 2;
return 16;
else
return 3;
return 24;
}
}
}
Expand Down Expand Up @@ -503,27 +505,29 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
int nMinMatchSize = pCompressor->min_match_size;
int i;

cost[nEndOffset - 1] = 1;
cost[nEndOffset - 1] = 8;
nLastLiteralsOffset = nEndOffset;

for (i = nEndOffset - 2; i != (nStartOffset - 1); i--) {
int nBestCost, nBestMatchLen, nBestMatchOffset;

int nLiteralsLen = nLastLiteralsOffset - i;
nBestCost = 1 + cost[i + 1];
nBestCost = 8 + cost[i + 1];
if (nLiteralsLen == LITERALS_RUN_LEN || nLiteralsLen == 256 || nLiteralsLen == 512) {
/* Add to the cost of encoding literals as their number crosses a variable length encoding boundary.
* The cost automatically accumulates down the chain. */
nBestCost++;
nBestCost += 8;
}
if (pCompressor->match[(i + 1) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nBestCost += MODESWITCH_PENALTY;
nBestMatchLen = 0;
nBestMatchOffset = 0;

lzsa_match *pMatch = pCompressor->match + (i << MATCHES_PER_OFFSET_SHIFT);
int m;

for (m = 0; m < NMATCHES_PER_OFFSET && pMatch[m].length >= nMinMatchSize; m++) {
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 1 : 2;
int nMatchOffsetSize = (pMatch[m].offset <= 256) ? 8 : 16;

if (pMatch[m].length >= LEAVE_ALONE_MATCH_SIZE) {
int nCurCost;
Expand All @@ -532,8 +536,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
if ((i + nMatchLen) > (nEndOffset - LAST_LITERALS))
nMatchLen = nEndOffset - LAST_LITERALS - i;

nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(nMatchLen - MIN_MATCH_SIZE);
nCurCost += cost[i + nMatchLen];
if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;

if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
Expand All @@ -555,8 +561,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
for (k = nMinMatchSize; k < nMatchRunLen; k++) {
int nCurCost;

nCurCost = 1 + nMatchOffsetSize /* no extra match len bytes */;
nCurCost = 8 + nMatchOffsetSize /* no extra match len bytes */;
nCurCost += cost[i + k];
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;

if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
Expand All @@ -568,8 +576,10 @@ static void lzsa_optimize_matches(lsza_compressor *pCompressor, const int nStart
for (; k <= nMatchLen; k++) {
int nCurCost;

nCurCost = 1 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
nCurCost = 8 + nMatchOffsetSize + lzsa_get_match_varlen_size(k - MIN_MATCH_SIZE);
nCurCost += cost[i + k];
if (pCompressor->match[(i + k) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE)
nCurCost += MODESWITCH_PENALTY;

if (nBestCost >= nCurCost) {
nBestCost = nCurCost;
Expand Down Expand Up @@ -614,10 +624,10 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
if (nMatchLen <= 9 && (i + nMatchLen) < nEndOffset) /* max reducable command size: <token> <FF> <ll> <ll> <offset> <offset> <FF> <mm> <mm> */ {
int nMatchOffset = pMatch->offset;
int nEncodedMatchLen = nMatchLen - MIN_MATCH_SIZE;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 1 : 2) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + ((nMatchOffset <= 256) ? 8 : 16) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);

if (pCompressor->match[(i + nMatchLen) << MATCHES_PER_OFFSET_SHIFT].length >= MIN_MATCH_SIZE) {
if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nMatchLen))) {
/* This command is a match; the next command is also a match. The next command currently has no literals; replacing this command by literals will
* make the next command eat the cost of encoding the current number of literals, + nMatchLen extra literals. The size of the current match command is
* at least as much as the number of literal bytes + the extra cost of encoding them in the next match command, so we can safely replace the current
Expand All @@ -634,7 +644,7 @@ static int lzsa_optimize_command_count(lsza_compressor *pCompressor, const int n
nNextNumLiterals++;
} while (nCurIndex < nEndOffset && pCompressor->match[nCurIndex << MATCHES_PER_OFFSET_SHIFT].length < MIN_MATCH_SIZE);

if (nCommandSize >= (nMatchLen + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
if (nCommandSize >= ((nMatchLen << 3) + lzsa_get_literals_varlen_size(nNumLiterals + nNextNumLiterals + nMatchLen) - lzsa_get_literals_varlen_size(nNextNumLiterals))) {
/* This command is a match, and is followed by literals, and then another match or the end of the input data. If encoding this match as literals doesn't take
* more room than the match, and doesn't grow the next match command's literals encoding, go ahead and remove the command. */
nReduce = 1;
Expand Down Expand Up @@ -706,9 +716,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
int nTokenMatchLen = (nEncodedMatchLen >= MATCH_RUN_LEN) ? MATCH_RUN_LEN : nEncodedMatchLen;
int nTokenLongOffset = (nMatchOffset <= 256) ? 0x00 : 0x80;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals + (nTokenLongOffset ? 2 : 1) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3) + (nTokenLongOffset ? 16 : 8) /* match offset */ + lzsa_get_match_varlen_size(nEncodedMatchLen);

if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
return -1;
if (nMatchOffset < MIN_OFFSET || nMatchOffset > MAX_OFFSET)
return -1;
Expand Down Expand Up @@ -741,9 +751,9 @@ static int lzsa_write_block(lsza_compressor *pCompressor, const unsigned char *p

{
int nTokenLiteralsLen = (nNumLiterals >= LITERALS_RUN_LEN) ? LITERALS_RUN_LEN : nNumLiterals;
int nCommandSize = 1 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + nNumLiterals;
int nCommandSize = 8 /* token */ + lzsa_get_literals_varlen_size(nNumLiterals) + (nNumLiterals << 3);

if ((nOutOffset + nCommandSize) > nMaxOutDataSize)
if ((nOutOffset + (nCommandSize >> 3)) > nMaxOutDataSize)
return -1;

pOutData[nOutOffset++] = (nTokenLiteralsLen << 4) | 0x0f;
Expand Down

0 comments on commit 9e75a55

Please sign in to comment.