summaryrefslogtreecommitdiff
path: root/src/support/huffman.c
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@wiredtiger.com>2011-05-19 14:57:16 -0400
committerKeith Bostic <keith.bostic@wiredtiger.com>2011-05-19 14:57:16 -0400
commitb3e45e60b2db88b323409e8d05b4e89f928283b3 (patch)
treec6da7eb62af4382156e070e5d05570b0e095d59f /src/support/huffman.c
parent8aaf428e6e72eff48718296e4c6d255afff2927c (diff)
downloadmongo-b3e45e60b2db88b323409e8d05b4e89f928283b3.tar.gz
Integrate Tom's new code drop into the tree.
Diffstat (limited to 'src/support/huffman.c')
-rw-r--r--src/support/huffman.c456
1 files changed, 263 insertions, 193 deletions
diff --git a/src/support/huffman.c b/src/support/huffman.c
index c55fe8981eb..a7a5079c610 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -5,29 +5,22 @@
* All rights reserved.
*/
-#include "XXX.h"
-
-#define DETAIL
-/* "The assert() macro may be removed at compile time with the cc(1) option -DNDEBUG." */
-#include <assert.h>
+#include "wt_internal.h"
+#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */
/* Length of header in compressed message, in bits. */
-#define WT_HUFFMAN_HEADER 3
-
-
-/* Escape symbol. */
-static uint16_t escape;
-
+#define WT_HUFFMAN_HEADER 3
/*
- * Maximum allowed length of Huffman code words, which otherwise can range up to (#symbols - 1) bits long.
- * Lower value to use less memory for tables, higher value for better compression.
- * Max value = 16 (or 32-7=25 or 64-7=57 if adjust data types). FYI, JPEG uses 16.
- * A side effect of limiting max code length is that the worst case compression (a message of the least frequent symbols) is shorter.
+ * Maximum allowed length of Huffman code words, which otherwise can range up
+ * to (#symbols - 1) bits long. Lower value to use less memory for tables,
+ * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57
+ * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max
+ * code length is that the worst case compression (a message of the least
+ * frequent symbols) is shorter.
*/
-#define MAX_CODE_LENGTH 16
-
+#define MAX_CODE_LENGTH 16
typedef struct __wt_freqtree_node {
/*
@@ -42,7 +35,9 @@ typedef struct __wt_freqtree_node {
} WT_FREQTREE_NODE;
typedef struct __wt_huffman_code {
- uint16_t pattern; /* requirement: length of field's type in bits >= max_code_length */
+ uint16_t pattern; /* requirement: length of field's type
+ * in bits >= max_code_length.
+ */
uint8_t length;
} WT_HUFFMAN_CODE;
@@ -58,16 +53,24 @@ typedef struct __wt_huffman_obj {
/* Tree in static array reprentation */
uint16_t max_depth, min_depth;
+ uint16_t escape; /* Escape symbol. */
+
/*
- * use: codes[symbol] = struct with pattern and length. Used in encoding and decoding.
+ * use: codes[symbol] = struct with pattern and length.
+ * Used in encoding and decoding.
* memory: codes[0-to-(number of symbols - 1)]
*/
WT_HUFFMAN_CODE *codes;
+
/*
- * use: code2symbol[Huffman_code] = symbol. Used in decoding.
+ * use: code2symbol[Huffman_code] = symbol.
+ * Used in decoding.
* memory: code2symbol[1 << max_code_length]
*/
- uint16_t *code2symbol; /* If <=256 symbols use uint8_t, if <=64K symbols use uint16_t, else uint32_t. */
+ uint16_t *code2symbol; /* If <= 256 symbols use uint8_t, if
+ * <= 64K symbols use uint16_t, else
+ * uint32_t.
+ */
} WT_HUFFMAN_OBJ;
/*
@@ -99,13 +102,18 @@ typedef struct node_queue {
typedef struct __indexed_byte {
uint8_t frequency;
uint16_t symbol;
-} INDEXED_BYTE;
+} INDEXED_SYMBOL;
static int indexed_byte_comparator(const void *, const void *);
+static void make_table(
+ SESSION *, uint16_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
static void node_queue_close(SESSION *, NODE_QUEUE *);
static void node_queue_dequeue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE **);
static int node_queue_enqueue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static uint32_t profile_tree(
+ WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *);
+static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
#define node_queue_is_empty(queue) \
(((queue) == NULL || (queue)->first == NULL) ? 1 : 0)
@@ -117,57 +125,52 @@ static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *);
static int
indexed_byte_comparator(const void *elem1, const void *elem2)
{
- return (((INDEXED_BYTE *)
- elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency);
+ return (((INDEXED_SYMBOL *)
+ elem1)->frequency) - (((INDEXED_SYMBOL *)elem2)->frequency);
}
-
-#if 1
-/* Keith: take this out when you hook up your buffer */
-static int buflen = 0;
-static char* buf = NULL;
-
-/* return a buffer of at least min_len bytes */
-char* getbuf(int min_len) {
- if (buflen < min_len) {
- if (buf!=NULL) free(buf);
- buf = malloc(min_len + 10);
- }
- return buf;
-}
-#endif
-
-
/*
* profile_tree --
- * Traverses tree to determine #leaves under each node, max depth, min depth of leaf.
+ * Traverses tree to determine #leaves under each node, max depth, min
+ * depth of leaf.
*/
static uint32_t
-profile_tree(WT_FREQTREE_NODE *node, uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
+profile_tree(WT_FREQTREE_NODE *node,
+ uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
{
uint32_t leaf_cnt;
+
if (node->left == NULL && node->right == NULL) { /* leaf */
leaf_cnt = 1;
- if (*max_depth < len) *max_depth = len;
- if (*min_depth > len) *min_depth = len;
- } else { /* internal node -- way tree constructed internal always has left and right children */
- leaf_cnt = profile_tree(node->left, len+1, max_depth, min_depth) + profile_tree(node->right, len+1, max_depth, min_depth);
+ if (*max_depth < len)
+ *max_depth = len;
+ if (*min_depth > len)
+ *min_depth = len;
+ } else {
+ /*
+ * internal node -- way tree constructed internal always has
+ * left and right children
+ */
+ leaf_cnt =
+ profile_tree(node->left, len + 1, max_depth, min_depth) +
+ profile_tree(node->right, len + 1, max_depth, min_depth);
}
- node->weight = leaf_cnt; /* abuse weight field */
- return leaf_cnt;
+ node->weight = leaf_cnt; /* abuse weight field */
+ return (leaf_cnt);
}
-
/*
* set_codes --
* Computes Huffman code for each symbol in tree.
- * Method is standard way in the literature, except that limits maximum code length.
- * A known max code length is important for limiting memory use by the tables
- * and for knowing how large data types need to be such as the field that holds the code pattern.
+ *
+ * Method is standard way in the literature, except that limits maximum code
+ * length. A known max code length is important for limiting memory use by
+ * the tables and for knowing how large data types need to be such as the field
+ * that holds the code pattern.
*/
static void
-set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
- uint16_t pattern, uint8_t len)
+set_codes(WT_FREQTREE_NODE *node,
+ WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
{
WT_HUFFMAN_CODE *code;
uint16_t patternleft, patternright, half;
@@ -177,14 +180,25 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
code = &codes[node->symbol];
code->pattern = pattern;
code->length = len;
- /*printf("%d: code %x, len %d\n", node->symbol, code, len);*/
+ /*
+ * printf("%d: code %x, len %d\n", node->symbol, code, len);
+ */
} else {
- /* Check each subtree individually to see if can afford here to split up bits into possibly shorter codes,
- or if need to employ all remaining bits up to MAX_CODE_LENGTH to consecutively number leaves. */
+ /*
+ * Check each subtree individually to see if can afford here
+ * to split up bits into possibly shorter codes, or if need
+ * to employ all remaining bits up to MAX_CODE_LENGTH to
+ * consecutively number leaves.
+ */
remaining = MAX_CODE_LENGTH - len;
- if (len < MAX_CODE_LENGTH /* not alraedy in "low-bit mode"... */
- && ((half = 1 << (remaining - 1)) < node->left->weight || half < node->right->weight)) { /* ...but need to be */
- pattern = pattern << remaining; /* open up lower-order bits for consecutive numbering */
+ /*
+ * If not already in "low-bit mode", but need to be, open up
+ * lower-order bits for consecutive numbering.
+ */
+ if (len < MAX_CODE_LENGTH &&
+ ((half = 1 << (remaining - 1)) < node->left->weight ||
+ half < node->right->weight)) {
+ pattern = pattern << remaining;
len = MAX_CODE_LENGTH;
}
@@ -192,10 +206,10 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
patternleft = (pattern << 1) | 0;
patternright = (pattern << 1) | 1;
len++;
- } else { /* "low bit mode" */
+ } else { /* "low bit mode" */
patternleft = pattern;
patternright = pattern + node->left->weight;
- /* len unchanged */
+ /* len unchanged */
}
set_codes(node->left, codes, patternleft, len);
@@ -203,16 +217,15 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
}
}
-
/*
* make_table --
- * Computes Huffman table used for subsequent lookups in encoding and decoding.
- * With the table, encoding from a symbol to Huffman code
- * and decoding from a code to a symbol are simple array lookups.
+ * Computes Huffman table used for subsequent lookups in encoding and
+ * decoding. With the table, encoding from a symbol to Huffman code and
+ * decoding from a code to a symbol are simple array lookups.
*/
static void
-make_table(uint16_t *code2symbol,
- uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int nbytes)
+make_table(SESSION *session, uint16_t *code2symbol,
+ uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
{
uint16_t c, c1, c2, i, j;
uint8_t len, shift;
@@ -221,7 +234,7 @@ make_table(uint16_t *code2symbol,
* Here's the magic: flood all bit patterns for lower-order bits to
* point to same symbol.
*/
- for (i = 0; i < nbytes; i++) {
+ for (i = 0; i < symcnt; i++) {
if ((len = codes[i].length) == 0)
continue;
@@ -237,13 +250,12 @@ make_table(uint16_t *code2symbol,
c1 = c << shift;
c2 = (c + 1) << shift;
for (j = c1; j < c2; j++) {
- assert (code2symbol[j] == 0);
+ WT_ASSERT(session, code2symbol[j] == 0);
code2symbol[j] = i;
}
}
}
-
/*
* recursive_free_node --
* Recursively free the huffman frequency tree's nodes.
@@ -267,15 +279,14 @@ recursive_free_node(SESSION *session, WT_FREQTREE_NODE *node)
* lowest rank is 1 (0 means the byte never appears in the input), so 1 byte
* is needed to hold the rank and the input table must be 1 byte x 256 values.
*
- * For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535.
+ * For UTF-16 (symcnt == 2) the range is 0 - 65535 and the max rank is 65535.
* The table should be 2 bytes x 65536 values.
*/
int
__wt_huffman_open(SESSION *session,
- uint8_t const *byte_frequency_array, u_int nbytes, void *retp)
+ uint8_t const *byte_frequency_array, u_int symcnt, void *retp)
{
- INDEXED_BYTE *indexed_freqs;
- INDEXED_BYTE esc;
+ INDEXED_SYMBOL esc, *indexed_freqs;
NODE_QUEUE *combined_nodes, *leaves;
WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
WT_HUFFMAN_OBJ *huffman;
@@ -289,42 +300,51 @@ __wt_huffman_open(SESSION *session,
ret = 0;
WT_RET(__wt_calloc_def(session, 1, &huffman));
- WT_ERR(__wt_calloc_def(session, (size_t)nbytes, &indexed_freqs));
+ WT_ERR(__wt_calloc_def(session, (size_t)symcnt, &indexed_freqs));
huffman->session = session;
/*
* The frequency array must be sorted to be able to use linear time
* construction algorithm.
*/
- for (i = 0; i < nbytes; i++) {
+ for (i = 0; i < symcnt; i++) {
indexed_freqs[i].frequency = byte_frequency_array[i];
indexed_freqs[i].symbol = i;
}
/*
- * The escape symbol dynamically seizes the lowest-frequency symbol, which may have 0 frequency.
- * We use the escape code only for cases that would otherwise be errors, which are hopefully rare.
- * We choose the lowest-frequency to have minimal impact (close to no impact) on compression ratios,
- * but if there are lots of errors we pay the longest codeword length in addition to the raw bits of the symbol.
+ * The escape symbol dynamically seizes the lowest-frequency symbol,
+ * which may have 0 frequency. We use the escape code only for cases
+ * that would otherwise be errors, which are hopefully rare. Choose
+ * the lowest-frequency to have minimal impact (close to no impact) on
+ * compression ratios, but if there are lots of errors we pay the
+ * longest codeword length in addition to the raw bits of the symbol.
*/
esc = indexed_freqs[0];
- for (i = 0+1; i < nbytes; i++)
- if (indexed_freqs[i].frequency < esc.frequency) esc = indexed_freqs[i];
- escape = esc.symbol;
-#ifdef DETAIL
- printf("taking over symbol %d (freq %d) as escape\n", escape, esc.frequency);
+ for (i = 0+1; i < symcnt; i++)
+ if (indexed_freqs[i].frequency < esc.frequency)
+ esc = indexed_freqs[i];
+ huffman->escape = esc.symbol;
+#if __HUFFMAN_DETAIL
+ printf("taking over symbol %d (freq %d) as escape\n",
+ huffman->escape, esc.frequency);
#endif
- if (esc.frequency == 0) esc.frequency++; /* zero frequency left out of codes, so make sure this doesn't happen to the escape symbol */
+ /*
+ * Zero frequency left out of codes, so make sure this doesn't happen
+ * to the escape symbol.
+ */
+ if (esc.frequency == 0)
+ esc.frequency++;
qsort(indexed_freqs,
- nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator);
+ symcnt, sizeof(INDEXED_SYMBOL), indexed_byte_comparator);
/* We need two node queues to build the tree. */
WT_ERR(__wt_calloc_def(session, 1, &leaves));
WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
/* Adding the leaves to the queue */
- for (i = 0; i < nbytes; ++i) {
+ for (i = 0; i < symcnt; ++i) {
/*
* We are leaving out symbols with a frequency of 0. This
* assumes these symbols will NEVER occur in the source stream,
@@ -394,35 +414,52 @@ __wt_huffman_open(SESSION *session,
/*
* The remaining node is in the node variable, this is the root of the
- * tree. Calculate the number of bytes it takes to hold nbytes bits.
+ * tree. Calculate how many bytes it takes to hold symcnt bytes bits.
*/
- huffman->numSymbols = nbytes;
- huffman->numBytes = nbytes > 256 ? 2 : 1;
+ huffman->numSymbols = symcnt;
+ huffman->numBytes = symcnt > 256 ? 2 : 1;
huffman->max_depth = 0; huffman->min_depth = MAX_CODE_LENGTH;
- profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
- if (huffman->max_depth > MAX_CODE_LENGTH) huffman->max_depth = MAX_CODE_LENGTH;
+ (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
+ if (huffman->max_depth > MAX_CODE_LENGTH)
+ huffman->max_depth = MAX_CODE_LENGTH;
WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
set_codes(node, huffman->codes, 0, 0);
- WT_ERR(__wt_calloc_def(session, 1U << huffman->max_depth, &huffman->code2symbol));
- make_table(huffman->code2symbol, huffman->max_depth, huffman->codes, huffman->numSymbols);
- /*huffman->code2symbol = make_table(huffman->codes, nbytes, huffman->max_depth); => have to have access to "err:" label */
-#ifdef DETAIL
- printf("leaf depth %d..%d, memory use: codes %d# * %u size + code2symbol %d# * %u size\n",
- huffman->min_depth, huffman->max_depth,
- nbytes, sizeof(WT_HUFFMAN_CODE),
- (1U << huffman->max_depth), sizeof(uint16_t));
-
- /* measure quality of computed Huffman codes, for different max bit lengths (say, 16 vs 24 vs 32) */
- uint32_t weighted_length = 0;
- for (i = 0; i<nbytes; i++) {
- uint16_t sym = indexed_freqs[i].symbol;
- weighted_length += indexed_freqs[i].frequency * huffman->codes[sym].length;
- /*printf("\t%d->%d. %d * %d\n", i,sym, indexed_freqs[i].frequency, huffman->codes[sym].length);*/
+ WT_ERR(__wt_calloc_def(
+ session, 1U << huffman->max_depth, &huffman->code2symbol));
+ make_table(session, huffman->code2symbol,
+ huffman->max_depth, huffman->codes, huffman->numSymbols);
+
+#if __HUFFMAN_DETAIL
+ {
+ uint32_t weighted_length;
+ uint16_t sym;
+
+ printf("leaf depth %lu..%lu, memory use: "
+ "codes %u# * %u size + code2symbol %lu# * %u size\n",
+ (u_long)huffman->min_depth, (u_long)huffman->max_depth,
+ symcnt, sizeof(WT_HUFFMAN_CODE),
+ (u_long)(1U << huffman->max_depth), sizeof(uint16_t));
+
+ /*
+ * measure quality of computed Huffman codes, for different max bit
+ * lengths (say, 16 vs 24 vs 32)
+ */
+ weighted_length = 0;
+ for (i = 0; i < symcnt; i++) {
+ sym = indexed_freqs[i].symbol;
+ weighted_length +=
+ indexed_freqs[i].frequency * huffman->codes[sym].length;
+ /*
+ * printf("\t%d->%d. %d * %d\n", i, sym,
+ * indexed_freqs[i].frequency, huffman->codes[sym].length);
+ */
+ }
+ printf("weighted length of all codes (the smaller the better): %lu\n",
+ (u_long)weighted_length);
}
- printf("weighted length of all codes (the smaller the better): %d\n", weighted_length);
#endif
*(void **)retp = huffman;
@@ -460,7 +497,7 @@ __wt_huffman_close(SESSION *session, void *huffman_arg)
__wt_free(session, huffman);
}
-#ifdef HAVE_DIAGNOSTIC
+#if __HUFFMAN_DETAIL
/*
* __wt_print_huffman_code --
* Prints a symbol's Huffman code.
@@ -468,9 +505,9 @@ __wt_huffman_close(SESSION *session, void *huffman_arg)
int
__wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
{
- WT_HUFFMAN_OBJ *huffman;
WT_HUFFMAN_CODE code;
- u_int i;
+ WT_HUFFMAN_OBJ *huffman;
+
huffman = huffman_arg;
if (symbol >= huffman->numSymbols) {
@@ -478,7 +515,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
} else {
code = huffman->codes[symbol];
if (code.length == 0)
- printf("symbol %d not defined -- 0 frequency\n", symbol);
+ printf(
+ "symbol %d not defined -- 0 frequency\n", symbol);
else
/* should print code as binary */
printf("%d -> code pattern %x, length %d\n",
@@ -495,8 +533,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
*
* Translation from symbol to Huffman code is a simple array lookup.
*
- * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per symbol.
- * Then, given a symbol:
+ * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
+ * symbol. Then, given a symbol:
* pattern = codes[symbol].pattern;
* length = codes[symbol].length;
*
@@ -507,19 +545,22 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
* and write header bits.
*/
int
-__wt_huffman_encode(void *huffman_arg,
- const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
+__wt_huffman_encode(
+ void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
{
+ WT_BUF *tmp;
SESSION *session;
- WT_HUFFMAN_OBJ *huffman;
WT_HUFFMAN_CODE code;
+ WT_HUFFMAN_OBJ *huffman;
uint32_t max_len, outlen, bitpos, bytes;
uint16_t symbol;
- uint8_t len, esc, padding_info, *out, *buf;
+ uint8_t len, esc, padding_info, *out;
+ int ret;
+
/*
* Shift register to accumulate bits from input.
- * Should be >= (MAX_CODE_LENGTH + 7),
- * but also efficient to shift bits and preferably in a machine register.
+ * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
+ * and preferably in a machine register.
*/
uint32_t bits;
@@ -528,6 +569,8 @@ __wt_huffman_encode(void *huffman_arg,
huffman = huffman_arg;
session = huffman->session;
+ tmp = NULL;
+ ret = 0;
/*
* We don't want to find all of our callers and ensure they don't pass
@@ -539,16 +582,20 @@ __wt_huffman_encode(void *huffman_arg,
}
/*
- * Compute the largest compressed output size,
- * which is if all symbols are least frequent and so have largest Huffman codes,
- * and compressed output may be larger than the input size.
- * This way we don't have to worry about resizing the buffer during compression.
- * Use the shared system buffer while compressing,
- * then allocate a new buffer of exactly the right size and copy the result into it.
+ * Compute the largest compressed output size, which is if all symbols
+ * are least frequent and so have largest Huffman codes, and compressed
+ * output may be larger than the input size. This way we don't have to
+ * worry about resizing the buffer during compression. Use the shared
+ * system buffer while compressing, then allocate a new buffer of the
+ * right size and copy the result into it.
*/
- max_len = (WT_HUFFMAN_HEADER + from_len * huffman->max_depth + 7/*round up to full byte*/) / 8;
- max_len += from_len * huffman->numBytes; /* if all symbols used escape code (ugh!) */
- buf = (uint8_t *)getbuf(max_len);
+ max_len = (WT_HUFFMAN_HEADER +
+ from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
+ /*
+ * if all symbols used escape code (ugh!)
+ */
+ max_len += from_len * huffman->numBytes;
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
/*
* Leave the first 3 bits of the encoded value empty, it holds the
@@ -557,31 +604,36 @@ __wt_huffman_encode(void *huffman_arg,
bits = 0;
bitpos = WT_HUFFMAN_HEADER;
valid = WT_HUFFMAN_HEADER;
- out = buf;
+ out = tmp->mem;
esc = 0;
for (bytes = 0; bytes < from_len; bytes += huffman->numBytes) {
/* Getting the next symbol, either 1 or 2 bytes */
symbol = *from++;
- if (huffman->numBytes == 2) symbol = (symbol << 8) | *from++;
+ if (huffman->numBytes == 2)
+ symbol = (symbol << 8) | *from++;
/*
- * Huffman expects to never see symbols zero frequency and out-of-band.
- * These may happen because the data changed since we computed the frequency tables.
- * We can still encode such illegal input symbols via an escape symbol: escape + raw next 8-or-16 bits.
- * (If escape symbol does appear, it's encoded the same way as "zero frequency".)
+ * Huffman expects to never see zero frequency and out-of-band
+ * symbols. These may happen because the data changed since we
+ * computed the frequency tables. We can still encode such
+ * illegal input symbols via an escape symbol: escape + raw
+ * next 8-or-16 bits. (If the escape symbol does appear, it's
+ * encoded the same way as "zero frequency".)
*/
- if (symbol >= huffman->numSymbols /* bad input: out of band */
- || (code = huffman->codes[symbol]).length == 0 /* bad input: declared zero frequency, yet here it is */
- || symbol == escape /* legitimate input symbol that was seized as escape symbol */
- ) {
- code = huffman->codes[escape];
-#ifdef DETAIL
+ if (symbol >= huffman->numSymbols
+ /* bad input: out of band */ ||
+ (code = huffman->codes[symbol]).length == 0
+ /* bad input: declared zero frequency, yet here it is */ ||
+ symbol == huffman->escape
+ /* legitimate input symbol seized as escape symbol */) {
+ code = huffman->codes[huffman->escape];
+#if __HUFFMAN_DETAIL
printf("encoding escape for %d\n", symbol);
#endif
esc = 1;
}
- /* translate symbol into Huffman code and stuff into buffer */
+ /* Translate symbol into Huffman code and stuff into buffer. */
len = code.length;
bits = (bits << len) | code.pattern;
valid += len;
@@ -616,18 +668,22 @@ __wt_huffman_encode(void *huffman_arg,
* bits used in the last byte.
*/
padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER);
- *buf |= padding_info;
+ ((uint8_t *)tmp->mem)[0] |= padding_info;
/* Copy result of exact known size into caller's buffer. */
outlen = (bitpos + 7) / 8;
- WT_RET(__wt_buf_setsize(session, to_buf, outlen));
- memcpy(to_buf->mem, buf, outlen);
-#ifdef DETAIL
- printf("compress: worst case %d bytes -> actual %d\n", max_len, outlen);
- printf("(vs old code %d, which was memory leak for common symbols and error for infrequent)\n", from_len+1);
+ WT_ERR(__wt_buf_setsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("encode: worst case %lu bytes -> actual %lu\n",
+ (u_long)max_len, (u_long)outlen);
#endif
- return (0);
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+ return (ret);
+
}
/*
@@ -640,41 +696,44 @@ __wt_huffman_encode(void *huffman_arg,
* and whose value is the corresponding symbol.
* From the symbol, we index into the 'codes' array to get the code length.
*
- * When decoding a message, we don't know where the boundaries are between codes.
- * The trick is that we collect enough bits for the longest code word,
- * and construct the table such that for codes with fewer bits we flood the table
- * with all of the bit patterns in the lower order bits.
- * This works because the Huffman code is a unique prefix, and
- * by the flooding we are treating bits beyond the unique prefix as don't care bits.
+ * When decoding a message, we don't know where the boundaries are between
+ * codes. The trick is that we collect enough bits for the longest code word,
+ * and construct the table such that for codes with fewer bits we flood the
+ * table with all of the bit patterns in the lower order bits. This works
+ * because the Huffman code is a unique prefix, and by the flooding we are
+ * treating bits beyond the unique prefix as don't care bits.
*
- * For example, we have table is of length 2^max_code_length (1<<max_code_length).
- * For a code of length, max_code_length, the position code2symbol[code] = symbol.
- * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, as well
- * as code2symbol[(code << 1) | 1] = symbol.
+ * For example, we have table of length 2^max_code_length (1<<max_code_length).
+ * For a code of length, max_code_length, the position code2symbol[code] =
+ * symbol.
+ * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
+ * as well as code2symbol[(code << 1) | 1] = symbol.
* And so on, so in general we fill:
- * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
+ * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
*
- * To decode a message, we read in enough bits from input to fill the shift register with at least
- * MAX_CODE_LENGTH bits.
+ * To decode a message, we read in enough bits from input to fill the shift
+ * register with at least MAX_CODE_LENGTH bits.
* We look up in the table code2symbol to obtain the symbol.
* We look up the symbol in 'codes' to obtain the code length
* Finally, subtract off these bits from the shift register.
*/
int
-__wt_huffman_decode(void *huffman_arg,
- const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
+__wt_huffman_decode(
+ void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
{
+ WT_BUF *tmp;
SESSION *session;
WT_HUFFMAN_OBJ *huffman;
- uint32_t bits, from_bits, from_len_bits, len, max_len, outlen, max;
- uint32_t mask, out_bits;
- uint16_t pattern;
- uint8_t valid;
- uint16_t symbol;
- uint8_t padding_info, *to, *buf;
+ uint32_t bits, from_bits, from_len_bits, len, mask, max, max_len;
+ uint32_t out_bits, outlen;
+ uint16_t pattern, symbol;
+ uint8_t padding_info, *to, valid;
+ int ret;
huffman = huffman_arg;
session = huffman->session;
+ tmp = NULL;
+ ret = 0;
/*
* We don't want to find all of our callers and ensure they don't pass
@@ -695,38 +754,41 @@ __wt_huffman_decode(void *huffman_arg,
from_len_bits -= 8 - padding_info;
/*
- * Compute largest uncompressed output size,
- * which is if all symbols are most frequent and so have smallest Huffman codes and therefore largest expansion.
- * (If some symbols use the escape code then the output is even smaller.)
- * Use the shared system buffer while uncompressing,
- * then allocate a new buffer of exactly the right size and copy the result into it.
+ * Compute largest uncompressed output size, which is if all symbols are
+ * most frequent and so have smallest Huffman codes and therefore
+ * largest expansion. (If some symbols use the escape code then the
+ * output is even smaller.) Use the shared system buffer while
+ * uncompressing, then allocate a new buffer of exactly the right size
+ * and copy the result into it.
*/
max_len = (from_len_bits / huffman->min_depth) * huffman->numBytes;
- buf = (uint8_t *)getbuf(max_len);
- to = buf;
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+ to = tmp->mem;
bits = *from++;
valid = 8 - WT_HUFFMAN_HEADER;
from_bits = from_len_bits - 8;
out_bits = from_len_bits - WT_HUFFMAN_HEADER;
max = huffman->max_depth;
- mask = (1 << max) - 1;
+ mask = (1U << max) - 1;
for (outlen = 0; out_bits > 0; outlen += huffman->numBytes) {
while (valid < max && from_bits > 0) {
bits = (bits << 8) | *from++;
valid += 8;
from_bits -= 8;
}
- pattern = valid >= max ? /* short patterns near end */
+ pattern = valid >= max ? /* short patterns near end */
(bits >> (valid - max)) : (bits << (max - valid));
symbol = huffman->code2symbol[pattern & mask];
len = huffman->codes[symbol].length;
valid -= len;
out_bits -= len;
- if (symbol == escape) {
-#ifdef DETAIL
- printf("decoding escape\n");
+ if (symbol == huffman->escape) {
+#if __HUFFMAN_DETAIL
+ /*
+ * printf("decoding escape\n");
+ */
#endif
len = huffman->numBytes * 8;
while (valid < len) {
@@ -734,7 +796,12 @@ __wt_huffman_decode(void *huffman_arg,
valid += 8;
from_bits -= 8;
}
- symbol = bits >> (valid - len); /* different than usual case: extract symbol not pattern */
+
+ /*
+ * different than usual case: extract symbol not
+ * pattern
+ */
+ symbol = bits >> (valid - len);
valid -= len;
out_bits -= len;
}
@@ -745,14 +812,17 @@ __wt_huffman_decode(void *huffman_arg,
}
/* Return the number of bytes used. */
- WT_RET(__wt_buf_setsize(session, to_buf, outlen));
- memcpy(to_buf->mem, buf, outlen);
-#ifdef DETAIL
- printf("uncompress: worst case %d bytes -> actual %d\n", max_len, outlen);
- printf("(vs old code %d, which was usually memory leak)\n", 2*from_len+1);
+ WT_ERR(__wt_buf_setsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("decode: worst case %lu bytes -> actual %lu\n",
+ (u_long)max_len, (u_long)outlen);
#endif
- return (0);
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+ return (ret);
}
/*
@@ -786,7 +856,7 @@ node_queue_enqueue(SESSION *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
NODE_QUEUE_ELEM *elem;
/* Allocating a new linked list element */
- WT_RET(__wt_calloc(session, 1, sizeof(NODE_QUEUE_ELEM), &elem));
+ WT_RET(__wt_calloc_def(session, 1, &elem));
/* It holds the tree node, and has no next element yet */
elem->node = node;