diff options
author | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-05-19 14:57:16 -0400 |
---|---|---|
committer | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-05-19 14:57:16 -0400 |
commit | b3e45e60b2db88b323409e8d05b4e89f928283b3 (patch) | |
tree | c6da7eb62af4382156e070e5d05570b0e095d59f /src/support/huffman.c | |
parent | 8aaf428e6e72eff48718296e4c6d255afff2927c (diff) | |
download | mongo-b3e45e60b2db88b323409e8d05b4e89f928283b3.tar.gz |
Integrate Tom's new code drop into the tree.
Diffstat (limited to 'src/support/huffman.c')
-rw-r--r-- | src/support/huffman.c | 456 |
1 files changed, 263 insertions, 193 deletions
diff --git a/src/support/huffman.c b/src/support/huffman.c index c55fe8981eb..a7a5079c610 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -5,29 +5,22 @@ * All rights reserved. */ -#include "XXX.h" - -#define DETAIL -/* "The assert() macro may be removed at compile time with the cc(1) option -DNDEBUG." */ -#include <assert.h> +#include "wt_internal.h" +#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */ /* Length of header in compressed message, in bits. */ -#define WT_HUFFMAN_HEADER 3 - - -/* Escape symbol. */ -static uint16_t escape; - +#define WT_HUFFMAN_HEADER 3 /* - * Maximum allowed length of Huffman code words, which otherwise can range up to (#symbols - 1) bits long. - * Lower value to use less memory for tables, higher value for better compression. - * Max value = 16 (or 32-7=25 or 64-7=57 if adjust data types). FYI, JPEG uses 16. - * A side effect of limiting max code length is that the worst case compression (a message of the least frequent symbols) is shorter. + * Maximum allowed length of Huffman code words, which otherwise can range up + * to (#symbols - 1) bits long. Lower value to use less memory for tables, + * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57 + * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max + * code length is that the worst case compression (a message of the least + * frequent symbols) is shorter. */ -#define MAX_CODE_LENGTH 16 - +#define MAX_CODE_LENGTH 16 typedef struct __wt_freqtree_node { /* @@ -42,7 +35,9 @@ typedef struct __wt_freqtree_node { } WT_FREQTREE_NODE; typedef struct __wt_huffman_code { - uint16_t pattern; /* requirement: length of field's type in bits >= max_code_length */ + uint16_t pattern; /* requirement: length of field's type + * in bits >= max_code_length. + */ uint8_t length; } WT_HUFFMAN_CODE; @@ -58,16 +53,24 @@ typedef struct __wt_huffman_obj { /* Tree in static array reprentation */ uint16_t max_depth, min_depth; + uint16_t escape; /* Escape symbol. */ + /* - * use: codes[symbol] = struct with pattern and length. Used in encoding and decoding. + * use: codes[symbol] = struct with pattern and length. + * Used in encoding and decoding. * memory: codes[0-to-(number of symbols - 1)] */ WT_HUFFMAN_CODE *codes; + /* - * use: code2symbol[Huffman_code] = symbol. Used in decoding. + * use: code2symbol[Huffman_code] = symbol. + * Used in decoding. * memory: code2symbol[1 << max_code_length] */ - uint16_t *code2symbol; /* If <=256 symbols use uint8_t, if <=64K symbols use uint16_t, else uint32_t. */ + uint16_t *code2symbol; /* If <= 256 symbols use uint8_t, if + * <= 64K symbols use uint16_t, else + * uint32_t. + */ } WT_HUFFMAN_OBJ; /* @@ -99,13 +102,18 @@ typedef struct node_queue { typedef struct __indexed_byte { uint8_t frequency; uint16_t symbol; -} INDEXED_BYTE; +} INDEXED_SYMBOL; static int indexed_byte_comparator(const void *, const void *); +static void make_table( + SESSION *, uint16_t *, uint16_t, WT_HUFFMAN_CODE *, u_int); static void node_queue_close(SESSION *, NODE_QUEUE *); static void node_queue_dequeue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE **); static int node_queue_enqueue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE *); +static uint32_t profile_tree( + WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *); static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *); +static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t); #define node_queue_is_empty(queue) \ (((queue) == NULL || (queue)->first == NULL) ? 1 : 0) @@ -117,57 +125,52 @@ static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *); static int indexed_byte_comparator(const void *elem1, const void *elem2) { - return (((INDEXED_BYTE *) - elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency); + return (((INDEXED_SYMBOL *) + elem1)->frequency) - (((INDEXED_SYMBOL *)elem2)->frequency); } - -#if 1 -/* Keith: take this out when you hook up your buffer */ -static int buflen = 0; -static char* buf = NULL; - -/* return a buffer of at least min_len bytes */ -char* getbuf(int min_len) { - if (buflen < min_len) { - if (buf!=NULL) free(buf); - buf = malloc(min_len + 10); - } - return buf; -} -#endif - - /* * profile_tree -- - * Traverses tree to determine #leaves under each node, max depth, min depth of leaf. + * Traverses tree to determine #leaves under each node, max depth, min + * depth of leaf. */ static uint32_t -profile_tree(WT_FREQTREE_NODE *node, uint16_t len, uint16_t *max_depth, uint16_t *min_depth) +profile_tree(WT_FREQTREE_NODE *node, + uint16_t len, uint16_t *max_depth, uint16_t *min_depth) { uint32_t leaf_cnt; + if (node->left == NULL && node->right == NULL) { /* leaf */ leaf_cnt = 1; - if (*max_depth < len) *max_depth = len; - if (*min_depth > len) *min_depth = len; - } else { /* internal node -- way tree constructed internal always has left and right children */ - leaf_cnt = profile_tree(node->left, len+1, max_depth, min_depth) + profile_tree(node->right, len+1, max_depth, min_depth); + if (*max_depth < len) + *max_depth = len; + if (*min_depth > len) + *min_depth = len; + } else { + /* + * internal node -- way tree constructed internal always has + * left and right children + */ + leaf_cnt = + profile_tree(node->left, len + 1, max_depth, min_depth) + + profile_tree(node->right, len + 1, max_depth, min_depth); } - node->weight = leaf_cnt; /* abuse weight field */ - return leaf_cnt; + node->weight = leaf_cnt; /* abuse weight field */ + return (leaf_cnt); } - /* * set_codes -- * Computes Huffman code for each symbol in tree. - * Method is standard way in the literature, except that limits maximum code length. - * A known max code length is important for limiting memory use by the tables - * and for knowing how large data types need to be such as the field that holds the code pattern. + * + * Method is standard way in the literature, except that limits maximum code + * length. A known max code length is important for limiting memory use by + * the tables and for knowing how large data types need to be such as the field + * that holds the code pattern. */ static void -set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes, - uint16_t pattern, uint8_t len) +set_codes(WT_FREQTREE_NODE *node, + WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len) { WT_HUFFMAN_CODE *code; uint16_t patternleft, patternright, half; @@ -177,14 +180,25 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes, code = &codes[node->symbol]; code->pattern = pattern; code->length = len; - /*printf("%d: code %x, len %d\n", node->symbol, code, len);*/ + /* + * printf("%d: code %x, len %d\n", node->symbol, code, len); + */ } else { - /* Check each subtree individually to see if can afford here to split up bits into possibly shorter codes, - or if need to employ all remaining bits up to MAX_CODE_LENGTH to consecutively number leaves. */ + /* + * Check each subtree individually to see if can afford here + * to split up bits into possibly shorter codes, or if need + * to employ all remaining bits up to MAX_CODE_LENGTH to + * consecutively number leaves. + */ remaining = MAX_CODE_LENGTH - len; - if (len < MAX_CODE_LENGTH /* not alraedy in "low-bit mode"... */ - && ((half = 1 << (remaining - 1)) < node->left->weight || half < node->right->weight)) { /* ...but need to be */ - pattern = pattern << remaining; /* open up lower-order bits for consecutive numbering */ + /* + * If not already in "low-bit mode", but need to be, open up + * lower-order bits for consecutive numbering. + */ + if (len < MAX_CODE_LENGTH && + ((half = 1 << (remaining - 1)) < node->left->weight || + half < node->right->weight)) { + pattern = pattern << remaining; len = MAX_CODE_LENGTH; } @@ -192,10 +206,10 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes, patternleft = (pattern << 1) | 0; patternright = (pattern << 1) | 1; len++; - } else { /* "low bit mode" */ + } else { /* "low bit mode" */ patternleft = pattern; patternright = pattern + node->left->weight; - /* len unchanged */ + /* len unchanged */ } set_codes(node->left, codes, patternleft, len); @@ -203,16 +217,15 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes, } } - /* * make_table -- - * Computes Huffman table used for subsequent lookups in encoding and decoding. - * With the table, encoding from a symbol to Huffman code - * and decoding from a code to a symbol are simple array lookups. + * Computes Huffman table used for subsequent lookups in encoding and + * decoding. With the table, encoding from a symbol to Huffman code and + * decoding from a code to a symbol are simple array lookups. */ static void -make_table(uint16_t *code2symbol, - uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int nbytes) +make_table(SESSION *session, uint16_t *code2symbol, + uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt) { uint16_t c, c1, c2, i, j; uint8_t len, shift; @@ -221,7 +234,7 @@ make_table(uint16_t *code2symbol, * Here's the magic: flood all bit patterns for lower-order bits to * point to same symbol. */ - for (i = 0; i < nbytes; i++) { + for (i = 0; i < symcnt; i++) { if ((len = codes[i].length) == 0) continue; @@ -237,13 +250,12 @@ make_table(uint16_t *code2symbol, c1 = c << shift; c2 = (c + 1) << shift; for (j = c1; j < c2; j++) { - assert (code2symbol[j] == 0); + WT_ASSERT(session, code2symbol[j] == 0); code2symbol[j] = i; } } } - /* * recursive_free_node -- * Recursively free the huffman frequency tree's nodes. @@ -267,15 +279,14 @@ recursive_free_node(SESSION *session, WT_FREQTREE_NODE *node) * lowest rank is 1 (0 means the byte never appears in the input), so 1 byte * is needed to hold the rank and the input table must be 1 byte x 256 values. * - * For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535. + * For UTF-16 (symcnt == 2) the range is 0 - 65535 and the max rank is 65535. * The table should be 2 bytes x 65536 values. */ int __wt_huffman_open(SESSION *session, - uint8_t const *byte_frequency_array, u_int nbytes, void *retp) + uint8_t const *byte_frequency_array, u_int symcnt, void *retp) { - INDEXED_BYTE *indexed_freqs; - INDEXED_BYTE esc; + INDEXED_SYMBOL esc, *indexed_freqs; NODE_QUEUE *combined_nodes, *leaves; WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode; WT_HUFFMAN_OBJ *huffman; @@ -289,42 +300,51 @@ __wt_huffman_open(SESSION *session, ret = 0; WT_RET(__wt_calloc_def(session, 1, &huffman)); - WT_ERR(__wt_calloc_def(session, (size_t)nbytes, &indexed_freqs)); + WT_ERR(__wt_calloc_def(session, (size_t)symcnt, &indexed_freqs)); huffman->session = session; /* * The frequency array must be sorted to be able to use linear time * construction algorithm. */ - for (i = 0; i < nbytes; i++) { + for (i = 0; i < symcnt; i++) { indexed_freqs[i].frequency = byte_frequency_array[i]; indexed_freqs[i].symbol = i; } /* - * The escape symbol dynamically seizes the lowest-frequency symbol, which may have 0 frequency. - * We use the escape code only for cases that would otherwise be errors, which are hopefully rare. - * We choose the lowest-frequency to have minimal impact (close to no impact) on compression ratios, - * but if there are lots of errors we pay the longest codeword length in addition to the raw bits of the symbol. + * The escape symbol dynamically seizes the lowest-frequency symbol, + * which may have 0 frequency. We use the escape code only for cases + * that would otherwise be errors, which are hopefully rare. Choose + * the lowest-frequency to have minimal impact (close to no impact) on + * compression ratios, but if there are lots of errors we pay the + * longest codeword length in addition to the raw bits of the symbol. */ esc = indexed_freqs[0]; - for (i = 0+1; i < nbytes; i++) - if (indexed_freqs[i].frequency < esc.frequency) esc = indexed_freqs[i]; - escape = esc.symbol; -#ifdef DETAIL - printf("taking over symbol %d (freq %d) as escape\n", escape, esc.frequency); + for (i = 0+1; i < symcnt; i++) + if (indexed_freqs[i].frequency < esc.frequency) + esc = indexed_freqs[i]; + huffman->escape = esc.symbol; +#if __HUFFMAN_DETAIL + printf("taking over symbol %d (freq %d) as escape\n", + huffman->escape, esc.frequency); #endif - if (esc.frequency == 0) esc.frequency++; /* zero frequency left out of codes, so make sure this doesn't happen to the escape symbol */ + /* + * Zero frequency left out of codes, so make sure this doesn't happen + * to the escape symbol. + */ + if (esc.frequency == 0) + esc.frequency++; qsort(indexed_freqs, - nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator); + symcnt, sizeof(INDEXED_SYMBOL), indexed_byte_comparator); /* We need two node queues to build the tree. */ WT_ERR(__wt_calloc_def(session, 1, &leaves)); WT_ERR(__wt_calloc_def(session, 1, &combined_nodes)); /* Adding the leaves to the queue */ - for (i = 0; i < nbytes; ++i) { + for (i = 0; i < symcnt; ++i) { /* * We are leaving out symbols with a frequency of 0. This * assumes these symbols will NEVER occur in the source stream, @@ -394,35 +414,52 @@ __wt_huffman_open(SESSION *session, /* * The remaining node is in the node variable, this is the root of the - * tree. Calculate the number of bytes it takes to hold nbytes bits. + * tree. Calculate how many bytes it takes to hold symcnt bytes bits. */ - huffman->numSymbols = nbytes; - huffman->numBytes = nbytes > 256 ? 2 : 1; + huffman->numSymbols = symcnt; + huffman->numBytes = symcnt > 256 ? 2 : 1; huffman->max_depth = 0; huffman->min_depth = MAX_CODE_LENGTH; - profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth); - if (huffman->max_depth > MAX_CODE_LENGTH) huffman->max_depth = MAX_CODE_LENGTH; + (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth); + if (huffman->max_depth > MAX_CODE_LENGTH) + huffman->max_depth = MAX_CODE_LENGTH; WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes)); set_codes(node, huffman->codes, 0, 0); - WT_ERR(__wt_calloc_def(session, 1U << huffman->max_depth, &huffman->code2symbol)); - make_table(huffman->code2symbol, huffman->max_depth, huffman->codes, huffman->numSymbols); - /*huffman->code2symbol = make_table(huffman->codes, nbytes, huffman->max_depth); => have to have access to "err:" label */ -#ifdef DETAIL - printf("leaf depth %d..%d, memory use: codes %d# * %u size + code2symbol %d# * %u size\n", - huffman->min_depth, huffman->max_depth, - nbytes, sizeof(WT_HUFFMAN_CODE), - (1U << huffman->max_depth), sizeof(uint16_t)); - - /* measure quality of computed Huffman codes, for different max bit lengths (say, 16 vs 24 vs 32) */ - uint32_t weighted_length = 0; - for (i = 0; i<nbytes; i++) { - uint16_t sym = indexed_freqs[i].symbol; - weighted_length += indexed_freqs[i].frequency * huffman->codes[sym].length; - /*printf("\t%d->%d. %d * %d\n", i,sym, indexed_freqs[i].frequency, huffman->codes[sym].length);*/ + WT_ERR(__wt_calloc_def( + session, 1U << huffman->max_depth, &huffman->code2symbol)); + make_table(session, huffman->code2symbol, + huffman->max_depth, huffman->codes, huffman->numSymbols); + +#if __HUFFMAN_DETAIL + { + uint32_t weighted_length; + uint16_t sym; + + printf("leaf depth %lu..%lu, memory use: " + "codes %u# * %u size + code2symbol %lu# * %u size\n", + (u_long)huffman->min_depth, (u_long)huffman->max_depth, + symcnt, sizeof(WT_HUFFMAN_CODE), + (u_long)(1U << huffman->max_depth), sizeof(uint16_t)); + + /* + * measure quality of computed Huffman codes, for different max bit + * lengths (say, 16 vs 24 vs 32) + */ + weighted_length = 0; + for (i = 0; i < symcnt; i++) { + sym = indexed_freqs[i].symbol; + weighted_length += + indexed_freqs[i].frequency * huffman->codes[sym].length; + /* + * printf("\t%d->%d. %d * %d\n", i, sym, + * indexed_freqs[i].frequency, huffman->codes[sym].length); + */ + } + printf("weighted length of all codes (the smaller the better): %lu\n", + (u_long)weighted_length); } - printf("weighted length of all codes (the smaller the better): %d\n", weighted_length); #endif *(void **)retp = huffman; @@ -460,7 +497,7 @@ __wt_huffman_close(SESSION *session, void *huffman_arg) __wt_free(session, huffman); } -#ifdef HAVE_DIAGNOSTIC +#if __HUFFMAN_DETAIL /* * __wt_print_huffman_code -- * Prints a symbol's Huffman code. @@ -468,9 +505,9 @@ __wt_huffman_close(SESSION *session, void *huffman_arg) int __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) { - WT_HUFFMAN_OBJ *huffman; WT_HUFFMAN_CODE code; - u_int i; + WT_HUFFMAN_OBJ *huffman; + huffman = huffman_arg; if (symbol >= huffman->numSymbols) { @@ -478,7 +515,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) } else { code = huffman->codes[symbol]; if (code.length == 0) - printf("symbol %d not defined -- 0 frequency\n", symbol); + printf( + "symbol %d not defined -- 0 frequency\n", symbol); else /* should print code as binary */ printf("%d -> code pattern %x, length %d\n", @@ -495,8 +533,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) * * Translation from symbol to Huffman code is a simple array lookup. * - * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per symbol. - * Then, given a symbol: + * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per + * symbol. Then, given a symbol: * pattern = codes[symbol].pattern; * length = codes[symbol].length; * @@ -507,19 +545,22 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) * and write header bits. */ int -__wt_huffman_encode(void *huffman_arg, - const uint8_t *from, uint32_t from_len, WT_BUF *to_buf) +__wt_huffman_encode( + void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf) { + WT_BUF *tmp; SESSION *session; - WT_HUFFMAN_OBJ *huffman; WT_HUFFMAN_CODE code; + WT_HUFFMAN_OBJ *huffman; uint32_t max_len, outlen, bitpos, bytes; uint16_t symbol; - uint8_t len, esc, padding_info, *out, *buf; + uint8_t len, esc, padding_info, *out; + int ret; + /* * Shift register to accumulate bits from input. - * Should be >= (MAX_CODE_LENGTH + 7), - * but also efficient to shift bits and preferably in a machine register. + * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits + * and preferably in a machine register. */ uint32_t bits; @@ -528,6 +569,8 @@ __wt_huffman_encode(void *huffman_arg, huffman = huffman_arg; session = huffman->session; + tmp = NULL; + ret = 0; /* * We don't want to find all of our callers and ensure they don't pass @@ -539,16 +582,20 @@ __wt_huffman_encode(void *huffman_arg, } /* - * Compute the largest compressed output size, - * which is if all symbols are least frequent and so have largest Huffman codes, - * and compressed output may be larger than the input size. - * This way we don't have to worry about resizing the buffer during compression. - * Use the shared system buffer while compressing, - * then allocate a new buffer of exactly the right size and copy the result into it. + * Compute the largest compressed output size, which is if all symbols + * are least frequent and so have largest Huffman codes, and compressed + * output may be larger than the input size. This way we don't have to + * worry about resizing the buffer during compression. Use the shared + * system buffer while compressing, then allocate a new buffer of the + * right size and copy the result into it. */ - max_len = (WT_HUFFMAN_HEADER + from_len * huffman->max_depth + 7/*round up to full byte*/) / 8; - max_len += from_len * huffman->numBytes; /* if all symbols used escape code (ugh!) */ - buf = (uint8_t *)getbuf(max_len); + max_len = (WT_HUFFMAN_HEADER + + from_len * huffman->max_depth + 7 /* round up to full byte */) / 8; + /* + * if all symbols used escape code (ugh!) + */ + max_len += from_len * huffman->numBytes; + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); /* * Leave the first 3 bits of the encoded value empty, it holds the @@ -557,31 +604,36 @@ __wt_huffman_encode(void *huffman_arg, bits = 0; bitpos = WT_HUFFMAN_HEADER; valid = WT_HUFFMAN_HEADER; - out = buf; + out = tmp->mem; esc = 0; for (bytes = 0; bytes < from_len; bytes += huffman->numBytes) { /* Getting the next symbol, either 1 or 2 bytes */ symbol = *from++; - if (huffman->numBytes == 2) symbol = (symbol << 8) | *from++; + if (huffman->numBytes == 2) + symbol = (symbol << 8) | *from++; /* - * Huffman expects to never see symbols zero frequency and out-of-band. - * These may happen because the data changed since we computed the frequency tables. - * We can still encode such illegal input symbols via an escape symbol: escape + raw next 8-or-16 bits. - * (If escape symbol does appear, it's encoded the same way as "zero frequency".) + * Huffman expects to never see zero frequency and out-of-band + * symbols. These may happen because the data changed since we + * computed the frequency tables. We can still encode such + * illegal input symbols via an escape symbol: escape + raw + * next 8-or-16 bits. (If the escape symbol does appear, it's + * encoded the same way as "zero frequency".) */ - if (symbol >= huffman->numSymbols /* bad input: out of band */ - || (code = huffman->codes[symbol]).length == 0 /* bad input: declared zero frequency, yet here it is */ - || symbol == escape /* legitimate input symbol that was seized as escape symbol */ - ) { - code = huffman->codes[escape]; -#ifdef DETAIL + if (symbol >= huffman->numSymbols + /* bad input: out of band */ || + (code = huffman->codes[symbol]).length == 0 + /* bad input: declared zero frequency, yet here it is */ || + symbol == huffman->escape + /* legitimate input symbol seized as escape symbol */) { + code = huffman->codes[huffman->escape]; +#if __HUFFMAN_DETAIL printf("encoding escape for %d\n", symbol); #endif esc = 1; } - /* translate symbol into Huffman code and stuff into buffer */ + /* Translate symbol into Huffman code and stuff into buffer. */ len = code.length; bits = (bits << len) | code.pattern; valid += len; @@ -616,18 +668,22 @@ __wt_huffman_encode(void *huffman_arg, * bits used in the last byte. */ padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER); - *buf |= padding_info; + ((uint8_t *)tmp->mem)[0] |= padding_info; /* Copy result of exact known size into caller's buffer. */ outlen = (bitpos + 7) / 8; - WT_RET(__wt_buf_setsize(session, to_buf, outlen)); - memcpy(to_buf->mem, buf, outlen); -#ifdef DETAIL - printf("compress: worst case %d bytes -> actual %d\n", max_len, outlen); - printf("(vs old code %d, which was memory leak for common symbols and error for infrequent)\n", from_len+1); + WT_ERR(__wt_buf_setsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("encode: worst case %lu bytes -> actual %lu\n", + (u_long)max_len, (u_long)outlen); #endif - return (0); +err: if (tmp != NULL) + __wt_scr_release(&tmp); + return (ret); + } /* @@ -640,41 +696,44 @@ __wt_huffman_encode(void *huffman_arg, * and whose value is the corresponding symbol. * From the symbol, we index into the 'codes' array to get the code length. * - * When decoding a message, we don't know where the boundaries are between codes. - * The trick is that we collect enough bits for the longest code word, - * and construct the table such that for codes with fewer bits we flood the table - * with all of the bit patterns in the lower order bits. - * This works because the Huffman code is a unique prefix, and - * by the flooding we are treating bits beyond the unique prefix as don't care bits. + * When decoding a message, we don't know where the boundaries are between + * codes. The trick is that we collect enough bits for the longest code word, + * and construct the table such that for codes with fewer bits we flood the + * table with all of the bit patterns in the lower order bits. This works + * because the Huffman code is a unique prefix, and by the flooding we are + * treating bits beyond the unique prefix as don't care bits. * - * For example, we have table is of length 2^max_code_length (1<<max_code_length). - * For a code of length, max_code_length, the position code2symbol[code] = symbol. - * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, as well - * as code2symbol[(code << 1) | 1] = symbol. + * For example, we have table of length 2^max_code_length (1<<max_code_length). + * For a code of length, max_code_length, the position code2symbol[code] = + * symbol. + * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, + * as well as code2symbol[(code << 1) | 1] = symbol. * And so on, so in general we fill: - * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive]. + * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive]. * - * To decode a message, we read in enough bits from input to fill the shift register with at least - * MAX_CODE_LENGTH bits. + * To decode a message, we read in enough bits from input to fill the shift + * register with at least MAX_CODE_LENGTH bits. * We look up in the table code2symbol to obtain the symbol. * We look up the symbol in 'codes' to obtain the code length * Finally, subtract off these bits from the shift register. */ int -__wt_huffman_decode(void *huffman_arg, - const uint8_t *from, uint32_t from_len, WT_BUF *to_buf) +__wt_huffman_decode( + void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf) { + WT_BUF *tmp; SESSION *session; WT_HUFFMAN_OBJ *huffman; - uint32_t bits, from_bits, from_len_bits, len, max_len, outlen, max; - uint32_t mask, out_bits; - uint16_t pattern; - uint8_t valid; - uint16_t symbol; - uint8_t padding_info, *to, *buf; + uint32_t bits, from_bits, from_len_bits, len, mask, max, max_len; + uint32_t out_bits, outlen; + uint16_t pattern, symbol; + uint8_t padding_info, *to, valid; + int ret; huffman = huffman_arg; session = huffman->session; + tmp = NULL; + ret = 0; /* * We don't want to find all of our callers and ensure they don't pass @@ -695,38 +754,41 @@ __wt_huffman_decode(void *huffman_arg, from_len_bits -= 8 - padding_info; /* - * Compute largest uncompressed output size, - * which is if all symbols are most frequent and so have smallest Huffman codes and therefore largest expansion. - * (If some symbols use the escape code then the output is even smaller.) - * Use the shared system buffer while uncompressing, - * then allocate a new buffer of exactly the right size and copy the result into it. + * Compute largest uncompressed output size, which is if all symbols are + * most frequent and so have smallest Huffman codes and therefore + * largest expansion. (If some symbols use the escape code then the + * output is even smaller.) Use the shared system buffer while + * uncompressing, then allocate a new buffer of exactly the right size + * and copy the result into it. */ max_len = (from_len_bits / huffman->min_depth) * huffman->numBytes; - buf = (uint8_t *)getbuf(max_len); - to = buf; + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); + to = tmp->mem; bits = *from++; valid = 8 - WT_HUFFMAN_HEADER; from_bits = from_len_bits - 8; out_bits = from_len_bits - WT_HUFFMAN_HEADER; max = huffman->max_depth; - mask = (1 << max) - 1; + mask = (1U << max) - 1; for (outlen = 0; out_bits > 0; outlen += huffman->numBytes) { while (valid < max && from_bits > 0) { bits = (bits << 8) | *from++; valid += 8; from_bits -= 8; } - pattern = valid >= max ? /* short patterns near end */ + pattern = valid >= max ? /* short patterns near end */ (bits >> (valid - max)) : (bits << (max - valid)); symbol = huffman->code2symbol[pattern & mask]; len = huffman->codes[symbol].length; valid -= len; out_bits -= len; - if (symbol == escape) { -#ifdef DETAIL - printf("decoding escape\n"); + if (symbol == huffman->escape) { +#if __HUFFMAN_DETAIL + /* + * printf("decoding escape\n"); + */ #endif len = huffman->numBytes * 8; while (valid < len) { @@ -734,7 +796,12 @@ __wt_huffman_decode(void *huffman_arg, valid += 8; from_bits -= 8; } - symbol = bits >> (valid - len); /* different than usual case: extract symbol not pattern */ + + /* + * different than usual case: extract symbol not + * pattern + */ + symbol = bits >> (valid - len); valid -= len; out_bits -= len; } @@ -745,14 +812,17 @@ __wt_huffman_decode(void *huffman_arg, } /* Return the number of bytes used. */ - WT_RET(__wt_buf_setsize(session, to_buf, outlen)); - memcpy(to_buf->mem, buf, outlen); -#ifdef DETAIL - printf("uncompress: worst case %d bytes -> actual %d\n", max_len, outlen); - printf("(vs old code %d, which was usually memory leak)\n", 2*from_len+1); + WT_ERR(__wt_buf_setsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("decode: worst case %lu bytes -> actual %lu\n", + (u_long)max_len, (u_long)outlen); #endif - return (0); +err: if (tmp != NULL) + __wt_scr_release(&tmp); + return (ret); } /* @@ -786,7 +856,7 @@ node_queue_enqueue(SESSION *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node) NODE_QUEUE_ELEM *elem; /* Allocating a new linked list element */ - WT_RET(__wt_calloc(session, 1, sizeof(NODE_QUEUE_ELEM), &elem)); + WT_RET(__wt_calloc_def(session, 1, &elem)); /* It holds the tree node, and has no next element yet */ elem->node = node; |