diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/support/huffman.c')
-rw-r--r-- | src/third_party/wiredtiger/src/support/huffman.c | 899 |
1 files changed, 899 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c new file mode 100644 index 00000000000..5a06b72d33e --- /dev/null +++ b/src/third_party/wiredtiger/src/support/huffman.c @@ -0,0 +1,899 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */ + +/* Length of header in compressed message, in bits. */ +#define WT_HUFFMAN_HEADER 3 + +/* + * Maximum allowed length of Huffman code words, which otherwise can range up + * to (#symbols - 1) bits long. Lower value to use less memory for tables, + * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57 + * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max + * code length is that the worst case compression (a message of the least + * frequent symbols) is shorter. + */ +#define MAX_CODE_LENGTH 16 + +typedef struct __wt_freqtree_node { + /* + * Data structure representing a node of the huffman tree. It holds a + * 64-bit weight and pointers to the left and right child nodes. The + * node either has two child nodes or none. + */ + uint8_t symbol; /* only used in leaf nodes */ + uint64_t weight; + struct __wt_freqtree_node *left; /* bit 0 */ + struct __wt_freqtree_node *right; /* bit 1 */ +} WT_FREQTREE_NODE; + +typedef struct __wt_huffman_code { + uint16_t pattern; /* requirement: length of field's type + * in bits >= MAX_CODE_LENGTH. + */ + uint8_t length; +} WT_HUFFMAN_CODE; + +typedef struct __wt_huffman_obj { + /* + * Data structure here defines specific instance of the encoder/decoder. + */ + u_int numSymbols; /* Symbols: UINT16_MAX or UINT8_MAX */ + + uint16_t max_depth, min_depth; /* Tree max/min depths */ + + /* + * use: codes[symbol] = struct with pattern and length. + * Used in encoding and decoding. + * memory: codes[0-to-(number of symbols - 1)] + */ + WT_HUFFMAN_CODE *codes; + + /* + * use: code2symbol[Huffman_code] = symbol. + * Used in decoding. + * memory: code2symbol[1 << max_code_length] + */ + uint8_t *code2symbol; +} WT_HUFFMAN_OBJ; + +/* + * Queue element data structure. + * + * Consists of a pointer to a huffman tree node, and a pointer to the next + * element in the queue. + */ +typedef struct node_queue_elem { + WT_FREQTREE_NODE *node; + struct node_queue_elem *next; +} NODE_QUEUE_ELEM; + +/* + * Queue of huffman tree nodes. + * + * Contains a pointer to the beginning and the end of the queue, which is + * implemented as a linked list. + */ +typedef struct node_queue { + NODE_QUEUE_ELEM *first; + NODE_QUEUE_ELEM *last; +} NODE_QUEUE; + +/* + * Internal data structure used to preserve the symbol when rearranging the + * frequency array. + */ +typedef struct __indexed_byte { + uint32_t symbol; /* not uint8_t: match external data structure */ + uint32_t frequency; +} INDEXED_SYMBOL; + +static int indexed_freq_compare(const void *, const void *); +static int indexed_symbol_compare(const void *, const void *); +static void make_table( + WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int); +static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *); +static void node_queue_dequeue( + WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **); +static int node_queue_enqueue( + WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *); +static uint32_t profile_tree( + WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *); +static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *); +static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t); + +#define node_queue_is_empty(queue) \ + ((queue) == NULL || (queue)->first == NULL) + +/* + * indexed_symbol_compare -- + * Qsort comparator to order the table by symbol, lowest to highest. + */ +static int +indexed_symbol_compare(const void *a, const void *b) +{ + return (((INDEXED_SYMBOL *)a)->symbol > + ((INDEXED_SYMBOL *)b)->symbol ? 1 : + (((INDEXED_SYMBOL *)a)->symbol < + ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0)); +} + +/* + * indexed_freq_compare -- + * Qsort comparator to order the table by frequency (the most frequent + * symbols will be at the end of the array). + */ +static int +indexed_freq_compare(const void *a, const void *b) +{ + return (((INDEXED_SYMBOL *)a)->frequency > + ((INDEXED_SYMBOL *)b)->frequency ? 1 : + (((INDEXED_SYMBOL *)a)->frequency < + ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0)); +} + +/* + * profile_tree -- + * Traverses tree to determine #leaves under each node, max depth, min + * depth of leaf. + */ +static uint32_t +profile_tree(WT_FREQTREE_NODE *node, + uint16_t len, uint16_t *max_depth, uint16_t *min_depth) +{ + uint32_t leaf_cnt; + + if (node->left == NULL && node->right == NULL) { /* leaf */ + leaf_cnt = 1; + if (*max_depth < len) + *max_depth = len; + if (*min_depth > len) + *min_depth = len; + } else { + /* + * internal node -- way tree constructed internal always has + * left and right children + */ + leaf_cnt = + profile_tree(node->left, len + 1, max_depth, min_depth) + + profile_tree(node->right, len + 1, max_depth, min_depth); + } + node->weight = leaf_cnt; /* abuse weight field */ + return (leaf_cnt); +} + +/* + * set_codes -- + * Computes Huffman code for each symbol in tree. + * + * Method is standard way in the literature, except that limits maximum code + * length. A known max code length is important for limiting memory use by + * the tables and for knowing how large data types need to be such as the field + * that holds the code pattern. + */ +static void +set_codes(WT_FREQTREE_NODE *node, + WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len) +{ + WT_HUFFMAN_CODE *code; + uint16_t patternleft, patternright, half; + uint8_t remaining; + + if (node->left == NULL && node->right == NULL) { + code = &codes[node->symbol]; + code->pattern = pattern; + code->length = len; +#if __HUFFMAN_DETAIL + printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n", + node->symbol, pattern, len); +#endif + } else { + /* + * Check each subtree individually to see if can afford to split + * up bits into possibly shorter codes, or if need to employ all + * remaining bits up to MAX_CODE_LENGTH to consecutively number + * leaves. + */ + remaining = MAX_CODE_LENGTH - len; + /* + * If not already in "low-bit mode", but need to be, open up + * lower-order bits for consecutive numbering. + */ + if (len < MAX_CODE_LENGTH && + ((half = 1 << (remaining - 1)) < node->left->weight || + half < node->right->weight)) { + pattern = pattern << remaining; + len = MAX_CODE_LENGTH; + } + + if (len < MAX_CODE_LENGTH) { + patternleft = (pattern << 1) | 0; + patternright = (pattern << 1) | 1; + len++; + } else { /* "low bit mode" */ + patternleft = pattern; + patternright = pattern + node->left->weight; + /* len unchanged */ + } + + set_codes(node->left, codes, patternleft, len); + set_codes(node->right, codes, patternright, len); + } +} + +/* + * make_table -- + * Computes Huffman table used for subsequent lookups in encoding and + * decoding. With the table, encoding from a symbol to Huffman code and + * decoding from a code to a symbol are simple array lookups. + */ +static void +make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol, + uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt) +{ + uint32_t j, c1, c2; /* Exceeds uint16_t bounds at loop boundary. */ + uint16_t c, i; + uint8_t len, shift; + + /* Zero out, for assertion below. */ + for (j = 0, c2 = (1U << max_depth); j < c2; j++) + code2symbol[j] = 0; + + /* + * Here's the magic: flood all bit patterns for lower-order bits to + * point to same symbol. + */ + for (i = 0; i < symcnt; i++) { + if ((len = codes[i].length) == 0) + continue; + + /* + * The size of the array index should be enough to hold largest + * index into symbol table. Pre-existing symbols were packed + * 0-255, so 8 bits is enough. Don't want to make it larger + * than necessary, we allocate (2 ^ max-code-length) of them. + */ + c = codes[i].pattern; + shift = max_depth - len; + c1 = (uint32_t)c << shift; + c2 = (uint32_t)(c + 1) << shift; + for (j = c1; j < c2; j++) { + WT_ASSERT(session, code2symbol[j] == 0); + code2symbol[j] = i; + } + } +} + +/* + * recursive_free_node -- + * Recursively free the huffman frequency tree's nodes. + */ +static void +recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node) +{ + if (node != NULL) { + recursive_free_node(session, node->left); + recursive_free_node(session, node->right); + __wt_free(session, node); + } +} + +/* + * __wt_huffman_open -- + * Take a frequency table and return a pointer to a descriptor object. + */ +int +__wt_huffman_open(WT_SESSION_IMPL *session, + void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp) +{ + INDEXED_SYMBOL *indexed_freqs, *sym; + NODE_QUEUE *combined_nodes, *leaves; + WT_DECL_RET; + WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode; + WT_HUFFMAN_OBJ *huffman; + uint64_t w1, w2; + uint16_t i; + + indexed_freqs = symbol_frequency_array; + + combined_nodes = leaves = NULL; + node = node2 = tempnode = NULL; + + WT_RET(__wt_calloc_def(session, 1, &huffman)); + + /* + * The frequency table is 4B pairs of symbol and frequency. The symbol + * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX + * (a frequency of 0 means the value is never expected to appear in the + * input). Validate the symbols are within range. + */ + if (numbytes != 1 && numbytes != 2) + WT_ERR_MSG(session, EINVAL, + "illegal number of symbol bytes specified for a huffman " + "table"); + + if (symcnt == 0) + WT_ERR_MSG(session, EINVAL, + "illegal number of symbols specified for a huffman table"); + + huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX; + + /* + * Order the array by symbol and check for invalid symbols and + * duplicates. + */ + qsort((void *)indexed_freqs, + symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare); + for (i = 0; i < symcnt; ++i) { + if (i > 0 && + indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol) + WT_ERR_MSG(session, EINVAL, + "duplicate symbol %" PRIx32 + " specified in a huffman table", + indexed_freqs[i].symbol); + if (indexed_freqs[i].symbol > huffman->numSymbols) + WT_ERR_MSG(session, EINVAL, + "illegal symbol %" PRIx32 + " specified in a huffman table", + indexed_freqs[i].symbol); + } + + /* + * Massage frequencies. + */ + indexed_freqs = NULL; + WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs)); + + /* + * Minimum of frequency==1 so everybody gets a Huffman code, in case + * data evolves and we need to represent this value. + */ + for (i = 0; i < 256; i++) { + sym = &indexed_freqs[i]; + sym->symbol = i; + sym->frequency = 1; + } + /* + * Avoid large tables by splitting UTF-16 frequencies into high byte + * and low byte. + */ + for (i = 0; i < symcnt; i++) { + sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i]; + indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency; + if (numbytes == 2) + indexed_freqs[(sym->symbol >> 8) & 0xff].frequency += + sym->frequency; + } + huffman->numSymbols = symcnt = 256; + + /* + * The array must be sorted by frequency to be able to use a linear time + * construction algorithm. + */ + qsort((void *)indexed_freqs, + symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare); + + /* We need two node queues to build the tree. */ + WT_ERR(__wt_calloc_def(session, 1, &leaves)); + WT_ERR(__wt_calloc_def(session, 1, &combined_nodes)); + + /* + * Adding the leaves to the queue. + * + * Discard symbols with a frequency of 0; this assumes these symbols + * never occur in the source stream, and the purpose is to reduce the + * huffman tree's size. + */ + for (i = 0; i < symcnt; ++i) + if (indexed_freqs[i].frequency > 0) { + WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + tempnode->symbol = (uint8_t)indexed_freqs[i].symbol; + tempnode->weight = indexed_freqs[i].frequency; + WT_ERR(node_queue_enqueue(session, leaves, tempnode)); + tempnode = NULL; + } + + while (!node_queue_is_empty(leaves) || + !node_queue_is_empty(combined_nodes)) { + /* + * We have to get the node with the smaller weight, examining + * both queues' first element. We are collecting pairs of these + * items, by alternating between node and node2: + */ + refnode = !node ? &node : &node2; + + /* + * To decide which queue must be used, we get the weights of + * the first items from both: + */ + w1 = node_queue_is_empty(leaves) ? + UINT64_MAX : leaves->first->node->weight; + w2 = node_queue_is_empty(combined_nodes) ? + UINT64_MAX : combined_nodes->first->node->weight; + + /* + * Based on the two weights we finally can dequeue the smaller + * element and place it to the alternating target node pointer: + */ + if (w1 < w2) + node_queue_dequeue(session, leaves, refnode); + else + node_queue_dequeue(session, combined_nodes, refnode); + + /* + * In every second run, we have both node and node2 initialized. + */ + if (node != NULL && node2 != NULL) { + WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + + /* The new weight is the sum of the two weights. */ + tempnode->weight = node->weight + node2->weight; + tempnode->left = node; + tempnode->right = node2; + + /* Enqueue it to the combined nodes queue */ + WT_ERR(node_queue_enqueue( + session, combined_nodes, tempnode)); + tempnode = NULL; + + /* Reset the state pointers */ + node = node2 = NULL; + } + } + + /* + * The remaining node is in the node variable, this is the root of the + * tree. Calculate how many bytes it takes to hold numSymbols bytes + * bits. + */ + huffman->max_depth = 0; + huffman->min_depth = MAX_CODE_LENGTH; + (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth); + if (huffman->max_depth > MAX_CODE_LENGTH) + huffman->max_depth = MAX_CODE_LENGTH; + + WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes)); + set_codes(node, huffman->codes, 0, 0); + + WT_ERR(__wt_calloc_def( + session, 1U << huffman->max_depth, &huffman->code2symbol)); + make_table(session, huffman->code2symbol, + huffman->max_depth, huffman->codes, huffman->numSymbols); + +#if __HUFFMAN_DETAIL + { + uint8_t symbol; + uint32_t weighted_length; + + printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: " + "codes %u# * %uB + code2symbol %u# * %uB\n", + huffman->min_depth, huffman->max_depth, + huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE), + 1U << huffman->max_depth, (u_int)sizeof(uint16_t)); + + /* + * measure quality of computed Huffman codes, for different max bit + * lengths (say, 16 vs 24 vs 32) + */ + weighted_length = 0; + for (i = 0; i < symcnt; i++) { + symbol = indexed_freqs[i].symbol; + weighted_length += + indexed_freqs[i].frequency * huffman->codes[symbol].length; + printf( + "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n", + i, symbol, + indexed_freqs[i].frequency, huffman->codes[symbol].length); + } + printf("weighted length of all codes (the smaller the better): " + "%" PRIu32 "\n", weighted_length); + } +#endif + + *(void **)retp = huffman; + + if (0) { +err: if (ret == 0) + ret = WT_ERROR; + } + __wt_free(session, indexed_freqs); + if (leaves != NULL) + node_queue_close(session, leaves); + if (combined_nodes != NULL) + node_queue_close(session, combined_nodes); + if (node != NULL) + recursive_free_node(session, node); + if (node2 != NULL) + recursive_free_node(session, node2); + __wt_free(session, tempnode); + if (ret != 0) + __wt_huffman_close(session, huffman); + return (ret); +} + +/* + * __wt_huffman_close -- + * Discard a Huffman descriptor object. + */ +void +__wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg) +{ + WT_HUFFMAN_OBJ *huffman; + + huffman = huffman_arg; + + __wt_free(session, huffman->code2symbol); + __wt_free(session, huffman->codes); + __wt_free(session, huffman); +} + +#if __HUFFMAN_DETAIL +/* + * __wt_print_huffman_code -- + * Prints a symbol's Huffman code. + */ +int +__wt_print_huffman_code(void *huffman_arg, uint16_t symbol) +{ + WT_HUFFMAN_CODE code; + WT_HUFFMAN_OBJ *huffman; + + huffman = huffman_arg; + + if (symbol >= huffman->numSymbols) + printf("symbol %" PRIu16 " out of range\n", symbol); + else { + code = huffman->codes[symbol]; + if (code.length == 0) + printf( + "symbol %" PRIu16 " not defined -- 0 frequency\n", + symbol); + else + /* should print code as binary */ + printf( + "%" PRIu16 " -> code pattern " + "%" PRIx16 ", length %" PRIu8 "\n", + symbol, code.pattern, code.length); + } + + return (0); +} +#endif + +/* + * __wt_huffman_encode -- + * Take a byte string, encode it into the target. + * + * Translation from symbol to Huffman code is a simple array lookup. + * + * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per + * symbol. Then, given a symbol: + * pattern = codes[symbol].pattern; + * length = codes[symbol].length; + * + * To encode byte-string, we iterate over the input symbols. For each symbol, + * look it up via table, shift bits onto a shift register (an int long enough + * to hold the longest code word + up to 7 bits remaining from the previous), + * then drain out full bytes. Finally, at the end flush remaining bits + * and write header bits. + */ +int +__wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, + const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) +{ + WT_DECL_RET; + WT_HUFFMAN_CODE code; + WT_HUFFMAN_OBJ *huffman; + WT_ITEM *tmp; + size_t max_len, outlen, bytes; + uint64_t bitpos; + const uint8_t *from; + uint8_t len, *out, padding_info, symbol; + + /* + * Shift register to accumulate bits from input. + * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits + * and preferably in a machine register. + */ + uint32_t bits; + + /* Count of bits in shift register ('bits' above). */ + uint8_t valid; + + huffman = huffman_arg; + from = from_arg; + tmp = NULL; + + /* + * We don't want to find all of our callers and ensure they don't pass + * 0-length byte strings, but there's no reason to do any work. + */ + if (from_len == 0) { + to_buf->size = 0; + return (0); + } + + /* + * Compute the largest compressed output size, which is if all symbols + * are least frequent and so have largest Huffman codes, and compressed + * output may be larger than the input size. This way we don't have to + * worry about resizing the buffer during compression. Use the shared + * system buffer while compressing, then allocate a new buffer of the + * right size and copy the result into it. + */ + max_len = (WT_HUFFMAN_HEADER + + from_len * huffman->max_depth + 7 /* round up to full byte */) / 8; + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); + + /* + * Leave the first 3 bits of the encoded value empty, it holds the + * number of bits actually used in the last byte of the encoded value. + */ + bits = 0; + bitpos = WT_HUFFMAN_HEADER; + valid = WT_HUFFMAN_HEADER; + out = tmp->mem; + for (bytes = 0; bytes < from_len; bytes++) { + WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len)); + + symbol = *from++; + + /* Translate symbol into Huffman code and stuff into buffer. */ + code = huffman->codes[symbol]; + len = code.length; + bits = (bits << len) | code.pattern; + valid += len; + bitpos += len; + while (valid >= 8) { + WT_ASSERT(session, + WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize)); + *out++ = (uint8_t)(bits >> (valid - 8)); + valid -= 8; + } + } + if (valid > 0) { /* Flush shift register. */ + WT_ASSERT(session, + WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize)); + *out = (uint8_t)(bits << (8 - valid)); + } + + /* + * At this point, bitpos is the total number of used bits (including + * the 3 bits at the beginning of the buffer, which we'll set now to + * the number of bits used in the last byte). Note if the number of + * bits used in the last byte is 8, we set the 3 bits to 0, in other + * words, the first 3 bits of the encoded value are the number of bits + * used in the last byte, unless they're 0, in which case there are 8 + * bits used in the last byte. + */ + padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER); + ((uint8_t *)tmp->mem)[0] |= padding_info; + + /* Copy result of exact known size into caller's buffer. */ + outlen = (uint32_t)((bitpos + 7) / 8); + WT_ERR(__wt_buf_initsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n", + max_len, outlen); +#endif + +err: __wt_scr_free(&tmp); + return (ret); + +} + +/* + * __wt_huffman_decode -- + * Take a byte string, decode it into the target. + * + * Translation from Huffman code to symbol is a simple array lookup. + * + * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word + * and whose value is the corresponding symbol. + * From the symbol, we index into the 'codes' array to get the code length. + * + * When decoding a message, we don't know where the boundaries are between + * codes. The trick is that we collect enough bits for the longest code word, + * and construct the table such that for codes with fewer bits we flood the + * table with all of the bit patterns in the lower order bits. This works + * because the Huffman code is a unique prefix, and by the flooding we are + * treating bits beyond the unique prefix as don't care bits. + * + * For example, we have table of length 2^max_code_length (1<<max_code_length). + * For a code of length, max_code_length, the position code2symbol[code] = + * symbol. + * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, + * as well as code2symbol[(code << 1) | 1] = symbol. + * And so on, so in general we fill: + * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive]. + * + * To decode a message, we read in enough bits from input to fill the shift + * register with at least MAX_CODE_LENGTH bits. + * We look up in the table code2symbol to obtain the symbol. + * We look up the symbol in 'codes' to obtain the code length + * Finally, subtract off these bits from the shift register. + */ +int +__wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, + const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) +{ + WT_DECL_RET; + WT_ITEM *tmp; + WT_HUFFMAN_OBJ *huffman; + size_t from_bytes, len, max_len, outlen; + uint64_t from_len_bits; + uint32_t bits, mask, max; + uint16_t pattern; + const uint8_t *from; + uint8_t padding_info, symbol, *to, valid; + + huffman = huffman_arg; + from = from_arg; + tmp = NULL; + + /* + * We don't want to find all of our callers and ensure they don't pass + * 0-length byte strings, but there's no reason to do any work. + */ + if (from_len == 0) { + to_buf->size = 0; + return (0); + } + + /* + * The first 3 bits are the number of used bits in the last byte, unless + * they're 0, in which case there are 8 bits used in the last byte. + */ + padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER); + from_len_bits = from_len * 8; + if (padding_info != 0) + from_len_bits -= 8U - padding_info; + + /* Number of bits that have codes. */ + from_len_bits -= WT_HUFFMAN_HEADER; + + /* + * Compute largest uncompressed output size, which is if all symbols are + * most frequent and so have smallest Huffman codes and therefore + * largest expansion. Use the shared system buffer while uncompressing, + * then allocate a new buffer of exactly the right size and copy the + * result into it. + */ + max_len = (uint32_t)(from_len_bits / huffman->min_depth); + WT_ERR(__wt_scr_alloc(session, max_len, &tmp)); + to = tmp->mem; + + /* The first byte of input is a special case because of header bits. */ + bits = *from++; + valid = 8 - WT_HUFFMAN_HEADER; + from_bytes = from_len - 1; + + max = huffman->max_depth; + mask = (1U << max) - 1; + for (outlen = 0; from_len_bits > 0; outlen++) { + while (valid < max && from_bytes > 0) { + WT_ASSERT(session, + WT_PTR_IN_RANGE(from, from_arg, from_len)); + bits = (bits << 8) | *from++; + valid += 8; + from_bytes--; + } + pattern = valid >= max ? /* short patterns near end */ + (bits >> (valid - max)) : (bits << (max - valid)); + symbol = huffman->code2symbol[pattern & mask]; + len = huffman->codes[symbol].length; + valid -= len; + WT_ASSERT(session, from_len_bits >= len); + from_len_bits -= len; + + WT_ASSERT(session, + WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize)); + *to++ = symbol; + } + + /* Return the number of bytes used. */ + WT_ERR(__wt_buf_initsize(session, to_buf, outlen)); + memcpy(to_buf->mem, tmp->mem, outlen); + +#if __HUFFMAN_DETAIL + printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n", + max_len, outlen); +#endif + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * node_queue_close -- + * Delete a queue from memory. + * + * It does not delete the pointed huffman tree nodes! + */ +static void +node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue) +{ + NODE_QUEUE_ELEM *elem, *next_elem; + + /* Freeing each element of the queue's linked list. */ + for (elem = queue->first; elem != NULL; elem = next_elem) { + next_elem = elem->next; + __wt_free(session, elem); + } + + /* Freeing the queue record itself. */ + __wt_free(session, queue); +} + +/* + * node_queue_enqueue -- + * Push a tree node to the end of the queue. + */ +static int +node_queue_enqueue( + WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node) +{ + NODE_QUEUE_ELEM *elem; + + /* Allocating a new linked list element */ + WT_RET(__wt_calloc_def(session, 1, &elem)); + + /* It holds the tree node, and has no next element yet */ + elem->node = node; + elem->next = NULL; + + /* If the queue is empty, the first element will be the new one. */ + if (queue->first == NULL) + queue->first = elem; + + /* + * If the queue is not empty, the last element's next pointer must be + * updated. + */ + if (queue->last != NULL) + queue->last->next = elem; + + /* The last element is the new one */ + queue->last = elem; + + return (0); +} + +/* + * node_queue_dequeue -- + * Removes a node from the beginning of the queue and copies the node's + * pointer to the location referred by the retp parameter. + */ +static void +node_queue_dequeue( + WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp) +{ + NODE_QUEUE_ELEM *first_elem; + + /* + * Getting the first element of the queue and updating it to point to + * the next element as first. + */ + first_elem = queue->first; + *retp = first_elem->node; + queue->first = first_elem->next; + + /* + * If the last element was the dequeued element, we have to update it + * to NULL. + */ + if (queue->last == first_elem) + queue->last = NULL; + + /* Freeing the linked list element that has been dequeued */ + __wt_free(session, first_elem); +} |