Integrate Tom's new code drop into the tree.

author: Keith Bostic <keith.bostic@wiredtiger.com> 2011-05-19 14:57:16 -0400
committer: Keith Bostic <keith.bostic@wiredtiger.com> 2011-05-19 14:57:16 -0400
commit: b3e45e60b2db88b323409e8d05b4e89f928283b3 (patch)
tree: c6da7eb62af4382156e070e5d05570b0e095d59f /src/support/huffman.c
parent: 8aaf428e6e72eff48718296e4c6d255afff2927c (diff)
download: mongo-b3e45e60b2db88b323409e8d05b4e89f928283b3.tar.gz
1 files changed, 263 insertions, 193 deletions
diff --git a/src/support/huffman.c b/src/support/huffman.c
index c55fe8981eb..a7a5079c610 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -5,29 +5,22 @@
  *	All rights reserved.
  */
 
-#include "XXX.h"
-
-#define DETAIL
-/* "The assert() macro may be removed at compile time with the cc(1) option -DNDEBUG." */
-#include <assert.h>
+#include "wt_internal.h"
 
+#define	__HUFFMAN_DETAIL	0	/* Set to 1 for debugging output. */
 
 /* Length of header in compressed message, in bits. */
-#define	WT_HUFFMAN_HEADER 3
-
-
-/* Escape symbol. */
-static uint16_t escape;
-
+#define	WT_HUFFMAN_HEADER 	3
 
 /*
- * Maximum allowed length of Huffman code words, which otherwise can range up to (#symbols - 1) bits long.
- * Lower value to use less memory for tables, higher value for better compression.
- * Max value = 16 (or 32-7=25 or 64-7=57 if adjust data types).  FYI, JPEG uses 16.
- * A side effect of limiting max code length is that the worst case compression (a message of the least frequent symbols) is shorter.
+ * Maximum allowed length of Huffman code words, which otherwise can range up
+ * to (#symbols - 1) bits long.  Lower value to use less memory for tables,
+ * higher value for better compression.  Max value = 16 (or 32-7=25 or 64-7=57
+ * if adjust data types).  FYI, JPEG uses 16.  A side effect of limiting max
+ * code length is that the worst case compression (a message of the least
+ * frequent symbols) is shorter.
  */
-#define MAX_CODE_LENGTH 16
-
+#define	MAX_CODE_LENGTH		16
 
 typedef struct __wt_freqtree_node {
 	/*
@@ -42,7 +35,9 @@ typedef struct __wt_freqtree_node {
 } WT_FREQTREE_NODE;
 
 typedef struct __wt_huffman_code {
-	uint16_t pattern;	/* requirement: length of field's type in bits >= max_code_length */
+	uint16_t pattern;		/* requirement: length of field's type
+					 * in bits >= max_code_length.
+					 */
 	uint8_t length;
 } WT_HUFFMAN_CODE;
 
@@ -58,16 +53,24 @@ typedef struct __wt_huffman_obj {
 					/* Tree in static array reprentation */
 	uint16_t max_depth, min_depth;
 
+	uint16_t escape;		/* Escape symbol. */
+
 	/*
-	 * use: codes[symbol] = struct with pattern and length.  Used in encoding and decoding.
+	 * use: codes[symbol] = struct with pattern and length.
+	 * Used in encoding and decoding.
 	 * memory: codes[0-to-(number of symbols - 1)]
 	 */
 	WT_HUFFMAN_CODE *codes;
+
 	/*
-	 * use: code2symbol[Huffman_code] = symbol.  Used in decoding.
+	 * use: code2symbol[Huffman_code] = symbol.
+	 * Used in decoding.
 	 * memory: code2symbol[1 << max_code_length]
 	 */
-	uint16_t *code2symbol;	/* If <=256 symbols use uint8_t, if <=64K symbols use uint16_t, else uint32_t. */
+	uint16_t *code2symbol;		/* If <= 256 symbols use uint8_t, if
+					 * <= 64K symbols use uint16_t, else
+					 * uint32_t.
+					 */
 } WT_HUFFMAN_OBJ;
 
 /*
@@ -99,13 +102,18 @@ typedef struct node_queue {
 typedef struct __indexed_byte {
 	uint8_t frequency;
 	uint16_t symbol;
-} INDEXED_BYTE;
+} INDEXED_SYMBOL;
 
 static int  indexed_byte_comparator(const void *, const void *);
+static void make_table(
+	SESSION *, uint16_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
 static void node_queue_close(SESSION *, NODE_QUEUE *);
 static void node_queue_dequeue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE **);
 static int  node_queue_enqueue(SESSION *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static uint32_t profile_tree(
+	WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
 static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *);
+static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
 
 #define	node_queue_is_empty(queue)					\
 	(((queue) == NULL || (queue)->first == NULL) ? 1 : 0)
@@ -117,57 +125,52 @@ static void recursive_free_node(SESSION *, WT_FREQTREE_NODE *);
 static int
 indexed_byte_comparator(const void *elem1, const void *elem2)
 {
-	return (((INDEXED_BYTE *)
-	    elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency);
+	return (((INDEXED_SYMBOL *)
+	    elem1)->frequency) - (((INDEXED_SYMBOL *)elem2)->frequency);
 }
 
-
-#if 1
-/* Keith: take this out when you hook up your buffer */
-static int buflen = 0;
-static char* buf = NULL;
-
-/* return a buffer of at least min_len bytes */
-char* getbuf(int min_len) {
-  if (buflen < min_len) {
-	if (buf!=NULL) free(buf);
-	buf = malloc(min_len + 10);
-  }
-  return buf;
-}
-#endif
-
-
 /*
  * profile_tree --
- *	Traverses tree to determine #leaves under each node, max depth, min depth of leaf.
+ *	Traverses tree to determine #leaves under each node, max depth, min
+ *	depth of leaf.
  */
 static uint32_t
-profile_tree(WT_FREQTREE_NODE *node, uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
+profile_tree(WT_FREQTREE_NODE *node,
+    uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
 {
 	uint32_t leaf_cnt;
+
 	if (node->left == NULL && node->right == NULL) {	/* leaf */
 		leaf_cnt = 1;
-		if (*max_depth < len) *max_depth = len;
-		if (*min_depth > len) *min_depth = len;
-	} else {	/* internal node -- way tree constructed internal always has left and right children */
-		leaf_cnt = profile_tree(node->left, len+1, max_depth, min_depth) + profile_tree(node->right, len+1, max_depth, min_depth);
+		if (*max_depth < len)
+			*max_depth = len;
+		if (*min_depth > len)
+			*min_depth = len;
+	} else {
+		/*
+		 * internal node -- way tree constructed internal always has
+		 * left and right children
+		 */
+		leaf_cnt =
+		    profile_tree(node->left, len + 1, max_depth, min_depth) +
+		    profile_tree(node->right, len + 1, max_depth, min_depth);
 	}
-	node->weight = leaf_cnt;	/* abuse weight field */
-	return leaf_cnt;
+	node->weight = leaf_cnt;		/* abuse weight field */
+	return (leaf_cnt);
 }
 
-
 /*
  * set_codes --
  *	Computes Huffman code for each symbol in tree.
- *  Method is standard way in the literature, except that limits maximum code length.
- *  A known max code length is important for limiting memory use by the tables
- *  and for knowing how large data types need to be such as the field that holds the code pattern.
+ *
+ * Method is standard way in the literature, except that limits maximum code
+ * length.  A known max code length is important for limiting memory use by
+ * the tables and for knowing how large data types need to be such as the field
+ * that holds the code pattern.
  */
 static void
-set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
-		  uint16_t pattern, uint8_t len)
+set_codes(WT_FREQTREE_NODE *node,
+    WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
 {
 	WT_HUFFMAN_CODE *code;
 	uint16_t patternleft, patternright, half;
@@ -177,14 +180,25 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
 		code = &codes[node->symbol];
 		code->pattern = pattern;
 		code->length = len;
-		/*printf("%d: code %x, len %d\n", node->symbol, code, len);*/
+		/*
+		 * printf("%d: code %x, len %d\n", node->symbol, code, len);
+		 */
 	} else {
-		/* Check each subtree individually to see if can afford here to split up bits into possibly shorter codes,
-			or if need to employ all remaining bits up to MAX_CODE_LENGTH to consecutively number leaves. */
+		/*
+		 * Check each subtree individually to see if can afford here
+		 * to split up bits into possibly shorter codes, or if need
+		 * to employ all remaining bits up to MAX_CODE_LENGTH to
+		 * consecutively number leaves.
+		 */
 		remaining = MAX_CODE_LENGTH - len;
-		if (len < MAX_CODE_LENGTH	/* not alraedy in "low-bit mode"... */
-			&& ((half = 1 << (remaining - 1)) < node->left->weight || half < node->right->weight)) {	/* ...but need to be */
-			pattern = pattern << remaining;	/* open up lower-order bits for consecutive numbering */
+		/*
+		 * If not already in "low-bit mode", but need to be, open up
+		 * lower-order bits for consecutive numbering.
+		 */
+		if (len < MAX_CODE_LENGTH &&
+		    ((half = 1 << (remaining - 1)) < node->left->weight ||
+		    half < node->right->weight)) {
+			pattern = pattern << remaining;
 			len = MAX_CODE_LENGTH;
 		}
 
@@ -192,10 +206,10 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
 			patternleft =  (pattern << 1) | 0;
 			patternright = (pattern << 1) | 1;
 			len++;
-		} else {	/* "low bit mode" */
+		} else {			/* "low bit mode" */
 			patternleft =  pattern;
 			patternright = pattern + node->left->weight;
-			/* len unchanged */
+						/* len unchanged */
 		}
 
 		set_codes(node->left, codes, patternleft, len);
@@ -203,16 +217,15 @@ set_codes(WT_FREQTREE_NODE *node, WT_HUFFMAN_CODE *codes,
 	}
 }
 
-
 /*
  * make_table --
- *	Computes Huffman table used for subsequent lookups in encoding and decoding.
- *  With the table, encoding from a symbol to Huffman code 
- *  and decoding from a code to a symbol are simple array lookups.
+ *	Computes Huffman table used for subsequent lookups in encoding and
+ * decoding.  With the table, encoding from a symbol to Huffman code and
+ * decoding from a code to a symbol are simple array lookups.
  */
 static void
-make_table(uint16_t *code2symbol,
-    uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int nbytes)
+make_table(SESSION *session, uint16_t *code2symbol,
+    uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
 {
 	uint16_t c, c1, c2, i, j;
 	uint8_t len, shift;
@@ -221,7 +234,7 @@ make_table(uint16_t *code2symbol,
 	 * Here's the magic: flood all bit patterns for lower-order bits to
 	 * point to same symbol.
 	 */
-	for (i = 0; i < nbytes; i++) {
+	for (i = 0; i < symcnt; i++) {
 		if ((len = codes[i].length) == 0)
 			continue;
 
@@ -237,13 +250,12 @@ make_table(uint16_t *code2symbol,
 		c1 = c << shift;
 		c2 = (c + 1) << shift;
 		for (j = c1; j < c2; j++) {
-			assert (code2symbol[j] == 0);
+			WT_ASSERT(session, code2symbol[j] == 0);
 			code2symbol[j] = i;
 		}
 	}
 }
 
-
 /*
  * recursive_free_node --
  *	Recursively free the huffman frequency tree's nodes.
@@ -267,15 +279,14 @@ recursive_free_node(SESSION *session, WT_FREQTREE_NODE *node)
  * lowest rank is 1 (0 means the byte never appears in the input), so 1 byte
  * is needed to hold the rank and the input table must be 1 byte x 256 values.
  *
- *  For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535.
+ *  For UTF-16 (symcnt == 2) the range is 0 - 65535 and the max rank is 65535.
  *  The table should be 2 bytes x 65536 values.
  */
 int
 __wt_huffman_open(SESSION *session,
-    uint8_t const *byte_frequency_array, u_int nbytes, void *retp)
+    uint8_t const *byte_frequency_array, u_int symcnt, void *retp)
 {
-	INDEXED_BYTE *indexed_freqs;
-	INDEXED_BYTE esc;
+	INDEXED_SYMBOL esc, *indexed_freqs;
 	NODE_QUEUE *combined_nodes, *leaves;
 	WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
 	WT_HUFFMAN_OBJ *huffman;
@@ -289,42 +300,51 @@ __wt_huffman_open(SESSION *session,
 	ret = 0;
 
 	WT_RET(__wt_calloc_def(session, 1, &huffman));
-	WT_ERR(__wt_calloc_def(session, (size_t)nbytes, &indexed_freqs));
+	WT_ERR(__wt_calloc_def(session, (size_t)symcnt, &indexed_freqs));
 	huffman->session = session;
 
 	/*
 	 * The frequency array must be sorted to be able to use linear time
 	 * construction algorithm.
 	 */
-	for (i = 0; i < nbytes; i++) {
+	for (i = 0; i < symcnt; i++) {
 		indexed_freqs[i].frequency = byte_frequency_array[i];
 		indexed_freqs[i].symbol = i;
 	}
 
 	/*
-	 * The escape symbol dynamically seizes the lowest-frequency symbol, which may have 0 frequency.
-	 * We use the escape code only for cases that would otherwise be errors, which are hopefully rare.
-	 * We choose the lowest-frequency to have minimal impact (close to no impact) on compression ratios,
-	 * but if there are lots of errors we pay the longest codeword length in addition to the raw bits of the symbol.
+	 * The escape symbol dynamically seizes the lowest-frequency symbol,
+	 * which may have 0 frequency.  We use the escape code only for cases
+	 * that would otherwise be errors, which are hopefully rare.  Choose
+	 * the lowest-frequency to have minimal impact (close to no impact) on
+	 * compression ratios, but if there are lots of errors we pay the
+	 * longest codeword length in addition to the raw bits of the symbol.
 	 */
 	esc = indexed_freqs[0];
-	for (i = 0+1; i < nbytes; i++)
-		if (indexed_freqs[i].frequency < esc.frequency) esc = indexed_freqs[i];
-	escape = esc.symbol;
-#ifdef DETAIL
-	printf("taking over symbol %d (freq %d) as escape\n", escape, esc.frequency);
+	for (i = 0+1; i < symcnt; i++)
+		if (indexed_freqs[i].frequency < esc.frequency)
+			esc = indexed_freqs[i];
+	huffman->escape = esc.symbol;
+#if __HUFFMAN_DETAIL
+	printf("taking over symbol %d (freq %d) as escape\n",
+	    huffman->escape, esc.frequency);
 #endif
-	if (esc.frequency == 0) esc.frequency++;	/* zero frequency left out of codes, so make sure this doesn't happen to the escape symbol */
+	/*
+	 * Zero frequency left out of codes, so make sure this doesn't happen
+	 * to the escape symbol.
+	 */
+	if (esc.frequency == 0)
+		esc.frequency++;
 		
 	qsort(indexed_freqs,
-	    nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator);
+	    symcnt, sizeof(INDEXED_SYMBOL), indexed_byte_comparator);
 
 	/* We need two node queues to build the tree. */
 	WT_ERR(__wt_calloc_def(session, 1, &leaves));
 	WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
 
 	/* Adding the leaves to the queue */
-	for (i = 0; i < nbytes; ++i) {
+	for (i = 0; i < symcnt; ++i) {
 		/*
 		 * We are leaving out symbols with a frequency of 0.  This
 		 * assumes these symbols will NEVER occur in the source stream,
@@ -394,35 +414,52 @@ __wt_huffman_open(SESSION *session,
 
 	/*
 	 * The remaining node is in the node variable, this is the root of the
-	 * tree.   Calculate the number of bytes it takes to hold nbytes bits.
+	 * tree.   Calculate how many bytes it takes to hold symcnt bytes bits.
 	 */
-	huffman->numSymbols = nbytes;
-	huffman->numBytes = nbytes > 256 ? 2 : 1;
+	huffman->numSymbols = symcnt;
+	huffman->numBytes = symcnt > 256 ? 2 : 1;
 
 	huffman->max_depth = 0; huffman->min_depth = MAX_CODE_LENGTH;
-	profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
-	if (huffman->max_depth > MAX_CODE_LENGTH) huffman->max_depth = MAX_CODE_LENGTH;
+	(void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
+	if (huffman->max_depth > MAX_CODE_LENGTH)
+		huffman->max_depth = MAX_CODE_LENGTH;
 
 	WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
 	set_codes(node, huffman->codes, 0, 0);
 
-	WT_ERR(__wt_calloc_def(session, 1U << huffman->max_depth, &huffman->code2symbol));
-	make_table(huffman->code2symbol, huffman->max_depth, huffman->codes, huffman->numSymbols);
-	/*huffman->code2symbol = make_table(huffman->codes, nbytes, huffman->max_depth); => have to have access to "err:" label */
-#ifdef DETAIL
-	printf("leaf depth %d..%d, memory use: codes %d# * %u size  +  code2symbol %d# * %u size\n",
-		huffman->min_depth, huffman->max_depth,
-		nbytes, sizeof(WT_HUFFMAN_CODE),
-	    (1U << huffman->max_depth), sizeof(uint16_t));
-
-	/* measure quality of computed Huffman codes, for different max bit lengths (say, 16 vs 24 vs 32) */
-	uint32_t weighted_length = 0;
-	for (i = 0; i<nbytes; i++) {
-	  uint16_t sym = indexed_freqs[i].symbol;
-	  weighted_length += indexed_freqs[i].frequency * huffman->codes[sym].length;
-	  /*printf("\t%d->%d. %d * %d\n", i,sym, indexed_freqs[i].frequency, huffman->codes[sym].length);*/
+	WT_ERR(__wt_calloc_def(
+	    session, 1U << huffman->max_depth, &huffman->code2symbol));
+	make_table(session, huffman->code2symbol,
+	    huffman->max_depth, huffman->codes, huffman->numSymbols);
+
+#if __HUFFMAN_DETAIL
+	{
+	uint32_t weighted_length;
+	uint16_t sym;
+
+	printf("leaf depth %lu..%lu, memory use: "
+	    "codes %u# * %u size  +  code2symbol %lu# * %u size\n",
+	    (u_long)huffman->min_depth, (u_long)huffman->max_depth,
+	    symcnt, sizeof(WT_HUFFMAN_CODE),
+	    (u_long)(1U << huffman->max_depth), sizeof(uint16_t));
+
+	/*
+	 * measure quality of computed Huffman codes, for different max bit
+	 * lengths (say, 16 vs 24 vs 32)
+	 */
+	weighted_length = 0;
+	for (i = 0; i < symcnt; i++) {
+		sym = indexed_freqs[i].symbol;
+		weighted_length +=
+		    indexed_freqs[i].frequency * huffman->codes[sym].length;
+		/*
+		 * printf("\t%d->%d. %d * %d\n", i, sym,
+		 *    indexed_freqs[i].frequency, huffman->codes[sym].length);
+		 */
+	}
+	printf("weighted length of all codes (the smaller the better): %lu\n",
+	    (u_long)weighted_length);
 	}
-	printf("weighted length of all codes (the smaller the better): %d\n", weighted_length);
 #endif
 
 	*(void **)retp = huffman;
@@ -460,7 +497,7 @@ __wt_huffman_close(SESSION *session, void *huffman_arg)
 	__wt_free(session, huffman);
 }
 
-#ifdef HAVE_DIAGNOSTIC
+#if __HUFFMAN_DETAIL
 /*
  * __wt_print_huffman_code --
  *	Prints a symbol's Huffman code.
@@ -468,9 +505,9 @@ __wt_huffman_close(SESSION *session, void *huffman_arg)
 int
 __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
 {
-	WT_HUFFMAN_OBJ *huffman;
 	WT_HUFFMAN_CODE code;
-	u_int i;
+	WT_HUFFMAN_OBJ *huffman;
+
 	huffman = huffman_arg;
 
 	if (symbol >= huffman->numSymbols) {
@@ -478,7 +515,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
 	} else {
 		code = huffman->codes[symbol];
 		if (code.length == 0)
-			printf("symbol %d not defined -- 0 frequency\n", symbol);
+			printf(
+			    "symbol %d not defined -- 0 frequency\n", symbol);
 		else
 			/* should print code as binary */
 			printf("%d -> code pattern %x, length %d\n",
@@ -495,8 +533,8 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
  * 
  * Translation from symbol to Huffman code is a simple array lookup.
  *
- * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per symbol.
- * Then, given a symbol:
+ * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
+ * symbol.  Then, given a symbol:
  *	pattern = codes[symbol].pattern;
  *	length = codes[symbol].length;
  *
@@ -507,19 +545,22 @@ __wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
  * and write header bits.
  */
 int
-__wt_huffman_encode(void *huffman_arg,
-    const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
+__wt_huffman_encode(
+    void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
 {
+	WT_BUF *tmp;
 	SESSION *session;
-	WT_HUFFMAN_OBJ *huffman;
 	WT_HUFFMAN_CODE code;
+	WT_HUFFMAN_OBJ *huffman;
 	uint32_t max_len, outlen, bitpos, bytes;
 	uint16_t symbol;
-	uint8_t len, esc, padding_info, *out, *buf;
+	uint8_t len, esc, padding_info, *out;
+	int ret;
+
 	/*
 	 * Shift register to accumulate bits from input.
-	 * Should be >= (MAX_CODE_LENGTH + 7),
-	 * but also efficient to shift bits and preferably in a machine register.
+	 * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
+	 * and preferably in a machine register.
 	 */
 	uint32_t bits;
 
@@ -528,6 +569,8 @@ __wt_huffman_encode(void *huffman_arg,
 
 	huffman = huffman_arg;
 	session = huffman->session;
+	tmp = NULL;
+	ret = 0;
 
 	/*
 	 * We don't want to find all of our callers and ensure they don't pass
@@ -539,16 +582,20 @@ __wt_huffman_encode(void *huffman_arg,
 	}
 
 	/*
-	 * Compute the largest compressed output size,
-	 * which is if all symbols are least frequent and so have largest Huffman codes,
-	 * and compressed output may be larger than the input size.
-	 * This way we don't have to worry about resizing the buffer during compression.
-	 * Use the shared system buffer while compressing,
-	 * then allocate a new buffer of exactly the right size and copy the result into it.
+	 * Compute the largest compressed output size, which is if all symbols
+	 * are least frequent and so have largest Huffman codes, and compressed
+	 * output may be larger than the input size.  This way we don't have to
+	 * worry about resizing the buffer during compression.  Use the shared
+	 * system buffer while compressing, then allocate a new buffer of the
+	 * right size and copy the result into it.
 	 */
-	max_len = (WT_HUFFMAN_HEADER + from_len * huffman->max_depth + 7/*round up to full byte*/) / 8;
-	max_len += from_len * huffman->numBytes;	/* if all symbols used escape code (ugh!) */
-	buf = (uint8_t *)getbuf(max_len);
+	max_len = (WT_HUFFMAN_HEADER +
+	    from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
+	/*
+	 * if all symbols used escape code (ugh!)
+	 */
+	max_len += from_len * huffman->numBytes;
+	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
 
 	/*
 	 * Leave the first 3 bits of the encoded value empty, it holds the
@@ -557,31 +604,36 @@ __wt_huffman_encode(void *huffman_arg,
 	bits = 0;
 	bitpos = WT_HUFFMAN_HEADER;
 	valid = WT_HUFFMAN_HEADER;
-	out = buf;
+	out = tmp->mem;
 	esc = 0;
 	for (bytes = 0; bytes < from_len; bytes += huffman->numBytes) {
 		/* Getting the next symbol, either 1 or 2 bytes */
 		symbol = *from++;
-		if (huffman->numBytes == 2) symbol = (symbol << 8) | *from++;
+		if (huffman->numBytes == 2)
+			symbol = (symbol << 8) | *from++;
 
 		/*
-		 * Huffman expects to never see symbols zero frequency and out-of-band.
-		 * These may happen because the data changed since we computed the frequency tables.
-		 * We can still encode such illegal input symbols via an escape symbol: escape + raw next 8-or-16 bits.
-		 * (If escape symbol does appear, it's encoded the same way as "zero frequency".)
+		 * Huffman expects to never see zero frequency and out-of-band
+		 * symbols.  These may happen because the data changed since we
+		 * computed the frequency tables.  We can still encode such
+		 * illegal input symbols via an escape symbol: escape + raw
+		 * next 8-or-16 bits.  (If the escape symbol does appear, it's
+		 * encoded the same way as "zero frequency".)
 		 */
-		if (symbol >= huffman->numSymbols	/* bad input: out of band */
-			|| (code = huffman->codes[symbol]).length == 0	/* bad input: declared zero frequency, yet here it is */
-			|| symbol == escape	/* legitimate input symbol that was seized as escape symbol */
-			) {
-			code = huffman->codes[escape];
-#ifdef DETAIL
+		if (symbol >= huffman->numSymbols
+		    /* bad input: out of band */ ||
+		    (code = huffman->codes[symbol]).length == 0
+		    /* bad input: declared zero frequency, yet here it is */ ||
+		    symbol == huffman->escape
+		    /* legitimate input symbol seized as escape symbol */) {
+			code = huffman->codes[huffman->escape];
+#if __HUFFMAN_DETAIL
 			printf("encoding escape for %d\n", symbol);
 #endif
 			esc = 1;
 		}
 
-		/* translate symbol into Huffman code and stuff into buffer */
+		/* Translate symbol into Huffman code and stuff into buffer. */
 		len = code.length;
 		bits = (bits << len) | code.pattern;
 		valid += len;
@@ -616,18 +668,22 @@ __wt_huffman_encode(void *huffman_arg,
 	 * bits used in the last byte.
 	 */
 	padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER);
-	*buf |= padding_info;
+	((uint8_t *)tmp->mem)[0] |= padding_info;
 
 	/* Copy result of exact known size into caller's buffer. */
 	outlen = (bitpos + 7) / 8;
-	WT_RET(__wt_buf_setsize(session, to_buf, outlen));
-	memcpy(to_buf->mem, buf, outlen);
-#ifdef DETAIL
-	printf("compress: worst case %d bytes -> actual %d\n", max_len, outlen);
-	printf("(vs old code %d, which was memory leak for common symbols and error for infrequent)\n", from_len+1);
+	WT_ERR(__wt_buf_setsize(session, to_buf, outlen));
+	memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+	printf("encode: worst case %lu bytes -> actual %lu\n",
+	    (u_long)max_len, (u_long)outlen);
 #endif
 
-	return (0);
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+	return (ret);
+
 }
 
 /*
@@ -640,41 +696,44 @@ __wt_huffman_encode(void *huffman_arg,
  * and whose value is the corresponding symbol.
  * From the symbol, we index into the 'codes' array to get the code length.
  *
- * When decoding a message, we don't know where the boundaries are between codes.
- * The trick is that we collect enough bits for the longest code word, 
- * and construct the table such that for codes with fewer bits we flood the table
- *  with all of the bit patterns in the lower order bits.
- * This works because the Huffman code is a unique prefix, and 
- * by the flooding we are treating bits beyond the unique prefix as don't care bits.
+ * When decoding a message, we don't know where the boundaries are between
+ * codes.  The trick is that we collect enough bits for the longest code word, 
+ * and construct the table such that for codes with fewer bits we flood the
+ * table with all of the bit patterns in the lower order bits.  This works
+ * because the Huffman code is a unique prefix, and by the flooding we are
+ * treating bits beyond the unique prefix as don't care bits.
  *
- * For example, we have table is of length 2^max_code_length (1<<max_code_length).  
- * For a code of length, max_code_length, the position code2symbol[code] = symbol.
- * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol, as well 
- * as code2symbol[(code << 1) | 1] = symbol.
+ * For example, we have table of length 2^max_code_length (1<<max_code_length). 
+ * For a code of length, max_code_length, the position code2symbol[code] =
+ *	symbol.
+ * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
+ * as well as code2symbol[(code << 1) | 1] = symbol.
  * And so on, so in general we fill:
- * 		code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
+ * 	code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
  *
- * To decode a message, we read in enough bits from input to fill the shift register with at least 
- * MAX_CODE_LENGTH bits.
+ * To decode a message, we read in enough bits from input to fill the shift
+ * register with at least MAX_CODE_LENGTH bits.
  * We look up in the table code2symbol to obtain the symbol.
  * We look up the symbol in 'codes' to obtain the code length
  * Finally, subtract off these bits from the shift register.
  */
 int
-__wt_huffman_decode(void *huffman_arg,
-    const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
+__wt_huffman_decode(
+    void *huffman_arg, const uint8_t *from, uint32_t from_len, WT_BUF *to_buf)
 {
+	WT_BUF *tmp;
 	SESSION *session;
 	WT_HUFFMAN_OBJ *huffman;
-	uint32_t bits, from_bits, from_len_bits, len, max_len, outlen, max;
-	uint32_t mask, out_bits;
-	uint16_t pattern;
-	uint8_t valid;
-	uint16_t symbol;
-	uint8_t padding_info, *to, *buf;
+	uint32_t bits, from_bits, from_len_bits, len, mask, max, max_len;
+	uint32_t out_bits, outlen;
+	uint16_t pattern, symbol;
+	uint8_t padding_info, *to, valid;
+	int ret;
 
 	huffman = huffman_arg;
 	session = huffman->session;
+	tmp = NULL;
+	ret = 0;
 
 	/*
 	 * We don't want to find all of our callers and ensure they don't pass
@@ -695,38 +754,41 @@ __wt_huffman_decode(void *huffman_arg,
 		from_len_bits -= 8 - padding_info;
 
 	/*
-	 * Compute largest uncompressed output size,
-	 * which is if all symbols are most frequent and so have smallest Huffman codes and therefore largest expansion.
-	 * (If some symbols use the escape code then the output is even smaller.)
-	 * Use the shared system buffer while uncompressing,
-	 * then allocate a new buffer of exactly the right size and copy the result into it.
+	 * Compute largest uncompressed output size, which is if all symbols are
+	 * most frequent and so have smallest Huffman codes and therefore
+	 * largest expansion.  (If some symbols use the escape code then the
+	 * output is even smaller.)  Use the shared system buffer while
+	 * uncompressing, then allocate a new buffer of exactly the right size
+	 * and copy the result into it.
 	 */
 	max_len = (from_len_bits / huffman->min_depth) * huffman->numBytes;
-	buf = (uint8_t *)getbuf(max_len);
-	to = buf;
+	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+	to = tmp->mem;
 
 	bits = *from++;
 	valid = 8 - WT_HUFFMAN_HEADER;
 	from_bits = from_len_bits - 8;
 	out_bits = from_len_bits - WT_HUFFMAN_HEADER;
 	max = huffman->max_depth;
-	mask = (1 << max) - 1;
+	mask = (1U << max) - 1;
 	for (outlen = 0; out_bits > 0; outlen += huffman->numBytes) {
 		while (valid < max && from_bits > 0) {
 			bits = (bits << 8) | *from++;
 			valid += 8;
 			from_bits -= 8;
 		}
-		pattern = valid >= max ?		/* short patterns near end */
+		pattern = valid >= max ?	/* short patterns near end */
 		    (bits >> (valid - max)) : (bits << (max - valid));
 		symbol = huffman->code2symbol[pattern & mask];
 		len = huffman->codes[symbol].length;
 		valid -= len;
 		out_bits -= len;
 
-		if (symbol == escape) {
-#ifdef DETAIL
-			printf("decoding escape\n");
+		if (symbol == huffman->escape) {
+#if __HUFFMAN_DETAIL
+			/*
+			 * printf("decoding escape\n");
+			 */
 #endif
 			len = huffman->numBytes * 8;
 			while (valid < len) {
@@ -734,7 +796,12 @@ __wt_huffman_decode(void *huffman_arg,
 				valid += 8;
 				from_bits -= 8;
 			}
-			symbol = bits >> (valid - len);	/* different than usual case: extract symbol not pattern */
+
+			/*
+			 * different than usual case: extract symbol not
+			 * pattern
+			 */
+			symbol = bits >> (valid - len);
 			valid -= len;
 			out_bits -= len;
 		}
@@ -745,14 +812,17 @@ __wt_huffman_decode(void *huffman_arg,
 	}
 
 	/* Return the number of bytes used. */
-	WT_RET(__wt_buf_setsize(session, to_buf, outlen));
-	memcpy(to_buf->mem, buf, outlen);
-#ifdef DETAIL
-	printf("uncompress: worst case %d bytes -> actual %d\n", max_len, outlen);
-	printf("(vs old code %d, which was usually memory leak)\n", 2*from_len+1);
+	WT_ERR(__wt_buf_setsize(session, to_buf, outlen));
+	memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+	printf("decode: worst case %lu bytes -> actual %lu\n",
+	    (u_long)max_len, (u_long)outlen);
 #endif
 
-	return (0);
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+	return (ret);
 }
 
 /*
@@ -786,7 +856,7 @@ node_queue_enqueue(SESSION *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
 	NODE_QUEUE_ELEM *elem;
 
 	/* Allocating a new linked list element */
-	WT_RET(__wt_calloc(session, 1, sizeof(NODE_QUEUE_ELEM), &elem));
+	WT_RET(__wt_calloc_def(session, 1, &elem));
 
 	/* It holds the tree node, and has no next element yet */
 	elem->node = node;
author	Keith Bostic <keith.bostic@wiredtiger.com>	2011-05-19 14:57:16 -0400
committer	Keith Bostic <keith.bostic@wiredtiger.com>	2011-05-19 14:57:16 -0400
commit	b3e45e60b2db88b323409e8d05b4e89f928283b3 (patch)
tree	c6da7eb62af4382156e070e5d05570b0e095d59f /src/support/huffman.c
parent	8aaf428e6e72eff48718296e4c6d255afff2927c (diff)
download	mongo-b3e45e60b2db88b323409e8d05b4e89f928283b3.tar.gz