Support large row-store reconciliation dictionaries: add a skiplist

as the indexing mechanism.
author: Keith Bostic <keith@wiredtiger.com> 2012-09-21 07:17:13 +0000
committer: Keith Bostic <keith@wiredtiger.com> 2012-09-21 07:17:13 +0000
commit: 171084c8359dea415d0e14df9d0d64376a7072a6 (patch)
tree: cc688bda0eb9f3b0e9ceb8f748ef716b5dda755b
parent: 6234acd24828049279d4bff3f22223d4afb4bdfd (diff)
download: mongo-171084c8359dea415d0e14df9d0d64376a7072a6.tar.gz
4 files changed, 175 insertions, 83 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index 143a031a7aa..ea443d82a63 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -119,9 +119,9 @@ file_config = format_meta + lsm_config + [
 		configure custom collation for keys.  Value must be a collator
 		name created with WT_CONNECTION::add_collator'''),
 	Config('dictionary', '0', r'''
-		the number of entries maintained in the Btree row-store leaf
-		page value dictionary; see @ref file_formats_compression for
-		more information''',
+		the maximum number of unique values remembered in the Btree
+		row-store leaf page value dictionary; see
+		@ref file_formats_compression for more information''',
 		min='0'),
 	Config('huffman_key', '', r'''
 		configure Huffman encoding for keys.  Permitted values
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 3238bcecbcc..ab1a8258d31 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -147,12 +147,19 @@ typedef struct {
 	 *	We optionally build a dictionary of row-store values for leaf
 	 * pages.  Where two value cells are identical, only write the value
 	 * once, the second and subsequent copies point to the original cell.
+	 * The dictionary is fixed size, but organized in a skip-list to make
+	 * searches faster.
 	 */
 	struct __rec_dictionary {
 		uint64_t hash;				/* Hash value */
 		uint8_t *cell;				/* Matching cell */
-	} *dictionary;
-	long dictionary_slots;				/* Maximum entries */
+
+		u_int depth;				/* Skiplist */
+		WT_DICTIONARY *next[0];
+	} **dictionary;					/* Dictionary */
+	u_int dictionary_next, dictionary_slots;	/* Next, max entries */
+							/* Skiplist head. */
+	WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
 
 	/*
 	 * WT_KV--
@@ -190,8 +197,6 @@ static int  __rec_col_merge(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_SALVAGE_COOKIE *);
 static int  __rec_col_var_helper(WT_SESSION_IMPL *,
 		WT_SALVAGE_COOKIE *, WT_ITEM *, int, int, uint64_t);
-static WT_DICTIONARY *__rec_dictionary_lookup(WT_SESSION_IMPL *, WT_KV *);
-static void __rec_dictionary_reset(WT_RECONCILE *);
 static int  __rec_page_deleted(WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, int *);
 static int  __rec_page_modified(WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, int *);
 static int  __rec_row_int(WT_SESSION_IMPL *, WT_PAGE *);
@@ -211,6 +216,12 @@ static int  __rec_write_init(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
 static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_write_wrapup_err(WT_SESSION_IMPL *, WT_PAGE *);
 
+static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *);
+static WT_DICTIONARY *
+	    __rec_dictionary_lookup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *);
+static void __rec_dictionary_reset(WT_RECONCILE *);
+
 /*
  * __rec_page_modified --
  *	Return if the given WT_REF references any modifications.
@@ -573,15 +584,10 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
 		 * Sanity check the size: 100 slots is the smallest dictionary
 		 * we use.
 		 */
-		if (btree->dictionary < 100)
-			btree->dictionary = 100;
-		if (r->dictionary != NULL) {
-			r->dictionary_slots = 0;
-			__wt_free(session, r->dictionary);
-		}
-		WT_RET(__wt_calloc_def(
-		    session, btree->dictionary, &r->dictionary));
-		r->dictionary_slots = btree->dictionary;
+		r->dictionary_slots =
+		    btree->dictionary < 100 ? 100 : btree->dictionary;
+
+		WT_RET(__rec_dictionary_init(session, r));
 	}
 
 	/* Per-page reconciliation: reset the dictionary. */
@@ -631,7 +637,7 @@ __wt_rec_destroy(WT_SESSION_IMPL *session)
 	__wt_buf_free(session, &r->_cur);
 	__wt_buf_free(session, &r->_last);
 
-	__wt_free(session, r->dictionary);
+	__rec_dictionary_free(session, r);
 
 	__wt_free(session, session->reconcile);
 }
@@ -712,7 +718,7 @@ __rec_dict_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
 	 * end of the buffer's memory.
 	 */
 	if (kv->buf.size > WT_INTPACK32_MAXSIZE &&
-	    (dp = __rec_dictionary_lookup(session, kv)) != NULL) {
+	    (dp = __rec_dictionary_lookup(session, r, kv)) != NULL) {
 		/*
 		 * If the dictionary cell reference is not set, we're
 		 * creating a new entry in the dictionary, update it.
@@ -3884,33 +3890,141 @@ err:	__wt_scr_free(&tmp);
 }
 
 /*
+ * The dictionary --
+ *	The rest of this file is support for dictionaries.
+ *
+ * It's difficult to write generic skiplist functions without turning a single
+ * memory allocation into two, or requiring a function call instead of a simple
+ * comparison.  Fortunately, skiplists are relatively simple things and we can
+ * include them in-place.  If you need generic skip-list functions to modify,
+ * this set wouldn't be a bad place to start.
+ *
+ * __rec_dictionary_skip_search --
+ *	Search a dictionary skiplist.
+ */
+static WT_DICTIONARY *
+__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+		if (*e == NULL) {
+			--i;
+			--e;
+		} else {
+			if ((*e)->hash == hash)
+				return (*e);
+			if ((*e)->hash > hash)
+				return (NULL);
+			e = &(*e)->next[i];
+		}
+
+	/* NOTREACHED */
+	return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ *	Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+    WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+		if (*e == NULL || (*e)->hash >= hash)
+			stack[i--] = e--;
+		else
+			e = &(*e)->next[i];
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ *	Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+    WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
+{
+	WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	/* Insert the new entry into the skiplist. */
+	__rec_dictionary_skip_search_stack(head, stack, hash);
+	for (i = 0; i < e->depth; ++i) {
+		e->next[i] = *stack[i];
+		*stack[i] = e;
+	}
+}
+
+/*
+ * __rec_dictionary_init --
+ *	Allocate and initialize the dictionary.
+ */
+static int
+__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	u_int depth, i;
+
+	/* Free any previous dictionary. */
+	__rec_dictionary_free(session, r);
+
+	WT_RET(__wt_calloc(session,
+	    r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
+	for (i = 0; i < r->dictionary_slots; ++i) {
+		depth = __wt_skip_choose_depth();
+		WT_RET(__wt_calloc(session, 1,
+		    sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
+		    &r->dictionary[i]));
+		r->dictionary[i]->depth = depth;
+	}
+	return (0);
+}
+
+/*
+ * __rec_dictionary_free --
+ *	Free the dictionary.
+ */
+static void
+__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	u_int i;
+
+	if (r->dictionary == NULL)
+		return;
+
+	/*
+	 * We don't correct dictionary_slots when we fail during allocation,
+	 * but that's OK, the value is either NULL or a memory reference to
+	 * be free'd.
+	 */
+	for (i = 0; i < r->dictionary_slots; ++i)
+		__wt_free(session, r->dictionary[i]);
+	__wt_free(session, r->dictionary);
+}
+
+/*
  * __rec_dictionary_reset --
- *	Clear the dictionary when reconciliation restarts and when crossing a
+ *	Reset the dictionary when reconciliation restarts and when crossing a
  * page boundary (a potential split).
  */
 static void
 __rec_dictionary_reset(WT_RECONCILE *r)
 {
-	/*
-	 * Reset the dictionary when starting a new reconciliation or crossing
-	 * a page split boundary.   I'm concerned about the latter: if it's a
-	 * dictionary with 10K slots and a small page size boundary, this is an
-	 * expensive operation we might be calling a lot; if it's a dictionary
-	 * with 100 slots and a large page size boundary it's a cheap operation
-	 * we won't call often.   The alternative is to clear the dictionary
-	 * when starting a new reconciliation and don't clear it when crossing
-	 * a split boundary.  If we don't clear the dictionary when crossing a
-	 * split boundary, then we have to add some way to detect references to
-	 * cells in other chunks of the page.  A simple way is to add the split
-	 * boundary's start address into the WT_DICTIONARY entry and test it
-	 * against the current boundary's start address when doing the lookup.
-	 * That grows the WT_DICTIONARY size by a pointer, of course; it might
-	 * be possible to test the referenced cell's address relationship to
-	 * the current boundary's start address instead, but that's a little
-	 * trickier if we actually split and the boundary moves around.
-	 */
-	memset(r->dictionary,
-	    0, (size_t)r->dictionary_slots * sizeof(r->dictionary[0]));
+	r->dictionary_next = 0;
+	memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
 }
 
 /*
@@ -3918,7 +4032,7 @@ __rec_dictionary_reset(WT_RECONCILE *r)
  *	Check to see if two cells (one in a WT_ITEM, and one laid out
  * in a buffer), match.
  */
-static inline int
+static int
 __rec_dictionary_cell_match(uint8_t *p, WT_KV *val)
 {
 	/*
@@ -3938,27 +4052,21 @@ __rec_dictionary_cell_match(uint8_t *p, WT_KV *val)
  *	Check the dictionary for a matching value on this page.
  */
 static WT_DICTIONARY *
-__rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_KV *val)
+__rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val)
 {
-	WT_DICTIONARY *dp, *empty;
-	WT_RECONCILE *r;
-	long i;
+	WT_DICTIONARY *dp, *next;
 	uint64_t hash;
 
-	r = session->reconcile;
-
+	/* Search the dictionary, and return any match we find. */
 	hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+	for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+	     dp != NULL && dp->hash == hash; dp = dp->next[0])
+		if (__rec_dictionary_cell_match(dp->cell, val)) {
+			WT_BSTAT_INCR(session, rec_dictionary);
+			return (dp);
+		}
 
 	/*
-	 * Walk the dictionary looking for a matching hash value.  We're not
-	 * sorting the dictionary or doing anything to make the lookup fast.
-	 * That's fine for the small dictionaries I expect to see, but bad if
-	 * we see big dictionaries with lots of unique page values.  It's an
-	 * obvious trade-off: this implementation only uses two pointers per
-	 * dictionary slot and building the dictionary takes almost no work,
-	 * vs. more memory and more work to build large dictionaries that we
-	 * can still search quickly.
-	 *
 	 * We're not doing value replacement in the dictionary.  We stop adding
 	 * new entries if we run out of empty dictionary slots (but continue to
 	 * use the existing entries).  I can't think of any reason a leaf page
@@ -3967,30 +4075,17 @@ __rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_KV *val)
 	 * case, it shouldn't be too difficult to maintain a pointer which is
 	 * the next dictionary slot to re-use.
 	 */
-	empty = NULL;
-	for (dp = r->dictionary, i = r->dictionary_slots; i > 0; --i, ++dp) {
-		if (dp->cell == NULL) {
-			empty = dp;
-			break;
-		}
-		if (dp->hash != hash)
-			continue;
-
-		/* If we find a match, replace our cell with a copy cell. */
-		if (__rec_dictionary_cell_match(dp->cell, val)) {
-			WT_BSTAT_INCR(session, rec_dictionary);
-			return (dp);
-		}
-	}
+	if (r->dictionary_next >= r->dictionary_slots)
+		return (NULL);
 
 	/*
 	 * Set the hash value, we'll add this entry into the dictionary when we
-	 * write it into the page's buffer (because that's when we know where
-	 * it will be written).
-	 */
-	if (empty != NULL) {
-		empty->cell = NULL;	/* Not necessary, just cautious. */
-		empty->hash = hash;
-	}
-	return (empty);
+	 * write it into the page's disk image buffer (because that's when we
+	 * know where on the page it will be written).
+	 */
+	next = r->dictionary[r->dictionary_next++];
+	next->cell = NULL;		/* Not necessary, just cautious. */
+	next->hash = hash;
+	__rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+	return (next);
 }
diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox
index 8a35de4b782..294026b895b 100644
--- a/src/docs/file-formats.dox
+++ b/src/docs/file-formats.dox
@@ -50,9 +50,6 @@ tree and when writing pages to disk.
 in-memory and on-disk objects by storing any identical value only once
 per page.  The cost is minor additional CPU and memory use when
 returning values from the in-memory tree and when writing pages to disk.
-Note that dictionary compression in WiredTiger is intended for
-relatively small dictionaries of unique values; configuring dictionaries
-of tens of thousands of values might result in a higher CPU cost.
 
 - Huffman encoding reduces the size requirement of both the in-memory
 and on-disk objects by compressing individual key/value items, and can
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 367b52a55cd..de67ba595f2 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -608,8 +608,8 @@ struct __wt_session {
 	 * value_format.  For colgroups and indices\, all column names must
 	 * appear in the list of columns for the table.,a list of strings;
 	 * default empty.}
-	 * @config{dictionary, the number of entries maintained in the Btree
-	 * row-store leaf page value dictionary; see @ref
+	 * @config{dictionary, the maximum number of unique values remembered in
+	 * the Btree row-store leaf page value dictionary; see @ref
 	 * file_formats_compression for more information.,an integer greater
 	 * than or equal to 0; default \c 0.}
 	 * @config{exclusive, fail if the object exists.  When false (the
author	Keith Bostic <keith@wiredtiger.com>	2012-09-21 07:17:13 +0000
committer	Keith Bostic <keith@wiredtiger.com>	2012-09-21 07:17:13 +0000
commit	171084c8359dea415d0e14df9d0d64376a7072a6 (patch)
tree	cc688bda0eb9f3b0e9ceb8f748ef716b5dda755b
parent	6234acd24828049279d4bff3f22223d4afb4bdfd (diff)
download	mongo-171084c8359dea415d0e14df9d0d64376a7072a6.tar.gz