diff options
author | Keith Bostic <keith@wiredtiger.com> | 2012-09-21 07:17:13 +0000 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2012-09-21 07:17:13 +0000 |
commit | 171084c8359dea415d0e14df9d0d64376a7072a6 (patch) | |
tree | cc688bda0eb9f3b0e9ceb8f748ef716b5dda755b | |
parent | 6234acd24828049279d4bff3f22223d4afb4bdfd (diff) | |
download | mongo-171084c8359dea415d0e14df9d0d64376a7072a6.tar.gz |
Support large row-store reconciliation dictionaries: add a skiplist
as the indexing mechanism.
-rw-r--r-- | dist/api_data.py | 6 | ||||
-rw-r--r-- | src/btree/rec_write.c | 245 | ||||
-rw-r--r-- | src/docs/file-formats.dox | 3 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 4 |
4 files changed, 175 insertions, 83 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 143a031a7aa..ea443d82a63 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -119,9 +119,9 @@ file_config = format_meta + lsm_config + [ configure custom collation for keys. Value must be a collator name created with WT_CONNECTION::add_collator'''), Config('dictionary', '0', r''' - the number of entries maintained in the Btree row-store leaf - page value dictionary; see @ref file_formats_compression for - more information''', + the maximum number of unique values remembered in the Btree + row-store leaf page value dictionary; see + @ref file_formats_compression for more information''', min='0'), Config('huffman_key', '', r''' configure Huffman encoding for keys. Permitted values diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 3238bcecbcc..ab1a8258d31 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -147,12 +147,19 @@ typedef struct { * We optionally build a dictionary of row-store values for leaf * pages. Where two value cells are identical, only write the value * once, the second and subsequent copies point to the original cell. + * The dictionary is fixed size, but organized in a skip-list to make + * searches faster. */ struct __rec_dictionary { uint64_t hash; /* Hash value */ uint8_t *cell; /* Matching cell */ - } *dictionary; - long dictionary_slots; /* Maximum entries */ + + u_int depth; /* Skiplist */ + WT_DICTIONARY *next[0]; + } **dictionary; /* Dictionary */ + u_int dictionary_next, dictionary_slots; /* Next, max entries */ + /* Skiplist head. */ + WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH]; /* * WT_KV-- @@ -190,8 +197,6 @@ static int __rec_col_merge(WT_SESSION_IMPL *, WT_PAGE *); static int __rec_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_SALVAGE_COOKIE *); static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_SALVAGE_COOKIE *, WT_ITEM *, int, int, uint64_t); -static WT_DICTIONARY *__rec_dictionary_lookup(WT_SESSION_IMPL *, WT_KV *); -static void __rec_dictionary_reset(WT_RECONCILE *); static int __rec_page_deleted(WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, int *); static int __rec_page_modified(WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, int *); static int __rec_row_int(WT_SESSION_IMPL *, WT_PAGE *); @@ -211,6 +216,12 @@ static int __rec_write_init(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_PAGE *); static int __rec_write_wrapup_err(WT_SESSION_IMPL *, WT_PAGE *); +static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *); +static WT_DICTIONARY * + __rec_dictionary_lookup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *); +static void __rec_dictionary_reset(WT_RECONCILE *); + /* * __rec_page_modified -- * Return if the given WT_REF references any modifications. @@ -573,15 +584,10 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) * Sanity check the size: 100 slots is the smallest dictionary * we use. */ - if (btree->dictionary < 100) - btree->dictionary = 100; - if (r->dictionary != NULL) { - r->dictionary_slots = 0; - __wt_free(session, r->dictionary); - } - WT_RET(__wt_calloc_def( - session, btree->dictionary, &r->dictionary)); - r->dictionary_slots = btree->dictionary; + r->dictionary_slots = + btree->dictionary < 100 ? 100 : btree->dictionary; + + WT_RET(__rec_dictionary_init(session, r)); } /* Per-page reconciliation: reset the dictionary. */ @@ -631,7 +637,7 @@ __wt_rec_destroy(WT_SESSION_IMPL *session) __wt_buf_free(session, &r->_cur); __wt_buf_free(session, &r->_last); - __wt_free(session, r->dictionary); + __rec_dictionary_free(session, r); __wt_free(session, session->reconcile); } @@ -712,7 +718,7 @@ __rec_dict_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) * end of the buffer's memory. */ if (kv->buf.size > WT_INTPACK32_MAXSIZE && - (dp = __rec_dictionary_lookup(session, kv)) != NULL) { + (dp = __rec_dictionary_lookup(session, r, kv)) != NULL) { /* * If the dictionary cell reference is not set, we're * creating a new entry in the dictionary, update it. @@ -3884,33 +3890,141 @@ err: __wt_scr_free(&tmp); } /* + * The dictionary -- + * The rest of this file is support for dictionaries. + * + * It's difficult to write generic skiplist functions without turning a single + * memory allocation into two, or requiring a function call instead of a simple + * comparison. Fortunately, skiplists are relatively simple things and we can + * include them in-place. If you need generic skip-list functions to modify, + * this set wouldn't be a bad place to start. + * + * __rec_dictionary_skip_search -- + * Search a dictionary skiplist. + */ +static WT_DICTIONARY * +__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) + if (*e == NULL) { + --i; + --e; + } else { + if ((*e)->hash == hash) + return (*e); + if ((*e)->hash > hash) + return (NULL); + e = &(*e)->next[i]; + } + + /* NOTREACHED */ + return (NULL); +} + +/* + * __rec_dictionary_skip_search_stack -- + * Search a dictionary skiplist, returning an insert/remove stack. + */ +static void +__rec_dictionary_skip_search_stack( + WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash) +{ + WT_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) + if (*e == NULL || (*e)->hash >= hash) + stack[i--] = e--; + else + e = &(*e)->next[i]; +} + +/* + * __rec_dictionary_skip_insert -- + * Insert an entry into the dictionary skip-list. + */ +static void +__rec_dictionary_skip_insert( + WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash) +{ + WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* Insert the new entry into the skiplist. */ + __rec_dictionary_skip_search_stack(head, stack, hash); + for (i = 0; i < e->depth; ++i) { + e->next[i] = *stack[i]; + *stack[i] = e; + } +} + +/* + * __rec_dictionary_init -- + * Allocate and initialize the dictionary. + */ +static int +__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + u_int depth, i; + + /* Free any previous dictionary. */ + __rec_dictionary_free(session, r); + + WT_RET(__wt_calloc(session, + r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary)); + for (i = 0; i < r->dictionary_slots; ++i) { + depth = __wt_skip_choose_depth(); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *), + &r->dictionary[i])); + r->dictionary[i]->depth = depth; + } + return (0); +} + +/* + * __rec_dictionary_free -- + * Free the dictionary. + */ +static void +__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + u_int i; + + if (r->dictionary == NULL) + return; + + /* + * We don't correct dictionary_slots when we fail during allocation, + * but that's OK, the value is either NULL or a memory reference to + * be free'd. + */ + for (i = 0; i < r->dictionary_slots; ++i) + __wt_free(session, r->dictionary[i]); + __wt_free(session, r->dictionary); +} + +/* * __rec_dictionary_reset -- - * Clear the dictionary when reconciliation restarts and when crossing a + * Reset the dictionary when reconciliation restarts and when crossing a * page boundary (a potential split). */ static void __rec_dictionary_reset(WT_RECONCILE *r) { - /* - * Reset the dictionary when starting a new reconciliation or crossing - * a page split boundary. I'm concerned about the latter: if it's a - * dictionary with 10K slots and a small page size boundary, this is an - * expensive operation we might be calling a lot; if it's a dictionary - * with 100 slots and a large page size boundary it's a cheap operation - * we won't call often. The alternative is to clear the dictionary - * when starting a new reconciliation and don't clear it when crossing - * a split boundary. If we don't clear the dictionary when crossing a - * split boundary, then we have to add some way to detect references to - * cells in other chunks of the page. A simple way is to add the split - * boundary's start address into the WT_DICTIONARY entry and test it - * against the current boundary's start address when doing the lookup. - * That grows the WT_DICTIONARY size by a pointer, of course; it might - * be possible to test the referenced cell's address relationship to - * the current boundary's start address instead, but that's a little - * trickier if we actually split and the boundary moves around. - */ - memset(r->dictionary, - 0, (size_t)r->dictionary_slots * sizeof(r->dictionary[0])); + r->dictionary_next = 0; + memset(r->dictionary_head, 0, sizeof(r->dictionary_head)); } /* @@ -3918,7 +4032,7 @@ __rec_dictionary_reset(WT_RECONCILE *r) * Check to see if two cells (one in a WT_ITEM, and one laid out * in a buffer), match. */ -static inline int +static int __rec_dictionary_cell_match(uint8_t *p, WT_KV *val) { /* @@ -3938,27 +4052,21 @@ __rec_dictionary_cell_match(uint8_t *p, WT_KV *val) * Check the dictionary for a matching value on this page. */ static WT_DICTIONARY * -__rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_KV *val) +__rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val) { - WT_DICTIONARY *dp, *empty; - WT_RECONCILE *r; - long i; + WT_DICTIONARY *dp, *next; uint64_t hash; - r = session->reconcile; - + /* Search the dictionary, and return any match we find. */ hash = __wt_hash_fnv64(val->buf.data, val->buf.size); + for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); + dp != NULL && dp->hash == hash; dp = dp->next[0]) + if (__rec_dictionary_cell_match(dp->cell, val)) { + WT_BSTAT_INCR(session, rec_dictionary); + return (dp); + } /* - * Walk the dictionary looking for a matching hash value. We're not - * sorting the dictionary or doing anything to make the lookup fast. - * That's fine for the small dictionaries I expect to see, but bad if - * we see big dictionaries with lots of unique page values. It's an - * obvious trade-off: this implementation only uses two pointers per - * dictionary slot and building the dictionary takes almost no work, - * vs. more memory and more work to build large dictionaries that we - * can still search quickly. - * * We're not doing value replacement in the dictionary. We stop adding * new entries if we run out of empty dictionary slots (but continue to * use the existing entries). I can't think of any reason a leaf page @@ -3967,30 +4075,17 @@ __rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_KV *val) * case, it shouldn't be too difficult to maintain a pointer which is * the next dictionary slot to re-use. */ - empty = NULL; - for (dp = r->dictionary, i = r->dictionary_slots; i > 0; --i, ++dp) { - if (dp->cell == NULL) { - empty = dp; - break; - } - if (dp->hash != hash) - continue; - - /* If we find a match, replace our cell with a copy cell. */ - if (__rec_dictionary_cell_match(dp->cell, val)) { - WT_BSTAT_INCR(session, rec_dictionary); - return (dp); - } - } + if (r->dictionary_next >= r->dictionary_slots) + return (NULL); /* * Set the hash value, we'll add this entry into the dictionary when we - * write it into the page's buffer (because that's when we know where - * it will be written). - */ - if (empty != NULL) { - empty->cell = NULL; /* Not necessary, just cautious. */ - empty->hash = hash; - } - return (empty); + * write it into the page's disk image buffer (because that's when we + * know where on the page it will be written). + */ + next = r->dictionary[r->dictionary_next++]; + next->cell = NULL; /* Not necessary, just cautious. */ + next->hash = hash; + __rec_dictionary_skip_insert(r->dictionary_head, next, hash); + return (next); } diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox index 8a35de4b782..294026b895b 100644 --- a/src/docs/file-formats.dox +++ b/src/docs/file-formats.dox @@ -50,9 +50,6 @@ tree and when writing pages to disk. in-memory and on-disk objects by storing any identical value only once per page. The cost is minor additional CPU and memory use when returning values from the in-memory tree and when writing pages to disk. -Note that dictionary compression in WiredTiger is intended for -relatively small dictionaries of unique values; configuring dictionaries -of tens of thousands of values might result in a higher CPU cost. - Huffman encoding reduces the size requirement of both the in-memory and on-disk objects by compressing individual key/value items, and can diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 367b52a55cd..de67ba595f2 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -608,8 +608,8 @@ struct __wt_session { * value_format. For colgroups and indices\, all column names must * appear in the list of columns for the table.,a list of strings; * default empty.} - * @config{dictionary, the number of entries maintained in the Btree - * row-store leaf page value dictionary; see @ref + * @config{dictionary, the maximum number of unique values remembered in + * the Btree row-store leaf page value dictionary; see @ref * file_formats_compression for more information.,an integer greater * than or equal to 0; default \c 0.} * @config{exclusive, fail if the object exists. When false (the |