diff options
-rw-r--r-- | dist/s_string.ok | 2 | ||||
-rw-r--r-- | src/btree/bt_debug.c | 29 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 14 | ||||
-rw-r--r-- | src/btree/bt_page.c | 62 | ||||
-rw-r--r-- | src/btree/bt_ret.c | 2 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 22 | ||||
-rw-r--r-- | src/btree/bt_vrfy_dsk.c | 6 | ||||
-rw-r--r-- | src/btree/rec_write.c | 56 | ||||
-rw-r--r-- | src/btree/row_key.c | 198 | ||||
-rw-r--r-- | src/include/btmem.h | 11 | ||||
-rw-r--r-- | src/include/btree.i | 229 | ||||
-rw-r--r-- | src/include/cell.i | 35 | ||||
-rw-r--r-- | src/include/cursor.i | 117 |
13 files changed, 362 insertions, 421 deletions
diff --git a/dist/s_string.ok b/dist/s_string.ok index bb4ef607d49..fd98b3b2526 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -127,6 +127,7 @@ INTL INUSE ISSET ITEMs +Inline Ippokratis JPEG JSON @@ -571,6 +572,7 @@ init initn initsize inline +inlined inmem insertK insertV diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 9e3d87ae4fb..9415f4ad790 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -779,15 +779,17 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; - WT_IKEY *ikey; + WT_DECL_ITEM(key); + WT_DECL_RET; WT_INSERT_HEAD *insert; - WT_ITEM key; WT_ROW *rip; + WT_SESSION_IMPL *session; WT_UPDATE *upd; uint32_t i; - void *copy; + session = ds->session; unpack = &_unpack; + WT_RET(__wt_scr_alloc(session, 256, &key)); /* * Dump any K/V pairs inserted into the page before the first from-disk @@ -798,24 +800,14 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) /* Dump the page's K/V pairs. */ WT_ROW_FOREACH(page, rip, i) { - copy = WT_ROW_KEY_COPY(rip); - if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { - __wt_row_leaf_direct(page, copy, &key); - __debug_item(ds, "K", key.data, key.size); - } else if (__wt_off_page(page, copy)) { - ikey = copy; - __debug_item(ds, "K", WT_IKEY_DATA(ikey), ikey->size); - } else { - __wt_cell_unpack(copy, unpack); - WT_RET(__debug_cell_data( - ds, page, WT_PAGE_ROW_LEAF, "K", unpack)); - } + WT_RET(__wt_row_leaf_key(session, page, rip, key, 0)); + __debug_item(ds, "K", key->data, key->size); - if ((cell = __wt_row_leaf_value(page, rip)) == NULL) + if ((cell = __wt_row_leaf_value(page, rip, NULL)) == NULL) __dmsg(ds, "\tV {}\n"); else { __wt_cell_unpack(cell, unpack); - WT_RET(__debug_cell_data( + WT_ERR(__debug_cell_data( ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); } @@ -826,7 +818,8 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) __debug_row_skip(ds, insert); } - return (0); +err: __wt_scr_free(&key); + return (ret); } /* diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 03d954d9d76..01aa46a89df 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -290,6 +290,7 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_IKEY *ikey; WT_ROW *rip; uint32_t i; + void *copy; /* * Free the in-memory index array. @@ -298,12 +299,13 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) * points somewhere other than the original page), and if so, free * the memory. */ - if (!F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) - WT_ROW_FOREACH(page, rip, i) { - ikey = WT_ROW_KEY_COPY(rip); - if (ikey != NULL && __wt_off_page(page, ikey)) - __wt_free(session, ikey); - } + WT_ROW_FOREACH(page, rip, i) { + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, NULL, NULL, NULL); + if (ikey != NULL) + __wt_free(session, ikey); + } /* * Free the insert array. diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index a9c77370df4..2a20c642872 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -11,9 +11,9 @@ static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *); static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *); static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *); static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *); -static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, int); +static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); static int __inmem_row_leaf_entries( - WT_SESSION_IMPL *, const WT_PAGE_HEADER *, int *, uint32_t *); + WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *); /* * __wt_page_in_func -- @@ -231,20 +231,16 @@ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { - WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; - int direct_key; *pagep = NULL; - btree = S2BT(session); dsk = image; alloc_entries = 0; - direct_key = 0; /* * Figure out how many underlying objects the page references so we can @@ -275,18 +271,6 @@ __wt_page_inmem(WT_SESSION_IMPL *session, break; case WT_PAGE_ROW_LEAF: /* - * High-performance applications will turn off Huffman encoding - * and prefix-compression, and won't have overflow keys. In - * those cases, we'd like to reference the key on the leaf page - * from our row-store index instead of the cell, then we don't - * have to unpack the cell every time we look at a key. Assume - * the fast configuration is more likely (note it's the default - * configuration), and correct course if we're wrong. - */ - direct_key = - btree->huffman_key || btree->prefix_compression ? 0 : 1; - - /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). @@ -299,7 +283,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( - session, dsk, &direct_key, &alloc_entries)); + session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } @@ -330,7 +314,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: - WT_ERR(__inmem_row_leaf(session, page, direct_key)); + WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -608,8 +592,8 @@ err: __wt_scr_free(¤t); * Return the number of entries for row-store leaf pages. */ static int -__inmem_row_leaf_entries(WT_SESSION_IMPL *session, - const WT_PAGE_HEADER *dsk, int *direct_keyp, uint32_t *nindxp) +__inmem_row_leaf_entries( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp) { WT_BTREE *btree; WT_CELL *cell; @@ -635,8 +619,6 @@ __inmem_row_leaf_entries(WT_SESSION_IMPL *session, __wt_cell_unpack(cell, unpack); switch (unpack->type) { case WT_CELL_KEY_OVFL: - *direct_keyp = 0; - /* FALLTHROUGH */ case WT_CELL_KEY: ++nindx; break; @@ -656,7 +638,7 @@ __inmem_row_leaf_entries(WT_SESSION_IMPL *session, * Build in-memory index for row-store leaf pages. */ static int -__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, int direct_key) +__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL *cell; @@ -669,28 +651,25 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, int direct_key) dsk = page->dsk; unpack = &_unpack; -restart: /* Walk the page, building indices. */ rip = page->pg_row_d; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); switch (unpack->type) { case WT_CELL_KEY_OVFL: + __wt_row_leaf_key_set_cell(page, rip, cell); + ++rip; + break; + case WT_CELL_KEY: /* - * If we've been preparing a fast-path to instantiating - * leaf page keys, we have a problem, overflow keys make - * that impossible. Restart without direct-key set. + * Simple keys without compression (not Huffman encoded + * or prefix compressed), can be directly referenced on + * the page to avoid repeatedly unpacking their cells. */ - if (direct_key) { - direct_key = 0; - goto restart; - } - /* FALLTHROUGH */ - case WT_CELL_KEY: - if (direct_key) - __wt_row_leaf_key_onpage_set(page, rip, unpack); + if (!btree->huffman_key && unpack->prefix == 0) + __wt_row_leaf_key_set(page, rip, unpack); else - __wt_row_leaf_key_onpage_set_cell(rip, cell); + __wt_row_leaf_key_set_cell(page, rip, cell); ++rip; break; case WT_CELL_VALUE: @@ -701,13 +680,6 @@ restart: } /* - * Set the direct access flag if we read the page's keys and found no - * problems. - */ - if (direct_key) - F_SET_ATOMIC(page, WT_PAGE_DIRECT_KEY); - - /* * We do not currently instantiate keys on leaf pages when the page is * loaded, they're instantiated on demand. */ diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index d7631a972ca..19fff653d3d 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -93,7 +93,7 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) * Take the value from the original page cell (which may be * empty). */ - if ((cell = __wt_row_leaf_value(page, rip)) == NULL) { + if ((cell = __wt_row_leaf_value(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 7d38ee83875..5f898a9151d 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1891,7 +1891,8 @@ err: WT_TRET(__wt_page_release(session, ref)); /* * __slvg_row_merge_ovfl -- - * Free file blocks referenced from keys discarded from merged pages. + * Free file blocks referenced from key/value pairs discarded from merged + * pages. */ static int __slvg_row_merge_ovfl(WT_SESSION_IMPL *session, @@ -1900,20 +1901,17 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session, WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; - WT_IKEY *ikey; WT_ROW *rip; + void *copy; bm = S2BT(session)->bm; unpack = &_unpack; for (rip = page->pg_row_d + start; start < stop; ++start) { - if (!F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { - ikey = WT_ROW_KEY_COPY(rip); - if (__wt_off_page(page, ikey)) - cell = - WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - else - cell = (WT_CELL *)ikey; + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, NULL, &cell, NULL, NULL); + if (cell != NULL) { __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_KEY_OVFL) { WT_RET(__wt_verbose(session, WT_VERB_SALVAGE, @@ -1924,12 +1922,12 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session, __wt_addr_string(session, unpack->data, unpack->size, trk->ss->tmp2))); - WT_RET(bm->free( - bm, session, unpack->data, unpack->size)); + WT_RET(bm->free( + bm, session, unpack->data, unpack->size)); } } - if ((cell = __wt_row_leaf_value(page, rip)) == NULL) + if ((cell = __wt_row_leaf_value(page, rip, NULL)) == NULL) continue; __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_VALUE_OVFL) { diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 93a872754e4..6c4d28a03ce 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -200,7 +200,7 @@ __verify_dsk_row( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(NULL, cell, unpack, end) != 0) { + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, addr); goto err; } @@ -467,7 +467,7 @@ __verify_dsk_col_int( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(NULL, cell, unpack, end) != 0) + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) return (__err_cell_corrupted(session, cell_num, addr)); /* Check the raw and collapsed cell types. */ @@ -534,7 +534,7 @@ __verify_dsk_col_var( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(NULL, cell, unpack, end) != 0) + if (__wt_cell_unpack_safe(cell, unpack, end) != 0) return (__err_cell_corrupted(session, cell_num, addr)); /* Check the raw and collapsed cell types. */ diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 417f71d4f72..db90e2119f9 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -4046,28 +4046,21 @@ __rec_row_leaf(WT_SESSION_IMPL *session, } /* - * Set the WT_IKEY reference (if the key was instantiated), and - * the key cell reference, unpack the key cell. + * Figure out the key: set any cell reference (and unpack it), + * set any instantiated key reference. */ copy = WT_ROW_KEY_COPY(rip); - if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { - ikey = NULL; - cell = NULL; + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (cell == NULL) kpack = NULL; - } else if (__wt_off_page(page, copy)) { - ikey = copy; - cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - kpack = &_kpack; - __wt_cell_unpack(cell, kpack); - } else { - ikey = NULL; - cell = (WT_CELL *)copy; + else { kpack = &_kpack; __wt_cell_unpack(cell, kpack); } /* Unpack the on-page value cell, and look for an update. */ - if ((val_cell = __wt_row_leaf_value(page, rip)) == NULL) + if ((val_cell = __wt_row_leaf_value(page, rip, NULL)) == NULL) vpack = NULL; else { vpack = &_vpack; @@ -4256,22 +4249,17 @@ __rec_row_leaf(WT_SESSION_IMPL *session, r->ovfl_items = 1; } else { /* - * Use a direct-key from the page, or - * Use an already instantiated key, or - * Use the key from the disk image, or - * Build a key from a previous key, or - * Instantiate the key from scratch. + * Get the key from the page or an instantiated key, or + * inline building the key from a previous key (it's a + * fast path for simple, prefix-compressed keys), or by + * by building the key from scratch. */ - if (kpack == NULL) - __wt_row_leaf_direct(page, copy, tmpkey); - else if (ikey != NULL) { - tmpkey->data = WT_IKEY_DATA(ikey); - tmpkey->size = ikey->size; - } else if (btree->huffman_key == NULL && - kpack->type == WT_CELL_KEY && kpack->prefix == 0) { - tmpkey->data = kpack->data; - tmpkey->size = kpack->size; - } else if (btree->huffman_key == NULL && + if (__wt_row_leaf_key_info(page, copy, + NULL, &cell, &tmpkey->data, &tmpkey->size)) + goto build; + + __wt_cell_unpack(cell, kpack); + if (btree->huffman_key == NULL && kpack->type == WT_CELL_KEY && tmpkey->size >= kpack->prefix) { /* @@ -4283,10 +4271,10 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_ASSERT(session, tmpkey->size != 0); /* - * Grow the buffer as necessary as well as - * ensure data has been copied into local buffer - * space, then append the suffix to the prefix - * already in the buffer. + * Grow the buffer as necessary, ensuring data + * data has been copied into local buffer space, + * then append the suffix to the prefix already + * in the buffer. * * Don't grow the buffer unnecessarily or copy * data we don't need, truncate the item's data @@ -4301,7 +4289,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, } else WT_ERR(__wt_row_leaf_key_copy( session, page, rip, tmpkey)); - +build: WT_ERR(__rec_cell_build_leaf_key(session, r, tmpkey->data, tmpkey->size, &ovfl_key)); } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index e82140d509a..a816747b9cb 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -25,8 +25,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) btree = S2BT(session); - if (page->pg_row_entries == 0 || /* Just checking... */ - F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { + if (page->pg_row_entries == 0) { /* Just checking... */ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); return (0); } @@ -132,6 +131,7 @@ __wt_row_leaf_key_work(WT_SESSION_IMPL *session, { enum { FORWARD, BACKWARD } direction; WT_BTREE *btree; + WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(tmp); WT_DECL_RET; @@ -140,7 +140,7 @@ __wt_row_leaf_key_work(WT_SESSION_IMPL *session, size_t size; u_int last_prefix; int jump_slot_offset, slot_offset; - void *key; + void *copy; const void *p; /* @@ -149,7 +149,6 @@ __wt_row_leaf_key_work(WT_SESSION_IMPL *session, * front-end, __wt_row_leaf_key, be careful if you're calling this code * directly. */ - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)); btree = S2BT(session); unpack = &_unpack; @@ -161,51 +160,68 @@ __wt_row_leaf_key_work(WT_SESSION_IMPL *session, direction = BACKWARD; for (slot_offset = 0;;) { -jump_slot: key = WT_ROW_KEY_COPY(rip); + if (0) { +switch_and_jump: /* Switching to a forward roll. */ + WT_ASSERT(session, direction == BACKWARD); + direction = FORWARD; + + /* Skip list of keys with compatible prefixes. */ + rip = jump_rip; + slot_offset = jump_slot_offset; + } + copy = WT_ROW_KEY_COPY(rip); /* - * Key copied. - * - * If another thread instantiated the key while we were doing - * that, we don't have any work to do. Figure this out using - * the key's value: - * - * If the key points off-page, another thread updated the key, - * we can just use it. - * - * If the key points on-page, we have a copy of a WT_CELL value - * that can be processed, regardless of what any other thread is - * doing. - * - * Overflow keys are not prefix-compressed, we don't want to - * read/write them during reconciliation simply because their - * prefix might change. That means we can't use instantiated - * overflow keys to figure out the prefix for other keys, - * specifically, in this code when we're looking for a key we - * can roll-forward to figure out the target key's prefix, - * instantiated overflow keys aren't useful. - * - * 1: the test for an on/off page reference. + * Figure out what the key looks like. */ - if (__wt_off_page(page, key)) { -off_page: ikey = key; + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, &p, &size); + + /* 1: the test for a directly referenced on-page key. */ + if (cell == NULL) { + keyb->data = p; + keyb->size = size; + + /* + * If this is the key we originally wanted, we don't + * care if we're rolling forward or backward, or if + * it's an overflow key or not, it's what we wanted. + * This shouldn't normally happen, the fast-path code + * that front-ends this function will have figured it + * out before we were called. + * + * The key doesn't need to be instantiated, skip past + * that test. + */ + if (slot_offset == 0) + goto done; + + /* + * This key is not an overflow key by definition and + * isn't compressed in any way, we can use it to roll + * forward. + * If rolling backward, switch directions. + * If rolling forward: there's a bug somewhere, + * we should have hit this key when rolling backward. + */ + goto switch_and_jump; + } + /* 2: the test for an instantiated off-page key. */ + if (ikey != NULL) { /* * If this is the key we originally wanted, we don't * care if we're rolling forward or backward, or if * it's an overflow key or not, it's what we wanted. * Take a copy and wrap up. + * + * The key doesn't need to be instantiated, skip past + * that test. */ if (slot_offset == 0) { - keyb->data = WT_IKEY_DATA(ikey); - keyb->size = ikey->size; - - /* - * The key is already instantiated, ignore the - * caller's suggestion. - */ - instantiate = 0; - break; + keyb->data = p; + keyb->size = size; + goto done; } /* @@ -218,8 +234,7 @@ off_page: ikey = key; * done because prefixes skip overflow keys: keep * rolling forward. */ - if (__wt_cell_type(WT_PAGE_REF_OFFSET( - page, ikey->cell_offset)) == WT_CELL_KEY_OVFL) + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) goto next; /* @@ -233,16 +248,18 @@ off_page: ikey = key; * In short: if it's not an overflow key, take a copy * and roll forward. */ - keyb->data = WT_IKEY_DATA(ikey); - keyb->size = ikey->size; + keyb->data = p; + keyb->size = size; direction = FORWARD; goto next; } - /* Unpack the key's cell. */ - __wt_cell_unpack(key, unpack); + /* + * It must be an on-page cell, unpack it. + */ + __wt_cell_unpack(cell, unpack); - /* 2: the test for an on-page reference to an overflow key. */ + /* 3: the test for an on-page reference to an overflow key. */ if (unpack->type == WT_CELL_KEY_OVFL) { /* * If this is the key we wanted from the start, we don't @@ -260,25 +277,23 @@ off_page: ikey = key; * the tracking cache. */ if (slot_offset == 0) { - WT_ERR(__wt_readlock( - session, btree->ovfl_lock)); - key = WT_ROW_KEY_COPY(rip); - if (__wt_off_page(page, key)) { - WT_ERR(__wt_rwunlock( - session, btree->ovfl_lock)); - goto off_page; + WT_ERR( + __wt_readlock(session, btree->ovfl_lock)); + copy = WT_ROW_KEY_COPY(rip); + if (!__wt_row_leaf_key_info(page, copy, + NULL, &cell, &keyb->data, &keyb->size)) { + __wt_cell_unpack(cell, unpack); + ret = __wt_dsk_cell_data_ref(session, + WT_PAGE_ROW_LEAF, unpack, keyb); } - ret = __wt_dsk_cell_data_ref( - session, WT_PAGE_ROW_LEAF, unpack, keyb); - WT_TRET(__wt_rwunlock( - session, btree->ovfl_lock)); + WT_TRET( + __wt_rwunlock(session, btree->ovfl_lock)); WT_ERR(ret); break; } /* - * If we wanted a different key and this key is an - * overflow key: + * If we wanted a different key: * If we're rolling backward, this key is useless * to us because it doesn't have a valid prefix: keep * rolling backward. @@ -290,11 +305,19 @@ off_page: ikey = key; } /* - * 3: the test for an on-page reference to a key that isn't + * 4: the test for an on-page reference to a key that isn't * prefix compressed. */ if (unpack->prefix == 0) { /* + * The only reason to be here is a Huffman encoded key, + * a non-encoded key with no prefix compression should + * have been directly referenced, and we should not have + * needed to unpack its cell. + */ + WT_ASSERT(session, btree->huffman_key != NULL); + + /* * If this is the key we originally wanted, we don't * care if we're rolling forward or backward, it's * what we want. Take a copy and wrap up. @@ -306,47 +329,19 @@ off_page: ikey = key; * If rolling forward there's a bug, we should have * found this key while rolling backwards and switched * directions then. + * + * The key doesn't need to be instantiated, skip past + * that test. */ - if (btree->huffman_key == NULL) { - keyb->data = unpack->data; - keyb->size = unpack->size; - } else - WT_ERR(__wt_dsk_cell_data_ref( - session, WT_PAGE_ROW_LEAF, unpack, keyb)); - - if (slot_offset == 0) { - /* - * If we have an uncompressed, on-page key with - * no prefix, don't bother instantiating it, - * regardless of what our caller thought. The - * memory cost is greater than the performance - * cost of finding the key each time we need it. - */ - if (btree->huffman_key == NULL) - instantiate = 0; - break; - } - - WT_ASSERT(session, direction == BACKWARD); - direction = FORWARD; - - /* - * Switching to the forward roll; skip over any list of - * keys with compatible prefixes. - */ - rip = jump_rip; - slot_offset = jump_slot_offset; - - /* - * I'm using an explicit branch instead of a continue, - * it needs to be obvious that new code at the top of - * this loop is problematical. - */ - goto jump_slot; + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_LEAF, unpack, keyb)); + if (slot_offset == 0) + goto done; + goto switch_and_jump; } /* - * 4: an on-page reference to a key that's prefix compressed. + * 5: an on-page reference to a key that's prefix compressed. * If rolling backward, keep looking for something we can * use. * If rolling forward, build the full key and keep rolling @@ -436,10 +431,12 @@ next: switch (direction) { * that half of the page). */ if (instantiate) { - key = WT_ROW_KEY_COPY(rip_arg); - if (!__wt_off_page(page, key)) { + copy = WT_ROW_KEY_COPY(rip_arg); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (ikey == NULL) { WT_ERR(__wt_row_ikey(session, - WT_PAGE_DISK_OFFSET(page, key), + WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey)); /* @@ -447,7 +444,7 @@ next: switch (direction) { * update the page's memory footprint, on failure, free * the allocated memory. */ - if (WT_ATOMIC_CAS(WT_ROW_KEY_COPY(rip), key, ikey)) + if (WT_ATOMIC_CAS(WT_ROW_KEY_COPY(rip), copy, ikey)) __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size); else @@ -455,6 +452,7 @@ next: switch (direction) { } } +done: err: __wt_scr_free(&tmp); return (ret); } diff --git a/src/include/btmem.h b/src/include/btmem.h index d9986e38d98..4056ec9ed08 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -531,12 +531,11 @@ struct __wt_page { uint8_t type; /* Page type */ #define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */ -#define WT_PAGE_DIRECT_KEY 0x02 /* Row-store leaf keys direct access */ -#define WT_PAGE_DISK_ALLOC 0x04 /* Disk image in allocated memory */ -#define WT_PAGE_DISK_MAPPED 0x08 /* Disk image in mapped memory */ -#define WT_PAGE_EVICT_LRU 0x10 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLITTING 0x40 /* An internal page is growing. */ +#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ +#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ +#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ +#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ }; diff --git a/src/include/btree.i b/src/include/btree.i index 688324f5e45..0f81103862f 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -486,7 +486,7 @@ __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack) /* * See the comment in __wt_ref_key for an explanation of the magic. */ - v = (uint64_t)unpack->size << 32 | + v = (uintptr_t)unpack->size << 32 | (uint32_t)WT_PAGE_DISK_OFFSET(page, unpack->data) << 1 | 0x01; ref->key.ikey = (void *)v; @@ -520,67 +520,124 @@ __wt_ref_key_clear(WT_REF *ref) } /* - * __wt_row_leaf_direct -- - * Return an encoded row-store leaf page key. + * __wt_row_leaf_key_info -- + * Return a row-store leaf page key referenced by a WT_ROW if it can be + * had without unpacking a cell, and information about the cell, if the key + * isn't cheaply available. */ -static inline void -__wt_row_leaf_direct(WT_PAGE *page, void *ripkey, WT_ITEM *key) +static inline int +__wt_row_leaf_key_info(WT_PAGE *page, void *copy, + WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { + WT_IKEY *ikey; uintptr_t v; + v = (uintptr_t)copy; + /* * A row-store leaf page key is in one of two places: if instantiated, * the WT_ROW pointer references a WT_IKEY structure, otherwise, it * references an on-page location. However, on-page keys are in one of - * two states and the reference is in one of two forms: if a row-store - * doesn't configure prefix compression or Huffman encoding, and there - * were no overflow keys found when reading the page into memory (all - * of which is likely, those are the default configurations), the key's - * location and size was encoded in the pointer and a page flag was set. - * If we found overflow keys or one of those features is configured, the - * reference is to the key's on-page cell, which we'll unpack (we're - * trying to avoid that cell unpack per key read in the fast path). - * The test is if the page flag is set, we're done, it's an encoding; - * otherwise, if the pointer is off-page it's an instantiated key, else - * an on-page cell. + * two states: if the key is a simple key (not an overflow key, prefix + * compressed or Huffman encoded all of which are likely), the key's + * offset/size is encoded in the pointer. Otherwise, the offset is to + * the key's on-page cell. + * + * Now the magic: Any allocated memory will have a low-order bit of 0 + * (the return from malloc must be aligned to store any standard type, + * and we assume there's always going to be a standard type requiring + * even-byte alignment). An on-page key consists of an offset/length + * pair. We can fit the maximum page size into 31 bits, so we use the + * low-order bit in the on-page value to flag the next 31 bits as a + * page offset and the other 32 bits as the key's length, not a WT_IKEY + * pointer. This breaks if allocation chunks aren't even-byte aligned. * - * This function cracks an encoded key and returns a real pointer. The - * encoding magic is simpler than internal page key encoding because we - * are using the page's flag rather than per-key information to decide - * if the key is encoded. The key's page offset is the bottom 4B, and - * the key size is the top 4B. + * To distinguish between an on-page key and an on-page cell, we set + * the size to 0 in the case on an on-page cell. + * + * Perform the tests in the order we think mostly probable, this call is + * all about speed. + * + * This function returns a list of things about the key (instantiation + * reference, cell reference, unpacked cell, and key/length pair). Our + * callers sometimes want some things, and sometimes others, we fill in + * the information we have based on the arguments we're passed; since + * this is an inlined function, we're depending on the compiler to drop + * code we don't need. */ - v = (uintptr_t)ripkey; - key->data = WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF)); - key->size = v >> 32; + + /* On-page key: no instantiated key, no cell. */ + if (v & 0x01 && (v & 0xFFFFFFFF00000000) != 0) { + if (cellp != NULL) + *cellp = NULL; + if (ikeyp != NULL) + *ikeyp = NULL; + if (datap != NULL) { + *(void **)datap = + WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF) >> 1); + *sizep = v >> 32; + return (1); + } + return (0); + } + + /* On-page cell: no instantiated key. */ + if (v & 0x01) { + if (ikeyp != NULL) + *ikeyp = NULL; + if (cellp != NULL) + *cellp = + WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF) >> 1); + return (0); + } + + /* Instantiated key. */ + ikey = copy; + if (ikeyp != NULL) + *ikeyp = copy; + if (cellp != NULL) + *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + if (datap != NULL) { + *(void **)datap = WT_IKEY_DATA(ikey); + *sizep = ikey->size; + return (1); + } + return (0); } /* - * __wt_row_leaf_key_onpage_set -- - * Set a WT_ROW to reference an on-page key. + * __wt_row_leaf_key_set -- + * Set a WT_ROW to reference an on-page row-store leaf key. */ static inline void -__wt_row_leaf_key_onpage_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) +__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) { uintptr_t v; /* - * See the comment in __wt_row_leaf_direct for an explanation of the + * See the comment in __wt_row_leaf_key_info for an explanation of the * magic. */ v = (uintptr_t)unpack->size << 32 | - (uint32_t)WT_PAGE_DISK_OFFSET(page, unpack->data); + (uint32_t)WT_PAGE_DISK_OFFSET(page, unpack->data) << 1 | 0x01; WT_ROW_KEY_SET(rip, v); } /* - * __wt_row_leaf_key_onpage_set_cell -- - * Set a WT_ROW to reference an on-page key's cell. + * __wt_row_leaf_key_set_cell -- + * Set a WT_ROW to reference an on-page row-store leaf cell. */ static inline void -__wt_row_leaf_key_onpage_set_cell(WT_ROW *rip, WT_CELL *cell) +__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) { - WT_ROW_KEY_SET(rip, cell); + uintptr_t v; + + /* + * See the comment in __wt_row_leaf_key_info for an explanation of the + * magic. + */ + v = (uintptr_t)WT_PAGE_DISK_OFFSET(page, cell) << 1 | 0x01; + WT_ROW_KEY_SET(rip, v); } /* @@ -592,13 +649,8 @@ static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; - WT_IKEY *ikey; void *copy; - btree = S2BT(session); - /* * A front-end for __wt_row_leaf_key_work, here to inline fast paths. * @@ -606,36 +658,19 @@ __wt_row_leaf_key(WT_SESSION_IMPL *session, */ copy = WT_ROW_KEY_COPY(rip); - /* First, check for an encoded key. */ - if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { - __wt_row_leaf_direct(page, copy, key); - return (0); - } - - /* Second, check for an instantiated key. */ - if (__wt_off_page(page, copy)) { - ikey = copy; - key->data = WT_IKEY_DATA(ikey); - key->size = ikey->size; - return (0); - } - /* - * Third, if the key isn't compressed or an overflow, unpack the cell - * and take it from the page. + * All we handle here are on-page keys (which should be a common case), + * and instantiated keys (which start out rare, but become more common + * as a leaf page is searched, instantiating prefix-compressed keys). */ - if (btree->huffman_key == NULL) { - __wt_cell_unpack(copy, &unpack); - if (unpack.type == WT_CELL_KEY && unpack.prefix == 0) { - key->data = unpack.data; - key->size = unpack.size; - return (0); - } - } + if (__wt_row_leaf_key_info( + page, copy, NULL, NULL, &key->data, &key->size)) + return (0); /* - * We have to build the key (it's never been instantiated, and it's some - * kind of compressed or overflow key). + * The alternative is an on-page cell with some kind of compressed or + * overflow key that's never been instantiated. Call the underlying + * worker function to figure it out. */ return (__wt_row_leaf_key_work(session, page, rip, key, instantiate)); } @@ -673,48 +708,42 @@ __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) * NULL if there isn't one. */ static inline WT_CELL * -__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip) +__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) { - WT_CELL *cell; + WT_CELL *kcell, *vcell; WT_CELL_UNPACK unpack; - WT_IKEY *ikey; - void *copy; - uintptr_t v; + void *copy, *key; + size_t size; - /* - * The row-store key can change underfoot; explicitly take a copy. - */ - copy = WT_ROW_KEY_COPY(rip); + /* If we already have an unpacked key cell, use it. */ + if (kpack != NULL) + vcell = (WT_CELL *) + ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack)); + else { + /* + * The row-store key can change underfoot; explicitly take a + * copy. + */ + copy = WT_ROW_KEY_COPY(rip); - /* - * See the comment in __wt_row_leaf_direct for an explanation of the - * magic; we know where the key is, step past it to the value's cell. - */ - if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) { - v = (uintptr_t)copy; - cell = (WT_CELL *) - ((uint8_t *)WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF)) + - (v >> 32)); - return (__wt_cell_leaf_value_parse(page, cell)); + /* + * Figure out where the key is, step past it to the value cell. + * The test for a cell not being set tells us that we have an + * on-page key, otherwise we're looking at an instantiated key + * or on-page cell, both of which require an unpack of the key's + * cell to find the value cell that follows. + */ + if (__wt_row_leaf_key_info( + page, copy, NULL, &kcell, &key, &size) && kcell == NULL) + vcell = (WT_CELL *)((uint8_t *)key + size); + else { + __wt_cell_unpack(kcell, &unpack); + vcell = (WT_CELL *)((uint8_t *) + unpack.cell + __wt_cell_total_len(&unpack)); + } } - /* - * Cell now either references a WT_IKEY structure with a cell offset, or - * references the on-page key WT_CELL. Both can be processed no matter - * what other threads are doing. If it's the former, use it to get the - * latter. - */ - if (__wt_off_page(page, copy)) { - ikey = copy; - cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - } else - cell = copy; - - /* Unpack the key cell, then return its associated value cell. */ - __wt_cell_unpack(cell, &unpack); - cell = (WT_CELL *)((uint8_t *)cell + __wt_cell_total_len(&unpack)); - - return (__wt_cell_leaf_value_parse(page, cell)); + return (__wt_cell_leaf_value_parse(page, vcell)); } /* diff --git a/src/include/cell.i b/src/include/cell.i index 071206c4b49..f5303644bef 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -555,8 +555,7 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) * Unpack a WT_CELL into a structure during verification. */ static inline int -__wt_cell_unpack_safe( - WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) +__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) { uint64_t saved_v, v; uint32_t saved_len; @@ -706,20 +705,6 @@ done: WT_CELL_LEN_CHK(cell, unpack->__len); unpack->v = saved_v; } - /* - * If we just unpacked a key cell for an in-memory page, set the value - * field to the next cell, interpreting it as a value cell, so cursors - * can return a key/value pair without unpacking the key cell multiple - * times. - * - * !!! - * This function is only called with a non-NULL page when unpacking a - * row-store leaf page key, which is why we don't check further. - */ - if (page != NULL) { - cell = (WT_CELL *)((uint8_t *)cell + unpack->__len); - unpack->value = __wt_cell_leaf_value_parse(page, cell); - } return (0); } @@ -730,23 +715,7 @@ done: WT_CELL_LEN_CHK(cell, unpack->__len); static inline void __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack) { - (void)__wt_cell_unpack_safe(NULL, cell, unpack, NULL); -} - -/* - * __wt_cell_unpack_with_value -- - * Unpack a WT_CELL into a structure, and check for an associated value. - */ -static inline void -__wt_cell_unpack_with_value( - WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack) -{ - /* - * This routine exists so we don't have pass in a NULL page reference - * whenever we're unpacking cells from disk images (rather than from - * in-memory pages). - */ - (void)__wt_cell_unpack_safe(page, cell, unpack, NULL); + (void)__wt_cell_unpack_safe(cell, unpack, NULL); } /* diff --git a/src/include/cursor.i b/src/include/cursor.i index e3bb225cbcb..e3834fb95ee 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -178,7 +178,6 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) WT_ITEM *kb, *vb; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; - WT_IKEY *ikey; WT_PAGE *page; WT_SESSION_IMPL *session; int key_unpacked; @@ -200,83 +199,75 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) copy = WT_ROW_KEY_COPY(rip); /* - * Get a reference to the key, ideally without doing a copy: we could - * call __wt_row_leaf_key, but if a cursor is running through the tree, - * we actually have more information here than that function has, we - * may have the prefix-compressed key that comes immediately before the - * one we want. + * Get a key: we could just call __wt_row_leaf_key, but as a cursor + * is running through the tree, we may have additional information + * here (we may have the fully-built key that's immediately before + * the prefix-compressed key one we want). * - * If the key can be accessed directly, or has been instantiated (the - * key points off-page), we don't have any work to do. - * - * If the key points on-page, we have a copy of a WT_CELL value that can - * be processed, regardless of what any other thread is doing. + * First, check for an immediately available key. */ - if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) - __wt_row_leaf_direct(page, copy, kb); - else if (__wt_off_page(page, copy)) { - ikey = copy; - kb->data = WT_IKEY_DATA(ikey); - kb->size = ikey->size; + if (__wt_row_leaf_key_info( + page, copy, NULL, &cell, &kb->data, &kb->size)) + goto value; + + /* Huffman encoded keys are a slow path in all cases. */ + if (btree->huffman_key != NULL) + goto slow; + + /* + * Unpack the cell and deal with overflow and prefix-compressed keys. + * Inline building simple prefix-compressed keys from a previous key, + * otherwise build from scratch. + */ + __wt_cell_unpack(cell, unpack); + key_unpacked = 1; + if (unpack->type == WT_CELL_KEY && + cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { + WT_ASSERT(session, cbt->tmp.size >= unpack->prefix); + + /* + * Grow the buffer as necessary as well as ensure data has been + * copied into local buffer space, then append the suffix to the + * prefix already in the buffer. + * + * Don't grow the buffer unnecessarily or copy data we don't + * need, truncate the item's data length to the prefix bytes. + */ + cbt->tmp.size = unpack->prefix; + WT_RET(__wt_buf_grow( + session, &cbt->tmp, cbt->tmp.size + unpack->size)); + memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size, + unpack->data, unpack->size); + cbt->tmp.size += unpack->size; } else { /* - * If the key is simple and on-page and not prefix-compressed, - * or we have the previous expanded key in the cursor buffer, - * reference or build it. Else, call __wt_row_leaf_key_work to - * do it the hard way. + * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we + * already did __wt_row_leaf_key's fast-path checks inline. */ - if (btree->huffman_key != NULL) - goto slow; - __wt_cell_unpack_with_value(page, copy, unpack); - key_unpacked = 1; - if (unpack->type == WT_CELL_KEY && unpack->prefix == 0) { - cbt->tmp.data = unpack->data; - cbt->tmp.size = unpack->size; - } else if (unpack->type == WT_CELL_KEY && - cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { - WT_ASSERT(session, cbt->tmp.size >= unpack->prefix); - - /* - * Grow the buffer as necessary as well as ensure data - * has been copied into local buffer space, then append - * the suffix to the prefix already in the buffer. - * - * Don't grow the buffer unnecessarily or copy data we - * don't need, truncate the item's data length to the - * prefix bytes. - */ - cbt->tmp.size = unpack->prefix; - WT_RET(__wt_buf_grow( - session, &cbt->tmp, cbt->tmp.size + unpack->size)); - memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size, - unpack->data, unpack->size); - cbt->tmp.size += unpack->size; - } else { - /* - * __wt_row_leaf_key_work instead of __wt_row_leaf_key: - * we do __wt_row_leaf_key's fast-path checks inline. - */ -slow: WT_RET(__wt_row_leaf_key_work( - session, page, rip, &cbt->tmp, 0)); - } - kb->data = cbt->tmp.data; - kb->size = cbt->tmp.size; - cbt->rip_saved = rip; +slow: WT_RET( + __wt_row_leaf_key_work(session, page, rip, &cbt->tmp, 0)); } + kb->data = cbt->tmp.data; + kb->size = cbt->tmp.size; + cbt->rip_saved = rip; +value: /* - * If the item was ever modified, use the WT_UPDATE data. Note that - * the caller passes us the update: it has already resolved which one + * If the item was ever modified, use the WT_UPDATE data. Note the + * caller passes us the update: it has already resolved which one * (if any) is visible. - * Else, check for empty data. - * Else, use the value from the original disk image. */ if (upd != NULL) { vb->data = WT_UPDATE_DATA(upd); vb->size = upd->size; return (0); } - cell = key_unpacked ? unpack->value : __wt_row_leaf_value(page, rip); + + /* + * Else, find the value cell and check for empty data. + * Else, use the value from the original disk image. + */ + cell = __wt_row_leaf_value(page, rip, key_unpacked ? unpack : NULL); if (cell == NULL) { vb->data = ""; vb->size = 0; |