summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2014-05-30 09:54:20 -0400
committerKeith Bostic <keith@wiredtiger.com>2014-05-30 09:54:20 -0400
commit983da1ff448fdfa5b2ee1508f1f6601afc4d6f73 (patch)
treef0a0024d1f0aaab86fb47eb02786e69e936550c9
parent8b2027b80b8ce0226489d6a94738ae721847ea6c (diff)
downloadmongo-983da1ff448fdfa5b2ee1508f1f6601afc4d6f73.tar.gz
Experimental change to avoid unpacking cells in order to access
row-store leaf page keys. If a page has no Huffman or prefix compression, and no overflow keys, encode the key's page offset and size in the WT_ROW pointer and set a flag on the page. From that point on, when the flag is set, we can go directly to the key instead of unpacking the key's cell each time.
-rw-r--r--src/btree/bt_debug.c29
-rw-r--r--src/btree/bt_discard.c11
-rw-r--r--src/btree/bt_page.c59
-rw-r--r--src/btree/bt_slvg.c40
-rw-r--r--src/btree/rec_write.c34
-rw-r--r--src/btree/row_key.c23
-rw-r--r--src/include/btmem.h28
-rw-r--r--src/include/btree.i135
-rw-r--r--src/include/cursor.i18
-rw-r--r--src/include/extern.h4
-rw-r--r--src/include/txn.i1
11 files changed, 266 insertions, 116 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 061a19ad9d9..9e3d87ae4fb 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -34,7 +34,6 @@ static void __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, int);
static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
-static void __debug_ikey(WT_DBG *, WT_IKEY *);
static void __debug_item(WT_DBG *, const char *, const void *, size_t);
static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t);
static void __debug_page_col_fix(WT_DBG *, WT_PAGE *);
@@ -780,11 +779,13 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
{
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
+ WT_IKEY *ikey;
WT_INSERT_HEAD *insert;
+ WT_ITEM key;
WT_ROW *rip;
WT_UPDATE *upd;
uint32_t i;
- void *ripkey;
+ void *copy;
unpack = &_unpack;
@@ -797,11 +798,15 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
/* Dump the page's K/V pairs. */
WT_ROW_FOREACH(page, rip, i) {
- ripkey = WT_ROW_KEY_COPY(rip);
- if (__wt_off_page(page, ripkey))
- __debug_ikey(ds, ripkey);
- else {
- __wt_cell_unpack(ripkey, unpack);
+ copy = WT_ROW_KEY_COPY(rip);
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
+ __wt_row_leaf_direct(page, copy, &key);
+ __debug_item(ds, "K", key.data, key.size);
+ } else if (__wt_off_page(page, copy)) {
+ ikey = copy;
+ __debug_item(ds, "K", WT_IKEY_DATA(ikey), ikey->size);
+ } else {
+ __wt_cell_unpack(copy, unpack);
WT_RET(__debug_cell_data(
ds, page, WT_PAGE_ROW_LEAF, "K", unpack));
}
@@ -1056,16 +1061,6 @@ __debug_cell_data(WT_DBG *ds,
}
/*
- * __debug_ikey --
- * Dump a single WT_IKEY in debugging mode, with an optional tag.
- */
-static void
-__debug_ikey(WT_DBG *ds, WT_IKEY *ikey)
-{
- __debug_item(ds, "K", WT_IKEY_DATA(ikey), ikey->size);
-}
-
-/*
* __debug_item --
* Dump a single data/size pair, with an optional tag.
*/
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index be3b7d9d269..03d954d9d76 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -298,11 +298,12 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
* points somewhere other than the original page), and if so, free
* the memory.
*/
- WT_ROW_FOREACH(page, rip, i) {
- ikey = WT_ROW_KEY_COPY(rip);
- if (ikey != NULL && __wt_off_page(page, ikey))
- __wt_free(session, ikey);
- }
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY))
+ WT_ROW_FOREACH(page, rip, i) {
+ ikey = WT_ROW_KEY_COPY(rip);
+ if (ikey != NULL && __wt_off_page(page, ikey))
+ __wt_free(session, ikey);
+ }
/*
* Free the insert array.
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 45f5f430f69..a9c77370df4 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -11,9 +11,9 @@ static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
-static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, int);
static int __inmem_row_leaf_entries(
- WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
+ WT_SESSION_IMPL *, const WT_PAGE_HEADER *, int *, uint32_t *);
/*
* __wt_page_in_func --
@@ -231,16 +231,20 @@ int
__wt_page_inmem(WT_SESSION_IMPL *session,
WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
const WT_PAGE_HEADER *dsk;
uint32_t alloc_entries;
size_t size;
+ int direct_key;
*pagep = NULL;
+ btree = S2BT(session);
dsk = image;
alloc_entries = 0;
+ direct_key = 0;
/*
* Figure out how many underlying objects the page references so we can
@@ -271,6 +275,18 @@ __wt_page_inmem(WT_SESSION_IMPL *session,
break;
case WT_PAGE_ROW_LEAF:
/*
+ * High-performance applications will turn off Huffman encoding
+ * and prefix-compression, and won't have overflow keys. In
+ * those cases, we'd like to reference the key on the leaf page
+ * from our row-store index instead of the cell, then we don't
+ * have to unpack the cell every time we look at a key. Assume
+ * the fast configuration is more likely (note it's the default
+ * configuration), and correct course if we're wrong.
+ */
+ direct_key =
+ btree->huffman_key || btree->prefix_compression ? 0 : 1;
+
+ /*
* If the "no empty values" flag is set, row-store leaf page
* entries map one-to-one to the number of physical entries
* on the page (each physical entry is a key or value item).
@@ -283,7 +299,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session,
alloc_entries = dsk->u.entries / 2;
else
WT_RET(__inmem_row_leaf_entries(
- session, dsk, &alloc_entries));
+ session, dsk, &direct_key, &alloc_entries));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -314,7 +330,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session,
WT_ERR(__inmem_row_int(session, page, &size));
break;
case WT_PAGE_ROW_LEAF:
- WT_ERR(__inmem_row_leaf(session, page));
+ WT_ERR(__inmem_row_leaf(session, page, direct_key));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -592,8 +608,8 @@ err: __wt_scr_free(&current);
* Return the number of entries for row-store leaf pages.
*/
static int
-__inmem_row_leaf_entries(
- WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp)
+__inmem_row_leaf_entries(WT_SESSION_IMPL *session,
+ const WT_PAGE_HEADER *dsk, int *direct_keyp, uint32_t *nindxp)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -618,8 +634,10 @@ __inmem_row_leaf_entries(
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
- case WT_CELL_KEY:
case WT_CELL_KEY_OVFL:
+ *direct_keyp = 0;
+ /* FALLTHROUGH */
+ case WT_CELL_KEY:
++nindx;
break;
case WT_CELL_VALUE:
@@ -638,7 +656,7 @@ __inmem_row_leaf_entries(
* Build in-memory index for row-store leaf pages.
*/
static int
-__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, int direct_key)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -651,14 +669,28 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
dsk = page->dsk;
unpack = &_unpack;
+restart:
/* Walk the page, building indices. */
rip = page->pg_row_d;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
- case WT_CELL_KEY:
case WT_CELL_KEY_OVFL:
- WT_ROW_KEY_SET(rip, cell);
+ /*
+ * If we've been preparing a fast-path to instantiating
+ * leaf page keys, we have a problem, overflow keys make
+ * that impossible. Restart without direct-key set.
+ */
+ if (direct_key) {
+ direct_key = 0;
+ goto restart;
+ }
+ /* FALLTHROUGH */
+ case WT_CELL_KEY:
+ if (direct_key)
+ __wt_row_leaf_key_onpage_set(page, rip, unpack);
+ else
+ __wt_row_leaf_key_onpage_set_cell(rip, cell);
++rip;
break;
case WT_CELL_VALUE:
@@ -669,6 +701,13 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * Set the direct access flag if we read the page's keys and found no
+ * problems.
+ */
+ if (direct_key)
+ F_SET_ATOMIC(page, WT_PAGE_DIRECT_KEY);
+
+ /*
* We do not currently instantiate keys on leaf pages when the page is
* loaded, they're instantiated on demand.
*/
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 628eaa44c75..7d38ee83875 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1598,7 +1598,7 @@ __slvg_row_trk_update_start(
*/
WT_ERR(__wt_scr_alloc(session, 0, &key));
WT_ROW_FOREACH(page, rip, i) {
- WT_ERR(__wt_row_leaf_key_work(session, page, rip, key, 0));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
WT_ERR(WT_LEX_CMP(session, btree->collator, key, stop, cmp));
if (cmp > 0) {
found = 1;
@@ -1764,8 +1764,7 @@ __slvg_row_build_leaf(
skip_start = skip_stop = 0;
if (F_ISSET(trk, WT_TRACK_CHECK_START))
WT_ROW_FOREACH(page, rip, i) {
- WT_ERR(
- __wt_row_leaf_key_work(session, page, rip, key, 0));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
/*
* >= is correct: see the comment above.
@@ -1788,8 +1787,7 @@ __slvg_row_build_leaf(
}
if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
WT_ROW_FOREACH_REVERSE(page, rip, i) {
- WT_ERR(
- __wt_row_leaf_key_work(session, page, rip, key, 0));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
/*
* < is correct: see the comment above.
@@ -1822,7 +1820,7 @@ __slvg_row_build_leaf(
* a copy from the page.
*/
rip = page->pg_row_d + skip_start;
- WT_ERR(__wt_row_leaf_key_work(session, page, rip, key, 0));
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
WT_ERR(__wt_row_ikey_incr(session,
ref->home, 0, key->data, key->size, &ref->key.ikey));
@@ -1909,22 +1907,26 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session,
unpack = &_unpack;
for (rip = page->pg_row_d + start; start < stop; ++start) {
- ikey = WT_ROW_KEY_COPY(rip);
- if (__wt_off_page(page, ikey))
- cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
- else
- cell = (WT_CELL *)ikey;
- __wt_cell_unpack(cell, unpack);
- if (unpack->type == WT_CELL_KEY_OVFL) {
- WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
- "%s merge discard freed overflow reference %s",
- __wt_addr_string(session,
- trk->addr.addr, trk->addr.size, trk->ss->tmp1),
- __wt_addr_string(session,
- unpack->data, unpack->size, trk->ss->tmp2)));
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
+ ikey = WT_ROW_KEY_COPY(rip);
+ if (__wt_off_page(page, ikey))
+ cell =
+ WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ else
+ cell = (WT_CELL *)ikey;
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->type == WT_CELL_KEY_OVFL) {
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discard freed overflow "
+ "reference %s",
+ __wt_addr_string(session, trk->addr.addr,
+ trk->addr.size, trk->ss->tmp1),
+ __wt_addr_string(session, unpack->data,
+ unpack->size, trk->ss->tmp2)));
WT_RET(bm->free(
bm, session, unpack->data, unpack->size));
+ }
}
if ((cell = __wt_row_leaf_value(page, rip)) == NULL)
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index e9d60c7726c..bd0c1d57845 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -4000,6 +4000,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
uint32_t i;
int dictionary, onpage_ovfl, ovfl_key;
const void *p;
+ void *copy;
btree = S2BT(session);
slvg_skip = salvage == NULL ? 0 : salvage->skip;
@@ -4046,14 +4047,22 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* Set the WT_IKEY reference (if the key was instantiated), and
* the key cell reference, unpack the key cell.
*/
- ikey = WT_ROW_KEY_COPY(rip);
- if (__wt_off_page(page, ikey))
+ copy = WT_ROW_KEY_COPY(rip);
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
+ ikey = NULL;
+ cell = NULL;
+ kpack = NULL;
+ } else if (__wt_off_page(page, copy)) {
+ ikey = copy;
cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
- else {
- cell = (WT_CELL *)ikey;
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ } else {
ikey = NULL;
+ cell = (WT_CELL *)copy;
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
}
- __wt_cell_unpack(cell, kpack);
/* Unpack the on-page value cell, and look for an update. */
if ((val_cell = __wt_row_leaf_value(page, rip)) == NULL)
@@ -4132,7 +4141,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* to imagine a real workload where this test is
* worth the effort, but it's a simple test.
*/
- if (kpack->raw == WT_CELL_KEY_OVFL_RM)
+ if (kpack != NULL &&
+ kpack->raw == WT_CELL_KEY_OVFL_RM)
goto leaf_insert;
/*
@@ -4173,7 +4183,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* keys from a row-store page reconciliation
* seems unlikely enough to ignore.
*/
- if (kpack->ovfl &&
+ if (kpack != NULL && kpack->ovfl &&
kpack->raw != WT_CELL_KEY_OVFL_RM) {
/*
* Keys are part of the name-space, we
@@ -4184,7 +4194,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* it now.
*/
if (ikey == NULL)
- WT_ERR(__wt_row_leaf_key_work(
+ WT_ERR(__wt_row_leaf_key(
session,
page, rip, tmpkey, 1));
@@ -4225,7 +4235,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* If the key is an overflow key that hasn't been removed, use
* the original backing blocks.
*/
- onpage_ovfl = kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ onpage_ovfl = kpack != NULL &&
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
if (onpage_ovfl) {
key->buf.data = cell;
key->buf.size = __wt_cell_total_len(kpack);
@@ -4243,12 +4254,15 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
r->ovfl_items = 1;
} else {
/*
+ * Use a direct-key from the page, or
* Use an already instantiated key, or
* Use the key from the disk image, or
* Build a key from a previous key, or
* Instantiate the key from scratch.
*/
- if (ikey != NULL) {
+ if (kpack == NULL)
+ __wt_row_leaf_direct(page, copy, tmpkey);
+ else if (ikey != NULL) {
tmpkey->data = WT_IKEY_DATA(ikey);
tmpkey->size = ikey->size;
} else if (btree->huffman_key == NULL &&
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index fd198812fd1..e82140d509a 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -25,7 +25,8 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
btree = S2BT(session);
- if (page->pg_row_entries == 0) { /* Just checking... */
+ if (page->pg_row_entries == 0 || /* Just checking... */
+ F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
return (0);
}
@@ -109,21 +110,21 @@ __inmem_row_leaf_slots(
*/
int
__wt_row_leaf_key_copy(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb)
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
{
- WT_RET(__wt_row_leaf_key_work(session, page, rip_arg, keyb, 0));
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
/* The return buffer may only hold a reference to a key, copy it. */
- if (!WT_DATA_IN_ITEM(keyb))
- WT_RET(__wt_buf_set(session, keyb, keyb->data, keyb->size));
+ if (!WT_DATA_IN_ITEM(key))
+ WT_RET(__wt_buf_set(session, key, key->data, key->size));
return (0);
}
/*
* __wt_row_leaf_key_work --
- * Return a reference to, or copy of, a row-store leaf-page key.
- * Optionally instantiate the key into the in-memory page.
+ * Return a reference to, a row-store leaf-page key, optionally instantiate
+ * the key into the in-memory page.
*/
int
__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
@@ -142,6 +143,14 @@ __wt_row_leaf_key_work(WT_SESSION_IMPL *session,
void *key;
const void *p;
+ /*
+ * !!!
+ * It is unusual to call this function: most code should be calling the
+ * front-end, __wt_row_leaf_key, be careful if you're calling this code
+ * directly.
+ */
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY));
+
btree = S2BT(session);
unpack = &_unpack;
rip = rip_arg;
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 66031ed24d8..c5cb88ca0fa 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -519,11 +519,12 @@ struct __wt_page {
uint8_t type; /* Page type */
#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */
-#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
-#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
-#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
-#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */
+#define WT_PAGE_DIRECT_KEY 0x02 /* Row-store leaf keys direct access */
+#define WT_PAGE_DISK_ALLOC 0x04 /* Disk image in allocated memory */
+#define WT_PAGE_DISK_MAPPED 0x08 /* Disk image in mapped memory */
+#define WT_PAGE_EVICT_LRU 0x10 /* Page is on the LRU queue */
+#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
+#define WT_PAGE_SPLITTING 0x40 /* An internal page is growing. */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
};
@@ -657,7 +658,7 @@ struct __wt_ref {
* WT_ROW --
* Each in-memory page row-store leaf page has an array of WT_ROW structures:
* this is created from on-page data when a page is read from the file. It's
- * sorted by key, fixed in size, and references data on the page.
+ * sorted by key, fixed in size, and starts with a reference to on-page data.
*
* Multiple threads of control may be searching the in-memory row-store pages,
* and the key may be instantiated at any time. Code must be able to handle
@@ -673,18 +674,15 @@ struct __wt_ref {
* }
*
* The field is declared volatile (so the compiler knows it shouldn't read it
- * multiple times), and we obscure the field name and use a copy macro in all
- * references to the field (so the code doesn't read it multiple times), all
- * to make sure we don't introduce this bug (again).
- *
- * Casting the read to a (void *) is safe as we are not taking the address of
- * the object.
+ * multiple times), and we obscure the field name and use macros or functions
+ * for references to the field (so the code doesn't read it multiple times),
+ * just to ensure we don't introduce this bug (again).
*/
-struct __wt_row {
- void * volatile __key; /* On-page cell or off-page WT_IKEY */
+struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */
+ void * volatile __key;
};
#define WT_ROW_KEY_COPY(rip) ((rip)->__key)
-#define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (v)
+#define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v)
/*
* WT_ROW_FOREACH --
diff --git a/src/include/btree.i b/src/include/btree.i
index 43418d65b96..5ddab20dfd6 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -517,6 +517,70 @@ __wt_ref_key_clear(WT_REF *ref)
}
/*
+ * __wt_row_leaf_direct --
+ * Return an encoded row-store leaf page key.
+ */
+static inline void
+__wt_row_leaf_direct(WT_PAGE *page, void *ripkey, WT_ITEM *key)
+{
+ uintptr_t v;
+
+ /*
+ * A row-store leaf page key is in one of two places: if instantiated,
+ * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
+ * references an on-page location. However, on-page keys are in one of
+ * two states and the reference is in one of two forms: if a row-store
+ * doesn't configure prefix compression or Huffman encoding, and there
+ * were no overflow keys found when reading the page into memory (all
+ * of which is likely, those are the default configurations), the key's
+ * location and size was encoded in the pointer and a page flag was set.
+ * If we found overflow keys or one of those features is configured, the
+ * reference is to the key's on-page cell, which we'll unpack (we're
+ * trying to avoid that cell unpack per key read in the fast path).
+ * The test is if the page flag is set, we're done, it's an encoding;
+ * otherwise, if the pointer is off-page it's an instantiated key, else
+ * an on-page cell.
+ *
+ * This function cracks an encoded key and returns a real pointer. The
+ * encoding magic is simpler than internal page key encoding because we
+ * are using the page's flag rather than per-key information to decide
+ * if the key is encoded. The key's page offset is the bottom 4B, and
+ * the key size is the top 4B.
+ */
+ v = (uintptr_t)ripkey;
+ key->data = WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF));
+ key->size = v >> 32;
+}
+
+/*
+ * __wt_row_leaf_key_onpage_set --
+ * Set a WT_ROW to reference an on-page key.
+ */
+static inline void
+__wt_row_leaf_key_onpage_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_row_leaf_direct for an explanation of the
+ * magic.
+ */
+ v = (uintptr_t)unpack->size << 32 |
+ (uint32_t)WT_PAGE_DISK_OFFSET(page, unpack->data);
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key_onpage_set_cell --
+ * Set a WT_ROW to reference an on-page key's cell.
+ */
+static inline void
+__wt_row_leaf_key_onpage_set_cell(WT_ROW *rip, WT_CELL *cell)
+{
+ WT_ROW_KEY_SET(rip, cell);
+}
+
+/*
* __wt_row_leaf_key --
* Set a buffer to reference a row-store leaf page key as cheaply as
* possible.
@@ -526,32 +590,39 @@ __wt_row_leaf_key(WT_SESSION_IMPL *session,
WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate)
{
WT_BTREE *btree;
- WT_IKEY *ikey;
WT_CELL_UNPACK unpack;
+ WT_IKEY *ikey;
+ void *copy;
btree = S2BT(session);
/*
- * A subset of __wt_row_leaf_key_work, that is, calling that function
- * should give you the same results as calling this one; this function
- * exists to inline fast-path checks for already instantiated keys and
- * on-page uncompressed keys.
+ * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
+ *
+ * The row-store key can change underfoot; explicitly take a copy.
*/
- ikey = WT_ROW_KEY_COPY(rip);
+ copy = WT_ROW_KEY_COPY(rip);
- /*
- * Key copied.
- * If the key has been instantiated for any reason, off-page, use it.
- */
- if (__wt_off_page(page, ikey)) {
+ /* First, check for an encoded key. */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
+ __wt_row_leaf_direct(page, copy, key);
+ return (0);
+ }
+
+ /* Second, check for an instantiated key. */
+ if (__wt_off_page(page, copy)) {
+ ikey = copy;
key->data = WT_IKEY_DATA(ikey);
key->size = ikey->size;
return (0);
}
- /* If the key isn't compressed or an overflow, take it from the page. */
+ /*
+ * Third, if the key isn't compressed or an overflow, unpack the cell
+ * and take it from the page.
+ */
if (btree->huffman_key == NULL) {
- __wt_cell_unpack((WT_CELL *)ikey, &unpack);
+ __wt_cell_unpack(copy, &unpack);
if (unpack.type == WT_CELL_KEY && unpack.prefix == 0) {
key->data = unpack.data;
key->size = unpack.size;
@@ -603,23 +674,43 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip)
{
WT_CELL *cell;
WT_CELL_UNPACK unpack;
+ WT_IKEY *ikey;
+ void *copy;
+ uintptr_t v;
- cell = WT_ROW_KEY_COPY(rip);
+ /*
+ * The row-store key can change underfoot; explicitly take a copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
/*
- * Key copied.
- *
- * Cell now either references a WT_IKEY structure with a cell offset,
- * or references the on-page key WT_CELL. Both can be processed
- * regardless of what other threads are doing. If it's the former,
- * use it to get the latter.
+ * See the comment in __wt_row_leaf_direct for an explanation of the
+ * magic; we know where the key is, step past it to the value's cell.
*/
- if (__wt_off_page(page, cell))
- cell = WT_PAGE_REF_OFFSET(page, ((WT_IKEY *)cell)->cell_offset);
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
+ v = (uintptr_t)copy;
+ cell = (WT_CELL *)
+ ((uint8_t *)WT_PAGE_REF_OFFSET(page, (v & 0xFFFFFFFF)) +
+ (v >> 32));
+ return (__wt_cell_leaf_value_parse(page, cell));
+ }
+
+ /*
+ * Cell now either references a WT_IKEY structure with a cell offset, or
+ * references the on-page key WT_CELL. Both can be processed no matter
+ * what other threads are doing. If it's the former, use it to get the
+ * latter.
+ */
+ if (__wt_off_page(page, copy)) {
+ ikey = copy;
+ cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ } else
+ cell = copy;
/* Unpack the key cell, then return its associated value cell. */
__wt_cell_unpack(cell, &unpack);
cell = (WT_CELL *)((uint8_t *)cell + __wt_cell_total_len(&unpack));
+
return (__wt_cell_leaf_value_parse(page, cell));
}
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 5505bc1b702..c9026c2ee8f 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -177,6 +177,7 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
WT_PAGE *page;
WT_SESSION_IMPL *session;
int key_unpacked;
+ void *copy;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = S2BT(session);
@@ -189,26 +190,27 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
vb = &cbt->iface.value;
/*
- * Return the WT_ROW slot's K/V pair.
+ * The row-store key can change underfoot; explicitly take a copy.
*/
+ copy = WT_ROW_KEY_COPY(rip);
- ikey = WT_ROW_KEY_COPY(rip);
/*
- * Key copied.
- *
* Get a reference to the key, ideally without doing a copy: we could
* call __wt_row_leaf_key, but if a cursor is running through the tree,
* we actually have more information here than that function has, we
* may have the prefix-compressed key that comes immediately before the
* one we want.
*
- * If the key has been instantiated (the key points off-page), we don't
- * have any work to do.
+ * If the key can be accessed directly, or has been instantiated (the
+ * key points off-page), we don't have any work to do.
*
* If the key points on-page, we have a copy of a WT_CELL value that can
* be processed, regardless of what any other thread is doing.
*/
- if (__wt_off_page(page, ikey)) {
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY))
+ __wt_row_leaf_direct(page, copy, kb);
+ else if (__wt_off_page(page, copy)) {
+ ikey = copy;
kb->data = WT_IKEY_DATA(ikey);
kb->size = ikey->size;
} else {
@@ -220,7 +222,7 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
*/
if (btree->huffman_key != NULL)
goto slow;
- __wt_cell_unpack_with_value(page, (WT_CELL *)ikey, unpack);
+ __wt_cell_unpack_with_value(page, copy, unpack);
key_unpacked = 1;
if (unpack->type == WT_CELL_KEY && unpack->prefix == 0) {
cbt->tmp.data = unpack->data;
diff --git a/src/include/extern.h b/src/include/extern.h
index 3008729d158..7495507cd31 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -466,8 +466,8 @@ extern int __wt_rec_col_var_bulk_insert(WT_CURSOR_BULK *cbulk);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session,
WT_PAGE *page,
- WT_ROW *rip_arg,
- WT_ITEM *keyb);
+ WT_ROW *rip,
+ WT_ITEM *key);
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session,
WT_PAGE *page,
WT_ROW *rip_arg,
diff --git a/src/include/txn.i b/src/include/txn.i
index 4f039bcc408..46386afd4e7 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -5,7 +5,6 @@
* See the file LICENSE for redistribution information.
*/
-static inline int __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *, WT_ITEM *);
static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
static inline void __wt_txn_read_first(WT_SESSION_IMPL *session);
static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);