diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_cell.c | 72 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 20 | ||||
-rw-r--r-- | src/btree/bt_debug.c | 63 | ||||
-rw-r--r-- | src/btree/bt_dump.c | 23 | ||||
-rw-r--r-- | src/btree/bt_misc.c | 8 | ||||
-rw-r--r-- | src/btree/bt_page.c | 93 | ||||
-rw-r--r-- | src/btree/bt_reconcile.c | 191 | ||||
-rw-r--r-- | src/btree/bt_ret.c | 28 | ||||
-rw-r--r-- | src/btree/bt_salvage.c | 64 | ||||
-rw-r--r-- | src/btree/bt_stat.c | 10 | ||||
-rw-r--r-- | src/btree/bt_vrfy.c | 33 | ||||
-rw-r--r-- | src/btree/bt_vrfy_dsk.c | 156 | ||||
-rw-r--r-- | src/btree/col_srch.c | 10 | ||||
-rw-r--r-- | src/btree/row_key.c | 75 | ||||
-rw-r--r-- | src/include/btree.h | 8 | ||||
-rw-r--r-- | src/include/cell.i | 368 | ||||
-rw-r--r-- | src/include/extern.h | 11 | ||||
-rw-r--r-- | src/include/intpack.i | 31 | ||||
-rw-r--r-- | src/include/wt_internal.in | 2 |
19 files changed, 645 insertions, 621 deletions
diff --git a/src/btree/bt_cell.c b/src/btree/bt_cell.c index e2c17bb40ee..976e431841e 100644 --- a/src/btree/bt_cell.c +++ b/src/btree/bt_cell.c @@ -8,88 +8,48 @@ #include "wt_internal.h" /* - * __wt_cell_set -- - * Set a WT_CELL's contents based on a type, prefix and data size. + * __wt_cell_copy -- + * Copy an on-page cell into a return buffer, processing as needed. */ -void -__wt_cell_set(WT_SESSION_IMPL *session, - WT_CELL *cell, u_int type, u_int prefix, uint32_t size, uint32_t *cell_lenp) +int +__wt_cell_copy(WT_SESSION_IMPL *session, WT_CELL *cell, WT_BUF *retb) { - uint8_t byte, *p; - - /* - * Delete and off-page items have known sizes, we don't store length - * bytes. Short key/data items have 6- or 7-bits of length in the - * descriptor byte and no length bytes. - */ - WT_ASSERT(session, type == WT_CELL_DATA || type == WT_CELL_KEY); - if (type == WT_CELL_DATA && size < 0x7f) { - /* - * Bit 0 is the WT_CELL_DATA_SHORT flag; the other 7 bits are - * the size. - */ - byte = (uint8_t)size; - cell->__chunk[0] = (byte << 1) | WT_CELL_DATA_SHORT; - *cell_lenp = 1; /* Cell byte */ - return; - } else if (size < 0x3f) { - /* - * Bit 0 is 0, bit 1 is the WT_CELL_KEY_SHORT flag; the other - * 6 bits are the size. - */ - byte = (uint8_t)size; - cell->__chunk[0] = (byte << 2) | WT_CELL_KEY_SHORT; - cell->__chunk[1] = (uint8_t)prefix; - *cell_lenp = 2; /* Cell byte + prefix byte */ - return; - } + WT_CELL_UNPACK *unpack, _unpack; - p = cell->__chunk; - *p++ = (uint8_t)type; /* Type */ + unpack = &_unpack; - if (type == WT_CELL_KEY) /* Prefix byte */ - *p++ = (uint8_t)prefix; - - /* Pack the data length. */ - (void)__wt_vpack_uint( - session, &p, sizeof(cell->__chunk) - 1, (uint64_t)size); - - /* Return the cell's length */ - *cell_lenp = WT_PTRDIFF32(p, cell->__chunk); + __wt_cell_unpack(session, cell, unpack); + return (__wt_cell_unpack_copy(session, unpack, retb)); } /* - * __wt_cell_copy -- - * Copy an on-page cell into a return buffer, processing as needed. + * __wt_cell_unpack_copy -- + * Copy an unpacked cell into a return buffer, processing as needed. */ int -__wt_cell_copy(WT_SESSION_IMPL *session, WT_CELL *cell, WT_BUF *retb) +__wt_cell_unpack_copy( + WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, WT_BUF *retb) { WT_BTREE *btree; - WT_OFF ovfl; - uint32_t size; - const void *p; void *huffman; btree = session->btree; /* Get the cell's data. */ - switch (__wt_cell_type(cell)) { + switch (unpack->type) { case WT_CELL_DATA: case WT_CELL_KEY: - __wt_cell_data_and_len(session, cell, &p, &size); - WT_RET(__wt_buf_set(session, retb, p, size)); + WT_RET(__wt_buf_set(session, retb, unpack->data, unpack->size)); break; case WT_CELL_DATA_OVFL: case WT_CELL_KEY_OVFL: - __wt_cell_off(session, cell, &ovfl); - WT_RET(__wt_ovfl_in(session, &ovfl, retb)); + WT_RET(__wt_ovfl_in(session, &unpack->off, retb)); break; WT_ILLEGAL_FORMAT(session); } /* Select a Huffman encoding function. */ - switch (__wt_cell_type(cell)) { + switch (unpack->type) { case WT_CELL_DATA: case WT_CELL_DATA_OVFL: if ((huffman = btree->huffman_value) == NULL) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 6eb52b31612..fc2ecea9ecb 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -108,10 +108,12 @@ __btcur_next_var( { WT_SESSION_IMPL *session; WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_UPDATE *upd; int found; session = (WT_SESSION_IMPL *)cbt->iface.session; + unpack = &_unpack; /* This loop moves through a page, including after reading a record. */ for (found = 0; !found; ++cbt->cip, ++cbt->recno, --cbt->nitems) { @@ -134,17 +136,13 @@ __btcur_next_var( value->data = WT_UPDATE_DATA(upd); value->size = upd->size; found = 1; - } else if ((cell = WT_COL_PTR(cbt->page, cbt->cip)) != NULL) - switch (__wt_cell_type(cell)) { - case WT_CELL_DATA: - case WT_CELL_DATA_OVFL: - WT_RET(__wt_cell_copy(session, cell, value)); - found = 1; - break; - case WT_CELL_DEL: - break; - WT_ILLEGAL_FORMAT(session); - } + } else if ((cell = WT_COL_PTR(cbt->page, cbt->cip)) != NULL) { + __wt_cell_unpack(session, cell, unpack); + if (unpack->type == WT_CELL_DEL) + continue; + WT_RET(__wt_cell_unpack_copy(session, unpack, value)); + found = 1; + } } return (0); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index e2c257cc100..f8e1f8369f1 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -23,8 +23,8 @@ typedef struct { #ifdef HAVE_DIAGNOSTIC static void __wt_debug_byte_string(WT_DBG *, const uint8_t *, uint32_t); -static int __wt_debug_cell(WT_DBG *, WT_CELL *); -static int __wt_debug_cell_data(WT_DBG *, const char *, WT_CELL *); +static int __wt_debug_cell(WT_DBG *, WT_CELL_UNPACK *); +static int __wt_debug_cell_data(WT_DBG *, const char *, WT_CELL_UNPACK *); static void __wt_debug_col_insert(WT_DBG *, WT_INSERT *); static int __wt_debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __wt_debug_dsk_cell(WT_DBG *, WT_PAGE_DISK *); @@ -503,13 +503,22 @@ __wt_debug_page_col_rle(WT_DBG *ds, WT_PAGE *page) static int __wt_debug_page_col_var(WT_DBG *ds, WT_PAGE *page) { + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; + WT_SESSION_IMPL *session; WT_UPDATE *upd; uint32_t i; + session = ds->session; + unpack = &_unpack; + WT_COL_FOREACH(page, cip, i) { - WT_RET( - __wt_debug_cell_data(ds, "V", WT_COL_PTR(page, cip))); + if ((cell = WT_COL_PTR(page, cip)) == NULL) + unpack = NULL; + else + __wt_cell_unpack(session, cell, unpack); + WT_RET(__wt_debug_cell_data(ds, "V", unpack)); if ((upd = WT_COL_UPDATE(page, cip)) != NULL) __wt_debug_update(ds, upd); @@ -551,6 +560,7 @@ static int __wt_debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_INSERT *ins; WT_ROW *rip; WT_SESSION_IMPL *session; @@ -558,6 +568,7 @@ __wt_debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) uint32_t i; session = ds->session; + unpack = &_unpack; /* * Dump any K/V pairs inserted into the page before the first from-disk @@ -570,13 +581,17 @@ __wt_debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) WT_ROW_FOREACH(page, rip, i) { if (__wt_off_page(page, rip->key)) __wt_debug_ikey(ds, rip->key); - else - WT_RET(__wt_debug_cell_data(ds, "K", rip->key)); + else { + __wt_cell_unpack(session, rip->key, unpack); + WT_RET(__wt_debug_cell_data(ds, "K", unpack)); + } if ((cell = __wt_row_value(session, page, rip)) == NULL) __wt_dmsg(ds, "\tV {}\n"); - else - WT_RET(__wt_debug_cell_data(ds, "V", cell)); + else { + __wt_cell_unpack(session, cell, unpack); + WT_RET(__wt_debug_cell_data(ds, "V", unpack)); + } if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) __wt_debug_update(ds, upd); @@ -638,14 +653,18 @@ __wt_debug_update(WT_DBG *ds, WT_UPDATE *upd) static int __wt_debug_dsk_cell(WT_DBG *ds, WT_PAGE_DISK *dsk) { - WT_SESSION_IMPL *session; WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_SESSION_IMPL *session; uint32_t i; session = ds->session; + unpack = &_unpack; - WT_CELL_FOREACH(session, dsk, cell, i) - WT_RET(__wt_debug_cell(ds, cell)); + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + WT_RET(__wt_debug_cell(ds, unpack)); + } return (0); } @@ -654,35 +673,33 @@ __wt_debug_dsk_cell(WT_DBG *ds, WT_PAGE_DISK *dsk) * Dump a single WT_CELL. */ static int -__wt_debug_cell(WT_DBG *ds, WT_CELL *cell) +__wt_debug_cell(WT_DBG *ds, WT_CELL_UNPACK *unpack) { WT_SESSION_IMPL *session; - WT_OFF off; session = ds->session; __wt_dmsg(ds, "\t%s: len %" PRIu32, - __wt_cell_type_string(cell), __wt_cell_datalen(session, cell)); + __wt_cell_type_string(unpack->raw), unpack->size); - switch (__wt_cell_type(cell)) { + switch (unpack->type) { case WT_CELL_DATA: case WT_CELL_DEL: break; case WT_CELL_KEY: - __wt_dmsg(ds, ", pfx: %u", __wt_cell_prefix(cell)); + __wt_dmsg(ds, ", pfx: " PRIu8, unpack->prefix); break; case WT_CELL_DATA_OVFL: case WT_CELL_KEY_OVFL: case WT_CELL_OFF: - __wt_cell_off(session, cell, &off); __wt_dmsg(ds, ", offpage: addr %" PRIu32 ", size %" PRIu32, - off.addr, off.size); + unpack->off.addr, unpack->off.size); break; WT_ILLEGAL_FORMAT(session); } __wt_dmsg(ds, "\n"); - return (__wt_debug_cell_data(ds, NULL, cell)); + return (__wt_debug_cell_data(ds, NULL, unpack)); } /* @@ -755,7 +772,7 @@ __wt_debug_dsk_col_rle(WT_DBG *ds, WT_PAGE_DISK *dsk) * Dump a single cell's data in debugging mode. */ static int -__wt_debug_cell_data(WT_DBG *ds, const char *tag, WT_CELL *cell) +__wt_debug_cell_data(WT_DBG *ds, const char *tag, WT_CELL_UNPACK *unpack) { WT_BUF *tmp; WT_SESSION_IMPL *session; @@ -771,16 +788,16 @@ __wt_debug_cell_data(WT_DBG *ds, const char *tag, WT_CELL *cell) * Column-store references to deleted cells return a NULL cell * reference. */ - if (cell == NULL) + if (unpack == NULL) goto deleted; - switch (__wt_cell_type(cell)) { + switch (unpack->type) { case WT_CELL_DATA: case WT_CELL_DATA_OVFL: case WT_CELL_KEY: case WT_CELL_KEY_OVFL: WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_cell_copy(session, cell, tmp)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, tmp)); p = tmp->data; size = tmp->size; break; diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c index 619bc1b9bd1..5fc9e0de8fc 100644 --- a/src/btree/bt_dump.c +++ b/src/btree/bt_dump.c @@ -177,6 +177,7 @@ __wt_dump_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) WT_BUF *tmp; WT_COL *cip; WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_UPDATE *upd; int ret; uint32_t i; @@ -184,6 +185,7 @@ __wt_dump_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) btree = session->btree; huffman = btree->huffman_value; + unpack = &_unpack; ret = 0; WT_RET(__wt_scr_alloc(session, 0, &tmp)); @@ -201,17 +203,16 @@ __wt_dump_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) continue; /* Process the original data. */ - switch (__wt_cell_type(cell)) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_DATA: if (huffman == NULL) { - dp->p(__wt_cell_data(session, cell), - __wt_cell_datalen(session, cell), - dp->stream); + dp->p(unpack->data, unpack->size, dp->stream); break; } /* FALLTHROUGH */ case WT_CELL_DATA_OVFL: - WT_ERR(__wt_cell_copy(session, cell, tmp)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, tmp)); dp->p(tmp->data, tmp->size, dp->stream); break; case WT_CELL_DEL: @@ -234,6 +235,7 @@ __wt_dump_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) WT_BTREE *btree; WT_BUF *key_tmp, *value_tmp; WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_INSERT *ins; WT_ITEM *key, _key, *value, _value; WT_ROW *rip; @@ -246,6 +248,7 @@ __wt_dump_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) key = value = NULL; key_tmp = value_tmp = NULL; huffman = btree->huffman_value; + unpack = &_unpack; ret = 0; WT_ERR(__wt_scr_alloc(session, 0, &key_tmp)); @@ -302,17 +305,19 @@ __wt_dump_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSTUFF *dp) } /* Set cell to reference the value we'll dump. */ - switch (__wt_cell_type(cell)) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_DATA: if (huffman == NULL) { value = &_value; - __wt_cell_data_and_len(session, - cell, &value->data, &value->size); + value->data = unpack->data; + value->size = unpack->size; break; } /* FALLTHROUGH */ case WT_CELL_DATA_OVFL: - WT_ERR(__wt_cell_copy(session, cell, value_tmp)); + WT_ERR( + __wt_cell_unpack_copy(session, unpack, value_tmp)); value = (WT_ITEM *)value_tmp; break; WT_ILLEGAL_FORMAT_ERR(session); diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index d5d95ad0db1..adb5b40101b 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -44,15 +44,15 @@ __wt_page_type_string(u_int type) * Return a string representing the cell type. */ const char * -__wt_cell_type_string(WT_CELL *cell) +__wt_cell_type_string(uint8_t type) { - switch (__wt_cell_type_raw(cell)) { + switch (type) { case WT_CELL_DATA: return ("data"); case WT_CELL_DATA_OVFL: return ("data-overflow"); case WT_CELL_DATA_SHORT: - return ("data-short"); + return ("short-data"); case WT_CELL_DEL: return ("deleted"); case WT_CELL_KEY: @@ -60,7 +60,7 @@ __wt_cell_type_string(WT_CELL *cell) case WT_CELL_KEY_OVFL: return ("key-overflow"); case WT_CELL_KEY_SHORT: - return ("key-short"); + return ("short-key"); case WT_CELL_OFF: return ("off-page"); default: diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 41b15331892..f797c113ef3 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -250,10 +250,12 @@ __wt_page_inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_COL *cip; WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_DISK *dsk; uint32_t i; dsk = page->dsk; + unpack = &_unpack; /* * Column-store page entries map one-to-one to the number of physical @@ -268,8 +270,10 @@ __wt_page_inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) * (WT_CELL_DATA_OVFL) or deleted (WT_CELL_DEL) items. */ cip = page->u.col_leaf.d; - WT_CELL_FOREACH(session, dsk, cell, i) + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); (cip++)->__value = WT_DISK_OFFSET(dsk, cell); + } page->entries = dsk->u.entries; return (0); @@ -285,15 +289,16 @@ __wt_page_inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) WT_BTREE *btree; WT_BUF *current, *last, *tmp; WT_CELL *cell; - WT_OFF off; + WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_DISK *dsk; WT_ROW_REF *rref; - uint32_t data_size, i, nindx, prefix; - int cell_ovfl, found_ovfl, ret; - void *data, *huffman; + uint32_t i, nindx, prefix; + int found_ovfl, ret; + void *huffman; btree = session->btree; current = last = NULL; + unpack = &_unpack; dsk = page->dsk; found_ovfl = ret = 0; huffman = btree->huffman_key; @@ -323,61 +328,36 @@ __wt_page_inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) * are WT_CELL_OFF items. */ rref = page->u.row_int.t; - WT_CELL_FOREACH(session, dsk, cell, i) { - switch (__wt_cell_type(cell)) { + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_KEY_OVFL: case WT_CELL_KEY: break; case WT_CELL_OFF: - __wt_cell_off(session, cell, &off); - WT_ROW_REF_ADDR(rref) = off.addr; - WT_ROW_REF_SIZE(rref) = off.size; + WT_ROW_REF_ADDR(rref) = unpack->off.addr; + WT_ROW_REF_SIZE(rref) = unpack->off.size; ++rref; continue; WT_ILLEGAL_FORMAT(session); } - /* Get the cell's prefix and check if it's an overflow cell. */ - prefix = __wt_cell_prefix(cell); - cell_ovfl = __wt_cell_type_is_ovfl(cell); - /* * We can discard the underlying disk page if we don't have any * overflow keys. */ - if (cell_ovfl) + if (unpack->ovfl) found_ovfl = 1; /* - * Overflow keys are simple, and don't participate in prefix - * compression. - * - * If Huffman decoding, use the heavy-weight __wt_cell_copy() - * code to build the key, up to the prefix. Else, we can do - * it faster internally because we don't have to shuffle memory - * around as much. + * If Huffman decoding is required or it's an overflow record, + * use the heavy-weight __wt_cell_unpack_copy() call to build + * the key. Else, we can do it faster internally as we don't + * have to shuffle memory around as much. */ - if (cell_ovfl) - WT_RET(__wt_cell_copy(session, cell, current)); - else if (huffman == NULL) { - /* - * Get the cell's data/length and make sure we have - * enough buffer space. - */ - __wt_cell_data_and_len( - session, cell, &data, &data_size); - WT_ERR(__wt_buf_grow( - session, current, prefix + data_size)); - - /* Copy the prefix then the data into place. */ - if (prefix != 0) - memcpy((void *) - current->data, last->data, prefix); - memcpy((uint8_t *) - current->data + prefix, data, data_size); - current->size = prefix + data_size; - } else { - WT_RET(__wt_cell_copy(session, cell, current)); + prefix = unpack->prefix; + if (huffman != NULL || unpack->ovfl) { + WT_RET(__wt_cell_unpack_copy(session, unpack, current)); /* * If there's a prefix, make sure there's enough buffer @@ -393,6 +373,21 @@ __wt_page_inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) (void *)current->data, last->data, prefix); current->size += prefix; } + } else { + /* + * Get the cell's data/length and make sure we have + * enough buffer space. + */ + WT_ERR(__wt_buf_grow( + session, current, prefix + unpack->size)); + + /* Copy the prefix then the data into place. */ + if (prefix != 0) + memcpy((void *) + current->data, last->data, prefix); + memcpy((uint8_t *) + current->data + prefix, unpack->data, unpack->size); + current->size = prefix + unpack->size; } /* @@ -402,14 +397,14 @@ __wt_page_inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) * and will need a reference to it during reconciliation. */ WT_ERR(__wt_row_ikey_alloc(session, - cell_ovfl ? WT_DISK_OFFSET(dsk, cell) : 0, + unpack->ovfl ? WT_DISK_OFFSET(dsk, cell) : 0, current->data, current->size, (WT_IKEY **)&rref->key)); /* * Swap buffers if it's not an overflow key, we have a new * prefix-compressed page. */ - if (!cell_ovfl) { + if (!unpack->ovfl) { tmp = last; last = current; current = tmp; @@ -434,11 +429,13 @@ static int __wt_page_inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_DISK *dsk; WT_ROW *rip; uint32_t i, nindx; dsk = page->dsk; + unpack = &_unpack; /* * Leaf row-store page entries map to a maximum of two-to-one to the @@ -458,8 +455,9 @@ __wt_page_inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) */ nindx = 0; rip = page->u.row_leaf.d; - WT_CELL_FOREACH(session, dsk, cell, i) - switch (__wt_cell_type(cell)) { + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_KEY_OVFL: case WT_CELL_KEY: ++nindx; @@ -472,6 +470,7 @@ __wt_page_inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) break; WT_ILLEGAL_FORMAT(session); } + } page->entries = nindx; diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c index c03c6646fdd..ea2f16bbb66 100644 --- a/src/btree/bt_reconcile.c +++ b/src/btree/bt_reconcile.c @@ -203,7 +203,8 @@ static void __hazard_copy(WT_SESSION_IMPL *); static int __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *); static int __hazard_qsort_cmp(const void *, const void *); STATIN void __rec_cell_build_deleted(WT_KV *); -static int __rec_cell_build_key(WT_SESSION_IMPL *, const void *, uint32_t); +static int __rec_cell_build_key( + WT_SESSION_IMPL *, const void *, uint32_t, int *); static int __rec_cell_build_ovfl(WT_SESSION_IMPL *, WT_KV *, u_int); static int __rec_cell_build_val(WT_SESSION_IMPL *, void *, uint32_t); static void __rec_col_extend_truncate(WT_PAGE *); @@ -218,7 +219,7 @@ static int __rec_col_var(WT_SESSION_IMPL *, WT_PAGE *, uint64_t); static int __rec_col_var_bulk(WT_SESSION_IMPL *, WT_PAGE *); STATIN void __rec_copy_incr(WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *); static int __rec_discard_add(WT_SESSION_IMPL *, WT_PAGE *, uint32_t, uint32_t); -STATIN int __rec_discard_add_ovfl(WT_SESSION_IMPL *, WT_CELL *); +STATIN int __rec_discard_add_ovfl(WT_SESSION_IMPL *, WT_CELL_UNPACK *); static int __rec_discard_evict(WT_SESSION_IMPL *); static void __rec_discard_init(WT_RECONCILE *); static int __rec_imref_add(WT_SESSION_IMPL *, WT_REF *); @@ -864,10 +865,12 @@ static int __rec_ovfl_delete(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_DISK *dsk; uint32_t i; dsk = page->dsk; + unpack = &_unpack; /* * For row-internal pages, the disk image was discarded because there @@ -880,8 +883,10 @@ __rec_ovfl_delete(WT_SESSION_IMPL *session, WT_PAGE *page) * We're deleting the page, which means any overflow item we ever had * is deleted as well. */ - WT_CELL_FOREACH(session, dsk, cell, i) - WT_RET(__rec_discard_add_ovfl(session, cell)); + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + WT_RET(__rec_discard_add_ovfl(session, unpack)); + } return (0); } @@ -1390,7 +1395,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_BUF *buf) WT_ASSERT(session, buf->size < buf->mem_size); cell = (WT_CELL *)&(((uint8_t *)buf->data)[buf->size]); - __wt_cell_set_fixed(cell, WT_CELL_KEY, ¬used); + __wt_cell_pack_fixed(cell, WT_CELL_KEY, ¬used); ++buf->size; } @@ -1433,11 +1438,13 @@ static int __rec_split_row_promote(WT_SESSION_IMPL *session, uint8_t type) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_RECONCILE *r; uint32_t cnt, len, size; const uint8_t *pa, *pb; r = S2C(session)->cache->rec; + unpack = &_unpack; /* * For a column-store, the promoted key is the recno and we already have @@ -1455,14 +1462,14 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, uint8_t type) /* * The cell had better have a zero-length prefix: it's the first * key on the page. (If it doesn't have a zero-length prefix, - * __wt_cell_copy() won't be sufficient any way, we'd only copy - * the non-prefix-compressed portion of the key.) + * __wt_cell_update_copy() won't be sufficient any way, we'd + * only copy the non-prefix-compressed portion of the key.) */ cell = WT_PAGE_DISK_BYTE(r->dsk.mem); + __wt_cell_unpack(session, cell, unpack); WT_ASSERT(session, - __wt_cell_prefix(cell) == 0 || - __wt_cell_type(cell) == WT_CELL_KEY_OVFL); - WT_RET(__wt_cell_copy(session, cell, &r->bnd[0].key)); + unpack->prefix == 0 || unpack->type == WT_CELL_KEY_OVFL); + WT_RET(__wt_cell_unpack_copy(session, unpack, &r->bnd[0].key)); } /* @@ -1965,6 +1972,7 @@ static int __rec_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_missing) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_KV *val; WT_RECONCILE *r; @@ -1972,6 +1980,7 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_missing) uint32_t i; r = S2C(session)->cache->rec; + unpack = &_unpack; val = &r->v; WT_RET(__rec_split_init(session, page, @@ -2002,13 +2011,14 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_missing) * Get a reference to the value: it's a deleted cell, an update, * or the original on-page item. */ - cell = WT_COL_PTR(page, cip); + if ((cell = WT_COL_PTR(page, cip)) != NULL) + __wt_cell_unpack(session, cell, unpack); if ((upd = WT_COL_UPDATE(page, cip)) == NULL) { if (cell == NULL) __rec_cell_build_deleted(val); else { val->buf.data = cell; - val->buf.size = __wt_cell_len(session, cell); + val->buf.size = unpack->len; val->cell_len = 0; val->len = val->buf.size; } @@ -2018,7 +2028,7 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_missing) * file space. */ if (cell != NULL) - WT_RET(__rec_discard_add_ovfl(session, cell)); + WT_RET(__rec_discard_add_ovfl(session, unpack)); /* * Check for deletion, else build the value's WT_CELL @@ -2092,6 +2102,7 @@ static int __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_IKEY *ikey; WT_KV *key, *val; WT_PAGE *rp; @@ -2101,6 +2112,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) int ovfl_key; r = S2C(session)->cache->rec; + unpack = &_unpack; key = &r->k; val = &r->v; @@ -2129,7 +2141,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) * The value cells all look the same -- we can set it up once and then * just reset the addr/size pairs we're writing after the cell. */ - __wt_cell_set_fixed(&val->cell, WT_CELL_OFF, &val->cell_len); + __wt_cell_pack_fixed(&val->cell, WT_CELL_OFF, &val->cell_len); val->buf.data = &val->off; val->buf.size = WT_SIZEOF32(WT_OFF); val->len = val->cell_len + WT_SIZEOF32(WT_OFF); @@ -2143,8 +2155,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) * the key's WT_CELL reference was set. */ ikey = rref->key; - cell = (ikey->cell_offset == 0) ? - NULL : WT_REF_OFFSET(page, ikey->cell_offset); + if (ikey->cell_offset == 0) + cell = NULL; + else { + cell = WT_REF_OFFSET(page, ikey->cell_offset); + __wt_cell_unpack(session, cell, unpack); + } /* * The page may be deleted or internally created during a split. @@ -2185,7 +2201,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) /* Delete overflow keys for merged pages. */ if (cell != NULL) WT_RET(__rec_discard_add_ovfl( - session, cell)); + session, unpack)); /* Merge split subtrees */ if (F_ISSET(rp, WT_PAGE_MERGE)) { @@ -2209,15 +2225,14 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) */ if (cell != NULL) { key->buf.data = cell; - key->buf.size = __wt_cell_len(session, cell); + key->buf.size = unpack->len; key->cell_len = 0; key->len = key->buf.size; ovfl_key = 1; - } else { + } else WT_RET(__rec_cell_build_key(session, - WT_IKEY_DATA(ikey), r->cell_zero ? 1 : ikey->size)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } + WT_IKEY_DATA(ikey), + r->cell_zero ? 1 : ikey->size, &ovfl_key)); r->cell_zero = 0; /* @@ -2236,10 +2251,9 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__rec_split(session)); r->key_pfx_compress = 0; - if (!ovfl_key) { - WT_RET(__rec_cell_build_key(session, NULL, 0)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } + if (!ovfl_key) + WT_RET(__rec_cell_build_key( + session, NULL, 0, &ovfl_key)); } /* Copy the key onto the page. */ @@ -2309,10 +2323,9 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_PAGE *page) */ ikey = r->merge_ref == NULL ? rref->key : r->merge_ref->key; r->merge_ref = NULL; - WT_RET(__rec_cell_build_key(session, - WT_IKEY_DATA(ikey), r->cell_zero ? 1 : ikey->size)); + WT_RET(__rec_cell_build_key(session, WT_IKEY_DATA(ikey), + r->cell_zero ? 1 : ikey->size, &ovfl_key)); r->cell_zero = 0; - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); /* * Boundary, split or write the page. If the K/V pair doesn't @@ -2324,10 +2337,9 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__rec_split(session)); r->key_pfx_compress = 0; - if (!ovfl_key) { - WT_RET(__rec_cell_build_key(session, NULL, 0)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } + if (!ovfl_key) + WT_RET(__rec_cell_build_key( + session, NULL, 0, &ovfl_key)); } /* Copy the key onto the page. */ @@ -2358,6 +2370,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) { WT_BUF *tmp; WT_CELL *cell, *val_cell; + WT_CELL_UNPACK *unpack, _unpack; WT_IKEY *ikey; WT_INSERT *ins; WT_KV *key, *val; @@ -2368,6 +2381,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) int ovfl_key, ret; r = S2C(session)->cache->rec; + unpack = &_unpack; tmp = NULL; ret = 0; @@ -2430,19 +2444,19 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) cell = rip->key; } /* Build value cell. */ + if ((val_cell = __wt_row_value(session, page, rip)) != NULL) + __wt_cell_unpack(session, val_cell, unpack); if ((upd = WT_ROW_UPDATE(page, rip)) == NULL) { /* * Copy the item off the page -- however, when the page * was read into memory, there may not have been a value * item, that is, it may have been zero length. */ - if ((val_cell = - __wt_row_value(session, page, rip)) == NULL) + if (val_cell == NULL) val->buf.size = 0; else { val->buf.data = val_cell; - val->buf.size = - __wt_cell_len(session, val_cell); + val->buf.size = unpack->len; } val->cell_len = 0; val->len = val->buf.size; @@ -2451,10 +2465,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) * If we updated an overflow value, free the underlying * file space. */ - if ((val_cell = - __wt_row_value(session, page, rip)) != NULL) - WT_ERR( - __rec_discard_add_ovfl(session, val_cell)); + if (val_cell != NULL) + WT_ERR(__rec_discard_add_ovfl(session, unpack)); /* * If this key/value pair was deleted, we're done. If @@ -2462,7 +2474,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) * space. */ if (WT_UPDATE_DELETED_ISSET(upd)) { - WT_ERR(__rec_discard_add_ovfl(session, cell)); + __wt_cell_unpack(session, cell, unpack); + WT_ERR(__rec_discard_add_ovfl(session, unpack)); goto leaf_insert; } @@ -2484,21 +2497,20 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) * If the key is an overflow item, assume prefix compression * won't make things better, and simply copy it. */ - if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { + __wt_cell_unpack(session, cell, unpack); + if (unpack->type == WT_CELL_KEY_OVFL) { key->buf.data = cell; - key->buf.size = __wt_cell_len(session, cell); + key->buf.size = unpack->len; key->cell_len = 0; key->len = key->buf.size; ovfl_key = 1; - } else if (ikey != NULL) { - WT_ERR(__rec_cell_build_key( - session, WT_IKEY_DATA(ikey), ikey->size)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } else { + } else if (ikey != NULL) + WT_ERR(__rec_cell_build_key(session, + WT_IKEY_DATA(ikey), ikey->size, &ovfl_key)); + else { WT_ERR(__wt_row_key(session, page, rip, tmp)); WT_ERR(__rec_cell_build_key( - session, tmp->data, tmp->size)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); + session, tmp->data, tmp->size, &ovfl_key)); } /* @@ -2516,16 +2528,15 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t slvg_skip) * We have to have a copy of any overflow key because * we're about to promote it. */ - if (ovfl_key && - __wt_cell_type(cell) == WT_CELL_KEY_OVFL) - WT_RET(__wt_cell_copy(session, cell, r->full)); + if (ovfl_key && unpack->type == WT_CELL_KEY_OVFL) + WT_RET(__wt_cell_unpack_copy( + session, unpack, r->full)); WT_ERR(__rec_split(session)); r->key_pfx_compress = 0; - if (!ovfl_key) { - WT_ERR(__rec_cell_build_key(session, NULL, 0)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } + if (!ovfl_key) + WT_ERR(__rec_cell_build_key( + session, NULL, 0, &ovfl_key)); } /* Copy the key/value pair onto the page. */ @@ -2575,9 +2586,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_INSERT *ins) WT_RET(__rec_cell_build_val( session, WT_UPDATE_DATA(upd), upd->size)); - WT_RET(__rec_cell_build_key( /* Build key cell. */ - session, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); + WT_RET(__rec_cell_build_key(session, /* Build key cell. */ + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* * Boundary, split or write the page. If the K/V pair doesn't @@ -2593,10 +2603,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_INSERT *ins) WT_RET(__rec_split(session)); r->key_pfx_compress = 0; - if (!ovfl_key) { - WT_RET(__rec_cell_build_key(session, NULL, 0)); - ovfl_key = __wt_cell_type_is_ovfl(&key->cell); - } + if (!ovfl_key) + WT_RET(__rec_cell_build_key( + session, NULL, 0, &ovfl_key)); } /* Copy the key/value pair onto the page. */ @@ -2867,7 +2876,7 @@ __rec_parent_update(WT_SESSION_IMPL *session, WT_PAGE *page, static inline void __rec_cell_build_deleted(WT_KV *val) { - __wt_cell_set_fixed(&val->cell, WT_CELL_DEL, &val->cell_len); + __wt_cell_pack_fixed(&val->cell, WT_CELL_DEL, &val->cell_len); val->buf.size = 0; val->len = val->cell_len; } @@ -2878,7 +2887,8 @@ __rec_cell_build_deleted(WT_KV *val) * stored on the page. */ static int -__rec_cell_build_key(WT_SESSION_IMPL *session, const void *data, uint32_t size) +__rec_cell_build_key( + WT_SESSION_IMPL *session, const void *data, uint32_t size, int *is_ovflp) { WT_BTREE *btree; WT_KV *key; @@ -2889,6 +2899,7 @@ __rec_cell_build_key(WT_SESSION_IMPL *session, const void *data, uint32_t size) r = S2C(session)->cache->rec; btree = session->btree; key = &r->k; + *is_ovflp = 0; pfx = 0; if (data == NULL) @@ -2943,12 +2954,15 @@ __rec_cell_build_key(WT_SESSION_IMPL *session, const void *data, uint32_t size) * Overflow objects aren't prefix compressed -- rebuild any * object that was prefix compressed. */ - return ((pfx == 0) ? - __rec_cell_build_ovfl(session, key, WT_CELL_KEY_OVFL) : - __rec_cell_build_key(session, NULL, 0)); + if (pfx == 0) { + *is_ovflp = 1; + return (__rec_cell_build_ovfl( + session, key, WT_CELL_KEY_OVFL)); + } + return (__rec_cell_build_key(session, NULL, 0, is_ovflp)); } - __wt_cell_set(session, + __wt_cell_pack(session, &key->cell, WT_CELL_KEY, pfx, key->buf.size, &key->cell_len); key->len = key->cell_len + key->buf.size; @@ -2980,7 +2994,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, void *data, uint32_t size) /* Handle zero-length cells quickly. */ if (size == 0) { - __wt_cell_set( + __wt_cell_pack( session, &val->cell, WT_CELL_DATA, 0, 0, &val->cell_len); val->len = val->cell_len + val->buf.size; return (0); @@ -2998,7 +3012,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, void *data, uint32_t size) return (__rec_cell_build_ovfl(session, val, WT_CELL_DATA_OVFL)); } - __wt_cell_set(session, + __wt_cell_pack(session, &val->cell, WT_CELL_DATA, 0, val->buf.size, &val->cell_len); val->len = val->cell_len + val->buf.size; return (0); @@ -3046,7 +3060,7 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_KV *kv, u_int type) /* Set the callers K/V to reference the WT_OFF structure. */ kv->buf.data = &kv->off; kv->buf.size = sizeof(kv->off); - __wt_cell_set_fixed(&kv->cell, type, &kv->cell_len); + __wt_cell_pack_fixed(&kv->cell, type, &kv->cell_len); kv->len = kv->cell_len + kv->buf.size; ret = __wt_disk_write(session, dsk, addr, size); @@ -3391,23 +3405,6 @@ __rec_imref_add(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * __rec_discard_add_ovfl -- - * If the cell argument references an overflow chunk, schedule it for - * discard. - */ -static inline int -__rec_discard_add_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell) -{ - WT_OFF ovfl; - - if (__wt_cell_type_is_ovfl(cell)) { - __wt_cell_off(session, cell, &ovfl); - return (__rec_discard_add(session, NULL, ovfl.addr, ovfl.size)); - } - return (0); -} - -/* * __rec_discard_init -- * Initialize the list of discard objects. */ @@ -3418,6 +3415,18 @@ __rec_discard_init(WT_RECONCILE *r) } /* + * __rec_discard_add_ovfl -- + * If the cell argument references an overflow chunk, schedule it for + * discard. + */ +static inline int +__rec_discard_add_ovfl(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack) +{ + return (unpack->ovfl ? __rec_discard_add( + session, NULL, unpack->off.addr, unpack->off.size) : 0); +} + +/* * __rec_discard_add -- * Append an object to the list of discard objects. */ diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index e41027d1b09..ca57bd053c4 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -16,6 +16,7 @@ __wt_return_data( WT_SESSION_IMPL *session, WT_ITEM *key, WT_ITEM *value, int key_return) { WT_BTREE *btree; + WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_ITEM local_key, local_value; @@ -31,7 +32,8 @@ __wt_return_data( btree = session->btree; cursor = session->cursor; - callback = NULL; /* TODO: was value->callback */ + callback = NULL; /* TODO: was value->callback */ + unpack = &_unpack; ret = 0; page = session->srch_page; @@ -120,15 +122,17 @@ __wt_return_data( break; case WT_PAGE_COL_VAR: cell = cipdata; -page_cell: if (btree->huffman_value == NULL && - __wt_cell_type(cell) == WT_CELL_DATA) - __wt_cell_data_and_len(session, - cell, &value_ret, &size_ret); - else { - WT_RET(__wt_cell_copy( - session, cell, &cursor->value)); + +page_cell: __wt_cell_unpack(session, cell, unpack); + if (btree->huffman_value != NULL || + unpack->type != WT_CELL_DATA) { + WT_RET(__wt_cell_unpack_copy( + session, unpack, &cursor->value)); value_ret = cursor->value.data; size_ret = cursor->value.size; + } else { + value_ret = unpack->data; + size_ret = unpack->size; } break; WT_ILLEGAL_FORMAT(session); @@ -138,15 +142,15 @@ page_cell: if (btree->huffman_value == NULL && /* * When we get here, value_ret and size_ret are set to the byte string * and the length we're going to return. That byte string has been - * decoded, we called __wt_cell_copy above in all cases where the item - * could be encoded. + * decoded, we called __wt_cell_unpack_copy above in all cases where an + * item could be encoded. */ if (callback == NULL) { /* * We're copying the key/value pair out to the caller. If we * haven't copied the value_ret/size_ret pair into the return - * WT_ITEM yet (potentially done by __wt_cell_copy()), do that - * now. + * WT_ITEM yet (potentially done by __wt_cell_unpack_copy), do + * that now. */ if (value_ret != cursor->value.data) WT_RET(__wt_buf_set( diff --git a/src/btree/bt_salvage.c b/src/btree/bt_salvage.c index 6509c1f9954..3c84939de02 100644 --- a/src/btree/bt_salvage.c +++ b/src/btree/bt_salvage.c @@ -619,17 +619,21 @@ static int __slvg_trk_leaf_ovfl(WT_SESSION_IMPL *session, WT_PAGE_DISK *dsk, WT_TRACK *trk) { WT_CELL *cell; - WT_OFF ovfl; + WT_CELL_UNPACK *unpack, _unpack; uint32_t i, ovfl_cnt; + unpack = &_unpack; + /* * Two passes: count the overflow items, then copy them into an * allocated array. */ ovfl_cnt = 0; - WT_CELL_FOREACH(session, dsk, cell, i) - if (__wt_cell_type_is_ovfl(cell)) + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + if (unpack->ovfl) ++ovfl_cnt; + } if (ovfl_cnt == 0) return (0); @@ -637,20 +641,22 @@ __slvg_trk_leaf_ovfl(WT_SESSION_IMPL *session, WT_PAGE_DISK *dsk, WT_TRACK *trk) trk->ovfl_cnt = ovfl_cnt; ovfl_cnt = 0; - WT_CELL_FOREACH(session, dsk, cell, i) - if (__wt_cell_type_is_ovfl(cell)) { - __wt_cell_off(session, cell, &ovfl); - trk->ovfl[ovfl_cnt].addr = ovfl.addr; - trk->ovfl[ovfl_cnt].size = ovfl.size; + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + if (unpack->ovfl) { + trk->ovfl[ovfl_cnt].addr = unpack->off.addr; + trk->ovfl[ovfl_cnt].size = unpack->off.size; WT_VERBOSE(session, SALVAGE, "[%" PRIu32 "] overflow reference [%" PRIu32 "/%" PRIu32 "]", - trk->addr, ovfl.addr, ovfl.size); + trk->addr, unpack->off.addr, unpack->off.size); if (++ovfl_cnt == trk->ovfl_cnt) break; } + } + return (0); } @@ -1220,21 +1226,23 @@ static int __slvg_col_merge_ovfl(WT_SESSION_IMPL *session, uint32_t addr, WT_PAGE *page, uint32_t start, uint32_t stop) { + WT_CELL_UNPACK *unpack, _unpack; WT_CELL *cell; WT_COL *cip; - WT_OFF ovfl; + + unpack = &_unpack; for (cip = page->u.col_leaf.d + start; start < stop; ++start) { cell = WT_COL_PTR(page, cip); - if (__wt_cell_type(cell) == WT_CELL_DATA_OVFL) { - __wt_cell_off(session, cell, &ovfl); - + __wt_cell_unpack(session, cell, unpack); + if (unpack->type == WT_CELL_DATA_OVFL) { WT_VERBOSE(session, SALVAGE, "[%" PRIu32 "] merge discard freed overflow " "reference [%" PRIu32 "/%" PRIu32 "]", - addr, ovfl.addr, ovfl.size); + addr, unpack->off.addr, unpack->off.size); - WT_RET(__wt_block_free(session, ovfl.addr, ovfl.size)); + WT_RET(__wt_block_free( + session, unpack->off.addr, unpack->off.size)); } } return (0); @@ -1781,35 +1789,39 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session, uint32_t addr, WT_PAGE *page, uint32_t start, uint32_t stop) { WT_CELL *cell; - WT_OFF ovfl; + WT_CELL_UNPACK *unpack, _unpack; WT_ROW *rip; + unpack = &_unpack; + for (rip = page->u.row_leaf.d + start; start < stop; ++start) { if (__wt_off_page(page, rip->key)) cell = WT_REF_OFFSET( page, ((WT_IKEY *)rip->key)->cell_offset); else cell = rip->key; - if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) { - __wt_cell_off(session, cell, &ovfl); - + __wt_cell_unpack(session, cell, unpack); + if (unpack->type == WT_CELL_KEY_OVFL) { WT_VERBOSE(session, SALVAGE, "[%" PRIu32 "] merge discard freed overflow " "reference [%" PRIu32 "/%" PRIu32 "]", - addr, ovfl.addr, ovfl.size); + addr, unpack->off.addr, unpack->off.size); - WT_RET(__wt_block_free(session, ovfl.addr, ovfl.size)); + WT_RET(__wt_block_free( + session, unpack->off.addr, unpack->off.size)); } - if ((cell = __wt_row_value(session, page, rip)) != NULL && - __wt_cell_type(cell) == WT_CELL_DATA_OVFL) { - __wt_cell_off(session, cell, &ovfl); + if ((cell = __wt_row_value(session, page, rip)) == NULL) + continue; + __wt_cell_unpack(session, cell, unpack); + if (unpack->type == WT_CELL_DATA_OVFL) { WT_VERBOSE(session, SALVAGE, "[%" PRIu32 "] merge discard freed overflow " "reference [%" PRIu32 "/%" PRIu32 "]", - addr, ovfl.addr, ovfl.size); + addr, unpack->off.addr, unpack->off.size); - WT_RET(__wt_block_free(session, ovfl.addr, ovfl.size)); + WT_RET(__wt_block_free( + session, unpack->off.addr, unpack->off.size)); } } return (0); diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 712ba1a2b23..013fa9cea60 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -152,12 +152,14 @@ static int __wt_stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE_FILE_STATS *stats; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_UPDATE *upd; uint32_t i; - void *cipdata; stats = session->btree->fstats; + unpack = &_unpack; /* * Walk the page, counting regular and overflow data items, and checking @@ -167,12 +169,12 @@ __wt_stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) * there's Huffman encoding). */ WT_COL_FOREACH(page, cip, i) { - cipdata = WT_COL_PTR(page, cip); - if (cipdata == NULL) { + if ((cell = WT_COL_PTR(page, cip)) == NULL) { WT_STAT_INCR(stats, file_item_col_deleted); continue; } - switch (__wt_cell_type(cipdata)) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_DATA: case WT_CELL_DATA_OVFL: upd = WT_COL_UPDATE(page, cip); diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 9a54ae7130a..0025d89effe 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -30,8 +30,9 @@ static int __wt_verify_addfrag( static int __wt_verify_checkfrag(WT_SESSION_IMPL *, WT_VSTUFF *); static int __wt_verify_freelist(WT_SESSION_IMPL *, WT_VSTUFF *); static int __wt_verify_int(WT_SESSION_IMPL *, int); -static int __wt_verify_overflow(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *); -static int __wt_verify_overflow_cell(WT_SESSION_IMPL *, WT_CELL *, WT_VSTUFF *); +static int __wt_verify_overflow( + WT_SESSION_IMPL *, uint32_t, uint32_t, WT_VSTUFF *); +static int __wt_verify_overflow_cell(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *); static int __wt_verify_row_int_key_order( WT_SESSION_IMPL *, WT_PAGE *, WT_ROW_REF *, uint32_t, WT_VSTUFF *); static int __wt_verify_row_leaf_key_order( @@ -295,7 +296,7 @@ recno_chk: if (parent_recno != recno) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - WT_RET(__wt_verify_overflow(session, page, vs)); + WT_RET(__wt_verify_overflow_cell(session, page, vs)); break; } @@ -460,17 +461,20 @@ __wt_verify_row_leaf_key_order( } /* - * __wt_verify_overflow -- + * __wt_verify_overflow_cell -- * Verify any overflow cells on the page. */ static int -__wt_verify_overflow( +__wt_verify_overflow_cell( WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_DISK *dsk; uint32_t i; + unpack = &_unpack; + /* * Row-store internal page disk images are discarded when there's no * overflow items on the page. If there's no disk image, we're done. @@ -481,13 +485,16 @@ __wt_verify_overflow( } /* Walk the disk page, verifying pages referenced by overflow cells. */ - WT_CELL_FOREACH(session, dsk, cell, i) - switch (__wt_cell_type(cell)) { + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { + __wt_cell_unpack(session, cell, unpack); + switch (unpack->type) { case WT_CELL_KEY_OVFL: case WT_CELL_DATA_OVFL: - WT_RET(__wt_verify_overflow_cell(session, cell, vs)); + WT_RET(__wt_verify_overflow( + session, unpack->off.addr, unpack->off.size, vs)); break; } + } return (0); } @@ -496,22 +503,16 @@ __wt_verify_overflow( * Read in an overflow page and check it. */ static int -__wt_verify_overflow_cell( - WT_SESSION_IMPL *session, WT_CELL *cell, WT_VSTUFF *vs) +__wt_verify_overflow( + WT_SESSION_IMPL *session, uint32_t addr, uint32_t size, WT_VSTUFF *vs) { WT_BUF *tmp; - WT_OFF ovfl; WT_PAGE_DISK *dsk; - uint32_t addr, size; int ret; tmp = NULL; ret = 0; - __wt_cell_off(session, cell, &ovfl); - addr = ovfl.addr; - size = ovfl.size; - /* Allocate enough memory to hold the overflow pages. */ WT_RET(__wt_scr_alloc(session, size, &tmp)); diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 0494e7cc4ad..5d6fe42f49b 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -7,13 +7,12 @@ #include "wt_internal.h" -static int __wt_err_cell_vs_page( - WT_SESSION_IMPL *, uint32_t, uint32_t, WT_CELL *, WT_PAGE_DISK *, int); +static int __wt_err_cell_corrupted(WT_SESSION_IMPL *, uint32_t, uint32_t, int); +static int __wt_err_cell_type( + WT_SESSION_IMPL *, uint32_t, uint32_t, uint8_t, WT_PAGE_DISK *, int); static int __wt_err_delfmt(WT_SESSION_IMPL *, uint32_t, uint32_t, int); static int __wt_err_eof(WT_SESSION_IMPL *, uint32_t, uint32_t, int); static int __wt_err_eop(WT_SESSION_IMPL *, uint32_t, uint32_t, int); -static int __wt_verify_cell( - WT_SESSION_IMPL *, WT_CELL *, uint32_t, uint32_t, uint8_t *, int); static int __wt_verify_dsk_col_fix( WT_SESSION_IMPL *, WT_PAGE_DISK *, uint32_t, uint32_t, int); static int __wt_verify_dsk_col_int( @@ -141,11 +140,11 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_BUF *current, *last, *last_pfx, *last_ovfl; WT_CELL *cell; - WT_OFF off; + WT_CELL_UNPACK *unpack, _unpack; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; off_t file_size; - void *huffman, *data; - uint32_t cell_num, cell_type, data_size, i, prefix; + void *huffman; + uint32_t cell_num, cell_type, i, prefix; uint8_t *end; int ret; int (*func)(WT_BTREE *, const WT_ITEM *, const WT_ITEM *); @@ -153,6 +152,7 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, btree = session->btree; func = btree->btree_compare; huffman = btree->huffman_key; + unpack = &_unpack; ret = 0; current = last_pfx = last_ovfl = NULL; @@ -166,30 +166,38 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, last_cell_type = FIRST; cell_num = 0; - WT_CELL_FOREACH(session, dsk, cell, i) { + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { ++cell_num; - /* Check the cell itself. */ - WT_ERR(__wt_verify_cell( - session, cell, cell_num, addr, end, quiet)); + /* Carefully unpack the cell. */ + if (__wt_cell_unpack_safe(session, cell, unpack, end) != 0) { + ret = __wt_err_cell_corrupted( + session, cell_num, addr, quiet); + goto err; + } /* Check the cell type. */ - cell_type = __wt_cell_type(cell); + cell_type = unpack->raw; switch (cell_type) { case WT_CELL_DATA: case WT_CELL_DATA_OVFL: + case WT_CELL_DATA_SHORT: case WT_CELL_KEY: case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_SHORT: break; case WT_CELL_OFF: if (dsk->type == WT_PAGE_ROW_INT) break; /* FALLTHROUGH */ default: - return (__wt_err_cell_vs_page( - session, cell_num, addr, cell, dsk, quiet)); + return (__wt_err_cell_type( + session, cell_num, addr, unpack->type, dsk, quiet)); } + /* Collapse the short key/data types. */ + cell_type = unpack->type; + /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: @@ -245,9 +253,8 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, case WT_CELL_DATA_OVFL: case WT_CELL_KEY_OVFL: case WT_CELL_OFF: - __wt_cell_off(session, cell, &off); if (WT_ADDR_TO_OFF(btree, - off.addr) + off.size > file_size) + unpack->off.addr) + unpack->off.size > file_size) goto eof; break; } @@ -263,7 +270,7 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: - WT_ERR(__wt_cell_copy(session, cell, current)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ @@ -276,7 +283,7 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ - prefix = __wt_cell_prefix(cell); + prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) { WT_VRFY_ERR(session, quiet, "the %" PRIu32 " key on page at addr %" PRIu32 @@ -298,30 +305,28 @@ __wt_verify_dsk_row(WT_SESSION_IMPL *session, } /* - * If Huffman decoding, use the heavy-weight __wt_cell_copy() - * code to build the key, up to the prefix. Else, we can do - * it faster internally because we don't have to shuffle memory - * around as much. + * If Huffman decoding required, use the heavy-weight call to + * __wt_cell_unpack_copy() to build the key, up to the prefix. + * Else, we can do it faster internally because we don't have + * to shuffle memory around as much. */ if (huffman == NULL) { /* * Get the cell's data/length and make sure we have * enough buffer space. */ - __wt_cell_data_and_len( - session, cell, &data, &data_size); WT_ERR(__wt_buf_grow( - session, current, prefix + data_size)); + session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy((void *) current->data, last->data, prefix); memcpy((uint8_t *) - current->data + prefix, data, data_size); - current->size = prefix + data_size; + current->data + prefix, unpack->data, unpack->size); + current->size = prefix + unpack->size; } else { - WT_ERR(__wt_cell_copy(session, cell, current)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, current)); /* * If there's a prefix, make sure there's enough buffer @@ -536,25 +541,27 @@ __wt_verify_dsk_col_var(WT_SESSION_IMPL *session, { WT_BTREE *btree; WT_CELL *cell; - WT_OFF off; + WT_CELL_UNPACK *unpack, _unpack; off_t file_size; uint32_t cell_num, cell_type, i; uint8_t *end; btree = session->btree; + unpack = &_unpack; file_size = btree->fh->file_size; end = (uint8_t *)dsk + size; cell_num = 0; - WT_CELL_FOREACH(session, dsk, cell, i) { + WT_CELL_FOREACH(session, dsk, cell, unpack, i) { ++cell_num; - /* Check the cell itself. */ - WT_RET(__wt_verify_cell( - session, cell, cell_num, addr, end, quiet)); + /* Carefully unpack the cell. */ + if (__wt_cell_unpack_safe(session, cell, unpack, end) != 0) + return (__wt_err_cell_corrupted( + session, cell_num, addr, quiet)); /* Check the cell type. */ - cell_type = __wt_cell_type_raw(cell); + cell_type = unpack->raw; switch (cell_type) { case WT_CELL_DATA: case WT_CELL_DATA_OVFL: @@ -562,15 +569,14 @@ __wt_verify_dsk_col_var(WT_SESSION_IMPL *session, case WT_CELL_DEL: break; default: - return (__wt_err_cell_vs_page( - session, cell_num, addr, cell, dsk, quiet)); + return (__wt_err_cell_type( + session, cell_num, addr, unpack->raw, dsk, quiet)); } /* Check if any referenced item is entirely in the file. */ if (cell_type == WT_CELL_DATA_OVFL) { - __wt_cell_off(session, cell, &off); if (WT_ADDR_TO_OFF(btree, - off.addr) + off.size > file_size) + unpack->off.addr) + unpack->off.size > file_size) return (__wt_err_eof( session, cell_num, addr, quiet)); } @@ -617,81 +623,33 @@ __wt_verify_dsk_chunk(WT_SESSION_IMPL *session, } /* - * __wt_verify_cell -- - * Check to see if a cell is safe. + * __wt_err_cell_corrupted -- + * Generic corrupted cell, we couldn't read it. */ static int -__wt_verify_cell(WT_SESSION_IMPL *session, - WT_CELL *cell, uint32_t cell_num, uint32_t addr, uint8_t *end, int quiet) +__wt_err_cell_corrupted( + WT_SESSION_IMPL *session, uint32_t entry_num, uint32_t addr, int quiet) { - uint8_t *p; - - /* - * Check if this cell is on the page, and once we know the cell - * is safe, check if the cell's data is entirely on the page. - * - * Delete and off-page items have known sizes, we don't store length - * bytes. Short key/data items have 6- or 7-bits of size in the - * descriptor byte and no length bytes. In both cases, the data is - * after the single byte WT_CELL. - */ - p = (uint8_t *)cell; - -#ifdef XXX - This code needs a "safe" check of the cell & the cell data. - switch (__wt_cell_type_raw(cell)) { - case WT_CELL_DATA_OVFL: - case WT_CELL_DATA_SHORT: - case WT_CELL_DEL: - case WT_CELL_KEY_OVFL: - case WT_CELL_KEY_SHORT: - case WT_CELL_OFF: - p += 1; - break; - case WT_CELL_DATA: - case WT_CELL_KEY: - switch (WT_CELL_BYTES(cell)) { - case WT_CELL_1_BYTE: - p += 2; - break; - case WT_CELL_2_BYTE: - p += 3; - break; - case WT_CELL_3_BYTE: - p += 4; - break; - case WT_CELL_4_BYTE: - default: - p += 5; - break; - } - break; - default: - /* - * Don't worry about illegal types -- our caller will check, - * based on the page type. - */ - break; - } -#endif - if (p > end || (uint8_t *)(cell) + __wt_cell_len(session, cell) > end) - return (__wt_err_eop(session, cell_num, addr, quiet)); - return (0); + WT_VRFY_ERR(session, quiet, + "item %" PRIu32 + " on page at addr %" PRIu32 " is a corrupted cell", + entry_num, addr); + return (WT_ERROR); } /* - * __wt_err_cell_vs_page -- + * __wt_err_cell_type -- * Generic illegal cell type for a particular page type error. */ static int -__wt_err_cell_vs_page(WT_SESSION_IMPL *session, uint32_t entry_num, - uint32_t addr, WT_CELL *cell, WT_PAGE_DISK *dsk, int quiet) +__wt_err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, + uint32_t addr, uint8_t cell_type, WT_PAGE_DISK *dsk, int quiet) { WT_VRFY_ERR(session, quiet, "illegal cell and page type combination cell %" PRIu32 " on page at addr %" PRIu32 " is a %s cell on a %s page", entry_num, addr, - __wt_cell_type_string(cell), __wt_page_type_string(dsk->type)); + __wt_cell_type_string(cell_type), __wt_page_type_string(dsk->type)); return (WT_ERROR); } diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 960dc3a6a6d..11a2c10e95b 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -15,6 +15,7 @@ int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, uint32_t flags) { WT_BTREE *btree; + WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_COL_REF *cref; WT_INSERT *ins; @@ -25,9 +26,10 @@ __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, uint32_t flags) int ret; void *cipdata; - cipdata = NULL; + unpack = &_unpack; cref = NULL; start_recno = 0; + cipdata = NULL; session->srch_page = NULL; /* Return values. */ session->srch_write_gen = 0; @@ -153,9 +155,11 @@ __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, uint32_t flags) } else if (page->type == WT_PAGE_COL_FIX) { if (WT_FIX_DELETE_ISSET(cipdata)) goto notfound; - } else - if (__wt_cell_type(cipdata) == WT_CELL_DEL) + } else { + __wt_cell_unpack(session, cipdata, unpack); + if (unpack->type == WT_CELL_DEL) goto notfound; + } break; case WT_PAGE_COL_RLE: /* diff --git a/src/btree/row_key.c b/src/btree/row_key.c index b3350c7218e..d3a14cdebdf 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -18,14 +18,15 @@ __wt_row_key( { enum { FORWARD, BACKWARD } direction; WT_BUF *tmp; + WT_CELL_UNPACK *unpack, _unpack; WT_IKEY *ikey; WT_ROW *rip; - uint32_t pfx; int is_local, ret, slot_offset; void *key; rip = rip_arg; tmp = NULL; + unpack = &_unpack; /* * If the caller didn't pass us a buffer, create one. We don't use @@ -78,34 +79,23 @@ __wt_row_key( * 1: the test for an on/off page reference. */ if (__wt_off_page(page, key)) { + ikey = key; + /* * If this is the key we originally wanted, we don't * care if we're rolling forward or backward, or if * it's an overflow key or not, it's what we wanted. * Take a copy and wrap up. - * - * If we wanted a different key and this key is not an - * overflow key, it has a valid prefix, we can use it. - * If rolling backward, take a copy of the key and - * switch directions, we can roll forward from this key. - * If rolling forward, replace the key we've been - * building with this key, it's what we would have built - * anyway. - * To summarize, in both cases, take a copy of the key - * and roll forward. */ - ikey = key; - if (slot_offset == 0 || !__wt_cell_type_is_ovfl( - WT_REF_OFFSET(page, ikey->cell_offset))) { + if (slot_offset == 0) { WT_ERR(__wt_buf_set(session, retb, WT_IKEY_DATA(ikey), ikey->size)); - if (slot_offset == 0) - break; - - direction = FORWARD; - goto next; + break; } + __wt_cell_unpack(session, + WT_REF_OFFSET(page, ikey->cell_offset), unpack); + /* * If we wanted a different key and this key is an * overflow key: @@ -116,19 +106,38 @@ __wt_row_key( * done because prefixes skip overflow keys: keep * rolling forward. */ + if (unpack->ovfl) + goto next; + + /* + * If we wanted a different key and this key is not an + * overflow key, it has a valid prefix, we can use it. + * If rolling backward, take a copy of the key and + * switch directions, we can roll forward from this key. + * If rolling forward, replace the key we've been + * building with this key, it's what we would have built + * anyway. + * In short: if it's not an overflow key, take a copy + * and roll forward. + */ + WT_ERR(__wt_buf_set( + session, retb, WT_IKEY_DATA(ikey), ikey->size)); + direction = FORWARD; goto next; } + /* Unpack the key's cell. */ + __wt_cell_unpack(session, key, unpack); + /* 2: the test for an on-page reference to an overflow key. */ - if (__wt_cell_type(key) == WT_CELL_KEY_OVFL) { + if (unpack->type == WT_CELL_KEY_OVFL) { /* * If this is the key we wanted from the start, we don't - * care if it's an overflow key. Flag the target key - * was an overflow key: the serialization function needs - * to know so it can update the overflow bit array. + * care if it's an overflow key, get a copy and wrap up. */ if (slot_offset == 0) { - WT_ERR(__wt_cell_copy(session, key, retb)); + WT_ERR(__wt_cell_unpack_copy( + session, unpack, retb)); break; } @@ -149,7 +158,7 @@ __wt_row_key( * 3: the test for an on-page reference to a key that isn't * prefix compressed. */ - if ((pfx = __wt_cell_prefix(key)) == 0) { + if (unpack->prefix == 0) { /* * If this is the key we originally wanted, we don't * care if we're rolling forward or backward, it's @@ -163,7 +172,7 @@ __wt_row_key( * found this key while rolling backwards and switched * directions then. */ - WT_ERR(__wt_cell_copy(session, key, retb)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, retb)); if (slot_offset == 0) break; @@ -188,11 +197,11 @@ __wt_row_key( */ if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_cell_copy(session, key, tmp)); - WT_ERR( - __wt_buf_initsize(session, retb, tmp->size + pfx)); + WT_ERR(__wt_cell_unpack_copy(session, unpack, tmp)); + WT_ERR(__wt_buf_initsize( + session, retb, tmp->size + unpack->prefix)); memcpy((uint8_t *) - retb->data + pfx, tmp->data, tmp->size); + retb->data + unpack->prefix, tmp->data, tmp->size); if (slot_offset == 0) break; @@ -255,6 +264,9 @@ WT_CELL * __wt_row_value(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) { WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + + unpack = &_unpack; /* * Passed both WT_ROW_REF and WT_ROW structures; the first field of each @@ -283,7 +295,8 @@ __wt_row_value(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) * key. The page reconciliation code guarantees there is always a key * cell after an empty data cell, so this is safe. */ - cell = __wt_cell_next(session, cell); + __wt_cell_unpack(session, cell, unpack); + cell = (WT_CELL *)((uint8_t *)cell + unpack->len); if (__wt_cell_type(cell) == WT_CELL_KEY || __wt_cell_type(cell) == WT_CELL_KEY_OVFL) return (NULL); diff --git a/src/include/btree.h b/src/include/btree.h index c0b8a4ead1d..0f90e7a58b0 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -676,14 +676,6 @@ struct __wt_insert { NULL : (page)->u.row_leaf.upd[WT_ROW_SLOT(page, ip)]) /* - * WT_CELL -- - * Variable-length cell type; see include/cell.i for more information. - */ -struct __wt_cell { - uint8_t __chunk[6]; /* cell: 1 to 6 bytes */ -}; - -/* * WT_OFF -- * Row-store internal pages reference subtrees with no record count, and * row- and column-store overflow key and data items. diff --git a/src/include/cell.i b/src/include/cell.i index d6cd193d620..07001c9498c 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -8,18 +8,13 @@ #undef STATIN #define STATIN static inline -STATIN void *__wt_cell_data(WT_SESSION_IMPL *, WT_CELL *); -STATIN void __wt_cell_data_and_len( - WT_SESSION_IMPL *, WT_CELL *, void *, uint32_t *); -STATIN uint32_t __wt_cell_datalen(WT_SESSION_IMPL *, WT_CELL *); -STATIN uint32_t __wt_cell_len(WT_SESSION_IMPL *, WT_CELL *); -STATIN void *__wt_cell_next(WT_SESSION_IMPL *, WT_CELL *); -STATIN void __wt_cell_off(WT_SESSION_IMPL *, WT_CELL *, WT_OFF *); -STATIN u_int __wt_cell_prefix(WT_CELL *); -STATIN void __wt_cell_set_fixed(WT_CELL *, u_int, uint32_t *); -STATIN u_int __wt_cell_type(WT_CELL *); -STATIN int __wt_cell_type_is_ovfl(WT_CELL *); -STATIN u_int __wt_cell_type_raw(WT_CELL *); +STATIN void __wt_cell_pack_fixed(WT_CELL *, u_int, uint32_t *); +STATIN void __wt_cell_pack(WT_SESSION_IMPL *, + WT_CELL *, u_int, u_int, uint32_t, uint32_t *); +STATIN void __wt_cell_unpack( + WT_SESSION_IMPL *, WT_CELL *, WT_CELL_UNPACK *); +STATIN int __wt_cell_unpack_safe( + WT_SESSION_IMPL *, WT_CELL *, WT_CELL_UNPACK *, uint8_t *); /* * WT_CELL -- @@ -101,222 +96,247 @@ STATIN u_int __wt_cell_type_raw(WT_CELL *); #define WT_CELL_OFF (5 << 5) /* Off-page ref */ #define WT_CELL_UNUSED_TYPE6 (6 << 5) /* Unused */ #define WT_CELL_UNUSED_TYPE7 (7 << 5) /* Unused */ +#define WT_CELL_TYPE_MASK (7 << 5) -/* WT_CELL_FOREACH is a loop that walks the cells on a page */ -#define WT_CELL_FOREACH(session, dsk, cell, i) \ - for ((cell) = WT_PAGE_DISK_BYTE(dsk), \ - (i) = (dsk)->u.entries; \ - (i) > 0; (cell) = __wt_cell_next(session, cell), --(i)) +/* + * WT_CELL -- + * Variable-length, on-page cell header. + */ +struct __wt_cell { + /* + * Maximum of 6 bytes: + * 0: descriptor/type + * 1: prefix compression + * 2-5: data-length + */ + uint8_t __chunk[6]; +}; /* - * __wt_cell_set_fixed -- - * Set a WT_CELL's contents based on a fixed-size type. + * WT_CELL_UNPACK -- + * Unpacked cell. + */ +struct __wt_cell_unpack { + uint8_t raw; /* Raw cell type (include "shorts") */ + uint8_t type; /* Cell type */ + + uint8_t prefix; /* Cell prefix */ + + uint8_t ovfl; /* Cell is an overflow */ + + WT_OFF off; /* WT_OFF structure */ + + const void *data; /* Data */ + uint32_t size; /* Data size */ + + uint32_t len; /* Cell + data total length */ +}; + +/* + * WT_CELL_FOREACH -- + * Walk the cells on a page. + */ +#define WT_CELL_FOREACH(session, dsk, cell, unpack, i) \ + for ((cell) = WT_PAGE_DISK_BYTE(dsk), (i) = (dsk)->u.entries; \ + (i) > 0; \ + (cell) = (WT_CELL *)((uint8_t *)cell + (unpack)->len), --(i)) + +/* + * __wt_cell_pack_fixed -- + * Write a WT_CELL's contents based on a fixed-size type. */ static inline void -__wt_cell_set_fixed(WT_CELL *cell, u_int type, uint32_t *cell_lenp) +__wt_cell_pack_fixed(WT_CELL *cell, u_int type, uint32_t *cell_lenp) { cell->__chunk[0] = (u_int8_t)type; *cell_lenp = 1; /* Cell byte */ } /* - * __wt_cell_type_raw -- - * Return the cell's type. + * __wt_cell_pack -- + * Set a WT_CELL's contents based on a type, prefix and data size. */ -static inline u_int -__wt_cell_type_raw(WT_CELL *cell) +static inline void +__wt_cell_pack(WT_SESSION_IMPL *session, + WT_CELL *cell, u_int type, u_int prefix, uint32_t size, uint32_t *cell_lenp) { - if (cell->__chunk[0] & WT_CELL_DATA_SHORT) - return (WT_CELL_DATA_SHORT); - if (cell->__chunk[0] & WT_CELL_KEY_SHORT) - return (WT_CELL_KEY_SHORT); - return (cell->__chunk[0] & (7 << 5)); + uint8_t byte, *p; + + /* + * Delete and off-page items have known sizes, we don't store length + * bytes. Short key/data items have 6- or 7-bits of length in the + * descriptor byte and no length bytes. + */ + WT_ASSERT(session, type == WT_CELL_DATA || type == WT_CELL_KEY); + if (type == WT_CELL_DATA && size < 0x7f) { + /* + * Bit 0 is the WT_CELL_DATA_SHORT flag; the other 7 bits are + * the size. + */ + byte = (uint8_t)size; + cell->__chunk[0] = (byte << 1) | WT_CELL_DATA_SHORT; + *cell_lenp = 1; /* Cell byte */ + return; + } + if (size < 0x3f) { + /* + * Bit 0 is 0, bit 1 is the WT_CELL_KEY_SHORT flag; the other + * 6 bits are the size. + */ + byte = (uint8_t)size; + cell->__chunk[0] = (byte << 2) | WT_CELL_KEY_SHORT; + cell->__chunk[1] = (uint8_t)prefix; + *cell_lenp = 2; /* Cell byte + prefix byte */ + return; + } + + p = cell->__chunk; + *p++ = (uint8_t)type; /* Type */ + + if (type == WT_CELL_KEY) /* Prefix byte */ + *p++ = (uint8_t)prefix; + + /* Pack the data length. */ + (void)__wt_vpack_uint( + session, &p, sizeof(cell->__chunk) - 1, (uint64_t)size); + + *cell_lenp = WT_PTRDIFF32(p, cell); } /* * __wt_cell_type -- - * Return a cell's type, mapping the short types to the normal, on-page - * types. + * Return the cell's type (collapsing "short" types). */ static inline u_int __wt_cell_type(WT_CELL *cell) { + /* + * NOTE: WT_CELL_DATA_SHORT MUST BE CHECKED BEFORE WT_CELL_KEY_SHORT. + */ if (cell->__chunk[0] & WT_CELL_DATA_SHORT) return (WT_CELL_DATA); if (cell->__chunk[0] & WT_CELL_KEY_SHORT) return (WT_CELL_KEY); - return (cell->__chunk[0] & (7 << 5)); + return (cell->__chunk[0] & WT_CELL_TYPE_MASK); } /* - * __wt_cell_type_is_ovfl -- - * Return if a cell references an overflow item. + * __wt_cell_unpack -- + * Unpack a WT_CELL into a structure. */ -static inline int -__wt_cell_type_is_ovfl(WT_CELL *cell) -{ - u_int type; - - type = __wt_cell_type(cell); - return (type == WT_CELL_DATA_OVFL || type == WT_CELL_KEY_OVFL); -} - -/* - * __wt_cell_prefix -- - * Return a cell's prefix-compression value. - */ -static inline u_int -__wt_cell_prefix(WT_CELL *cell) +static inline void +__wt_cell_unpack( + WT_SESSION_IMPL *session, WT_CELL *cell, WT_CELL_UNPACK *unpack) { - return (cell->__chunk[1]); + (void)__wt_cell_unpack_safe(session, cell, unpack, NULL); } /* - * __wt_cell_data -- - * Return a reference to the first byte of data for a cell. + * __wt_cell_unpack_safe -- + * Unpack a WT_CELL into a structure during verification. */ -static inline void * -__wt_cell_data(WT_SESSION_IMPL *session, WT_CELL *cell) +static inline int +__wt_cell_unpack_safe(WT_SESSION_IMPL *session, + WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) { uint64_t v; + u_int len; const uint8_t *p; /* - * Delete and off-page items have known sizes, we don't store length - * bytes. Short key/data items have 6- or 7-bits of length in the - * descriptor byte and no length bytes. + * If our caller specifies an "1 past the end-of-buffer" reference, it's + * the verification code and we have to make sure we don't go past the + * end of the buffer when reading. Don't complain on error here, our + * caller will take care of that. + * + * NOTE: WT_CELL_DATA_SHORT MUST BE CHECKED BEFORE WT_CELL_KEY_SHORT. */ - switch (__wt_cell_type_raw(cell)) { - case WT_CELL_DATA_OVFL: - case WT_CELL_DATA_SHORT: - case WT_CELL_DEL: - case WT_CELL_KEY_OVFL: - case WT_CELL_OFF: - return ((uint8_t *)cell + 1); /* Cell byte */ - case WT_CELL_KEY_SHORT: - return ((uint8_t *)cell + 2); /* Cell + prefix byte */ - case WT_CELL_KEY: - p = (uint8_t *)cell + 2; /* Cell + prefix */ - (void)__wt_vunpack_uint( - session, &p, sizeof(cell->__chunk) - 2, &v); - return ((void *)p); - case WT_CELL_DATA: - default: /* Impossible */ - p = (uint8_t *)cell + 1; /* Cell */ - (void)__wt_vunpack_uint( - session, &p, sizeof(cell->__chunk) - 1, &v); - return ((void *)p); - } - /* NOTREACHED */ -} + if (cell->__chunk[0] & WT_CELL_DATA_SHORT) { + unpack->type = WT_CELL_DATA; + unpack->raw = WT_CELL_DATA_SHORT; + } else if (cell->__chunk[0] & WT_CELL_KEY_SHORT) { + unpack->type = WT_CELL_KEY; + unpack->raw = WT_CELL_KEY_SHORT; + } else + unpack->type = + unpack->raw = cell->__chunk[0] & WT_CELL_TYPE_MASK; -/* - * __wt_cell_datalen -- - * Return the number of data bytes referenced by a WT_CELL. - */ -static inline uint32_t -__wt_cell_datalen(WT_SESSION_IMPL *session, WT_CELL *cell) -{ - uint64_t v; - const uint8_t *p; + unpack->prefix = 0; + unpack->ovfl = 0; + p = (uint8_t *)cell + 1; /* skip cell */ /* - * Delete and off-page items have known sizes, we don't store length - * bytes. Short key/data items have 6- or 7-bits of length in the - * descriptor byte and no length bytes. + * Delete and off-page items have known sizes, there's no length bytes. + * Short key/data items have 6- or 7-bits of length in the descriptor + * byte and no length bytes. Normal key/data items have length bytes. */ - switch (__wt_cell_type_raw(cell)) { + switch (unpack->raw) { case WT_CELL_DATA_OVFL: case WT_CELL_KEY_OVFL: + unpack->ovfl = 1; + /* FALLTHROUGH */ case WT_CELL_OFF: - return (sizeof(WT_OFF)); - case WT_CELL_DATA_SHORT: - return (cell->__chunk[0] >> 1); - case WT_CELL_DEL: - return (0); - case WT_CELL_KEY_SHORT: - return (cell->__chunk[0] >> 2); - case WT_CELL_KEY: - p = (uint8_t *)cell + 2; /* Step past cell + prefix */ + if (end != NULL && p + sizeof(WT_OFF) > end) + return (WT_ERROR); + memcpy(&unpack->off, p, sizeof(WT_OFF)); + + unpack->data = NULL; + unpack->size = sizeof(WT_OFF); + unpack->len = 1 + sizeof(WT_OFF); break; - case WT_CELL_DATA: - default: - p = (uint8_t *)cell + 1; /* Step past cell byte */ + case WT_CELL_DEL: + unpack->data = NULL; + unpack->size = 0; + unpack->len = 1; break; - } - - (void)__wt_vunpack_uint(session, &p, sizeof(cell->__chunk) - 1, &v); - return ((uint32_t)v); -} - -/* - * __wt_cell_len -- - * Return the total bytes taken up by a WT_CELL on page, including the - * trailing data. - */ -static inline uint32_t -__wt_cell_len(WT_SESSION_IMPL *session, WT_CELL *cell) -{ - uint64_t v; - const uint8_t *p; - - /* - * Delete and off-page items have known sizes, we don't store length - * bytes. Short key/data items have 6- or 7-bits of length in the - * descriptor byte and no length bytes. - */ - switch (__wt_cell_type_raw(cell)) { - case WT_CELL_DATA_OVFL: case WT_CELL_DATA_SHORT: - case WT_CELL_DEL: - case WT_CELL_KEY_OVFL: - case WT_CELL_OFF: /* Cell + data */ - return (1 + __wt_cell_datalen(session, cell)); - case WT_CELL_KEY_SHORT: /* Cell + prefix + data */ - return (2 + __wt_cell_datalen(session, cell)); - case WT_CELL_KEY: - p = &cell->__chunk[2]; /* Cell + prefix */ + unpack->data = p; + unpack->size = cell->__chunk[0] >> 1; + unpack->len = 1 + unpack->size; break; case WT_CELL_DATA: - default: /* Impossible */ - p = &cell->__chunk[1]; /* Cell */ - break; - } + if (end != NULL) { + WT_RET(__wt_vpack_uint_len(p, &len)); + if (end != NULL && p + len > end) + return (WT_ERROR); + } + WT_RET(__wt_vunpack_uint( + session, &p, sizeof(cell->__chunk) - 1, &v)); + if (end != NULL && p + v > end) + return (WT_ERROR); - (void)__wt_vunpack_uint(session, &p, sizeof(cell->__chunk) - 1, &v); - return ((uint32_t)(WT_PTRDIFF32(p, cell->__chunk) + v)); -} + unpack->data = p; + unpack->size = v; + unpack->len = WT_PTRDIFF32(p, cell) + v; + break; + case WT_CELL_KEY_SHORT: + unpack->prefix = cell->__chunk[1]; + ++p; /* skip prefix */ -/* - * __wt_cell_data_and_len -- - * Fill in both the first byte of data for a cell as well as the length. - */ -static inline void -__wt_cell_data_and_len( - WT_SESSION_IMPL *session, WT_CELL *cell, void *p, uint32_t *sizep) -{ - *(void **)p = __wt_cell_data(session, cell); - *sizep = __wt_cell_datalen(session, cell); -} + unpack->data = p; + unpack->size = cell->__chunk[0] >> 2; + unpack->len = 2 + unpack->size; + break; + case WT_CELL_KEY: + unpack->prefix = cell->__chunk[1]; + ++p; /* skip prefix */ -/* - * __wt_cell_off -- - * Copy out a WT_CELL that references a WT_OFF structure. - */ -static inline void -__wt_cell_off(WT_SESSION_IMPL *session, WT_CELL *cell, WT_OFF *off) -{ - /* Version for systems that support unaligned access. */ - *off = *(WT_OFF *)__wt_cell_data(session, cell); -} + if (end != NULL) { + WT_RET(__wt_vpack_uint_len(p, &len)); + if (end != NULL && p + len > end) + return (WT_ERROR); + } + WT_RET(__wt_vunpack_uint( + session, &p, sizeof(cell->__chunk) - 2, &v)); + if (end != NULL && p + v > end) + return (WT_ERROR); -/* - * __wt_cell_next -- - * Return a pointer to the next WT_CELL on the page. - */ -static inline void * -__wt_cell_next(WT_SESSION_IMPL *session, WT_CELL *cell) -{ - return ((u_int8_t *)cell + __wt_cell_len(session, cell)); + unpack->data = p; + unpack->size = v; + unpack->len = WT_PTRDIFF32(p, cell) + v; + break; + default: + return (end == NULL ? __wt_file_format(session) : WT_ERROR); + } + return (0); } - diff --git a/src/include/extern.h b/src/include/extern.h index 97cf30b9fec..6860eee9c78 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -165,15 +165,12 @@ extern int __wt_bulk_end(WT_CURSOR_BULK *cbulk); extern int __wt_cache_create(WT_CONNECTION_IMPL *conn); extern void __wt_cache_stats_update(WT_CONNECTION_IMPL *conn); extern void __wt_cache_destroy(WT_CONNECTION_IMPL *conn); -extern void __wt_cell_set(WT_SESSION_IMPL *session, - WT_CELL *cell, - u_int type, - u_int prefix, - uint32_t size, - uint32_t *cell_lenp); extern int __wt_cell_copy(WT_SESSION_IMPL *session, WT_CELL *cell, WT_BUF *retb); +extern int __wt_cell_unpack_copy( WT_SESSION_IMPL *session, + WT_CELL_UNPACK *unpack, + WT_BUF *retb); extern int __wt_bt_lex_compare( WT_BTREE *btree, const WT_ITEM *user_item, const WT_ITEM *tree_item); @@ -236,7 +233,7 @@ extern int __wt_btree_close(WT_SESSION_IMPL *session); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); extern const char *__wt_page_type_string(u_int type); -extern const char *__wt_cell_type_string(WT_CELL *cell); +extern const char *__wt_cell_type_string(uint8_t type); extern int __wt_ovfl_in(WT_SESSION_IMPL *session, WT_OFF *ovfl, WT_BUF *store); extern int __wt_page_in_func( diff --git a/src/include/intpack.i b/src/include/intpack.i index 5ffc6dfb694..8289fa21443 100644 --- a/src/include/intpack.i +++ b/src/include/intpack.i @@ -360,3 +360,34 @@ __wt_vsize_int(int64_t x) /* For non-negative values, use the unsigned code above. */ return (__wt_vsize_uint((uint64_t)x)); } + +/* + * __wt_vpack_uint_len -- + * Returns the length of the variable-length unsigned integer. + */ +static inline int +__wt_vpack_uint_len(const uint8_t *p, u_int *lenp) +{ + /* + * This routine is used by the verification code to ensure it never + * oversteps the buffer because of corruption. + */ + switch (*p & 0xf0) { + case POS_1BYTE_MARKER: + case POS_1BYTE_MARKER | 0x10: + case POS_1BYTE_MARKER | 0x20: + case POS_1BYTE_MARKER | 0x30: + *lenp = 1; + break; + case POS_2BYTE_MARKER: + case POS_2BYTE_MARKER | 0x10: + *lenp = 2; + break; + case POS_MULTI_MARKER: + *lenp = *p & 0xf; + break; + default: + return (WT_ERROR); + } + return (0); +} diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in index 6485f93589e..6de6a11e3ae 100644 --- a/src/include/wt_internal.in +++ b/src/include/wt_internal.in @@ -62,6 +62,8 @@ struct __wt_cache_stats; typedef struct __wt_cache_stats WT_CACHE_STATS; struct __wt_cell; typedef struct __wt_cell WT_CELL; +struct __wt_cell_unpack; + typedef struct __wt_cell_unpack WT_CELL_UNPACK; struct __wt_col; typedef struct __wt_col WT_COL; struct __wt_col_ref; |