diff options
Diffstat (limited to 'src/third_party')
45 files changed, 1365 insertions, 982 deletions
diff --git a/src/third_party/wiredtiger/.clang-format b/src/third_party/wiredtiger/.clang-format index 573572df901..79eb168c0c0 100644 --- a/src/third_party/wiredtiger/.clang-format +++ b/src/third_party/wiredtiger/.clang-format @@ -49,7 +49,8 @@ ForEachMacros: - Q_FOREACH - BOOST_FOREACH - TAILQ_FOREACH - - WT_CELL_FOREACH_BEGIN + - WT_CELL_FOREACH_ADDR + - WT_CELL_FOREACH_KV - WT_CELL_FOREACH_VRFY - WT_CKPT_FOREACH - WT_COL_FOREACH diff --git a/src/third_party/wiredtiger/bench/workgen/runner/prepare_stress.py b/src/third_party/wiredtiger/bench/workgen/runner/prepare_stress.py index a79edf71af8..18751888a03 100755 --- a/src/third_party/wiredtiger/bench/workgen/runner/prepare_stress.py +++ b/src/third_party/wiredtiger/bench/workgen/runner/prepare_stress.py @@ -96,7 +96,7 @@ for i in range(0, table_count): tname = "table:test" + str(i) table = Table(tname) s.create(tname, wtperf_table_config +\ - compress_table_config + table_config) + compress_table_config + table_config + ",log=(enabled=false)") table.options.key_size = 200 table.options.value_size = 5000 tables.append(table) diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 6c67f53a7ee..71760c1b966 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -26,6 +26,7 @@ WT_BLOCK_HEADER_SIZE WT_CACHE_LINE_ALIGNMENT WT_CACHE_LINE_PAD_BEGIN WT_CACHE_LINE_PAD_END +WT_CELL_COMMON_FIELDS WT_CKPT_BLOCK_MODS WT_CLOCKDIFF_NS WT_CONN_CHECK_PANIC diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index e9422174821..d8fc8e08f4a 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -298,6 +298,7 @@ Obama Opcode Outfmt PARAM +PFX POSIX PPC PREDEFINE diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index a08ef7d90d9..e64af3c37d8 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "404b4a70af14e7d3aecf7f206380884af5d06786" + "commit": "bdff12c2331ab0478a22309a6d35519d2e2ca441" } diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 9ea91c6f421..47d0907a0d7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -179,7 +179,7 @@ static inline int __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_CELL *cell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_COL *cip; WT_INSERT *ins; WT_PAGE *page; @@ -245,7 +245,7 @@ restart_read: */ if (cbt->cip_saved != cip) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; @@ -295,7 +295,7 @@ restart_read: static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { - WT_CELL_UNPACK kpack; + WT_CELL_UNPACK_KV kpack; WT_INSERT *ins; WT_ITEM *key; WT_PAGE *page; diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index f8db9cd6233..4d6f62a10b7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -318,7 +318,7 @@ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_CELL *cell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_COL *cip; WT_INSERT *ins; WT_PAGE *page; @@ -385,7 +385,7 @@ restart_read: */ if (cbt->cip_saved != cip) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; @@ -435,7 +435,7 @@ restart_read: static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { - WT_CELL_UNPACK kpack; + WT_CELL_UNPACK_KV kpack; WT_INSERT *ins; WT_ITEM *key; WT_PAGE *page; diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index a835e593022..d0fb68ecb03 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -35,12 +35,8 @@ struct __wt_dbg { static const /* Output separator */ char *const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; -static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *); -static int __debug_cell_data(WT_DBG *, WT_PAGE *, int, const char *, WT_CELL_UNPACK *); static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); -static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); -static int __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); static int __debug_modify(WT_DBG *, WT_UPDATE *, const char *); static int __debug_page(WT_DBG *, WT_REF *, uint32_t); static int __debug_page_col_fix(WT_DBG *, WT_REF *); @@ -410,6 +406,237 @@ err: } /* + * __debug_cell_int_data -- + * Dump a single WT_COL_INT or WT_ROW_INT disk image cell's data in debugging mode. + */ +static int +__debug_cell_int_data(WT_DBG *ds, WT_CELL_UNPACK_ADDR *unpack) +{ + const char *p; + + switch (unpack->raw) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + p = __wt_cell_type_string(unpack->raw); + return (__debug_item(ds, NULL, p, strlen(p))); + } + return (0); +} + +/* + * __debug_cell_int -- + * Dump a single unpacked WT_COL_INT or WT_ROW_INT disk image WT_CELL. + */ +static int +__debug_cell_int(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK_ADDR *unpack) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_SESSION_IMPL *session; + char time_string[WT_TIME_STRING_SIZE]; + + session = ds->session; + + WT_RET(ds->f(ds, "\t%s: len %" PRIu32, __wt_cell_type_string(unpack->raw), unpack->size)); + + /* Dump the cell's per-disk page type information. */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + WT_RET(ds->f(ds, ", recno: %" PRIu64, unpack->v)); + break; + } + + /* Dump timestamps and addresses. */ + switch (unpack->raw) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + WT_RET(ds->f(ds, ", %s", __wt_time_aggregate_to_string(&unpack->ta, time_string))); + + WT_RET(__wt_scr_alloc(session, 128, &buf)); + ret = ds->f(ds, ", %s", __wt_addr_string(session, unpack->data, unpack->size, buf)); + __wt_scr_free(session, &buf); + WT_RET(ret); + break; + } + WT_RET(ds->f(ds, "\n")); + + return (__debug_cell_int_data(ds, unpack)); +} + +/* + * __debug_dsk_int -- + * Dump a WT_COL_INT or WT_ROW_INT disk image. + */ +static int +__debug_dsk_int(WT_DBG *ds, const WT_PAGE_HEADER *dsk) +{ + WT_CELL_UNPACK_ADDR unpack; + + WT_CELL_FOREACH_ADDR (ds->session, dsk, unpack) { + WT_RET(__debug_cell_int(ds, dsk, &unpack)); + } + WT_CELL_FOREACH_END; + return (0); +} + +/* + * __debug_cell_kv_data -- + * Dump a single WT_COL_VAR or WT_ROW_LEAF disk image cell's data in debugging mode. + */ +static int +__debug_cell_kv_data( + WT_DBG *ds, WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK_KV *unpack) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_SESSION_IMPL *session; + const char *p; + + session = ds->session; + + /* + * Column-store references to deleted cells return a NULL cell reference. + */ + if (unpack == NULL) + return (__debug_item(ds, tag, "deleted", strlen("deleted"))); + + /* + * Row-store references to empty cells return a NULL on-page reference. + */ + if (unpack->cell == NULL) + return (__debug_item(ds, tag, "", 0)); + + switch (unpack->raw) { + case WT_CELL_DEL: + p = __wt_cell_type_string(unpack->raw); + return (__debug_item(ds, tag, p, strlen(p))); + } + + WT_RET(__wt_scr_alloc(session, 256, &buf)); + WT_ERR(page == NULL ? __wt_dsk_cell_data_ref(session, page_type, unpack, buf) : + __wt_page_cell_data_ref(session, page, unpack, buf)); + + switch (unpack->raw) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + case WT_CELL_KEY_PFX: + case WT_CELL_KEY_SHORT: + case WT_CELL_KEY_SHORT_PFX: + WT_ERR(__debug_item_key(ds, tag, buf->data, buf->size)); + break; + case WT_CELL_VALUE: + case WT_CELL_VALUE_COPY: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_SHORT: + WT_ERR(__debug_item_value(ds, tag, buf->data, buf->size)); + break; + } + +err: + __wt_scr_free(session, &buf); + return (ret); +} + +/* + * __debug_cell_kv -- + * Dump a single unpacked WT_COL_VAR or WT_ROW_LEAF disk image WT_CELL. + */ +static int +__debug_cell_kv(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK_KV *unpack) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_SESSION_IMPL *session; + char time_string[WT_TIME_STRING_SIZE]; + + session = ds->session; + + WT_RET(ds->f(ds, "\t%s: len %" PRIu32, __wt_cell_type_string(unpack->raw), unpack->size)); + + /* Dump cell's per-disk page type information. */ + switch (dsk->type) { + case WT_PAGE_COL_VAR: + WT_RET(ds->f(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack))); + break; + case WT_PAGE_ROW_LEAF: + switch (unpack->raw) { + case WT_CELL_KEY_PFX: + case WT_CELL_KEY_SHORT_PFX: + WT_RET(ds->f(ds, ", pfx: %" PRIu8, unpack->prefix)); + break; + } + break; + } + + /* Dump timestamps. */ + switch (unpack->raw) { + case WT_CELL_DEL: + case WT_CELL_VALUE: + case WT_CELL_VALUE_COPY: + case WT_CELL_VALUE_OVFL: + case WT_CELL_VALUE_OVFL_RM: + case WT_CELL_VALUE_SHORT: + WT_RET(ds->f(ds, ", %s", __wt_time_window_to_string(&unpack->tw, time_string))); + break; + } + + /* Dump overflow addresses. */ + switch (unpack->raw) { + case WT_CELL_KEY_OVFL: + case WT_CELL_VALUE_OVFL: + WT_RET(__wt_scr_alloc(session, 128, &buf)); + ret = ds->f(ds, ", %s", __wt_addr_string(session, unpack->data, unpack->size, buf)); + __wt_scr_free(session, &buf); + WT_RET(ret); + break; + } + WT_RET(ds->f(ds, "\n")); + + return (__debug_cell_kv_data(ds, NULL, dsk->type, NULL, unpack)); +} + +/* + * __debug_dsk_kv -- + * Dump a WT_COL_VAR or WT_ROW_LEAF disk image. + */ +static int +__debug_dsk_kv(WT_DBG *ds, const WT_PAGE_HEADER *dsk) +{ + WT_CELL_UNPACK_KV unpack; + + WT_CELL_FOREACH_KV (ds->session, dsk, unpack) { + WT_RET(__debug_cell_kv(ds, dsk, &unpack)); + } + WT_CELL_FOREACH_END; + return (0); +} + +/* + * __debug_dsk_col_fix -- + * Dump a WT_PAGE_COL_FIX disk image. + */ +static int +__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk) +{ + WT_BTREE *btree; + uint32_t i; + uint8_t v; + + btree = S2BT(ds->session); + + WT_FIX_FOREACH (btree, dsk, v, i) { + WT_RET(ds->f(ds, "\t{")); + WT_RET(__debug_hex_byte(ds, v)); + WT_RET(ds->f(ds, "}\n")); + } + return (0); +} + +/* * __wt_debug_disk -- * Dump a disk page in debugging mode. */ @@ -460,10 +687,12 @@ __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char WT_ERR(__debug_dsk_col_fix(ds, dsk)); break; case WT_PAGE_COL_INT: - case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: + WT_ERR(__debug_dsk_int(ds, dsk)); + break; + case WT_PAGE_COL_VAR: case WT_PAGE_ROW_LEAF: - WT_ERR(__debug_dsk_cell(ds, dsk)); + WT_ERR(__debug_dsk_kv(ds, dsk)); break; default: break; @@ -475,50 +704,6 @@ err: } /* - * __debug_dsk_col_fix -- - * Dump a WT_PAGE_COL_FIX page. - */ -static int -__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk) -{ - WT_BTREE *btree; - uint32_t i; - uint8_t v; - - WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL); - - btree = S2BT(ds->session); - - WT_FIX_FOREACH (btree, dsk, v, i) { - WT_RET(ds->f(ds, "\t{")); - WT_RET(__debug_hex_byte(ds, v)); - WT_RET(ds->f(ds, "}\n")); - } - return (0); -} - -/* - * __debug_dsk_cell -- - * Dump a page of WT_CELL's. - */ -static int -__debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk) -{ - WT_BTREE *btree; - WT_CELL_UNPACK unpack; - - WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL); - - btree = S2BT(ds->session); - - WT_CELL_FOREACH_BEGIN (ds->session, btree, dsk, unpack) { - WT_RET(__debug_cell(ds, dsk, &unpack)); - } - WT_CELL_FOREACH_END; - return (0); -} - -/* * __debug_tree_shape_info -- * Pretty-print information about a page. */ @@ -787,7 +972,7 @@ err: */ int __wt_debug_key_value( - WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, uint64_t rle, WT_CELL_UNPACK *value) + WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, uint64_t rle, WT_CELL_UNPACK_KV *value) { WT_DBG *ds, _ds; WT_DECL_RET; @@ -801,7 +986,7 @@ __wt_debug_key_value( else WT_ERR(__debug_item_key(ds, "K", key->data, key->size)); WT_ERR(__debug_time_window(ds, "T", &value->tw)); - WT_ERR(__debug_cell_data(ds, NULL, value != NULL ? value->type : 0, "V", value)); + WT_ERR(__debug_cell_kv_data(ds, NULL, value != NULL ? value->type : 0, "V", value)); err: return (__debug_wrapup(ds)); @@ -1065,7 +1250,7 @@ static int __debug_page_col_var(WT_DBG *ds, WT_REF *ref) { WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_COL *cip; WT_INSERT_HEAD *update; WT_PAGE *page; @@ -1079,10 +1264,10 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) WT_COL_FOREACH (page, cip, i) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(ds->session, page, cell, unpack); + __wt_cell_unpack_kv(ds->session, page->dsk, cell, unpack); rle = __wt_cell_rle(unpack); WT_RET(__wt_snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle)); - WT_RET(__debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack)); + WT_RET(__debug_cell_kv_data(ds, page, WT_PAGE_COL_VAR, tag, unpack)); if ((update = WT_COL_UPDATE(page, cip)) != NULL) WT_RET(__debug_col_skip(ds, update, "update", false)); @@ -1137,7 +1322,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) static int __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) { - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_DECL_ITEM(key); WT_DECL_RET; WT_INSERT_HEAD *insert; @@ -1162,7 +1347,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) WT_ERR(__debug_item_key(ds, "K", key->data, key->size)); __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); - WT_ERR(__debug_cell_data(ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); + WT_ERR(__debug_cell_kv_data(ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) WT_ERR(__debug_update(ds, upd, false)); @@ -1361,154 +1546,4 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) __wt_addr_string(session, addr.addr, addr.size, ds->t1))); return (ds->f(ds, "\n")); } - -/* - * __debug_cell -- - * Dump a single unpacked WT_CELL. - */ -static int -__debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) -{ - WT_DECL_ITEM(buf); - WT_DECL_RET; - WT_SESSION_IMPL *session; - char time_string[WT_TIME_STRING_SIZE]; - - session = ds->session; - - WT_RET(ds->f(ds, "\t%s: len %" PRIu32, __wt_cell_type_string(unpack->raw), unpack->size)); - - /* Dump cell's per-disk page type information. */ - switch (dsk->type) { - case WT_PAGE_COL_INT: - switch (unpack->type) { - case WT_CELL_VALUE: - WT_RET(ds->f(ds, ", recno: %" PRIu64, unpack->v)); - break; - } - break; - case WT_PAGE_COL_VAR: - switch (unpack->type) { - case WT_CELL_DEL: - case WT_CELL_KEY_OVFL_RM: - case WT_CELL_VALUE: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_OVFL_RM: - WT_RET(ds->f(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack))); - break; - } - break; - case WT_PAGE_ROW_INT: - case WT_PAGE_ROW_LEAF: - switch (unpack->type) { - case WT_CELL_KEY: - WT_RET(ds->f(ds, ", pfx: %" PRIu8, unpack->prefix)); - break; - } - break; - } - - /* Dump timestamps. */ - switch (unpack->raw) { - case WT_CELL_ADDR_DEL: - case WT_CELL_ADDR_INT: - case WT_CELL_ADDR_LEAF: - case WT_CELL_ADDR_LEAF_NO: - WT_RET(ds->f(ds, ", %s", __wt_time_aggregate_to_string(&unpack->ta, time_string))); - break; - case WT_CELL_DEL: - case WT_CELL_VALUE: - case WT_CELL_VALUE_COPY: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_OVFL_RM: - case WT_CELL_VALUE_SHORT: - WT_RET(ds->f(ds, ", %s", __wt_time_window_to_string(&unpack->tw, time_string))); - break; - } - - /* Dump addresses. */ - switch (unpack->raw) { - case WT_CELL_ADDR_DEL: - case WT_CELL_ADDR_INT: - case WT_CELL_ADDR_LEAF: - case WT_CELL_ADDR_LEAF_NO: - case WT_CELL_KEY_OVFL: - case WT_CELL_KEY_OVFL_RM: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_OVFL_RM: - WT_RET(__wt_scr_alloc(session, 128, &buf)); - ret = ds->f(ds, ", %s", __wt_addr_string(session, unpack->data, unpack->size, buf)); - __wt_scr_free(session, &buf); - WT_RET(ret); - break; - } - WT_RET(ds->f(ds, "\n")); - - return (__debug_cell_data(ds, NULL, dsk->type, NULL, unpack)); -} - -/* - * __debug_cell_data -- - * Dump a single cell's data in debugging mode. - */ -static int -__debug_cell_data(WT_DBG *ds, WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK *unpack) -{ - WT_DECL_ITEM(buf); - WT_DECL_RET; - WT_SESSION_IMPL *session; - const char *p; - - session = ds->session; - - /* - * Column-store references to deleted cells return a NULL cell reference. - */ - if (unpack == NULL) - return (__debug_item(ds, tag, "deleted", strlen("deleted"))); - - /* - * Row-store references to empty cells return a NULL on-page reference. - */ - if (unpack->cell == NULL) - return (__debug_item(ds, tag, "", 0)); - - switch (unpack->raw) { - case WT_CELL_ADDR_DEL: - case WT_CELL_ADDR_INT: - case WT_CELL_ADDR_LEAF: - case WT_CELL_ADDR_LEAF_NO: - case WT_CELL_DEL: - case WT_CELL_KEY_OVFL_RM: - case WT_CELL_VALUE_OVFL_RM: - p = __wt_cell_type_string(unpack->raw); - return (__debug_item(ds, tag, p, strlen(p))); - } - - WT_RET(__wt_scr_alloc(session, 256, &buf)); - WT_ERR(page == NULL ? __wt_dsk_cell_data_ref(session, page_type, unpack, buf) : - __wt_page_cell_data_ref(session, page, unpack, buf)); - - switch (unpack->raw) { - case WT_CELL_KEY: - case WT_CELL_KEY_OVFL: - case WT_CELL_KEY_PFX: - case WT_CELL_KEY_SHORT: - case WT_CELL_KEY_SHORT_PFX: - WT_ERR(__debug_item_key(ds, tag, buf->data, buf->size)); - break; - case WT_CELL_VALUE: - case WT_CELL_VALUE_COPY: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_SHORT: - WT_ERR(__debug_item_value(ds, tag, buf->data, buf->size)); - break; - default: - WT_ERR(__wt_illegal_value(session, unpack->raw)); - } - -err: - __wt_scr_free(session, &buf); - return (ret); -} #endif diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index cccd2c628a3..855272ce1d9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -43,8 +43,8 @@ __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ * Bring an overflow item into memory. */ int -__wt_ovfl_read( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) +__wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK_COMMON *unpack, + WT_ITEM *store, bool *decoded) { WT_DECL_RET; @@ -65,7 +65,6 @@ __wt_ovfl_read( */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { - WT_ASSERT(session, __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)); ret = __wt_buf_setstr(session, store, "WT_CELL_VALUE_OVFL_RM"); *decoded = true; } else @@ -80,7 +79,7 @@ __wt_ovfl_read( * Remove an overflow value. */ int -__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) +__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK_KV *unpack) { /* * This function solves two problems in reconciliation. @@ -121,13 +120,13 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; - __wt_cell_unpack(session, page, cell, unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, unpack); /* * Remove overflow key/value objects, called when reconciliation finishes after successfully diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 1a690b24804..6d1c377a3f0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -265,14 +265,11 @@ __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) static void __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; WT_PAGE_INDEX *pindex; WT_REF **refp, *ref; uint32_t hint; - btree = S2BT(session); - /* * Walk the page, building references: the page contains value items. The value items are * on-page items (WT_CELL_VALUE). @@ -280,7 +277,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; hint = 0; - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, page->dsk, unpack) { ref = *refp++; ref->home = page; ref->pindex_hint = hint++; @@ -299,15 +296,12 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) static void __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; *np = 0; - btree = S2BT(session); - /* Walk the page, counting entries for the repeats array. */ - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_KV (session, page->dsk, unpack) { if (__wt_cell_rle(&unpack) > 1) ++*np; } @@ -321,8 +315,7 @@ __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np) static int __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t *sizep) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_COL *cip; WT_COL_RLE *repeats; size_t size; @@ -330,8 +323,6 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t uint32_t indx, n, repeat_off; void *p; - btree = S2BT(session); - repeats = NULL; repeat_off = 0; @@ -342,7 +333,7 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t */ indx = 0; cip = page->pg_var; - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_KV (session, page->dsk, unpack) { WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, unpack.cell)); cip++; @@ -383,7 +374,7 @@ static int __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) { WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; WT_DECL_ITEM(current); WT_DECL_RET; WT_PAGE_INDEX *pindex; @@ -403,7 +394,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) refp = pindex->index; overflow_keys = false; hint = 0; - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, page->dsk, unpack) { ref = *refp; ref->home = page; ref->pindex_hint = hint++; @@ -499,12 +490,9 @@ err: static int __inmem_row_leaf_entries(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; uint32_t nindx; - btree = S2BT(session); - /* * Leaf row-store page entries map to a maximum of one-to-one to the number of physical entries * on the page (each physical entry might be a key without a subsequent data item). To avoid @@ -516,7 +504,7 @@ __inmem_row_leaf_entries(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, ui * overflow (WT_CELL_VALUE_OVFL) item. */ nindx = 0; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_KV (session, dsk, unpack) { switch (unpack.type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: @@ -543,7 +531,7 @@ static int __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_ITEM buf; WT_ROW *rip; WT_UPDATE **upd_array, *upd; @@ -558,7 +546,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) /* Walk the page, building indices. */ rip = page->pg_row; - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_KV (session, page->dsk, unpack) { if (instantiate_prepared && !prepare && F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE)) prepare = true; switch (unpack.type) { @@ -587,7 +575,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) */ if (!btree->huffman_value && unpack.tw.stop_txn == WT_TXN_MAX && unpack.tw.stop_ts == WT_TS_MAX && !F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE) && - __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.start_ts)) + __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts)) __wt_row_leaf_value_set(page, rip - 1, &unpack); break; case WT_CELL_VALUE_OVFL: diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 5f29cf08691..039a4b5c833 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -57,7 +57,7 @@ __rebalance_discard(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) */ static int __rebalance_leaf_append(WT_SESSION_IMPL *session, const uint8_t *key, size_t key_len, - WT_CELL_UNPACK *unpack, WT_REBALANCE_STUFF *rs) + WT_CELL_UNPACK_ADDR *unpack, WT_REBALANCE_STUFF *rs) { WT_ADDR *copy_addr; WT_REF *copy; @@ -185,13 +185,10 @@ __rebalance_free_original(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) static int __rebalance_col_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; WT_DECL_ITEM(buf); WT_DECL_RET; - btree = S2BT(session); - WT_ERR(__wt_scr_alloc(session, 0, &buf)); /* Report progress periodically. */ @@ -202,7 +199,7 @@ __rebalance_col_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REB * Walk the page, instantiating keys: the page contains sorted key and location cookie pairs. * Keys are on-page/overflow items and location cookies are WT_CELL_ADDR_XXX items. */ - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, dsk, unpack) { switch (unpack.type) { case WT_CELL_ADDR_INT: /* An internal page: read it and recursively walk it. */ @@ -260,8 +257,7 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, const uint8_t *addr, size_t a static int __rebalance_row_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) { - WT_BTREE *btree; - WT_CELL_UNPACK key, unpack; + WT_CELL_UNPACK_ADDR key, unpack; WT_DECL_ITEM(buf); WT_DECL_ITEM(leafkey); WT_DECL_RET; @@ -269,7 +265,6 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REB bool first_cell; const void *p; - btree = S2BT(session); WT_CLEAR(key); /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); @@ -284,7 +279,7 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REB * Keys are on-page/overflow items and location cookies are WT_CELL_ADDR_XXX items. */ first_cell = true; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, dsk, unpack) { switch (unpack.type) { case WT_CELL_KEY: key = unpack; diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 1a2360f6d09..abffa19cf56 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -76,9 +76,9 @@ __key_return(WT_CURSOR_BTREE *cbt) static void __read_col_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_WINDOW *tw) { - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); __wt_time_window_copy(tw, &unpack.tw); } @@ -89,7 +89,7 @@ __read_col_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, W void __wt_read_row_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw) { - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; __wt_time_window_init(tw); /* @@ -138,7 +138,7 @@ __wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_W { WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; @@ -174,7 +174,7 @@ __wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_W if (page->type == WT_PAGE_COL_VAR) { /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); if (tw != NULL) __wt_time_window_copy(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 344c6a573d7..0571404cf89 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -567,14 +567,12 @@ static int __slvg_trk_leaf(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_DECL_RET; WT_PAGE *page; WT_TRACK *trk; uint64_t stop_recno; - btree = S2BT(session); page = NULL; trk = NULL; @@ -603,7 +601,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *ad * stop key requires walking the page. */ stop_recno = dsk->recno; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_KV (session, dsk, unpack) { stop_recno += __wt_cell_rle(&unpack); } WT_CELL_FOREACH_END; @@ -683,15 +681,12 @@ __slvg_trk_ovfl(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *ad static int __slvg_trk_leaf_ovfl(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; uint32_t ovfl_cnt; - btree = S2BT(session); - /* Count page overflow items. */ ovfl_cnt = 0; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_KV (session, dsk, unpack) { if (FLD_ISSET(unpack.flags, WT_CELL_UNPACK_OVERFLOW)) ++ovfl_cnt; } @@ -706,7 +701,7 @@ __slvg_trk_leaf_ovfl(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRA trk->trk_ovfl_cnt = ovfl_cnt; ovfl_cnt = 0; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_KV (session, dsk, unpack) { if (FLD_ISSET(unpack.flags, WT_CELL_UNPACK_OVERFLOW)) { WT_RET( __wt_memdup(session, unpack.data, unpack.size, &trk->trk_ovfl_addr[ovfl_cnt].addr)); @@ -1297,7 +1292,7 @@ err: * Find a single overflow record in the merge page's list, and mark it as referenced. */ static int -__slvg_col_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) +__slvg_col_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK_KV *unpack) { WT_TRACK *ovfl; uint32_t i; @@ -1325,7 +1320,7 @@ __slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint64_t uint64_t skip, uint64_t take) { WT_CELL *cell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_COL *cip; WT_DECL_RET; uint64_t start, stop; @@ -1340,7 +1335,7 @@ __slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint64_t WT_COL_FOREACH (page, cip, i) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); recno += __wt_cell_rle(&unpack); /* @@ -1957,7 +1952,7 @@ err: * Find a single overflow record in the merge page's list, and mark it as referenced. */ static int -__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) +__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK_KV *unpack) { WT_TRACK *ovfl; uint32_t i; @@ -1989,7 +1984,7 @@ __slvg_row_ovfl( WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop) { WT_CELL *cell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_ROW *rip; void *copy; @@ -2001,7 +1996,7 @@ __slvg_row_ovfl( copy = WT_ROW_KEY_COPY(rip); WT_IGNORE_RET_BOOL(__wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL)); if (cell != NULL) { - __wt_cell_unpack(session, page, cell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); } __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 2a016d6d725..b4083c16e12 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -159,7 +159,7 @@ static int __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) { WT_CELL *cell; - WT_CELL_UNPACK kpack; + WT_CELL_UNPACK_KV kpack; WT_IKEY *ikey; uint32_t cell_offset; @@ -181,7 +181,7 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) ikey->cell_offset = 0; cell = WT_PAGE_REF_OFFSET(page, cell_offset); - __wt_cell_unpack(session, page, cell, &kpack); + __wt_cell_unpack_kv(session, page->dsk, cell, &kpack); if (FLD_ISSET(kpack.flags, WT_CELL_UNPACK_OVERFLOW) && kpack.raw != WT_CELL_KEY_OVFL_RM) WT_RET(__wt_ovfl_discard(session, page, cell)); @@ -197,7 +197,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref WT_REF **to_refp, size_t *incrp) { WT_ADDR *addr, *ref_addr; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; WT_DECL_RET; WT_IKEY *ikey; WT_REF *ref; @@ -247,7 +247,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref */ WT_ORDERED_READ(ref_addr, ref->addr); if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) { - __wt_cell_unpack(session, from_home, (WT_CELL *)ref_addr, &unpack); + __wt_cell_unpack_addr(session, from_home->dsk, (WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); __wt_time_aggregate_copy(&addr->ta, &unpack.ta); WT_ERR(__wt_memdup(session, unpack.data, unpack.size, &addr->addr)); diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 2005c279771..83143b60682 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -134,7 +134,7 @@ static void __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; @@ -154,7 +154,7 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **sta */ WT_COL_FOREACH (page, cip, i) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(session, page, cell, unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, unpack); if (unpack->type == WT_CELL_DEL) { orig_deleted = true; deleted_cnt += __wt_cell_rle(unpack); @@ -218,11 +218,9 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **sta static void __stat_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; uint32_t ovfl_cnt; - btree = S2BT(session); ovfl_cnt = 0; WT_STAT_INCR(session, stats, btree_row_internal); @@ -232,7 +230,7 @@ __stat_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **sta * representation of the page doesn't necessarily contain a reference to the original cell. */ if (page->dsk != NULL) { - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, page->dsk, unpack) { if (__wt_cell_type(unpack.cell) == WT_CELL_KEY_OVFL) ++ovfl_cnt; } @@ -249,15 +247,13 @@ __stat_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **sta static void __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; WT_INSERT *ins; WT_ROW *rip; WT_UPDATE *upd; uint32_t empty_values, entry_cnt, i, ovfl_cnt; bool key; - btree = S2BT(session); empty_values = entry_cnt = ovfl_cnt = 0; WT_STAT_INCR(session, stats, btree_row_leaf); @@ -298,7 +294,7 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **st */ if (page->dsk != NULL) { key = false; - WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + WT_CELL_FOREACH_KV (session, page->dsk, unpack) { switch (__wt_cell_type(unpack.cell)) { case WT_CELL_KEY_OVFL: ++ovfl_cnt; diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 100aeed5105..71885a341d0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -179,7 +179,7 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; - wt_timestamp_t newest_stop_ts; + wt_timestamp_t newest_stop_durable_ts; uint64_t newest_stop_txn; uint32_t i; uint8_t previous_state; @@ -214,15 +214,20 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl * dirty. This is to ensure the parent is written during the checkpoint and the child page * discarded. */ - newest_stop_ts = WT_TS_NONE; + newest_stop_durable_ts = WT_TS_NONE; newest_stop_txn = WT_TXN_NONE; obsolete = false; if (previous_state == WT_REF_DISK) { /* There should be an address, but simply skip any page where we don't find one. */ if (__wt_ref_addr_copy(session, ref, &addr)) { - newest_stop_ts = addr.ta.newest_stop_ts; + /* + * Max stop timestamp is possible only when the prepared update is written to the data + * store. + */ + newest_stop_durable_ts = + addr.ta.newest_stop_ts == WT_TS_MAX ? WT_TS_MAX : addr.ta.newest_stop_durable_ts; newest_stop_txn = addr.ta.newest_stop_txn; - obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); + obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_durable_ts); } if (obsolete) { @@ -237,7 +242,7 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl "%p on-disk page obsolete check: %s" "obsolete, stop ts/txn %s", (void *)ref, obsolete ? "" : "not ", - __wt_time_pair_to_string(newest_stop_ts, newest_stop_txn, tp_string)); + __wt_time_pair_to_string(newest_stop_durable_ts, newest_stop_txn, tp_string)); return (0); } WT_REF_UNLOCK(ref, previous_state); @@ -275,21 +280,26 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl /* Calculate the max stop time pair by traversing all multi addresses. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { newest_stop_txn = WT_MAX(newest_stop_txn, multi->addr.ta.newest_stop_txn); - newest_stop_ts = WT_MAX(newest_stop_ts, multi->addr.ta.newest_stop_ts); + newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, + multi->addr.ta.newest_stop_ts == WT_TS_MAX ? WT_TS_MAX : + multi->addr.ta.newest_stop_durable_ts); } - obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); + obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_durable_ts); } else if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replacement block"; newest_stop_txn = mod->mod_replace.ta.newest_stop_txn; - newest_stop_ts = mod->mod_replace.ta.newest_stop_ts; - obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); + newest_stop_durable_ts = mod->mod_replace.ta.newest_stop_ts == WT_TS_MAX ? + WT_TS_MAX : + mod->mod_replace.ta.newest_stop_durable_ts; + obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_durable_ts); } else if (__wt_ref_addr_copy(session, ref, &addr)) { tag = "WT_REF address"; newest_stop_txn = addr.ta.newest_stop_txn; - newest_stop_ts = addr.ta.newest_stop_ts; - obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); + newest_stop_durable_ts = + addr.ta.newest_stop_ts == WT_TS_MAX ? WT_TS_MAX : addr.ta.newest_stop_durable_ts; + obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_durable_ts); } else tag = "unexpected page state"; @@ -303,7 +313,7 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl "%p in-memory page obsolete check: %s %s" "obsolete, stop ts/txn %s", (void *)ref, tag, obsolete ? "" : "not ", - __wt_time_pair_to_string(newest_stop_ts, newest_stop_txn, tp_string)); + __wt_time_pair_to_string(newest_stop_durable_ts, newest_stop_txn, tp_string)); err: if (hazard) diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 0b3d4da2459..c2f0ec0c3ce 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -38,11 +38,14 @@ typedef struct { } WT_VSTUFF; static void __verify_checkpoint_reset(WT_VSTUFF *); -static int __verify_page_content(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK *, WT_VSTUFF *); +static int __verify_page_content_int( + WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK_ADDR *, WT_VSTUFF *); +static int __verify_page_content_leaf( + WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK_ADDR *, WT_VSTUFF *); static int __verify_row_int_key_order( WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *); static int __verify_row_leaf_key_order(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *); -static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK *, WT_VSTUFF *); +static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK_ADDR *, WT_VSTUFF *); static int __verify_ts_stable_cmp( WT_SESSION_IMPL *, WT_ITEM *, WT_REF *, uint32_t, wt_timestamp_t, wt_timestamp_t, WT_VSTUFF *); @@ -164,7 +167,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; - WT_CELL_UNPACK addr_unpack; + WT_CELL_UNPACK_ADDR addr_unpack; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; @@ -384,43 +387,42 @@ err: * Check an address block's timestamps. */ static int -__verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack, WT_VSTUFF *vs) +__verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK_ADDR *unpack, WT_VSTUFF *vs) { + WT_TIME_AGGREGATE *ta; char time_string[WT_TIME_STRING_SIZE]; - if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) + ta = &unpack->ta; + + if (ta->oldest_start_ts != WT_TS_NONE && ta->newest_stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has a newest stop " - "timestamp of 0", - __verify_addr_string(session, ref, vs->tmp1)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) + "internal page reference at %s has a newest stop timestamp of 0; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_stop_ts) WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has an oldest start " - "timestamp newer than its newest stop timestamp; time window %s", + "internal page reference at %s has an oldest start timestamp newer than its newest stop " + "timestamp; time window %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_txn > ta->newest_stop_txn) WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has an oldest start " - "transaction newer than its newest stop " - "transaction; time aggregate %s", + "internal page reference at %s has an oldest start transaction newer than its newest " + "stop transaction; time aggregate %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_start_durable_ts) WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has an oldest start " - "timestamp newer than its newest start durable " - "timestamp; time aggregate %s", + "internal page reference at %s has an oldest start timestamp newer than its newest start " + "durable timestamp; time aggregate %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.newest_stop_ts != WT_TS_MAX && - unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->newest_stop_ts != WT_TS_MAX && ta->newest_stop_ts > ta->newest_stop_durable_ts) WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has a newest stop " - "timestamp newer than its newest stop durable " - "timestamp; time aggregate %s", + "internal page reference at %s has a newest stop timestamp newer than its newest stop " + "durable timestamp; time aggregate %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack->ta, time_string)); + __wt_time_aggregate_to_string(ta, time_string)); return (0); } @@ -431,10 +433,11 @@ __verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack, * Our job is to check logical relationships in the page and in the tree. */ static int -__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack, WT_VSTUFF *vs) +__verify_tree( + WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK_ADDR *addr_unpack, WT_VSTUFF *vs) { WT_BM *bm; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; @@ -515,10 +518,12 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack vs->records_so_far += page->entries; break; case WT_PAGE_COL_INT: - case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: + WT_RET(__verify_page_content_int(session, ref, addr_unpack, vs)); + break; + case WT_PAGE_COL_VAR: case WT_PAGE_ROW_LEAF: - WT_RET(__verify_page_content(session, ref, addr_unpack, vs)); + WT_RET(__verify_page_content_leaf(session, ref, addr_unpack, vs)); break; } @@ -542,8 +547,7 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack if (addr_unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, - "page at %s, of type %s, is referenced in " - "its parent by a cell of type %s", + "page at %s, of type %s, is referenced in its parent by a cell of type %s", __verify_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(addr_unpack->raw)); break; @@ -561,17 +565,16 @@ celltype_err: */ ++entry; if (child_ref->ref_recno != vs->records_so_far + 1) { - WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 - " of the column internal page at " - "%s is %" PRIu64 - " and the expected " - "starting record number is %" PRIu64, + WT_RET_MSG(session, WT_ERROR, + "the starting record number in entry %" PRIu32 + " of the column internal page at %s is %" PRIu64 + " and the expected starting record number is %" PRIu64, entry, __verify_addr_string(session, child_ref, vs->tmp1), child_ref->ref_recno, vs->records_so_far + 1); } /* Unpack the address block and check timestamps */ - __wt_cell_unpack(session, child_ref->home, child_ref->addr, unpack); + __wt_cell_unpack_addr(session, child_ref->home->dsk, child_ref->addr, unpack); WT_RET(__verify_addr_ts(session, child_ref, unpack, vs)); /* Verify the subtree. */ @@ -601,7 +604,7 @@ celltype_err: WT_RET(__verify_row_int_key_order(session, page, child_ref, entry, vs)); /* Unpack the address block and check timestamps */ - __wt_cell_unpack(session, child_ref->home, child_ref->addr, unpack); + __wt_cell_unpack_addr(session, child_ref->home->dsk, child_ref->addr, unpack); WT_RET(__verify_addr_ts(session, child_ref, unpack, vs)); /* Verify the subtree. */ @@ -645,9 +648,8 @@ __verify_row_int_key_order( WT_RET(__wt_compare(session, btree->collator, &item, vs->max_key, &cmp)); if (cmp <= 0) WT_RET_MSG(session, WT_ERROR, "the internal key in entry %" PRIu32 - " on the page at %s " - "sorts before the last key appearing on page %s, earlier " - "in the tree: %s, %s", + " on the page at %s sorts before the last key appearing on " + "page %s, earlier in the tree: %s, %s", entry, __verify_addr_string(session, ref, vs->tmp1), (char *)vs->max_addr->data, __wt_buf_set_printable(session, item.data, item.size, vs->tmp2), __wt_buf_set_printable(session, vs->max_key->data, vs->max_key->size, vs->tmp3)); @@ -698,9 +700,8 @@ __verify_row_leaf_key_order(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs WT_RET(__wt_compare(session, btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp)); if (cmp < 0) WT_RET_MSG(session, WT_ERROR, - "the first key on the page at %s sorts equal to " - "or less than the last key appearing on the page " - "at %s, earlier in the tree: %s, %s", + "the first key on the page at %s sorts equal to or less than the last key appearing " + "on the page at %s, earlier in the tree: %s, %s", __verify_addr_string(session, ref, vs->tmp2), (char *)vs->max_addr->data, __wt_buf_set_printable(session, vs->tmp1->data, vs->tmp1->size, vs->tmp3), __wt_buf_set_printable(session, vs->max_key->data, vs->max_key->size, vs->tmp4)); @@ -771,8 +772,8 @@ __verify_ts_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num, c return (0); WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "timestamp of %s, %s the parent's %s timestamp of %s", + " on page at %s failed verification with %s timestamp of %s, %s " + "the parent's %s timestamp of %s", cell_num, __verify_addr_string(session, ref, vs->tmp1), ts1_name, __verify_timestamp_to_pretty_string(ts1, ts_string[0]), gt ? "less than" : "greater than", ts2_name, __verify_timestamp_to_pretty_string(ts2, ts_string[1])); @@ -819,8 +820,8 @@ msg: __wt_key_string(session, key->data, key->size, btree->key_format, vs->tmp2))); WT_RET_MSG(session, WT_ERROR, - "%s has failed verification with a %s" - " timestamp of %s greater than the stable_timestamp of %s", + "%s has failed verification with a %s timestamp of %s greater than the stable_timestamp of " + "%s", (char *)vs->tmp1->data, start ? "start" : "stop", __wt_timestamp_to_string(start ? start_ts : stop_ts, tp_string[0]), __wt_timestamp_to_string(vs->stable_timestamp, tp_string[1])); @@ -846,11 +847,9 @@ __verify_txn_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num, if (dsk->write_gen <= S2C(session)->base_write_gen) return (0); - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "transaction of %" PRIu64 - ", %s the parent's %s transaction of " - "%" PRIu64, + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s failed verification with %s transaction of %" PRIu64 + ", %s the parent's %s transaction of %" PRIu64, cell_num, __verify_addr_string(session, ref, vs->tmp1), txn1_name, txn1, gt ? "less than" : "greater than", txn2_name, txn2); } @@ -941,30 +940,23 @@ __verify_key_hs( } /* - * __verify_page_content -- - * Verify the page's content. + * __verify_page_content_int -- + * Verify an internal page's content. */ static int -__verify_page_content( - WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack, WT_VSTUFF *vs) +__verify_page_content_int( + WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK_ADDR *addr_unpack, WT_VSTUFF *vs) { - WT_BTREE *btree; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_ADDR unpack; WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; - WT_ROW *rip; - uint64_t recno, rle; + WT_TIME_AGGREGATE *ta; uint32_t cell_num; - uint8_t *p; char time_string[WT_TIME_STRING_SIZE]; - bool found_ovfl; - btree = S2BT(session); page = ref->page; - rip = page->pg_row; - recno = ref->ref_recno; - found_ovfl = false; + ta = &unpack.ta; /* * If a tree is empty (just created), it won't have a disk image; if there is no disk image, @@ -975,17 +967,21 @@ __verify_page_content( /* Walk the page, tracking timestamps and verifying overflow pages. */ cell_num = 0; - WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) { + WT_CELL_FOREACH_ADDR (session, dsk, unpack) { ++cell_num; + + if (!__wt_cell_type_check(unpack.type, dsk->type)) + WT_RET_MSG(session, WT_ERROR, "illegal cell and page type combination: cell %" PRIu32 + " on page at %s is a %s cell on a %s page", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_cell_type_string(unpack.type), __wt_page_type_string(dsk->type)); + switch (unpack.type) { case WT_CELL_KEY_OVFL: - case WT_CELL_VALUE_OVFL: - found_ovfl = true; if ((ret = __verify_overflow(session, unpack.data, unpack.size, vs)) != 0) - WT_RET_MSG(session, ret, "cell %" PRIu32 - " on page at %s references " - "an overflow item at %s that failed " - "verification", + WT_RET_MSG(session, ret, + "cell %" PRIu32 + " on page at %s references an overflow item at %s that failed verification", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), __wt_addr_string(session, unpack.data, unpack.size, vs->tmp2)); break; @@ -1000,126 +996,179 @@ __verify_page_content( case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - if (unpack.ta.oldest_start_ts != WT_TS_NONE && unpack.ta.newest_stop_ts == WT_TS_NONE) - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "newest stop timestamp of 0; time window %s", + if (ta->oldest_start_ts != WT_TS_NONE && ta->newest_stop_ts == WT_TS_NONE) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s has a newest stop timestamp of 0; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack.ta, time_string)); - if (unpack.ta.oldest_start_ts > unpack.ta.newest_stop_ts) + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has an " - "oldest start timestamp newer than " - "its newest stop timestamp; time window %s", + " on page at %s has an oldest start timestamp newer " + "than its newest stop timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack.ta, time_string)); - if (unpack.ta.oldest_start_txn > unpack.ta.newest_stop_txn) { - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page " - "at %s has an oldest start transaction newer than " - "its newest stop transaction; time aggregate %s ", + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_txn > ta->newest_stop_txn) { + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 + " on page at %s has an oldest start transaction newer than its newest stop " + "transaction; time aggregate %s ", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack.ta, time_string)); + __wt_time_aggregate_to_string(ta, time_string)); } - if (unpack.ta.oldest_start_ts > unpack.ta.newest_start_durable_ts) + if (ta->oldest_start_ts > ta->newest_start_durable_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has an " - "oldest start timestamp newer than " - "its newest start durable timestamp; time aggregate %s", + " on page at %s has an oldest start timestamp newer than its newest start " + "durable timestamp; time aggregate %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack.ta, time_string)); - if (unpack.ta.newest_stop_ts != WT_TS_MAX && - unpack.ta.newest_stop_ts > unpack.ta.newest_stop_durable_ts) + __wt_time_aggregate_to_string(ta, time_string)); + if (ta->newest_stop_ts != WT_TS_MAX && ta->newest_stop_ts > ta->newest_stop_durable_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "newest stop timestamp newer than " - "its newest stop durable timestamp; time aggregate %s", + " on page at %s has a newest stop timestamp newer than its newest stop durable " + "timestamp; time aggregate %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_aggregate_to_string(&unpack.ta, time_string)); + __wt_time_aggregate_to_string(ta, time_string)); if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", - unpack.ta.newest_start_durable_ts, "start durable", + ta->newest_start_durable_ts, "start durable", addr_unpack->ta.newest_start_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.ta.oldest_start_ts, "oldest start", addr_unpack->ta.oldest_start_ts, true, - vs)); + ta->oldest_start_ts, "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.ta.oldest_start_txn, "oldest start", addr_unpack->ta.oldest_start_txn, true, - dsk, vs)); + ta->oldest_start_txn, "oldest start", addr_unpack->ta.oldest_start_txn, true, dsk, + vs)); if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", - unpack.ta.newest_stop_durable_ts, "stop durable", + ta->newest_stop_durable_ts, "stop durable", addr_unpack->ta.newest_stop_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.ta.newest_stop_ts, "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); + ta->newest_stop_ts, "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.ta.newest_stop_txn, "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, - vs)); + ta->newest_stop_txn, "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, vs)); WT_RET(__verify_ts_stable_cmp(session, NULL, ref, cell_num - 1, addr_unpack->ta.oldest_start_ts, addr_unpack->ta.newest_stop_ts, vs)); break; + } + } + WT_CELL_FOREACH_END; + + return (0); +} + +/* + * __verify_page_content_leaf -- + * Verify the page's content. + */ +static int +__verify_page_content_leaf( + WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK_ADDR *addr_unpack, WT_VSTUFF *vs) +{ + WT_CELL_UNPACK_KV unpack; + WT_DECL_RET; + WT_PAGE *page; + const WT_PAGE_HEADER *dsk; + WT_ROW *rip; + WT_TIME_WINDOW *tw; + uint64_t recno, rle; + uint32_t cell_num; + uint8_t *p; + char time_string[WT_TIME_STRING_SIZE]; + bool found_ovfl; + + page = ref->page; + rip = page->pg_row; + tw = &unpack.tw; + recno = ref->ref_recno; + found_ovfl = false; + + /* + * If a tree is empty (just created), it won't have a disk image; if there is no disk image, + * we're done. + */ + if ((dsk = page->dsk) == NULL) + return (0); + + /* Walk the page, tracking timestamps and verifying overflow pages. */ + cell_num = 0; + WT_CELL_FOREACH_KV (session, dsk, unpack) { + ++cell_num; + + if (!__wt_cell_type_check(unpack.type, dsk->type)) + WT_RET_MSG(session, WT_ERROR, "illegal cell and page type combination: cell %" PRIu32 + " on page at %s is a %s cell on a %s page", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_cell_type_string(unpack.type), __wt_page_type_string(dsk->type)); + + switch (unpack.type) { + case WT_CELL_KEY_OVFL: + case WT_CELL_VALUE_OVFL: + found_ovfl = true; + if ((ret = __verify_overflow(session, unpack.data, unpack.size, vs)) != 0) + WT_RET_MSG(session, ret, + "cell %" PRIu32 + " on page at %s references an overflow item at %s that failed verification", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_addr_string(session, unpack.data, unpack.size, vs->tmp2)); + break; + } + + switch (unpack.type) { case WT_CELL_DEL: case WT_CELL_VALUE: case WT_CELL_VALUE_COPY: case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: - if (unpack.tw.start_ts != WT_TS_NONE && unpack.tw.stop_ts == WT_TS_NONE) - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a stop " - "timestamp of 0; time window %s", + if (tw->start_ts != WT_TS_NONE && tw->stop_ts == WT_TS_NONE) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s has a stop timestamp of 0; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack.tw, time_string)); - if (unpack.tw.start_ts > unpack.tw.stop_ts) + __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "start timestamp newer than its stop " - "timestamp; time window %s", + " on page at %s has a start timestamp newer than its " + "stop timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack.tw, time_string)); - if (unpack.tw.start_txn > unpack.tw.stop_txn) + __wt_time_window_to_string(tw, time_string)); + if (tw->start_txn > tw->stop_txn) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "start transaction newer than " + " on page at %s has a start transaction newer than " "its stop transaction; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack.tw, time_string)); - if (unpack.tw.start_ts > unpack.tw.durable_start_ts) + __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->durable_start_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "start timestamp newer than its start durable " - "timestamp; time window %s", + " on page at %s has a start timestamp newer than its " + "start durable timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack.tw, time_string)); - if (unpack.tw.stop_ts != WT_TS_MAX && unpack.tw.stop_ts > unpack.tw.durable_stop_ts) + __wt_time_window_to_string(tw, time_string)); + if (tw->stop_ts != WT_TS_MAX && tw->stop_ts > tw->durable_stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has a " - "stop timestamp newer than its stop durable " - "timestamp; time window %s", + " on page at %s has a stop timestamp newer than its " + "stop durable timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_time_window_to_string(&unpack.tw, time_string)); + __wt_time_window_to_string(tw, time_string)); if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", - unpack.tw.durable_start_ts, "newest durable start", + tw->durable_start_ts, "newest durable start", addr_unpack->ta.newest_start_durable_ts, false, vs)); - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_ts, + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", tw->start_ts, "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_txn, + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", tw->start_txn, "oldest start", addr_unpack->ta.oldest_start_txn, true, dsk, vs)); if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", - unpack.tw.durable_stop_ts, "newest durable stop", + tw->durable_stop_ts, "newest durable stop", addr_unpack->ta.newest_stop_durable_ts, false, vs)); - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_ts, + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", tw->stop_ts, "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_txn, + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", tw->stop_txn, "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, vs)); WT_RET(__verify_ts_stable_cmp( - session, NULL, ref, cell_num - 1, unpack.tw.start_ts, unpack.tw.stop_ts, vs)); + session, NULL, ref, cell_num - 1, tw->start_ts, tw->stop_ts, vs)); break; } @@ -1128,11 +1177,12 @@ __verify_page_content( * values in debug mode. */ if (page->type == WT_PAGE_ROW_LEAF) { - if (unpack.type != WT_CELL_KEY && unpack.type != WT_CELL_KEY_OVFL) + if (unpack.type != WT_CELL_VALUE && unpack.type != WT_CELL_VALUE_COPY && + unpack.type != WT_CELL_VALUE_OVFL && unpack.type != WT_CELL_VALUE_SHORT) continue; WT_RET(__wt_row_leaf_key(session, page, rip++, vs->tmp1, false)); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, tw->start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) @@ -1143,7 +1193,7 @@ __verify_page_content( p = vs->tmp1->mem; WT_RET(__wt_vpack_uint(&p, 0, recno)); vs->tmp1->size = WT_PTRDIFF(p, vs->tmp1->mem); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, tw->start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) @@ -1162,8 +1212,8 @@ __verify_page_content( */ if (found_ovfl && addr_unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, - "page at %s, of type %s and referenced in its parent by a " - "cell of type %s, contains overflow items", + "page at %s, of type %s and referenced in its parent by a cell of type %s, contains " + "overflow items", __verify_addr_string(session, ref, vs->tmp1), __wt_page_type_string(ref->page->type), __wt_cell_type_string(addr_unpack->raw)); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index a1e96d41dc9..0b29c3ee526 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -16,7 +16,9 @@ static int __verify_dsk_col_fix(WT_SESSION_IMPL *, const char *, const WT_PAGE_H static int __verify_dsk_col_int(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); static int __verify_dsk_col_var(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); static int __verify_dsk_memsize(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *); -static int __verify_dsk_row(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); +static int __verify_dsk_row_int(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); +static int __verify_dsk_row_leaf( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); #define WT_ERR_VRFY(session, ...) \ do { \ @@ -44,8 +46,8 @@ static int __verify_dsk_row(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADE * WT_CELL_FOREACH macro, created because the loop can't simply unpack cells, * verify has to do additional work to ensure that unpack is safe. */ -#define WT_CELL_FOREACH_VRFY(btree, dsk, cell, unpack, i) \ - for ((cell) = WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; (i) > 0; \ +#define WT_CELL_FOREACH_VRFY(session, dsk, cell, unpack, i) \ + for ((cell) = WT_PAGE_HEADER_BYTE(S2BT(session), dsk), (i) = (dsk)->u.entries; (i) > 0; \ (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i)) /* @@ -90,8 +92,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H if (dsk->recno == WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a record number, which is illegal for " - "this page type", + "%s page at %s has a record number, which is illegal for this page type", __wt_page_type_string(dsk->type), tag); } @@ -167,8 +168,9 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, tag, dsk, addr)); case WT_PAGE_ROW_INT: + return (__verify_dsk_row_int(session, tag, dsk, addr)); case WT_PAGE_ROW_LEAF: - return (__verify_dsk_row(session, tag, dsk, addr)); + return (__verify_dsk_row_leaf(session, tag, dsk, addr)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); @@ -227,8 +229,8 @@ __verify_dsk_ts_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, const char break; } WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "timestamp of %s, %s the parent's %s timestamp of %s", + " on page at %s failed verification with %s timestamp of %s, %s " + "the parent's %s timestamp of %s", cell_num, tag, ts1_name, ts1_bp, gt ? "less than" : "greater than", ts2_name, ts2_bp); } @@ -252,24 +254,25 @@ __verify_dsk_txn_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, const cha if (dsk->write_gen <= S2C(session)->base_write_gen) return (0); - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "transaction of %" PRIu64 - ", %s the parent's %s transaction of " - "%" PRIu64, + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s failed verification with %s transaction of %" PRIu64 + ", %s the parent's %s transaction of %" PRIu64, cell_num, tag, txn1_name, txn1, gt ? "less than" : "greater than", txn2_name, txn2); } /* - * __verify_dsk_validity -- - * Verify a cell's validity window. + * __verify_dsk_addr_validity -- + * Verify an address cell's validity window. */ static int -__verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t cell_num, +__verify_dsk_addr_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_ADDR *unpack, uint32_t cell_num, WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) { + WT_TIME_AGGREGATE *ta; char time_string[WT_TIME_STRING_SIZE]; + ta = &unpack->ta; + /* * Check timestamp and transaction order, and optionally against parent values. Timestamps and * transactions in the parent address aren't necessarily an exact match, but should be within @@ -279,169 +282,192 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t * cell-unpacking code hides it by always returning durable values if they don't appear on the * page. */ - switch (unpack->type) { - case WT_CELL_ADDR_DEL: - case WT_CELL_ADDR_INT: - case WT_CELL_ADDR_LEAF: - case WT_CELL_ADDR_LEAF_NO: - if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a newest stop " - "timestamp of 0; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start timestamp newer than its newest stop " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start transaction newer than its " - "newest stop transaction; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start timestamp newer than its newest start durable " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.newest_stop_ts != WT_TS_MAX && - unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a newest " - "stop timestamp newer than its newest stop durable " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - - if (addr == NULL) - break; + if (ta->oldest_start_ts != WT_TS_NONE && ta->newest_stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a newest stop timestamp of 0; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start timestamp newer than its newest " + "stop timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_txn > ta->newest_stop_txn) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start transaction newer than its newest " + "stop transaction; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_start_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start timestamp newer than its newest " + "start durable timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->newest_stop_ts != WT_TS_MAX && ta->newest_stop_ts > ta->newest_stop_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a newest stop timestamp newer than its newest " + "stop durable timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + + if (addr == NULL) + return (0); - if (addr->ta.newest_start_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->ta.newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", - unpack->ta.oldest_start_ts, "oldest start", addr->ta.oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", - unpack->ta.oldest_start_txn, "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); - - if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->ta.newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", - unpack->ta.newest_stop_ts, "newest stop", addr->ta.newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", - unpack->ta.newest_stop_txn, "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - break; - case WT_CELL_DEL: - case WT_CELL_VALUE: - case WT_CELL_VALUE_COPY: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_OVFL_RM: - case WT_CELL_VALUE_SHORT: - if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a stop " - "timestamp of 0; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_ts > unpack->tw.stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "timestamp newer than its stop timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_txn > unpack->tw.stop_txn) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "transaction newer than its stop " - "transaction; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_ts > unpack->tw.durable_start_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "timestamp newer than its durable start timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.stop_ts != WT_TS_MAX && unpack->tw.stop_ts > unpack->tw.durable_stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a stop " - "timestamp newer than its durable stop timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - - if (addr == NULL) - break; + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + ta->newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, false, + tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", ta->oldest_start_ts, + "oldest start", addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", ta->oldest_start_txn, + "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + ta->newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", ta->newest_stop_ts, + "newest stop", addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", ta->newest_stop_txn, + "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - if (addr->ta.newest_start_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->tw.durable_start_ts, "newest start durable", addr->ta.newest_start_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_ts, - "oldest start", addr->ta.oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_txn, - "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); - if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->tw.durable_stop_ts, "newest stop durable", addr->ta.newest_stop_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_ts, - "newest stop", addr->ta.newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_txn, - "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - break; - } + return (0); +} + +/* + * __verify_dsk_value_validity -- + * Verify a value cell's validity window. + */ +static int +__verify_dsk_value_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_KV *unpack, uint32_t cell_num, + WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) +{ + WT_TIME_WINDOW *tw; + char time_string[WT_TIME_STRING_SIZE]; + + tw = &unpack->tw; + + /* + * Check timestamp and transaction order, and optionally against parent values. Timestamps and + * transactions in the parent address aren't necessarily an exact match, but should be within + * the boundaries of the parent's information. + * + * There's no checking if validity information should appear on a page because the + * cell-unpacking code hides it by always returning durable values if they don't appear on the + * page. + */ + if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a stop timestamp of 0; time window %s", cell_num - 1, + tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->stop_ts) + WT_RET_VRFY(session, + "cell %" PRIu32 + " on page at %s has a start timestamp newer than its stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_txn > tw->stop_txn) + WT_RET_VRFY(session, + "cell %" PRIu32 + " on page at %s has a start transaction newer than its stop transaction; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->durable_start_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a start timestamp newer than its durable start " + "timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->stop_ts != WT_TS_MAX && tw->stop_ts > tw->durable_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a stop timestamp newer than its durable stop " + "timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + + if (addr == NULL) + return (0); + + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET( + __verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", tw->durable_start_ts, + "newest start durable", addr->ta.newest_start_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", tw->start_ts, "oldest start", + addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", tw->start_txn, "oldest start", + addr->ta.oldest_start_txn, true, tag, dsk)); + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", tw->durable_stop_ts, + "newest stop durable", addr->ta.newest_stop_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", tw->stop_ts, "newest stop", + addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", tw->stop_txn, "newest stop", + addr->ta.newest_stop_txn, false, tag, dsk)); return (0); } /* - * __verify_dsk_row -- - * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. + * __verify_row_key_order_check -- + * Check key ordering for row-store pages. */ static int -__verify_dsk_row( +__verify_row_key_order_check(WT_SESSION_IMPL *session, WT_ITEM *last, uint32_t last_cell_num, + WT_ITEM *current, uint32_t cell_num, const char *tag) +{ + WT_DECL_ITEM(tmp1); + WT_DECL_ITEM(tmp2); + WT_DECL_RET; + int cmp; + + WT_RET(__wt_compare(session, S2BT(session)->collator, last, current, &cmp)); + if (cmp < 0) + return (0); + + WT_ERR(__wt_scr_alloc(session, 0, &tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &tmp2)); + + ret = WT_ERROR; + WT_ERR_VRFY(session, + "the %" PRIu32 " and %" PRIu32 " keys on page at %s are incorrectly sorted: %s, %s", + last_cell_num, cell_num, tag, __wt_buf_set_printable(session, last->data, last->size, tmp1), + __wt_buf_set_printable(session, current->data, current->size, tmp2)); + +err: + __wt_scr_free(session, &tmp1); + __wt_scr_free(session, &tmp2); + return (ret); +} + +/* + * __verify_dsk_row_int -- + * Walk a WT_PAGE_ROW_INT disk page and verify it. + */ +static int +__verify_dsk_row_int( WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, WT_ADDR *addr) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_DECL_ITEM(current); - WT_DECL_ITEM(last_ovfl); - WT_DECL_ITEM(last_pfx); - WT_DECL_ITEM(tmp1); - WT_DECL_ITEM(tmp2); + WT_DECL_ITEM(last); + WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; - void *huffman; - size_t prefix; uint32_t cell_num, cell_type, i, key_cnt; uint8_t *end; - int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; - huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); - WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); - WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); - WT_ERR(__wt_scr_alloc(session, 0, &tmp1)); - WT_ERR(__wt_scr_alloc(session, 0, &tmp2)); - last = last_ovfl; + WT_ERR(__wt_scr_alloc(session, 0, &last)); end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, NULL, end); if (ret != 0) { (void)__err_cell_corrupt(session, ret, cell_num, tag); goto err; @@ -452,15 +478,20 @@ __verify_dsk_row( WT_ERR(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; + /* Internal row-store cells should not have prefix compression or recno/rle fields. */ + if (unpack->prefix != 0) + WT_ERR_VRFY( + session, "the %" PRIu32 " cell on page at %s has a non-zero prefix", cell_num, tag); + if (unpack->v != 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " cell on page at %s has a non-zero rle/recno field", cell_num, tag); + /* - * Check ordering relationships between the WT_CELL entries. - * For row-store internal pages, check for: - * two values in a row, - * two keys in a row, - * a value as the first cell on a page. - * For row-store leaf pages, check for: - * two values in a row, - * a value as the first cell on a page. + * Check ordering relationships between the WT_CELL entries. For row-store internal pages, + * check for: + * - two values in a row, + * - two keys in a row, + * - a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: @@ -471,12 +502,9 @@ __verify_dsk_row( case WAS_VALUE: break; case WAS_KEY: - if (dsk->type == WT_PAGE_ROW_LEAF) - break; - WT_ERR_VRFY(session, "cell %" PRIu32 - " on page at %s is the " - "first of two adjacent keys", - cell_num - 1, tag); + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent keys", cell_num - 1, + tag); } last_cell_type = WAS_KEY; break; @@ -484,17 +512,14 @@ __verify_dsk_row( case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - case WT_CELL_VALUE: - case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", tag); case WAS_KEY: break; case WAS_VALUE: - WT_ERR_VRFY(session, "cell %" PRIu32 - " on page at %s is the " - "first of two adjacent values", + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent values", cell_num - 1, tag); } last_cell_type = WAS_VALUE; @@ -502,7 +527,14 @@ __verify_dsk_row( } /* Check the validity window. */ - WT_ERR(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + switch (cell_type) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + WT_ERR(__verify_dsk_addr_validity(session, unpack, cell_num, addr, tag, dsk)); + break; + } /* Check if any referenced item has an invalid address. */ switch (cell_type) { @@ -511,6 +543,167 @@ __verify_dsk_row( case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: + if ((ret = bm->addr_invalid(bm, session, unpack->data, unpack->size)) == EINVAL) + (void)__err_cell_corrupt_or_eof(session, ret, cell_num, tag); + WT_ERR(ret); + break; + } + + /* + * Remaining checks are for key order. If this cell isn't a key, we're done, move to the + * next cell. If this cell is an overflow item, instantiate the key and compare it with the + * last key. + */ + switch (cell_type) { + case WT_CELL_KEY: + /* Get the cell's data/length and make sure we have enough buffer space. */ + WT_ERR(__wt_buf_init(session, current, unpack->size)); + + /* Copy the data into place. */ + memcpy((uint8_t *)current->mem, unpack->data, unpack->size); + current->size = unpack->size; + break; + case WT_CELL_KEY_OVFL: + WT_ERR(__wt_dsk_cell_data_ref(session, dsk->type, unpack, current)); + break; + default: + /* Not a key -- continue with the next cell. */ + continue; + } + + /* + * Compare the current key against the last key. + * + * Be careful about the 0th key on internal pages: we only store the first byte and custom + * collators may not be able to handle truncated keys. + */ + if (cell_num > 3) + WT_ERR( + __verify_row_key_order_check(session, last, cell_num - 2, current, cell_num, tag)); + + /* Swap the buffers. */ + tmp = last; + last = current; + current = tmp; + } + WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell)); + + /* + * On row-store internal pages, the key count should be equal to half the number of physical + * entries. + */ + if (key_cnt * 2 != dsk->u.entries) + WT_ERR_VRFY(session, + "%s page at %s has a key count of %" PRIu32 " and a physical entry count of %" PRIu32, + __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); + + if (0) { +err: + if (ret == 0) + ret = WT_ERROR; + } + __wt_scr_free(session, ¤t); + __wt_scr_free(session, &last); + return (ret); +} + +/* + * __verify_dsk_row_leaf -- + * Walk a WT_PAGE_ROW_LEAF disk page and verify it. + */ +static int +__verify_dsk_row_leaf( + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, WT_ADDR *addr) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK_KV *unpack, _unpack; + WT_DECL_ITEM(current); + WT_DECL_ITEM(last_ovfl); + WT_DECL_ITEM(last_pfx); + WT_DECL_RET; + WT_ITEM *last; + enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; + void *huffman; + size_t prefix; + uint32_t cell_num, cell_type, i, key_cnt, last_cell_num; + uint8_t *end; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; + + WT_ERR(__wt_scr_alloc(session, 0, ¤t)); + WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); + WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); + last = last_ovfl; + + end = (uint8_t *)dsk + dsk->mem_size; + + last_cell_type = FIRST; + cell_num = last_cell_num = 0; + key_cnt = 0; + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { + ++cell_num; + + /* Carefully unpack the cell. */ + ret = __wt_cell_unpack_safe(session, dsk, cell, NULL, unpack, end); + if (ret != 0) { + (void)__err_cell_corrupt(session, ret, cell_num, tag); + goto err; + } + + /* Check the raw and collapsed cell types. */ + WT_ERR(__err_cell_type(session, cell_num, tag, unpack->raw, dsk->type)); + WT_ERR(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); + cell_type = unpack->type; + + /* Leaf row-store cells should not have recno/rle fields. */ + if (unpack->v != 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " cell on page at %s has a non-zero rle/recno field", cell_num, tag); + + /* + * Check ordering relationships between the WT_CELL entries. For row-store leaf pages, check + * for: + * - two values in a row, + * - a value as the first cell on a page. + */ + switch (cell_type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + ++key_cnt; + last_cell_type = WAS_KEY; + break; + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + switch (last_cell_type) { + case FIRST: + WT_ERR_VRFY(session, "page at %s begins with a value", tag); + case WAS_KEY: + break; + case WAS_VALUE: + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent values", + cell_num - 1, tag); + } + last_cell_type = WAS_VALUE; + break; + } + + /* Check the validity window. */ + switch (cell_type) { + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + WT_ERR(__verify_dsk_value_validity(session, unpack, cell_num, addr, tag, dsk)); + break; + } + + /* Check if any referenced item has an invalid address. */ + switch (cell_type) { + case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if ((ret = bm->addr_invalid(bm, session, unpack->data, unpack->size)) == EINVAL) (void)__err_cell_corrupt_or_eof(session, ret, cell_num, tag); @@ -542,17 +735,15 @@ __verify_dsk_row( prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 - " key on page at %s is the first " - "non-overflow key on the page and has a non-zero " - "prefix compression value", + " key on page at %s is the first non-overflow key on the page and " + "has a non-zero prefix compression value", cell_num, tag); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) - WT_ERR_VRFY(session, "key %" PRIu32 - " on page at %s has a prefix " - "compression count of %" WT_SIZET_FMT - ", larger than the length of the previous key, %" WT_SIZET_FMT, + WT_ERR_VRFY(session, + "key %" PRIu32 " on page at %s has a prefix compression count of %" WT_SIZET_FMT + ", larger than the length of the previous key, %" WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* @@ -591,21 +782,11 @@ __verify_dsk_row( key_compare: /* * Compare the current key against the last key. - * - * Be careful about the 0th key on internal pages: we only store the first byte and custom - * collators may not be able to handle truncated keys. */ - if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || - (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { - WT_ERR(__wt_compare(session, btree->collator, last, current, &cmp)); - if (cmp >= 0) - WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 - " keys on " - "page at %s are incorrectly sorted: %s, %s", - cell_num - 2, cell_num, tag, - __wt_buf_set_printable(session, last->data, last->size, tmp1), - __wt_buf_set_printable(session, current->data, current->size, tmp2)); - } + if (cell_num > 1) + WT_ERR( + __verify_row_key_order_check(session, last, last_cell_num, current, cell_num, tag)); + last_cell_num = cell_num; /* * Swap the buffers: last always references the last key entry, last_pfx and last_ovfl @@ -625,28 +806,21 @@ key_compare: WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell)); /* - * On row-store internal pages, and on row-store leaf pages, where the - * "no empty values" flag is set, the key count should be equal to half - * the number of physical entries. On row-store leaf pages where the - * "all empty values" flag is set, the key count should be equal to the - * number of physical entries. + * On standard row-store leaf pages there's no check to make, there may be more keys than values + * as zero-length values aren't physically stored on the page. On row-store leaf pages, where + * the "no empty values" flag is set, the key count should be equal to half the number of + * physical entries. On row-store leaf pages where the "all empty values" flag is set, the key + * count should be equal to the number of physical entries. */ - if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries) - WT_ERR_VRFY(session, "%s page at %s has a key count of %" PRIu32 - " and a " - "physical entry count of %" PRIu32, - __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); - if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && - key_cnt != dsk->u.entries) + if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && key_cnt != dsk->u.entries) WT_ERR_VRFY(session, - "%s page at %s with the 'all empty values' flag set has a " - "key count of %" PRIu32 " and a physical entry count of %" PRIu32, + "%s page at %s with the 'all empty values' flag set has a key count of %" PRIu32 + " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); - if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && - key_cnt * 2 != dsk->u.entries) + if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && key_cnt * 2 != dsk->u.entries) WT_ERR_VRFY(session, - "%s page at %s with the 'no empty values' flag set has a " - "key count of %" PRIu32 " and a physical entry count of %" PRIu32, + "%s page at %s with the 'no empty values' flag set has a key count of %" PRIu32 + " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); if (0) { @@ -657,8 +831,6 @@ err: __wt_scr_free(session, ¤t); __wt_scr_free(session, &last_pfx); __wt_scr_free(session, &last_ovfl); - __wt_scr_free(session, &tmp1); - __wt_scr_free(session, &tmp2); return (ret); } @@ -673,7 +845,7 @@ __verify_dsk_col_int( WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_DECL_RET; uint32_t cell_num, i; uint8_t *end; @@ -684,11 +856,11 @@ __verify_dsk_col_int( end = (uint8_t *)dsk + dsk->mem_size; cell_num = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, NULL, end); if (ret != 0) return (__err_cell_corrupt(session, ret, cell_num, tag)); @@ -697,7 +869,7 @@ __verify_dsk_col_int( WT_RET(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); /* Check the validity window. */ - WT_RET(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + WT_RET(__verify_dsk_addr_validity(session, unpack, cell_num, addr, tag, dsk)); /* Check if any referenced item is entirely in the file. */ ret = bm->addr_invalid(bm, session, unpack->data, unpack->size); @@ -743,7 +915,7 @@ __verify_dsk_col_var( WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_DECL_RET; uint32_t cell_num, cell_type, i; uint8_t *end; @@ -759,11 +931,11 @@ __verify_dsk_col_var( last.deleted = false; cell_num = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, NULL, unpack, end); if (ret != 0) return (__err_cell_corrupt(session, ret, cell_num, tag)); @@ -773,7 +945,7 @@ __verify_dsk_col_var( cell_type = unpack->type; /* Check the validity window. */ - WT_RET(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + WT_RET(__verify_dsk_value_validity(session, unpack, cell_num, addr, tag, dsk)); /* Check if any referenced item is entirely in the file. */ if (cell_type == WT_CELL_VALUE_OVFL) { @@ -797,8 +969,7 @@ __verify_dsk_col_var( memcmp(last.data, unpack->data, last.size) == 0) match_err: WT_RET_VRFY(session, "data entries %" PRIu32 " and %" PRIu32 - " on page at %s are identical and should " - "have been run-length encoded", + " on page at %s are identical and should have been run-length encoded", cell_num - 1, cell_num, tag); __wt_time_window_copy(&last.tw, &unpack->tw); @@ -841,9 +1012,8 @@ __verify_dsk_memsize( len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell); if (len == 0) return (0); - WT_RET_VRFY(session, "%s page at %s has %" WT_SIZET_FMT - " unexpected bytes of data " - "after the last cell", + WT_RET_VRFY(session, + "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data after the last cell", __wt_page_type_string(dsk->type), tag, len); } @@ -862,7 +1032,7 @@ __verify_dsk_chunk( end = (uint8_t *)dsk + dsk->mem_size; /* - * Fixed-length column-store and overflow pages are simple chunks of data. Verify the data + * Fixed-length column-store and overflow pages are simple chunks of data-> Verify the data * doesn't overflow the end of the page. */ p = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -896,19 +1066,17 @@ __err_cell_corrupt(WT_SESSION_IMPL *session, int retval, uint32_t entry_num, con static int __err_cell_corrupt_or_eof(WT_SESSION_IMPL *session, int retval, uint32_t entry_num, const char *tag) { - WT_RET_VRFY_RETVAL(session, retval, "item %" PRIu32 - " on page at %s is a corrupted cell or references " - "non-existent file pages", + WT_RET_VRFY_RETVAL(session, retval, + "item %" PRIu32 " on page at %s is a corrupted cell or references non-existent file pages", entry_num, tag); } /* - * __err_cell_type -- - * Generic illegal cell type for a particular page type error. + * __wt_cell_type_check -- + * Check the cell type against the page type. */ -static int -__err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, uint8_t cell_type, - uint8_t dsk_type) +bool +__wt_cell_type_check(uint8_t cell_type, uint8_t dsk_type) { switch (cell_type) { case WT_CELL_ADDR_DEL: @@ -916,22 +1084,22 @@ __err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, u case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: if (dsk_type == WT_PAGE_COL_INT || dsk_type == WT_PAGE_ROW_INT) - return (0); + return (true); break; case WT_CELL_DEL: if (dsk_type == WT_PAGE_COL_VAR) - return (0); + return (true); break; case WT_CELL_KEY: case WT_CELL_KEY_OVFL: case WT_CELL_KEY_SHORT: if (dsk_type == WT_PAGE_ROW_INT || dsk_type == WT_PAGE_ROW_LEAF) - return (0); + return (true); break; case WT_CELL_KEY_PFX: case WT_CELL_KEY_SHORT_PFX: if (dsk_type == WT_PAGE_ROW_LEAF) - return (0); + return (true); break; case WT_CELL_KEY_OVFL_RM: case WT_CELL_VALUE_OVFL_RM: @@ -944,13 +1112,23 @@ __err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, u case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: if (dsk_type == WT_PAGE_COL_VAR || dsk_type == WT_PAGE_ROW_LEAF) - return (0); - break; - default: + return (true); break; } + return (false); +} - WT_RET_VRFY(session, "illegal cell and page type combination: cell %" PRIu32 - " on page at %s is a %s cell on a %s page", - entry_num, tag, __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); +/* + * __err_cell_type -- + * Generic illegal cell type for a particular page type error. + */ +static int +__err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, uint8_t cell_type, + uint8_t dsk_type) +{ + if (!__wt_cell_type_check(cell_type, dsk_type)) + WT_RET_VRFY(session, "illegal cell and page type combination: cell %" PRIu32 + " on page at %s is a %s cell on a %s page", + entry_num, tag, __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); + return (0); } diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index 8db18398831..b9113df8f44 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -120,7 +120,7 @@ __wt_row_leaf_key_copy(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ /* * __wt_row_leaf_key_work -- - * Return a reference to, a row-store leaf-page key, optionally instantiate the key into the + * Return a reference to a row-store leaf-page key, optionally instantiate the key into the * in-memory page. */ int @@ -130,7 +130,7 @@ __wt_row_leaf_key_work( enum { FORWARD, BACKWARD } direction; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_IKEY *ikey; @@ -253,7 +253,7 @@ switch_and_jump: /* * It must be an on-page cell, unpack it. */ - __wt_cell_unpack(session, page, cell, unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, unpack); /* 3: the test for an on-page reference to an overflow key. */ if (unpack->type == WT_CELL_KEY_OVFL) { @@ -272,7 +272,7 @@ switch_and_jump: __wt_readlock(session, &btree->ovfl_lock); copy = WT_ROW_KEY_COPY(rip); if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { - __wt_cell_unpack(session, page, cell, unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, unpack); ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); } __wt_readunlock(session, &btree->ovfl_lock); diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index ed1db846793..ea4858b0c30 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -1070,7 +1070,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma */ WT_ERR(__wt_buf_set(session, &upd_value->buf, hs_value->data, hs_value->size)); skip_buf: - upd_value->start_ts = hs_start_ts; + upd_value->durable_ts = durable_timestamp; upd_value->txnid = WT_TXN_NONE; upd_value->type = upd_type; upd_value->prepare_state = diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 1bcca8dc686..e466f5716f8 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -1111,7 +1111,7 @@ struct __wt_update { */ struct __wt_update_value { WT_ITEM buf; - wt_timestamp_t start_ts; + wt_timestamp_t durable_ts; uint64_t txnid; uint8_t type; uint8_t prepare_state; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 7b3ff5b8f3d..4e6780566d7 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -728,7 +728,7 @@ __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep) * Set a WT_REF to reference an on-page key. */ static inline void -__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack) +__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK_ADDR *unpack) { uintptr_t v; @@ -930,7 +930,7 @@ __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) * Set a WT_ROW to reference an on-page row-store leaf key. */ static inline void -__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) +__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack) { uintptr_t v; @@ -948,7 +948,7 @@ __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) * Set a WT_ROW to reference an on-page row-store leaf value. */ static inline void -__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) +__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack) { uintptr_t key_len, key_offset, value_offset, v; @@ -1017,10 +1017,10 @@ __wt_row_leaf_key( */ static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, - WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack) + WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack) { WT_CELL *kcell, *vcell; - WT_CELL_UNPACK unpack; + WT_CELL_UNPACK_KV unpack; size_t size; void *copy, *key; @@ -1045,12 +1045,12 @@ __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, if (__wt_row_leaf_key_info(page, copy, NULL, &kcell, &key, &size) && kcell == NULL) vcell = (WT_CELL *)((uint8_t *)key + size); else { - __wt_cell_unpack(session, page, kcell, &unpack); + __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); vcell = (WT_CELL *)((uint8_t *)unpack.cell + __wt_cell_total_len(&unpack)); } } - __wt_cell_unpack(session, page, __wt_cell_leaf_value_parse(page, vcell), vpack); + __wt_cell_unpack_kv(session, page->dsk, __wt_cell_leaf_value_parse(page, vcell), vpack); } /* @@ -1094,7 +1094,7 @@ static inline bool __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) { WT_ADDR *addr; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_PAGE *page; unpack = &_unpack; @@ -1122,7 +1122,7 @@ __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) } /* If on-page, the pointer references a cell. */ - __wt_cell_unpack(session, page, (WT_CELL *)addr, unpack); + __wt_cell_unpack_addr(session, page->dsk, (WT_CELL *)addr, unpack); __wt_time_aggregate_copy(©->ta, &unpack->ta); copy->type = 0; /* Avoid static analyzer uninitialized value complaints. */ switch (unpack->raw) { @@ -1696,7 +1696,7 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32 */ static inline int __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_PAGE *page, - WT_CELL_UNPACK *unpack, WT_COL *cip) + WT_CELL_UNPACK_KV *unpack, WT_COL *cip) { cbt->slot = WT_COL_SLOT(page, cip); WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, NULL, unpack)); diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h index 760811e5acf..d7a5452dbe1 100644 --- a/src/third_party/wiredtiger/src/include/cell.h +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -144,37 +144,64 @@ struct __wt_cell { uint8_t __chunk[1 + 1 + 1 + 7 * WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE]; }; +/* AUTOMATIC FLAG VALUE GENERATION START */ +#define WT_CELL_UNPACK_OVERFLOW 0x1u /* cell is an overflow */ +#define WT_CELL_UNPACK_PREPARE 0x2u /* cell is part of a prepared transaction */ +#define WT_CELL_UNPACK_TIME_WINDOW_CLEARED 0x4u /* time window cleared because of restart */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + /* - * WT_CELL_UNPACK -- - * Unpacked cell. + * We have two "unpacked cell" structures: one holding holds unpacked cells from internal nodes + * (address pages), and one holding unpacked cells from leaf nodes (key/value pages). They share a + * common set of initial fields: in a few places where a function has to handle both types of + * unpacked cells, the unpacked cell structures are cast to an "unpack-common" structure that can + * only reference shared fields. */ -struct __wt_cell_unpack { - WT_CELL *cell; /* Cell's disk image address */ +#define WT_CELL_COMMON_FIELDS \ + WT_CELL *cell; /* Cell's disk image address */ \ + \ + uint64_t v; /* RLE count or recno */ \ + \ + /* \ + * The size and __len fields are reasonably type size_t; don't change the type, performance \ + * drops significantly if they're type size_t. \ + */ \ + const void *data; /* Data */ \ + uint32_t size; /* Data size */ \ + \ + uint32_t __len; /* Cell + data length (usually) */ \ + \ + uint8_t prefix; /* Cell prefix length */ \ + \ + uint8_t raw; /* Raw cell type (include "shorts") */ \ + uint8_t type; /* Cell type */ \ + \ + uint8_t flags - WT_TIME_AGGREGATE ta; /* Address validity window */ - WT_TIME_WINDOW tw; /* Value validity window */ - - uint64_t v; /* RLE count or recno */ - - /* - * !!! - * The size and __len fields are reasonably type size_t; don't change - * the type, performance drops significantly if they're type size_t. - */ - const void *data; /* Data */ - uint32_t size; /* Data size */ +/* + * WT_CELL_UNPACK_COMMON -- + * Unpacked address cell, the common fields. + */ +struct __wt_cell_unpack_common { + WT_CELL_COMMON_FIELDS; +}; - uint32_t __len; /* Cell + data length (usually) */ +/* + * WT_CELL_UNPACK_ADDR -- + * Unpacked address cell. + */ +struct __wt_cell_unpack_addr { + WT_CELL_COMMON_FIELDS; - uint8_t prefix; /* Cell prefix length */ + WT_TIME_AGGREGATE ta; /* Address validity window */ +}; - uint8_t raw; /* Raw cell type (include "shorts") */ - uint8_t type; /* Cell type */ +/* + * WT_CELL_UNPACK_KV -- + * Unpacked value cell. + */ +struct __wt_cell_unpack_kv { + WT_CELL_COMMON_FIELDS; -/* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CELL_UNPACK_OVERFLOW 0x1u /* cell is an overflow */ -#define WT_CELL_UNPACK_PREPARE 0x2u /* cell is part of a prepared transaction */ -#define WT_CELL_UNPACK_TIME_WINDOW_CLEARED 0x4u /* time window cleared because of restart */ - /* AUTOMATIC FLAG VALUE GENERATION STOP */ - uint8_t flags; + WT_TIME_WINDOW tw; /* Value validity window */ }; diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index f14eb7f8d15..427ca83b124 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -570,7 +570,7 @@ __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, WT_TI * Return the cell's RLE value. */ static inline uint64_t -__wt_cell_rle(WT_CELL_UNPACK *unpack) +__wt_cell_rle(WT_CELL_UNPACK_KV *unpack) { /* * Any item with only 1 occurrence is stored with an RLE of 0, that is, without any RLE at all. @@ -584,8 +584,12 @@ __wt_cell_rle(WT_CELL_UNPACK *unpack) * Return the cell's total length, including data. */ static inline size_t -__wt_cell_total_len(WT_CELL_UNPACK *unpack) +__wt_cell_total_len(void *unpack_arg) { + WT_CELL_UNPACK_COMMON *unpack; + + unpack = unpack_arg; + /* * The length field is specially named because it's dangerous to use it: it represents the * length of the current cell (normally used for the loop that walks through cells on the page), @@ -696,18 +700,31 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) } /* + * The verification code specifies an end argument, a pointer to 1B past the end-of-page. In which + * case, make sure all reads are inside the page image. If an error occurs, return an error code but + * don't output messages, our caller handles that. + */ +#define WT_CELL_LEN_CHK(t, len) \ + do { \ + if (end != NULL && \ + ((uint8_t *)(t) < (uint8_t *)dsk || (((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \ + return (WT_ERROR); \ + } while (0) + +/* * __wt_cell_unpack_safe -- * Unpack a WT_CELL into a structure, with optional boundary checks. */ static inline int __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, - WT_CELL_UNPACK *unpack, const void *end) + WT_CELL_UNPACK_ADDR *unpack_addr, WT_CELL_UNPACK_KV *unpack_value, const void *end) { struct { uint64_t v; - WT_TIME_WINDOW tw; uint32_t len; + WT_TIME_WINDOW tw; } copy; + WT_CELL_UNPACK_COMMON *unpack; WT_TIME_AGGREGATE *ta; WT_TIME_WINDOW *tw; uint64_t v; @@ -719,20 +736,19 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE copy.len = 0; /* [-Wconditional-uninitialized] */ copy.v = 0; /* [-Wconditional-uninitialized] */ - tw = &unpack->tw; - ta = &unpack->ta; + if (unpack_addr == NULL) { + unpack = (WT_CELL_UNPACK_COMMON *)unpack_value; + tw = &unpack_value->tw; + __wt_time_window_init(tw); + ta = NULL; + } else { + WT_ASSERT(session, unpack_value == NULL); -/* - * The verification code specifies an end argument, a pointer to 1B past the end-of-page. In which - * case, make sure all reads are inside the page image. If an error occurs, return an error code but - * don't output messages, our caller handles that. - */ -#define WT_CELL_LEN_CHK(t, len) \ - do { \ - if (end != NULL && \ - ((uint8_t *)(t) < (uint8_t *)dsk || (((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \ - return (WT_ERROR); \ - } while (0) + unpack = (WT_CELL_UNPACK_COMMON *)unpack_addr; + ta = &unpack_addr->ta; + __wt_time_aggregate_init(ta); + tw = NULL; + } /* * NB: when unpacking a WT_CELL_VALUE_COPY cell, unpack.cell is returned as the original cell, @@ -741,7 +757,7 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE */ unpack->cell = cell; -restart: +copy_cell_restart: WT_CELL_LEN_CHK(cell, 0); /* @@ -751,15 +767,14 @@ restart: * following switch. All validity windows default to durability. */ unpack->v = 0; - __wt_time_window_init(&unpack->tw); - __wt_time_aggregate_init(&unpack->ta); unpack->raw = (uint8_t)__wt_cell_type_raw(cell); unpack->type = (uint8_t)__wt_cell_type(cell); unpack->flags = 0; /* - * Handle cells with none of RLE counts, validity window or data length: short key/data cells - * have 6 bits of data length in the descriptor byte and nothing else. + * Handle cells with none of RLE counts, validity window or data length: WT_CELL_KEY_SHORT_PFX, + * WT_CELL_KEY_SHORT and WT_CELL_VALUE_SHORT. Short key/data cells have 6 bits of data length in + * the descriptor byte and nothing else */ switch (unpack->raw) { case WT_CELL_KEY_SHORT_PFX: @@ -768,14 +783,14 @@ restart: unpack->data = cell->__chunk + 2; unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT; unpack->__len = 2 + unpack->size; - goto done; + goto done; /* Handle copy cells. */ case WT_CELL_KEY_SHORT: case WT_CELL_VALUE_SHORT: unpack->prefix = 0; unpack->data = cell->__chunk + 1; unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT; unpack->__len = 1 + unpack->size; - goto done; + goto done; /* Handle copy cells. */ } unpack->prefix = 0; @@ -800,6 +815,14 @@ restart: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: + /* + * Skip if we know we're not unpacking a cell of this type. This is all inlined code, and + * ideally checking allows the compiler to discard big chunks of it. + */ + WT_ASSERT(session, unpack_value == NULL); + if (unpack_value != NULL) + break; + if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0) break; flags = *p++; /* skip second descriptor byte */ @@ -842,6 +865,14 @@ restart: case WT_CELL_VALUE_COPY: case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: + /* + * Skip if we know we're not unpacking a cell of this type. This is all inlined code, and + * ideally checking allows the compiler to discard big chunks of it. + */ + WT_ASSERT(session, unpack_addr == NULL); + if (unpack_addr != NULL) + break; + if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0) break; flags = *p++; /* skip second descriptor byte */ @@ -895,6 +926,14 @@ restart: */ switch (unpack->raw) { case WT_CELL_VALUE_COPY: + /* + * Skip if we know we're not unpacking a cell of this type. This is all inlined code, and + * ideally checking allows the compiler to discard big chunks of it. + */ + WT_ASSERT(session, unpack_addr == NULL); + if (unpack_addr != NULL) + break; + copy_cell = true; /* @@ -902,12 +941,13 @@ restart: * visibility window, length and RLE of this cell, we need the length to step through the * set of cells on the page and the RLE and timestamp information are specific to this cell. */ - __wt_time_window_copy(©.tw, tw); WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v)); copy.v = unpack->v; copy.len = WT_PTRDIFF32(p, cell); + tw = ©.tw; + __wt_time_window_init(tw); cell = (WT_CELL *)((uint8_t *)cell - v); - goto restart; + goto copy_cell_restart; case WT_CELL_KEY_OVFL: case WT_CELL_KEY_OVFL_RM: @@ -954,57 +994,34 @@ restart: done: /* - * Check the original cell against the full cell length (this is a diagnostic as well, we may be - * copying the cell from the page and we need the right length). + * Skip if we know we're not unpacking a cell of this type. This is all inlined code, and + * ideally checking allows the compiler to discard big chunks of it. */ - WT_CELL_LEN_CHK(cell, unpack->__len); - if (copy_cell) { - __wt_time_window_copy(tw, ©.tw); + if (unpack_addr == NULL && copy_cell) { unpack->v = copy.v; unpack->__len = copy.len; unpack->raw = WT_CELL_VALUE_COPY; } + /* + * Check the original cell against the full cell length (this is a diagnostic as well, we may be + * copying the cell from the page and we need the right length). + */ + WT_CELL_LEN_CHK(cell, unpack->__len); return (0); } /* - * __wt_cell_unpack_dsk -- - * Unpack a WT_CELL into a structure. + * __cell_unpack_window_cleanup -- + * Clean up cells loaded from a previous run. */ static inline void -__wt_cell_unpack_dsk( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack) +__cell_unpack_window_cleanup(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, + WT_CELL_UNPACK_ADDR *unpack_addr, WT_CELL_UNPACK_KV *unpack_kv) { WT_TIME_AGGREGATE *ta; WT_TIME_WINDOW *tw; - ta = &unpack->ta; - tw = &unpack->tw; - - /* - * Row-store doesn't store zero-length values on pages, but this allows us to pretend. - */ - if (cell == NULL) { - unpack->cell = NULL; - unpack->v = 0; - /* - * If there isn't any value validity window (which is what it will take to get to a - * zero-length item), the value must be stable. - */ - __wt_time_window_init(tw); - __wt_time_aggregate_init(ta); - unpack->data = ""; - unpack->size = 0; - unpack->__len = 0; - unpack->prefix = 0; - unpack->raw = unpack->type = WT_CELL_VALUE; - unpack->flags = 0; - return; - } - - WT_IGNORE_RET(__wt_cell_unpack_safe(session, dsk, cell, unpack, NULL)); - /* * If the page came from a previous run, reset the transaction ids to "none" and timestamps to 0 * as appropriate. Transaction ids shouldn't persist between runs so these are always set to @@ -1020,42 +1037,85 @@ __wt_cell_unpack_dsk( * Current startup txnid=x, ts=y txnid=x, ts=WT_TS_NONE txnid=MAX, ts=MAX * Previous startup txnid=0, ts=y txnid=0, ts=WT_TS_NONE txnid=MAX, ts=MAX */ - if (dsk->write_gen > 0 && dsk->write_gen <= S2C(session)->base_write_gen) { - /* FIXME-WT-6124: deal with durable timestamps. */ - /* Tell reconciliation we cleared the transaction ids and the cell needs to be rebuilt. */ - if (tw->start_txn != WT_TXN_NONE) { - tw->start_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); - } - if (tw->stop_txn != WT_TXN_MAX) { - tw->stop_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); - if (tw->stop_ts == WT_TS_MAX) - tw->stop_ts = WT_TS_NONE; - } else - WT_ASSERT(session, tw->stop_ts == WT_TS_MAX); + if (dsk->write_gen == 0 || dsk->write_gen > S2C(session)->base_write_gen) + return; + + /* Tell reconciliation we cleared the transaction ids and the cell needs to be rebuilt. */ + /* FIXME-WT-6124: deal with durable timestamps. */ + if (unpack_addr != NULL) { + ta = &unpack_addr->ta; if (ta->oldest_start_txn != WT_TXN_NONE) { ta->oldest_start_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + F_SET(unpack_addr, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); } if (ta->newest_stop_txn != WT_TXN_MAX) { ta->newest_stop_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + F_SET(unpack_addr, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); if (ta->newest_stop_ts == WT_TS_MAX) ta->newest_stop_ts = WT_TS_NONE; } else WT_ASSERT(session, ta->newest_stop_ts == WT_TS_MAX); } + if (unpack_kv != NULL) { + tw = &unpack_kv->tw; + if (tw->start_txn != WT_TXN_NONE) { + tw->start_txn = WT_TXN_NONE; + F_SET(unpack_kv, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + } + if (tw->stop_txn != WT_TXN_MAX) { + tw->stop_txn = WT_TXN_NONE; + F_SET(unpack_kv, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + if (tw->stop_ts == WT_TS_MAX) + tw->stop_ts = WT_TS_NONE; + } else + WT_ASSERT(session, tw->stop_ts == WT_TS_MAX); + } } /* - * __wt_cell_unpack -- - * Unpack a WT_CELL into a structure. + * __wt_cell_unpack_addr -- + * Unpack an address WT_CELL into a structure. */ static inline void -__wt_cell_unpack(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack) +__wt_cell_unpack_addr(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, + WT_CELL_UNPACK_ADDR *unpack_addr) { - __wt_cell_unpack_dsk(session, page->dsk, cell, unpack); + WT_IGNORE_RET(__wt_cell_unpack_safe(session, dsk, cell, unpack_addr, NULL, NULL)); + __cell_unpack_window_cleanup(session, dsk, unpack_addr, NULL); +} + +/* + * __wt_cell_unpack_kv -- + * Unpack a value WT_CELL into a structure. + */ +static inline void +__wt_cell_unpack_kv(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, + WT_CELL_UNPACK_KV *unpack_value) +{ + /* + * Row-store doesn't store zero-length values on pages, but this allows us to pretend. + */ + if (cell == NULL) { + unpack_value->cell = NULL; + unpack_value->v = 0; + unpack_value->data = ""; + unpack_value->size = 0; + unpack_value->__len = 0; + unpack_value->prefix = 0; + unpack_value->raw = unpack_value->type = WT_CELL_VALUE; + unpack_value->flags = 0; + + /* + * If there isn't any value validity window (which is what it will take to get to a + * zero-length item), the value must be stable. + */ + __wt_time_window_init(&unpack_value->tw); + + return; + } + + WT_IGNORE_RET(__wt_cell_unpack_safe(session, dsk, cell, NULL, unpack_value, NULL)); + __cell_unpack_window_cleanup(session, dsk, NULL, unpack_value); } /* @@ -1063,8 +1123,8 @@ __wt_cell_unpack(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_CELL * Set a buffer to reference the data from an unpacked cell. */ static inline int -__cell_data_ref( - WT_SESSION_IMPL *session, WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +__cell_data_ref(WT_SESSION_IMPL *session, WT_PAGE *page, int page_type, + WT_CELL_UNPACK_COMMON *unpack, WT_ITEM *store) { WT_BTREE *btree; bool decoded; @@ -1119,9 +1179,12 @@ __cell_data_ref( * be of type WT_CELL_VALUE_OVFL_RM, and calling the "page" version means it might be. */ static inline int -__wt_dsk_cell_data_ref( - WT_SESSION_IMPL *session, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session, int page_type, void *unpack_arg, WT_ITEM *store) { + WT_CELL_UNPACK_COMMON *unpack; + + unpack = unpack_arg; + WT_ASSERT(session, __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM); return (__cell_data_ref(session, NULL, page_type, unpack, store)); } @@ -1131,23 +1194,30 @@ __wt_dsk_cell_data_ref( * Set a buffer to reference the data from an unpacked cell. */ static inline int -__wt_page_cell_data_ref( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) +__wt_page_cell_data_ref(WT_SESSION_IMPL *session, WT_PAGE *page, void *unpack_arg, WT_ITEM *store) { - return (__cell_data_ref(session, page, page->type, unpack, store)); + return (__cell_data_ref(session, page, page->type, unpack_arg, store)); } /* * WT_CELL_FOREACH -- * Walk the cells on a page. */ -#define WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack) \ - do { \ - uint32_t __i; \ - uint8_t *__cell; \ - for (__cell = WT_PAGE_HEADER_BYTE(btree, dsk), __i = (dsk)->u.entries; __i > 0; \ - __cell += (unpack).__len, --__i) { \ - __wt_cell_unpack_dsk(session, dsk, (WT_CELL *)__cell, &(unpack)); +#define WT_CELL_FOREACH_ADDR(session, dsk, unpack) \ + do { \ + uint32_t __i; \ + uint8_t *__cell; \ + for (__cell = WT_PAGE_HEADER_BYTE(S2BT(session), dsk), __i = (dsk)->u.entries; __i > 0; \ + __cell += (unpack).__len, --__i) { \ + __wt_cell_unpack_addr(session, dsk, (WT_CELL *)__cell, &(unpack)); + +#define WT_CELL_FOREACH_KV(session, dsk, unpack) \ + do { \ + uint32_t __i; \ + uint8_t *__cell; \ + for (__cell = WT_PAGE_HEADER_BYTE(S2BT(session), dsk), __i = (dsk)->u.entries; __i > 0; \ + __cell += (unpack).__len, --__i) { \ + __wt_cell_unpack_kv(session, dsk, (WT_CELL *)__cell, &(unpack)); #define WT_CELL_FOREACH_END \ } \ diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 81cc28feb08..d3237ad91d2 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -409,7 +409,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) */ static inline int __cursor_row_slot_key_return( - WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK *kpack, bool *kpack_used) + WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_CELL_UNPACK_KV *kpack, bool *kpack_used) { WT_BTREE *btree; WT_CELL *cell; @@ -454,7 +454,7 @@ __cursor_row_slot_key_return( * compiler complaining about uninitialized field use. */ memset(kpack, 0, sizeof(*kpack)); - __wt_cell_unpack(session, page, cell, kpack); + __wt_cell_unpack_kv(session, page->dsk, cell, kpack); *kpack_used = true; if (kpack->type == WT_CELL_KEY && cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { WT_ASSERT(session, cbt->row_key->size >= kpack->prefix); diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h index 76c644a2850..39cce1e5b34 100644 --- a/src/third_party/wiredtiger/src/include/error.h +++ b/src/third_party/wiredtiger/src/include/error.h @@ -166,6 +166,9 @@ } while (0) #endif +/* Verbose messages. */ +#define WT_VERBOSE_ISSET(session, flag) (FLD_ISSET(S2C(session)->verbose, flag)) + /* * __wt_verbose -- * Display a verbose message. Not an inlined function because you can't inline functions taking @@ -174,8 +177,8 @@ * additional argument, there's no portable way to remove the comma before an empty __VA_ARGS__ * value. */ -#define __wt_verbose(session, flag, fmt, ...) \ - do { \ - if (WT_VERBOSE_ISSET(session, flag)) \ - __wt_verbose_worker(session, fmt, __VA_ARGS__); \ +#define __wt_verbose(session, flag, fmt, ...) \ + do { \ + if (WT_VERBOSE_ISSET(session, flag)) \ + __wt_verbose_worker(session, "[" #flag "] " fmt, __VA_ARGS__); \ } while (0) diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index a5dfb85bf89..81e6a7c81aa 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -11,6 +11,8 @@ extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_cell_type_check(uint8_t cell_type, uint8_t dsk_type) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_checksum_alt_match(const void *chunk, size_t len, uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) @@ -595,7 +597,7 @@ extern int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile) extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_key_value(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, - uint64_t rle, WT_CELL_UNPACK *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint64_t rle, WT_CELL_UNPACK_KV *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, @@ -1126,9 +1128,9 @@ extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *c WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, +extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK_COMMON *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK_KV *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) @@ -1206,7 +1208,7 @@ extern int __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) extern int __wt_rec_split_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, - void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE_SELECT *upd_select) + void *ripcip, WT_CELL_UNPACK_KV *vpack, WT_UPDATE_SELECT *upd_select) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1845,7 +1847,7 @@ static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id, static inline double __wt_eviction_dirty_target(WT_CACHE *cache) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_COL *cip) + WT_CURSOR_BTREE *cbt, WT_PAGE *page, WT_CELL_UNPACK_KV *unpack, WT_COL *cip) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1866,7 +1868,7 @@ static inline int __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, static inline int __wt_cell_pack_value_match(WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data, bool *matchp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, - WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *end) + WT_CELL *cell, WT_CELL_UNPACK_ADDR *unpack_addr, WT_CELL_UNPACK_KV *unpack_value, const void *end) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size, @@ -1882,8 +1884,8 @@ static inline int __wt_curindex_get_valuev(WT_CURSOR *cursor, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_curtable_get_valuev(WT_CURSOR *cursor, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline int __wt_dsk_cell_data_ref(WT_SESSION_IMPL *session, int page_type, - WT_CELL_UNPACK *unpack, WT_ITEM *store) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_dsk_cell_data_ref(WT_SESSION_IMPL *session, int page_type, void *unpack_arg, + WT_ITEM *store) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_extlist_read_pair(const uint8_t **p, wt_off_t *offp, wt_off_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_extlist_write_pair(uint8_t **p, wt_off_t off, wt_off_t size) @@ -1934,8 +1936,8 @@ static inline int __wt_lex_compare_skip(const WT_ITEM *user_item, const WT_ITEM size_t *matchp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline int __wt_page_cell_data_ref(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_CELL_UNPACK *unpack, WT_ITEM *store) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_page_cell_data_ref(WT_SESSION_IMPL *session, WT_PAGE *page, void *unpack_arg, + WT_ITEM *store) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_page_parent_modify_set(WT_SESSION_IMPL *session, WT_REF *ref, bool page_only) @@ -2007,7 +2009,7 @@ static inline int __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF * static inline int __wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, - uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK *vpack) + uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2061,7 +2063,7 @@ static inline size_t __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell WT_TIME_WINDOW *tw, uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline size_t __wt_cell_total_len(WT_CELL_UNPACK *unpack) +static inline size_t __wt_cell_total_len(void *unpack_arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_strnlen(const char *s, size_t maxlen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2104,7 +2106,7 @@ static inline uint64_t __wt_cache_pages_inuse(WT_CACHE *cache) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline uint64_t __wt_cache_read_gen(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline uint64_t __wt_cell_rle(WT_CELL_UNPACK *unpack) +static inline uint64_t __wt_cell_rle(WT_CELL_UNPACK_KV *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline uint64_t __wt_clock(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2136,10 +2138,10 @@ static inline void __wt_cache_update_hs_score( WT_SESSION_IMPL *session, u_int updates_seen, u_int updates_unstable); static inline void __wt_cell_type_reset( WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type); -static inline void __wt_cell_unpack( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack); -static inline void __wt_cell_unpack_dsk( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack); +static inline void __wt_cell_unpack_addr(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, + WT_CELL *cell, WT_CELL_UNPACK_ADDR *unpack_addr); +static inline void __wt_cell_unpack_kv(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, + WT_CELL *cell, WT_CELL_UNPACK_KV *unpack_value); static inline void __wt_check_addr_validity(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta); static inline void __wt_cond_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)); @@ -2155,18 +2157,18 @@ static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) static inline void __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page); static inline void __wt_rec_addr_ts_init(WT_RECONCILE *r, WT_TIME_AGGREGATE *ta); static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_ADDR *addr, WT_CELL_UNPACK *vpack, bool proxy_cell, uint64_t recno); + WT_ADDR *addr, WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno); static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv); static inline void __wt_rec_incr( WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size); static inline void __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep); static inline void __wt_ref_key_clear(WT_REF *ref); -static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack); -static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack); +static inline void __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK_ADDR *unpack); +static inline void __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack); static inline void __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell); static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, - WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack); -static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack); + WT_CELL_UNPACK_KV *kpack, WT_CELL_UNPACK_KV *vpack); +static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK_KV *unpack); static inline void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp); static inline void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp); static inline void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp); diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index b2df8478dd7..45dd0b42960 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -235,9 +235,6 @@ } \ } while (0) -/* Verbose messages. */ -#define WT_VERBOSE_ISSET(session, f) (FLD_ISSET(S2C(session)->verbose, f)) - #define WT_CLEAR(s) memset(&(s), 0, sizeof(s)) /* Check if a string matches a prefix. */ diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 647c015e26e..a8d0c205aad 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -31,7 +31,6 @@ struct __wt_reconcile { /* Track the page's min/maximum transactions. */ uint64_t max_txn; wt_timestamp_t max_ts; - wt_timestamp_t max_ondisk_ts; wt_timestamp_t min_skipped_ts; u_int updates_seen; /* Count of updates seen. */ diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i index 3f9339a81ab..8da14bc93ad 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.i +++ b/src/third_party/wiredtiger/src/include/reconcile.i @@ -122,7 +122,7 @@ __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv) */ static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *addr, - WT_CELL_UNPACK *vpack, bool proxy_cell, uint64_t recno) + WT_CELL_UNPACK_ADDR *vpack, bool proxy_cell, uint64_t recno) { WT_REC_KV *val; u_int cell_type; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 0cd8c89c9a7..90858eb6950 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -580,7 +580,11 @@ __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd) if (upd->prepare_state == WT_PREPARE_LOCKED || upd->prepare_state == WT_PREPARE_INPROGRESS) return (false); - return (__wt_txn_visible_all(session, upd->txnid, upd->start_ts)); + /* + * This function is used to determine when an update is obsolete: that should take into account + * the durable timestamp which is greater than or equal to the start timestamp. + */ + return (__wt_txn_visible_all(session, upd->txnid, upd->durable_ts)); } /* @@ -594,7 +598,7 @@ __wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, WT_UPDATE_VALUE *upd_va upd_value->prepare_state == WT_PREPARE_INPROGRESS) return (false); - return (__wt_txn_visible_all(session, upd_value->txnid, upd_value->start_ts)); + return (__wt_txn_visible_all(session, upd_value->txnid, upd_value->durable_ts)); } /* @@ -837,7 +841,7 @@ __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE */ static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, - WT_UPDATE *upd, WT_CELL_UNPACK *vpack) + WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack) { WT_TIME_WINDOW tw; @@ -873,10 +877,10 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint __wt_txn_visible(session, tw.stop_txn, tw.stop_ts) && ((!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) && (!WT_IS_HS(S2BT(session)) || !F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) || - __wt_txn_visible_all(session, tw.stop_txn, tw.stop_ts))) { + __wt_txn_visible_all(session, tw.stop_txn, tw.durable_stop_ts))) { cbt->upd_value->buf.data = NULL; cbt->upd_value->buf.size = 0; - cbt->upd_value->start_ts = tw.stop_ts; + cbt->upd_value->durable_ts = tw.durable_stop_ts; cbt->upd_value->txnid = tw.stop_txn; cbt->upd_value->type = WT_UPDATE_TOMBSTONE; cbt->upd_value->prepare_state = WT_PREPARE_INIT; @@ -900,7 +904,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint cbt->upd_value->buf.data = NULL; cbt->upd_value->buf.size = 0; } - cbt->upd_value->start_ts = tw.start_ts; + cbt->upd_value->durable_ts = tw.durable_start_ts; cbt->upd_value->txnid = tw.start_txn; cbt->upd_value->type = WT_UPDATE_STANDARD; cbt->upd_value->prepare_state = WT_PREPARE_INIT; @@ -1276,7 +1280,7 @@ __wt_upd_value_assign(WT_UPDATE_VALUE *upd_value, WT_UPDATE *upd) upd_value->buf.data = upd->data; upd_value->buf.size = upd->size; } - upd_value->start_ts = upd->start_ts; + upd_value->durable_ts = upd->durable_ts; upd_value->txnid = upd->txnid; upd_value->type = upd->type; upd_value->prepare_state = upd->prepare_state; @@ -1295,7 +1299,7 @@ __wt_upd_value_clear(WT_UPDATE_VALUE *upd_value) */ upd_value->buf.data = NULL; upd_value->buf.size = 0; - upd_value->start_ts = WT_TS_NONE; + upd_value->durable_ts = WT_TS_NONE; upd_value->txnid = WT_TXN_NONE; upd_value->type = WT_UPDATE_INVALID; upd_value->prepare_state = WT_PREPARE_INIT; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index bdf26c80663..113113f9e93 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -107,8 +107,12 @@ struct __wt_capacity; typedef struct __wt_capacity WT_CAPACITY; struct __wt_cell; typedef struct __wt_cell WT_CELL; -struct __wt_cell_unpack; -typedef struct __wt_cell_unpack WT_CELL_UNPACK; +struct __wt_cell_unpack_addr; +typedef struct __wt_cell_unpack_addr WT_CELL_UNPACK_ADDR; +struct __wt_cell_unpack_common; +typedef struct __wt_cell_unpack_common WT_CELL_UNPACK_COMMON; +struct __wt_cell_unpack_kv; +typedef struct __wt_cell_unpack_kv WT_CELL_UNPACK_KV; struct __wt_ckpt; typedef struct __wt_ckpt WT_CKPT; struct __wt_col; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 18a4a16b556..c49a3f11bfa 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -192,7 +192,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) { WT_ADDR *addr; WT_BTREE *btree; - WT_CELL_UNPACK *vpack, _vpack; + WT_CELL_UNPACK_ADDR *vpack, _vpack; WT_CHILD_STATE state; WT_DECL_RET; WT_PAGE *child, *page; @@ -275,7 +275,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) if (addr == NULL && __wt_off_page(page, ref->addr)) addr = ref->addr; if (addr == NULL) { - __wt_cell_unpack(session, page, ref->addr, vpack); + __wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack); val->buf.data = ref->addr; val->buf.size = __wt_cell_total_len(vpack); val->cell_len = 0; @@ -571,7 +571,7 @@ __wt_rec_col_var( } last; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *vpack, _vpack; + WT_CELL_UNPACK_KV *vpack, _vpack; WT_COL *cip; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(orig); @@ -654,7 +654,7 @@ __wt_rec_col_var( WT_COL_FOREACH (page, cip, i) { ovfl_state = OVFL_IGNORE; cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(session, page, cell, vpack); + __wt_cell_unpack_kv(session, page->dsk, cell, vpack); nrepeat = __wt_cell_rle(vpack); ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index 5365e077b65..66331a663b5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -294,7 +294,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_ADDR *addr; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_CELL_UNPACK_ADDR *kpack, _kpack, *vpack, _vpack; WT_CHILD_STATE state; WT_DECL_RET; WT_IKEY *ikey; @@ -353,7 +353,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) ikey = __wt_ref_key_instantiated(ref); if (ikey != NULL && ikey->cell_offset != 0) { cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - __wt_cell_unpack(session, page, cell, kpack); + __wt_cell_unpack_addr(session, page->dsk, cell, kpack); key_onpage_ovfl = F_ISSET(kpack, WT_CELL_UNPACK_OVERFLOW) && kpack->raw != WT_CELL_KEY_OVFL_RM; } @@ -433,7 +433,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __wt_rec_cell_build_addr(session, r, addr, NULL, state == WT_CHILD_PROXY, WT_RECNO_OOB); __wt_time_aggregate_copy(&ta, &addr->ta); } else { - __wt_cell_unpack(session, page, ref->addr, vpack); + __wt_cell_unpack_addr(session, page->dsk, ref->addr, vpack); if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) { /* * The transaction ids are cleared after restart. Repack the cell with new validity @@ -535,7 +535,7 @@ __rec_row_zero_len(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw) */ return ((tw->stop_ts == WT_TS_MAX && tw->stop_txn == WT_TXN_MAX) && ((tw->start_ts == WT_TS_NONE && tw->start_txn == WT_TXN_NONE) || - __wt_txn_visible_all(session, tw->start_txn, tw->start_ts))); + __wt_txn_visible_all(session, tw->start_txn, tw->durable_start_ts))); } /* @@ -633,8 +633,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) * Repack a cell. */ static inline int -__rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_CELL_UNPACK *vpack, - WT_TIME_WINDOW *tw) +__rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, + WT_CELL_UNPACK_KV *vpack, WT_TIME_WINDOW *tw) { WT_DECL_ITEM(tmpval); WT_DECL_RET; @@ -671,7 +671,7 @@ __wt_rec_row_leaf( static WT_UPDATE upd_tombstone = {.txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE}; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_CELL_UNPACK_KV *kpack, _kpack, *vpack, _vpack; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(tmpkey); WT_DECL_RET; @@ -740,7 +740,7 @@ __wt_rec_row_leaf( kpack = NULL; else { kpack = &_kpack; - __wt_cell_unpack(session, page, cell, kpack); + __wt_cell_unpack_kv(session, page->dsk, cell, kpack); } /* Unpack the on-page value cell. */ @@ -767,7 +767,7 @@ __wt_rec_row_leaf( * new updates for that key, skip writing that key. */ if (upd == NULL && (tw.stop_txn != WT_TXN_MAX || tw.stop_ts != WT_TS_MAX) && - __wt_txn_visible_all(session, tw.stop_txn, tw.stop_ts)) + __wt_txn_visible_all(session, tw.stop_txn, tw.durable_stop_ts)) upd = &upd_tombstone; /* Build value cell. */ @@ -910,7 +910,7 @@ __wt_rec_row_leaf( goto build; kpack = &_kpack; - __wt_cell_unpack(session, page, cell, kpack); + __wt_cell_unpack_kv(session, page->dsk, cell, kpack); if (btree->huffman_key == NULL && kpack->type == WT_CELL_KEY && tmpkey->size >= kpack->prefix && tmpkey->size != 0) { /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index b1a3b93eee8..1fb6b072d0e 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -31,13 +31,13 @@ __wt_ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) static int __ovfl_discard_verbose(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag) { - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_DECL_ITEM(tmp); WT_RET(__wt_scr_alloc(session, 512, &tmp)); unpack = &_unpack; - __wt_cell_unpack(session, page, cell, unpack); + __wt_cell_unpack_kv(session, page->dsk, cell, unpack); __wt_verbose(session, WT_VERB_OVERFLOW, "discard: %s%s%p %s", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", (void *)page, diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 93991af5983..b428ab8a3e9 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -18,7 +18,7 @@ __rec_update_stable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd) return (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_upd_visible_all(session, upd) : __wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE && - __wt_txn_visible(session, upd->txnid, upd->start_ts)); + __wt_txn_visible(session, upd->txnid, upd->durable_ts)); } /* @@ -54,17 +54,19 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, voi */ static int __rec_append_orig_value( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) + WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK_KV *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UPDATE *append, *oldest_upd, *tombstone; size_t size, total_size; + bool tombstone_globally_visible; WT_ASSERT(session, upd != NULL && unpack != NULL && unpack->type != WT_CELL_DEL); append = oldest_upd = tombstone = NULL; total_size = 0; + tombstone_globally_visible = false; /* Review the current update list, checking conditions that mean no work is needed. */ for (;; upd = upd->next) { @@ -110,20 +112,6 @@ __rec_append_orig_value( break; } - /* Done if the stop time pair of the onpage cell is globally visible. */ - if ((unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) && - __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)) - return (0); - - /* We need the original on-page value for some reader: get a copy. */ - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); - WT_ERR(__wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &append, &size)); - total_size += size; - append->txnid = unpack->tw.start_txn; - append->start_ts = unpack->tw.start_ts; - append->durable_ts = unpack->tw.durable_start_ts; - /* * Additionally, we need to append a tombstone before the onpage value we're about to append to * the list, if the onpage value has a valid stop pair. Imagine a case where we insert and @@ -131,17 +119,30 @@ __rec_append_orig_value( * the tombstone to tell us there is no value between 10 and 20. */ if (unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) { + tombstone_globally_visible = + __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.durable_stop_ts); + /* No need to append the tombstone if it is already in the update chain. */ if (oldest_upd->type != WT_UPDATE_TOMBSTONE) { + /* + * We still need to append the globally visible tombstone if its timestamp is WT_TS_NONE + * as we may need it to clear the history store content of the key. We don't append a + * timestamped globally visible tombstone because even if its timestamp is smaller than + * the entries in the history store, we can't change the history store entries. This is + * not correct but we hope we can get away with it. + * + * FIXME-WT-6171: remove this once we get rid of out of order timestamps and mixed mode + * transactions. + */ + if (unpack->tw.durable_stop_ts != WT_TS_NONE && tombstone_globally_visible) + return (0); + WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, &size)); total_size += size; tombstone->txnid = unpack->tw.stop_txn; tombstone->start_ts = unpack->tw.stop_ts; tombstone->durable_ts = unpack->tw.durable_stop_ts; - - tombstone->next = append; - append = tombstone; - } else + } else { /* * Once the prepared update is resolved, the in-memory update and on-disk written copy * doesn't have same timestamp due to replacing of prepare timestamp with commit and @@ -150,6 +151,25 @@ __rec_append_orig_value( WT_ASSERT(session, F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) || (unpack->tw.stop_ts == oldest_upd->start_ts && unpack->tw.stop_txn == oldest_upd->txnid)); + if (tombstone_globally_visible) + return (0); + } + } + + /* We need the original on-page value for some reader: get a copy. */ + if (!tombstone_globally_visible) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &append, &size)); + total_size += size; + append->txnid = unpack->tw.start_txn; + append->start_ts = unpack->tw.start_ts; + append->durable_ts = unpack->tw.durable_start_ts; + } + + if (tombstone != NULL) { + tombstone->next = append; + append = tombstone; } /* Append the new entry into the update list. */ @@ -192,8 +212,9 @@ __rec_need_save_upd( if (F_ISSET(r, WT_REC_CHECKPOINT) && upd_select->upd == NULL) return (false); - return (!__wt_txn_visible_all(session, upd_select->tw.stop_txn, upd_select->tw.stop_ts) && - !__wt_txn_visible_all(session, upd_select->tw.start_txn, upd_select->tw.start_ts)); + return ( + !__wt_txn_visible_all(session, upd_select->tw.stop_txn, upd_select->tw.durable_stop_ts) && + !__wt_txn_visible_all(session, upd_select->tw.start_txn, upd_select->tw.durable_start_ts)); } /* @@ -202,7 +223,7 @@ __rec_need_save_upd( */ int __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, - WT_CELL_UNPACK *vpack, WT_UPDATE_SELECT *upd_select) + WT_CELL_UNPACK_KV *vpack, WT_UPDATE_SELECT *upd_select) { WT_DECL_ITEM(tmp); WT_DECL_RET; @@ -343,9 +364,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v return (__wt_set_return(session, EBUSY)); } - if (upd != NULL && upd->start_ts > r->max_ondisk_ts) - r->max_ondisk_ts = upd->start_ts; - /* * The start timestamp is determined by the commit timestamp when the key is first inserted (or * last updated). The end timestamp is set when a key/value pair becomes invalid, either because diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 2a4358c585f..4dd14173c5c 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -519,7 +519,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO /* Track the page's min/maximum transaction */ r->max_txn = WT_TXN_NONE; - r->max_ondisk_ts = r->max_ts = WT_TS_NONE; + r->max_ts = WT_TS_NONE; r->min_skipped_ts = WT_TS_MAX; /* Track if updates were used and/or uncommitted. */ diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index 010ef9a80d1..25de3b700b7 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -535,7 +535,7 @@ __wt_modify_reconstruct_from_upd_list( cursor = &cbt->iface; /* While we have a pointer to our original modify, grab this information. */ - upd_value->start_ts = upd->start_ts; + upd_value->durable_ts = upd->durable_ts; upd_value->txnid = upd->txnid; upd_value->prepare_state = upd->prepare_state; diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index e2489bc8563..00cd443398b 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -144,7 +144,7 @@ static int __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, wt_timestamp_t rollback_timestamp, bool replace) { - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_CURSOR *hs_cursor; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(hs_key); @@ -351,7 +351,7 @@ static int __rollback_abort_row_ondisk_kv( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, wt_timestamp_t rollback_timestamp) { - WT_CELL_UNPACK *vpack, _vpack; + WT_CELL_UNPACK_KV *vpack, _vpack; WT_DECL_RET; WT_ITEM buf; WT_UPDATE *upd; @@ -635,7 +635,7 @@ __rollback_page_needs_abort( WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) { WT_ADDR *addr; - WT_CELL_UNPACK vpack; + WT_CELL_UNPACK_ADDR vpack; WT_MULTI *multi; WT_PAGE_MODIFY *mod; wt_timestamp_t durable_ts; @@ -677,7 +677,7 @@ __rollback_page_needs_abort( } else if (!__wt_off_page(ref->home, addr)) { tag = "on page cell"; /* Check if the page is obsolete using the page disk address. */ - __wt_cell_unpack(session, ref->home, (WT_CELL *)addr, &vpack); + __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack); durable_ts = WT_MAX(vpack.ta.newest_start_durable_ts, vpack.ta.newest_stop_durable_ts); prepared = F_ISSET(&vpack, WT_CELL_UNPACK_PREPARE); result = (durable_ts > rollback_timestamp) || prepared; @@ -704,7 +704,7 @@ static void __rollback_verify_ondisk_page( WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) { - WT_CELL_UNPACK *vpack, _vpack; + WT_CELL_UNPACK_KV *vpack, _vpack; WT_ROW *rip; uint32_t i; diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 1e306fe4b2e..7d81b1c2b80 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -2217,10 +2217,10 @@ buildvariants: - name: syscall-linux - name: make-check-asan-test - name: configure-combinations - # - name: checkpoint-filetypes-test + - name: checkpoint-filetypes-test # - name: coverage-report - name: unit-test-long - # - name: spinlock-gcc-test + - name: spinlock-gcc-test - name: spinlock-pthread-adaptive-test - name: compile-wtperf - name: wtperf-test @@ -2321,9 +2321,9 @@ buildvariants: - name: syscall-linux - name: compile-asan - name: make-check-asan-test - # - name: checkpoint-filetypes-test + - name: checkpoint-filetypes-test - name: unit-test-long - # - name: spinlock-gcc-test + - name: spinlock-gcc-test - name: spinlock-pthread-adaptive-test - name: compile-wtperf - name: wtperf-test diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c index 5ad1cfe65dc..9e959dcd823 100644 --- a/src/third_party/wiredtiger/test/format/backup.c +++ b/src/third_party/wiredtiger/test/format/backup.c @@ -217,54 +217,57 @@ static void copy_blocks(WT_SESSION *session, WT_CURSOR *bkup_c, const char *name) { WT_CURSOR *incr_cur; + WT_DECL_RET; size_t len, tmp_sz; ssize_t rdsize; - uint64_t offset, type; - u_int size; - int ret, rfd, wfd1, wfd2; - char buf[512], config[512], *first, *second, *tmp; + uint64_t offset, size, type; + int rfd, wfd1, wfd2; + char config[512], *tmp; bool first_pass; - /* - * We need to prepend the home directory name here because we are not using the WiredTiger - * internal functions that would prepend it for us. - */ - len = strlen(g.home) + strlen("BACKUP") + strlen(name) + 10; - first = dmalloc(len); - - /* - * Save another copy of the original file to make debugging recovery errors easier. - */ - len = strlen(g.home) + strlen("BACKUP.copy") + strlen(name) + 10; - second = dmalloc(len); - testutil_check(__wt_snprintf(config, sizeof(config), "incremental=(file=%s)", name)); - - /* Open the duplicate incremental backup cursor with the file name given. */ tmp_sz = 0; tmp = NULL; first_pass = true; rfd = wfd1 = wfd2 = -1; + + /* Open the duplicate incremental backup cursor with the file name given. */ + testutil_check(__wt_snprintf(config, sizeof(config), "incremental=(file=%s)", name)); testutil_check(session->open_cursor(session, NULL, bkup_c, config, &incr_cur)); while ((ret = incr_cur->next(incr_cur)) == 0) { - testutil_check(incr_cur->get_key(incr_cur, &offset, (uint64_t *)&size, &type)); + testutil_check(incr_cur->get_key(incr_cur, &offset, &size, &type)); if (type == WT_BACKUP_RANGE) { /* * Since we are using system calls below instead of a WiredTiger function, we have to * prepend the home directory to the file names ourselves. */ - testutil_check(__wt_snprintf(first, len, "%s/BACKUP/%s", g.home, name)); - testutil_check(__wt_snprintf(second, len, "%s/BACKUP.copy/%s", g.home, name)); + if (first_pass) { + len = strlen(g.home) + strlen(name) + 10; + tmp = dmalloc(len); + testutil_check(__wt_snprintf(tmp, len, "%s/%s", g.home, name)); + error_sys_check(rfd = open(tmp, O_RDONLY, 0)); + free(tmp); + tmp = NULL; + + len = strlen(g.home) + strlen("BACKUP") + strlen(name) + 10; + tmp = dmalloc(len); + testutil_check(__wt_snprintf(tmp, len, "%s/BACKUP/%s", g.home, name)); + error_sys_check(wfd1 = open(tmp, O_WRONLY | O_CREAT, 0)); + free(tmp); + tmp = NULL; + + len = strlen(g.home) + strlen("BACKUP.copy") + strlen(name) + 10; + tmp = dmalloc(len); + testutil_check(__wt_snprintf(tmp, len, "%s/BACKUP.copy/%s", g.home, name)); + error_sys_check(wfd2 = open(tmp, O_WRONLY | O_CREAT, 0)); + free(tmp); + tmp = NULL; + + first_pass = false; + } if (tmp_sz < size) { tmp = drealloc(tmp, size); tmp_sz = size; } - if (first_pass) { - testutil_check(__wt_snprintf(buf, sizeof(buf), "%s/%s", g.home, name)); - error_sys_check(rfd = open(buf, O_RDONLY, 0)); - error_sys_check(wfd1 = open(first, O_WRONLY | O_CREAT, 0)); - error_sys_check(wfd2 = open(second, O_WRONLY | O_CREAT, 0)); - first_pass = false; - } error_sys_check(lseek(rfd, (wt_off_t)offset, SEEK_SET)); error_sys_check(rdsize = read(rfd, tmp, size)); error_sys_check(lseek(wfd1, (wt_off_t)offset, SEEK_SET)); @@ -273,17 +276,27 @@ copy_blocks(WT_SESSION *session, WT_CURSOR *bkup_c, const char *name) error_sys_check(write(wfd1, tmp, (size_t)rdsize)); error_sys_check(write(wfd2, tmp, (size_t)rdsize)); } else { + testutil_assert(type == WT_BACKUP_FILE); + testutil_assert(first_pass == true); + testutil_assert(rfd == -1); + /* * These operations are using a WiredTiger function so it will prepend the home * directory to the name for us. */ - testutil_check(__wt_snprintf(first, len, "BACKUP/%s", name)); - testutil_check(__wt_snprintf(second, len, "BACKUP.copy/%s", name)); - testutil_assert(type == WT_BACKUP_FILE); - testutil_assert(rfd == -1); - testutil_assert(first_pass == true); - testutil_check(__wt_copy_and_sync(session, name, first)); - testutil_check(__wt_copy_and_sync(session, first, second)); + len = strlen("BACKUP") + strlen(name) + 10; + tmp = dmalloc(len); + testutil_check(__wt_snprintf(tmp, len, "BACKUP/%s", name)); + testutil_check(__wt_copy_and_sync(session, name, tmp)); + free(tmp); + tmp = NULL; + + len = strlen("BACKUP.copy") + strlen(name) + 10; + tmp = dmalloc(len); + testutil_check(__wt_snprintf(tmp, len, "BACKUP.copy/%s", name)); + testutil_check(__wt_copy_and_sync(session, name, tmp)); + free(tmp); + tmp = NULL; } } testutil_check(incr_cur->close(incr_cur)); @@ -292,8 +305,6 @@ copy_blocks(WT_SESSION *session, WT_CURSOR *bkup_c, const char *name) error_sys_check(close(wfd1)); error_sys_check(close(wfd2)); } - free(first); - free(second); free(tmp); } /* diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c index 36e70ae3125..9131e920231 100644 --- a/src/third_party/wiredtiger/test/format/checkpoint.c +++ b/src/third_party/wiredtiger/test/format/checkpoint.c @@ -37,6 +37,17 @@ wts_checkpoints(void) { char config[1024]; + /* + * Configuring WiredTiger library checkpoints is done separately, rather than as part of the + * original database open because format tests small caches and you can get into cache stuck + * trouble during the initial load (where bulk load isn't configured). There's a single thread + * doing lots of inserts and creating huge leaf pages. Those pages can't be evicted if there's a + * checkpoint running in the tree, and the cache can get stuck. That workload is unlikely enough + * we're not going to fix it in the library, so configure it away by delaying checkpoint start. + */ + if (g.c_checkpoint_flag != CHECKPOINT_WIREDTIGER) + return; + testutil_check( __wt_snprintf(config, sizeof(config), ",checkpoint=(wait=%" PRIu32 ",log_size=%" PRIu32 ")", g.c_checkpoint_wait, MEGABYTE(g.c_checkpoint_log_size))); diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh index 19f5df8ede4..b02a58abfb0 100755 --- a/src/third_party/wiredtiger/test/format/format.sh +++ b/src/third_party/wiredtiger/test/format/format.sh @@ -214,14 +214,11 @@ skip_known_errors() log=$1 - # Define each array with multi-signature matching for a single known error - # and append it to the skip_error_list - err_1=("heap-buffer-overflow" "__split_parent") # Delete this error line post WT-5518 fix - err_2=("heap-use-after-free" "__wt_btcur_next_random") # Delete this error line post WT-5552 fix - - # skip_error_list is the list of errors to skip, and each error could - # have multiple signatures to be able to reach a finer match - skip_error_list=( err_1[@] err_2[@] ) + # skip_error_list is a list of errors to skip. Each array entry can have multiple signatures + # for finger-grained matching. For example: + # + # err_1=("heap-buffer-overflow" "__split_parent") + skip_error_list=( err_1[@] ) # Loop through the skip list and search in the log file. err_count=${#skip_error_list[@]} @@ -249,12 +246,11 @@ report_failure() log="$dir.log" # DO NOT CURRENTLY SKIP ANY ERRORS. - skip_ret=0 #skip_known_errors $log #skip_ret=$? echo "$name: failure status reported" > $dir/$status - [[ $skip_ret -ne 0 ]] && failure=$(($failure + 1)) + failure=$(($failure + 1)) # Forcibly quit if first-failure configured. [[ $first_failure -ne 0 ]] && force_quit=1 diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index ef3a79e7b53..b38c7b721bc 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -204,16 +204,6 @@ operations(u_int ops_seconds, bool lastrun) if (g.c_txn_timestamps) testutil_check(__wt_thread_create(NULL, ×tamp_tid, timestamp, tinfo_list)); - /* - * Configuring WiredTiger library checkpoints is done separately, rather than as part of the - * original database open because format tests small caches and you can get into cache stuck - * trouble during the initial load (where bulk load isn't configured). There's a single thread - * doing lots of inserts and creating huge leaf pages. Those pages can't be evicted if there's a - * checkpoint running in the tree, and the cache can get stuck. That workload is unlikely enough - * we're not going to fix it in the library, so configure it away by delaying checkpoint start. - */ - if (g.c_checkpoint_flag == CHECKPOINT_WIREDTIGER) - wts_checkpoints(); if (g.c_checkpoint_flag == CHECKPOINT_ON) testutil_check(__wt_thread_create(NULL, &checkpoint_tid, checkpoint, NULL)); diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index b596124087b..5e68d4c524b 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -287,6 +287,8 @@ main(int argc, char *argv[]) TIMED_MAJOR_OP(wts_read_scan()); + wts_checkpoints(); + /* Operations. */ for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps) operations(ops_seconds, reps == FORMAT_OPERATION_REPS); |