diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-05-12 16:27:34 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-12 06:42:41 +0000 |
commit | cf68adaa79f789fd0da77ea0c4eb554af6beab08 (patch) | |
tree | 4e11b2b8eed62d4e0e6b0f7e670bb3be655458fa /src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c | |
parent | 6eb64770517b467ac903d39560c95fb470a77ad0 (diff) | |
download | mongo-cf68adaa79f789fd0da77ea0c4eb554af6beab08.tar.gz |
Import wiredtiger: bdff12c2331ab0478a22309a6d35519d2e2ca441 from branch mongodb-4.4
ref: 404b4a70af..bdff12c233
for: 4.4.0-rc6
WT-5864 Append globally visible tombstone with WT_TS_NONE to the update chain
WT-6063 Re-enable checkpoint-filetypes-test in Evergreen
WT-6065 Re-enable spinlock-gcc-test in Evergreen
WT-6092 Use durable timestamp for global visibility check instead of commit timestamp
WT-6111 Rework cell structures and unpacking
WT-6157 Disable table logging in workgen stress test while running prepare transactions
WT-6159 Tag verbose messages to make them easier to distinguish
WT-6160 Fix format failure caused by stack overwrite
WT-6161 Fix format hang when WiredTiger internal checkpoints are configured
WT-6162 Fix incorrectly counts failures in format.sh
WT-6166 KEY/VALUE short cells have to handle copy cells
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c | 654 |
1 files changed, 416 insertions, 238 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index a1e96d41dc9..0b29c3ee526 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -16,7 +16,9 @@ static int __verify_dsk_col_fix(WT_SESSION_IMPL *, const char *, const WT_PAGE_H static int __verify_dsk_col_int(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); static int __verify_dsk_col_var(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); static int __verify_dsk_memsize(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *); -static int __verify_dsk_row(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); +static int __verify_dsk_row_int(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); +static int __verify_dsk_row_leaf( + WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_ADDR *); #define WT_ERR_VRFY(session, ...) \ do { \ @@ -44,8 +46,8 @@ static int __verify_dsk_row(WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADE * WT_CELL_FOREACH macro, created because the loop can't simply unpack cells, * verify has to do additional work to ensure that unpack is safe. */ -#define WT_CELL_FOREACH_VRFY(btree, dsk, cell, unpack, i) \ - for ((cell) = WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; (i) > 0; \ +#define WT_CELL_FOREACH_VRFY(session, dsk, cell, unpack, i) \ + for ((cell) = WT_PAGE_HEADER_BYTE(S2BT(session), dsk), (i) = (dsk)->u.entries; (i) > 0; \ (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i)) /* @@ -90,8 +92,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H if (dsk->recno == WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a record number, which is illegal for " - "this page type", + "%s page at %s has a record number, which is illegal for this page type", __wt_page_type_string(dsk->type), tag); } @@ -167,8 +168,9 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, tag, dsk, addr)); case WT_PAGE_ROW_INT: + return (__verify_dsk_row_int(session, tag, dsk, addr)); case WT_PAGE_ROW_LEAF: - return (__verify_dsk_row(session, tag, dsk, addr)); + return (__verify_dsk_row_leaf(session, tag, dsk, addr)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); @@ -227,8 +229,8 @@ __verify_dsk_ts_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, const char break; } WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "timestamp of %s, %s the parent's %s timestamp of %s", + " on page at %s failed verification with %s timestamp of %s, %s " + "the parent's %s timestamp of %s", cell_num, tag, ts1_name, ts1_bp, gt ? "less than" : "greater than", ts2_name, ts2_bp); } @@ -252,24 +254,25 @@ __verify_dsk_txn_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, const cha if (dsk->write_gen <= S2C(session)->base_write_gen) return (0); - WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s failed verification with %s " - "transaction of %" PRIu64 - ", %s the parent's %s transaction of " - "%" PRIu64, + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s failed verification with %s transaction of %" PRIu64 + ", %s the parent's %s transaction of %" PRIu64, cell_num, tag, txn1_name, txn1, gt ? "less than" : "greater than", txn2_name, txn2); } /* - * __verify_dsk_validity -- - * Verify a cell's validity window. + * __verify_dsk_addr_validity -- + * Verify an address cell's validity window. */ static int -__verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t cell_num, +__verify_dsk_addr_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_ADDR *unpack, uint32_t cell_num, WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) { + WT_TIME_AGGREGATE *ta; char time_string[WT_TIME_STRING_SIZE]; + ta = &unpack->ta; + /* * Check timestamp and transaction order, and optionally against parent values. Timestamps and * transactions in the parent address aren't necessarily an exact match, but should be within @@ -279,169 +282,192 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t * cell-unpacking code hides it by always returning durable values if they don't appear on the * page. */ - switch (unpack->type) { - case WT_CELL_ADDR_DEL: - case WT_CELL_ADDR_INT: - case WT_CELL_ADDR_LEAF: - case WT_CELL_ADDR_LEAF_NO: - if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a newest stop " - "timestamp of 0; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start timestamp newer than its newest stop " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start transaction newer than its " - "newest stop transaction; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has an oldest " - "start timestamp newer than its newest start durable " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - if (unpack->ta.newest_stop_ts != WT_TS_MAX && - unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a newest " - "stop timestamp newer than its newest stop durable " - "timestamp; time aggregate %s", - cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); - - if (addr == NULL) - break; + if (ta->oldest_start_ts != WT_TS_NONE && ta->newest_stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a newest stop timestamp of 0; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start timestamp newer than its newest " + "stop timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_txn > ta->newest_stop_txn) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start transaction newer than its newest " + "stop transaction; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->oldest_start_ts > ta->newest_start_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest start timestamp newer than its newest " + "start durable timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + if (ta->newest_stop_ts != WT_TS_MAX && ta->newest_stop_ts > ta->newest_stop_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a newest stop timestamp newer than its newest " + "stop durable timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(ta, time_string)); + + if (addr == NULL) + return (0); - if (addr->ta.newest_start_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->ta.newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", - unpack->ta.oldest_start_ts, "oldest start", addr->ta.oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", - unpack->ta.oldest_start_txn, "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); - - if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->ta.newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", - unpack->ta.newest_stop_ts, "newest stop", addr->ta.newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", - unpack->ta.newest_stop_txn, "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - break; - case WT_CELL_DEL: - case WT_CELL_VALUE: - case WT_CELL_VALUE_COPY: - case WT_CELL_VALUE_OVFL: - case WT_CELL_VALUE_OVFL_RM: - case WT_CELL_VALUE_SHORT: - if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a stop " - "timestamp of 0; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_ts > unpack->tw.stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "timestamp newer than its stop timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_txn > unpack->tw.stop_txn) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "transaction newer than its stop " - "transaction; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.start_ts > unpack->tw.durable_start_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a start " - "timestamp newer than its durable start timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - if (unpack->tw.stop_ts != WT_TS_MAX && unpack->tw.stop_ts > unpack->tw.durable_stop_ts) - WT_RET_VRFY(session, "cell %" PRIu32 - " on page at %s has a stop " - "timestamp newer than its durable stop timestamp; time window %s", - cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); - - if (addr == NULL) - break; + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + ta->newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, false, + tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", ta->oldest_start_ts, + "oldest start", addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", ta->oldest_start_txn, + "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + ta->newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", ta->newest_stop_ts, + "newest stop", addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", ta->newest_stop_txn, + "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - if (addr->ta.newest_start_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->tw.durable_start_ts, "newest start durable", addr->ta.newest_start_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_ts, - "oldest start", addr->ta.oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_txn, - "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); - if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->tw.durable_stop_ts, "newest stop durable", addr->ta.newest_stop_durable_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_ts, - "newest stop", addr->ta.newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_txn, - "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); - break; - } + return (0); +} + +/* + * __verify_dsk_value_validity -- + * Verify a value cell's validity window. + */ +static int +__verify_dsk_value_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK_KV *unpack, uint32_t cell_num, + WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) +{ + WT_TIME_WINDOW *tw; + char time_string[WT_TIME_STRING_SIZE]; + + tw = &unpack->tw; + + /* + * Check timestamp and transaction order, and optionally against parent values. Timestamps and + * transactions in the parent address aren't necessarily an exact match, but should be within + * the boundaries of the parent's information. + * + * There's no checking if validity information should appear on a page because the + * cell-unpacking code hides it by always returning durable values if they don't appear on the + * page. + */ + if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a stop timestamp of 0; time window %s", cell_num - 1, + tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->stop_ts) + WT_RET_VRFY(session, + "cell %" PRIu32 + " on page at %s has a start timestamp newer than its stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_txn > tw->stop_txn) + WT_RET_VRFY(session, + "cell %" PRIu32 + " on page at %s has a start transaction newer than its stop transaction; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->start_ts > tw->durable_start_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a start timestamp newer than its durable start " + "timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + if (tw->stop_ts != WT_TS_MAX && tw->stop_ts > tw->durable_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a stop timestamp newer than its durable stop " + "timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(tw, time_string)); + + if (addr == NULL) + return (0); + + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET( + __verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", tw->durable_start_ts, + "newest start durable", addr->ta.newest_start_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", tw->start_ts, "oldest start", + addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", tw->start_txn, "oldest start", + addr->ta.oldest_start_txn, true, tag, dsk)); + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", tw->durable_stop_ts, + "newest stop durable", addr->ta.newest_stop_durable_ts, false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", tw->stop_ts, "newest stop", + addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", tw->stop_txn, "newest stop", + addr->ta.newest_stop_txn, false, tag, dsk)); return (0); } /* - * __verify_dsk_row -- - * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. + * __verify_row_key_order_check -- + * Check key ordering for row-store pages. */ static int -__verify_dsk_row( +__verify_row_key_order_check(WT_SESSION_IMPL *session, WT_ITEM *last, uint32_t last_cell_num, + WT_ITEM *current, uint32_t cell_num, const char *tag) +{ + WT_DECL_ITEM(tmp1); + WT_DECL_ITEM(tmp2); + WT_DECL_RET; + int cmp; + + WT_RET(__wt_compare(session, S2BT(session)->collator, last, current, &cmp)); + if (cmp < 0) + return (0); + + WT_ERR(__wt_scr_alloc(session, 0, &tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &tmp2)); + + ret = WT_ERROR; + WT_ERR_VRFY(session, + "the %" PRIu32 " and %" PRIu32 " keys on page at %s are incorrectly sorted: %s, %s", + last_cell_num, cell_num, tag, __wt_buf_set_printable(session, last->data, last->size, tmp1), + __wt_buf_set_printable(session, current->data, current->size, tmp2)); + +err: + __wt_scr_free(session, &tmp1); + __wt_scr_free(session, &tmp2); + return (ret); +} + +/* + * __verify_dsk_row_int -- + * Walk a WT_PAGE_ROW_INT disk page and verify it. + */ +static int +__verify_dsk_row_int( WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, WT_ADDR *addr) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_DECL_ITEM(current); - WT_DECL_ITEM(last_ovfl); - WT_DECL_ITEM(last_pfx); - WT_DECL_ITEM(tmp1); - WT_DECL_ITEM(tmp2); + WT_DECL_ITEM(last); + WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; - void *huffman; - size_t prefix; uint32_t cell_num, cell_type, i, key_cnt; uint8_t *end; - int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; - huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); - WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); - WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); - WT_ERR(__wt_scr_alloc(session, 0, &tmp1)); - WT_ERR(__wt_scr_alloc(session, 0, &tmp2)); - last = last_ovfl; + WT_ERR(__wt_scr_alloc(session, 0, &last)); end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, NULL, end); if (ret != 0) { (void)__err_cell_corrupt(session, ret, cell_num, tag); goto err; @@ -452,15 +478,20 @@ __verify_dsk_row( WT_ERR(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; + /* Internal row-store cells should not have prefix compression or recno/rle fields. */ + if (unpack->prefix != 0) + WT_ERR_VRFY( + session, "the %" PRIu32 " cell on page at %s has a non-zero prefix", cell_num, tag); + if (unpack->v != 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " cell on page at %s has a non-zero rle/recno field", cell_num, tag); + /* - * Check ordering relationships between the WT_CELL entries. - * For row-store internal pages, check for: - * two values in a row, - * two keys in a row, - * a value as the first cell on a page. - * For row-store leaf pages, check for: - * two values in a row, - * a value as the first cell on a page. + * Check ordering relationships between the WT_CELL entries. For row-store internal pages, + * check for: + * - two values in a row, + * - two keys in a row, + * - a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: @@ -471,12 +502,9 @@ __verify_dsk_row( case WAS_VALUE: break; case WAS_KEY: - if (dsk->type == WT_PAGE_ROW_LEAF) - break; - WT_ERR_VRFY(session, "cell %" PRIu32 - " on page at %s is the " - "first of two adjacent keys", - cell_num - 1, tag); + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent keys", cell_num - 1, + tag); } last_cell_type = WAS_KEY; break; @@ -484,17 +512,14 @@ __verify_dsk_row( case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - case WT_CELL_VALUE: - case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", tag); case WAS_KEY: break; case WAS_VALUE: - WT_ERR_VRFY(session, "cell %" PRIu32 - " on page at %s is the " - "first of two adjacent values", + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent values", cell_num - 1, tag); } last_cell_type = WAS_VALUE; @@ -502,7 +527,14 @@ __verify_dsk_row( } /* Check the validity window. */ - WT_ERR(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + switch (cell_type) { + case WT_CELL_ADDR_DEL: + case WT_CELL_ADDR_INT: + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + WT_ERR(__verify_dsk_addr_validity(session, unpack, cell_num, addr, tag, dsk)); + break; + } /* Check if any referenced item has an invalid address. */ switch (cell_type) { @@ -511,6 +543,167 @@ __verify_dsk_row( case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: + if ((ret = bm->addr_invalid(bm, session, unpack->data, unpack->size)) == EINVAL) + (void)__err_cell_corrupt_or_eof(session, ret, cell_num, tag); + WT_ERR(ret); + break; + } + + /* + * Remaining checks are for key order. If this cell isn't a key, we're done, move to the + * next cell. If this cell is an overflow item, instantiate the key and compare it with the + * last key. + */ + switch (cell_type) { + case WT_CELL_KEY: + /* Get the cell's data/length and make sure we have enough buffer space. */ + WT_ERR(__wt_buf_init(session, current, unpack->size)); + + /* Copy the data into place. */ + memcpy((uint8_t *)current->mem, unpack->data, unpack->size); + current->size = unpack->size; + break; + case WT_CELL_KEY_OVFL: + WT_ERR(__wt_dsk_cell_data_ref(session, dsk->type, unpack, current)); + break; + default: + /* Not a key -- continue with the next cell. */ + continue; + } + + /* + * Compare the current key against the last key. + * + * Be careful about the 0th key on internal pages: we only store the first byte and custom + * collators may not be able to handle truncated keys. + */ + if (cell_num > 3) + WT_ERR( + __verify_row_key_order_check(session, last, cell_num - 2, current, cell_num, tag)); + + /* Swap the buffers. */ + tmp = last; + last = current; + current = tmp; + } + WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell)); + + /* + * On row-store internal pages, the key count should be equal to half the number of physical + * entries. + */ + if (key_cnt * 2 != dsk->u.entries) + WT_ERR_VRFY(session, + "%s page at %s has a key count of %" PRIu32 " and a physical entry count of %" PRIu32, + __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); + + if (0) { +err: + if (ret == 0) + ret = WT_ERROR; + } + __wt_scr_free(session, ¤t); + __wt_scr_free(session, &last); + return (ret); +} + +/* + * __verify_dsk_row_leaf -- + * Walk a WT_PAGE_ROW_LEAF disk page and verify it. + */ +static int +__verify_dsk_row_leaf( + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, WT_ADDR *addr) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK_KV *unpack, _unpack; + WT_DECL_ITEM(current); + WT_DECL_ITEM(last_ovfl); + WT_DECL_ITEM(last_pfx); + WT_DECL_RET; + WT_ITEM *last; + enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; + void *huffman; + size_t prefix; + uint32_t cell_num, cell_type, i, key_cnt, last_cell_num; + uint8_t *end; + + btree = S2BT(session); + bm = btree->bm; + unpack = &_unpack; + huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; + + WT_ERR(__wt_scr_alloc(session, 0, ¤t)); + WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); + WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); + last = last_ovfl; + + end = (uint8_t *)dsk + dsk->mem_size; + + last_cell_type = FIRST; + cell_num = last_cell_num = 0; + key_cnt = 0; + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { + ++cell_num; + + /* Carefully unpack the cell. */ + ret = __wt_cell_unpack_safe(session, dsk, cell, NULL, unpack, end); + if (ret != 0) { + (void)__err_cell_corrupt(session, ret, cell_num, tag); + goto err; + } + + /* Check the raw and collapsed cell types. */ + WT_ERR(__err_cell_type(session, cell_num, tag, unpack->raw, dsk->type)); + WT_ERR(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); + cell_type = unpack->type; + + /* Leaf row-store cells should not have recno/rle fields. */ + if (unpack->v != 0) + WT_ERR_VRFY(session, + "the %" PRIu32 " cell on page at %s has a non-zero rle/recno field", cell_num, tag); + + /* + * Check ordering relationships between the WT_CELL entries. For row-store leaf pages, check + * for: + * - two values in a row, + * - a value as the first cell on a page. + */ + switch (cell_type) { + case WT_CELL_KEY: + case WT_CELL_KEY_OVFL: + ++key_cnt; + last_cell_type = WAS_KEY; + break; + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + switch (last_cell_type) { + case FIRST: + WT_ERR_VRFY(session, "page at %s begins with a value", tag); + case WAS_KEY: + break; + case WAS_VALUE: + WT_ERR_VRFY(session, + "cell %" PRIu32 " on page at %s is the first of two adjacent values", + cell_num - 1, tag); + } + last_cell_type = WAS_VALUE; + break; + } + + /* Check the validity window. */ + switch (cell_type) { + case WT_CELL_VALUE: + case WT_CELL_VALUE_OVFL: + WT_ERR(__verify_dsk_value_validity(session, unpack, cell_num, addr, tag, dsk)); + break; + } + + /* Check if any referenced item has an invalid address. */ + switch (cell_type) { + case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if ((ret = bm->addr_invalid(bm, session, unpack->data, unpack->size)) == EINVAL) (void)__err_cell_corrupt_or_eof(session, ret, cell_num, tag); @@ -542,17 +735,15 @@ __verify_dsk_row( prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 - " key on page at %s is the first " - "non-overflow key on the page and has a non-zero " - "prefix compression value", + " key on page at %s is the first non-overflow key on the page and " + "has a non-zero prefix compression value", cell_num, tag); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) - WT_ERR_VRFY(session, "key %" PRIu32 - " on page at %s has a prefix " - "compression count of %" WT_SIZET_FMT - ", larger than the length of the previous key, %" WT_SIZET_FMT, + WT_ERR_VRFY(session, + "key %" PRIu32 " on page at %s has a prefix compression count of %" WT_SIZET_FMT + ", larger than the length of the previous key, %" WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* @@ -591,21 +782,11 @@ __verify_dsk_row( key_compare: /* * Compare the current key against the last key. - * - * Be careful about the 0th key on internal pages: we only store the first byte and custom - * collators may not be able to handle truncated keys. */ - if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || - (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { - WT_ERR(__wt_compare(session, btree->collator, last, current, &cmp)); - if (cmp >= 0) - WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 - " keys on " - "page at %s are incorrectly sorted: %s, %s", - cell_num - 2, cell_num, tag, - __wt_buf_set_printable(session, last->data, last->size, tmp1), - __wt_buf_set_printable(session, current->data, current->size, tmp2)); - } + if (cell_num > 1) + WT_ERR( + __verify_row_key_order_check(session, last, last_cell_num, current, cell_num, tag)); + last_cell_num = cell_num; /* * Swap the buffers: last always references the last key entry, last_pfx and last_ovfl @@ -625,28 +806,21 @@ key_compare: WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell)); /* - * On row-store internal pages, and on row-store leaf pages, where the - * "no empty values" flag is set, the key count should be equal to half - * the number of physical entries. On row-store leaf pages where the - * "all empty values" flag is set, the key count should be equal to the - * number of physical entries. + * On standard row-store leaf pages there's no check to make, there may be more keys than values + * as zero-length values aren't physically stored on the page. On row-store leaf pages, where + * the "no empty values" flag is set, the key count should be equal to half the number of + * physical entries. On row-store leaf pages where the "all empty values" flag is set, the key + * count should be equal to the number of physical entries. */ - if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries) - WT_ERR_VRFY(session, "%s page at %s has a key count of %" PRIu32 - " and a " - "physical entry count of %" PRIu32, - __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); - if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && - key_cnt != dsk->u.entries) + if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && key_cnt != dsk->u.entries) WT_ERR_VRFY(session, - "%s page at %s with the 'all empty values' flag set has a " - "key count of %" PRIu32 " and a physical entry count of %" PRIu32, + "%s page at %s with the 'all empty values' flag set has a key count of %" PRIu32 + " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); - if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && - key_cnt * 2 != dsk->u.entries) + if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && key_cnt * 2 != dsk->u.entries) WT_ERR_VRFY(session, - "%s page at %s with the 'no empty values' flag set has a " - "key count of %" PRIu32 " and a physical entry count of %" PRIu32, + "%s page at %s with the 'no empty values' flag set has a key count of %" PRIu32 + " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), tag, key_cnt, dsk->u.entries); if (0) { @@ -657,8 +831,6 @@ err: __wt_scr_free(session, ¤t); __wt_scr_free(session, &last_pfx); __wt_scr_free(session, &last_ovfl); - __wt_scr_free(session, &tmp1); - __wt_scr_free(session, &tmp2); return (ret); } @@ -673,7 +845,7 @@ __verify_dsk_col_int( WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_ADDR *unpack, _unpack; WT_DECL_RET; uint32_t cell_num, i; uint8_t *end; @@ -684,11 +856,11 @@ __verify_dsk_col_int( end = (uint8_t *)dsk + dsk->mem_size; cell_num = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, NULL, end); if (ret != 0) return (__err_cell_corrupt(session, ret, cell_num, tag)); @@ -697,7 +869,7 @@ __verify_dsk_col_int( WT_RET(__err_cell_type(session, cell_num, tag, unpack->type, dsk->type)); /* Check the validity window. */ - WT_RET(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + WT_RET(__verify_dsk_addr_validity(session, unpack, cell_num, addr, tag, dsk)); /* Check if any referenced item is entirely in the file. */ ret = bm->addr_invalid(bm, session, unpack->data, unpack->size); @@ -743,7 +915,7 @@ __verify_dsk_col_var( WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; + WT_CELL_UNPACK_KV *unpack, _unpack; WT_DECL_RET; uint32_t cell_num, cell_type, i; uint8_t *end; @@ -759,11 +931,11 @@ __verify_dsk_col_var( last.deleted = false; cell_num = 0; - WT_CELL_FOREACH_VRFY (btree, dsk, cell, unpack, i) { + WT_CELL_FOREACH_VRFY (session, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - ret = __wt_cell_unpack_safe(session, dsk, cell, unpack, end); + ret = __wt_cell_unpack_safe(session, dsk, cell, NULL, unpack, end); if (ret != 0) return (__err_cell_corrupt(session, ret, cell_num, tag)); @@ -773,7 +945,7 @@ __verify_dsk_col_var( cell_type = unpack->type; /* Check the validity window. */ - WT_RET(__verify_dsk_validity(session, unpack, cell_num, addr, tag, dsk)); + WT_RET(__verify_dsk_value_validity(session, unpack, cell_num, addr, tag, dsk)); /* Check if any referenced item is entirely in the file. */ if (cell_type == WT_CELL_VALUE_OVFL) { @@ -797,8 +969,7 @@ __verify_dsk_col_var( memcmp(last.data, unpack->data, last.size) == 0) match_err: WT_RET_VRFY(session, "data entries %" PRIu32 " and %" PRIu32 - " on page at %s are identical and should " - "have been run-length encoded", + " on page at %s are identical and should have been run-length encoded", cell_num - 1, cell_num, tag); __wt_time_window_copy(&last.tw, &unpack->tw); @@ -841,9 +1012,8 @@ __verify_dsk_memsize( len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell); if (len == 0) return (0); - WT_RET_VRFY(session, "%s page at %s has %" WT_SIZET_FMT - " unexpected bytes of data " - "after the last cell", + WT_RET_VRFY(session, + "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data after the last cell", __wt_page_type_string(dsk->type), tag, len); } @@ -862,7 +1032,7 @@ __verify_dsk_chunk( end = (uint8_t *)dsk + dsk->mem_size; /* - * Fixed-length column-store and overflow pages are simple chunks of data. Verify the data + * Fixed-length column-store and overflow pages are simple chunks of data-> Verify the data * doesn't overflow the end of the page. */ p = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -896,19 +1066,17 @@ __err_cell_corrupt(WT_SESSION_IMPL *session, int retval, uint32_t entry_num, con static int __err_cell_corrupt_or_eof(WT_SESSION_IMPL *session, int retval, uint32_t entry_num, const char *tag) { - WT_RET_VRFY_RETVAL(session, retval, "item %" PRIu32 - " on page at %s is a corrupted cell or references " - "non-existent file pages", + WT_RET_VRFY_RETVAL(session, retval, + "item %" PRIu32 " on page at %s is a corrupted cell or references non-existent file pages", entry_num, tag); } /* - * __err_cell_type -- - * Generic illegal cell type for a particular page type error. + * __wt_cell_type_check -- + * Check the cell type against the page type. */ -static int -__err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, uint8_t cell_type, - uint8_t dsk_type) +bool +__wt_cell_type_check(uint8_t cell_type, uint8_t dsk_type) { switch (cell_type) { case WT_CELL_ADDR_DEL: @@ -916,22 +1084,22 @@ __err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, u case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: if (dsk_type == WT_PAGE_COL_INT || dsk_type == WT_PAGE_ROW_INT) - return (0); + return (true); break; case WT_CELL_DEL: if (dsk_type == WT_PAGE_COL_VAR) - return (0); + return (true); break; case WT_CELL_KEY: case WT_CELL_KEY_OVFL: case WT_CELL_KEY_SHORT: if (dsk_type == WT_PAGE_ROW_INT || dsk_type == WT_PAGE_ROW_LEAF) - return (0); + return (true); break; case WT_CELL_KEY_PFX: case WT_CELL_KEY_SHORT_PFX: if (dsk_type == WT_PAGE_ROW_LEAF) - return (0); + return (true); break; case WT_CELL_KEY_OVFL_RM: case WT_CELL_VALUE_OVFL_RM: @@ -944,13 +1112,23 @@ __err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, u case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: if (dsk_type == WT_PAGE_COL_VAR || dsk_type == WT_PAGE_ROW_LEAF) - return (0); - break; - default: + return (true); break; } + return (false); +} - WT_RET_VRFY(session, "illegal cell and page type combination: cell %" PRIu32 - " on page at %s is a %s cell on a %s page", - entry_num, tag, __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); +/* + * __err_cell_type -- + * Generic illegal cell type for a particular page type error. + */ +static int +__err_cell_type(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag, uint8_t cell_type, + uint8_t dsk_type) +{ + if (!__wt_cell_type_check(cell_type, dsk_type)) + WT_RET_VRFY(session, "illegal cell and page type combination: cell %" PRIu32 + " on page at %s is a %s cell on a %s page", + entry_num, tag, __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); + return (0); } |