diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-05-06 15:45:01 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-06 06:00:50 +0000 |
commit | d2274bb6e1f8b21d73121a2fcb20b6628f652bbe (patch) | |
tree | 72c771934dab7adff1bbffdcb1af1ac6e1c36a0d /src/third_party/wiredtiger/src/btree | |
parent | e500238a9ea3d5498ebffeb74a1aceac42eb2c1f (diff) | |
download | mongo-d2274bb6e1f8b21d73121a2fcb20b6628f652bbe.tar.gz |
Import wiredtiger: 18dfb9e58e39927696affcd8e362364e23e1aa59 from branch mongodb-4.4r4.4.0-rc4
ref: a707df12a2..18dfb9e58e
for: 4.4.0-rc4
WT-5242 Minimize checkpoints pinned during backup
WT-5470 Reduce copies and allocations in read path
WT-5673 Prepare support with durable history: modify verify and salvage as needed
WT-5677 Prepare support with durable history: add test/format stress tests
WT-5710 Review WT_PANIC usage
WT-5716 Create the history store file at the same time as creating the metadata file in wiredtiger open
WT-5839 Ignore non-globally visible tombstones for both data store and hs store in hs verification
WT-5841 Return WT_TRY_SALVAGE when the history file is removed or truncated
WT-5928 Cleanup stale FIXMEs from durable history
WT-5977 WT_SESSION_NO_RECONCILE flag set by history cursor prevents eviction
WT-5984 Allow prepared updates to be evicted in durable history
WT-6009 Prepare support with durable history: add statistic for prepared updates evicted
WT-6032 Turn on mongodb-4.4 branch upgrade/downgrade testing
WT-6051 Fix reconstructing full value from modifies for string format
WT-6068 Re-enable tests temporarily disabled during durable history development
WT-6069 Remove WT_UPDATE_RESTORED_FROM_DISK flag
WT-6070 Coverity : Copy paste error
WT-6071 Coverity : Change format specifier
WT-6086 Move time windows and aggregated time windows into structures
WT-6087 Add a C2S(cursor) macro to simplify translation from a cursor to a session
WT-6095 Verify on-disk page only for row store as part of rollback to stable
WT-6109 Cleanup usage of cursor->session
WT-6110 Cleanup cast from cbt to cursor
WT-6120 Remove use-after-free in __verify_history_store_id
WT-6130 Disable test_random_abort
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
21 files changed, 621 insertions, 780 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index d6c89dacd33..9ea91c6f421 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -16,9 +16,8 @@ static inline int __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -58,28 +57,14 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it - * to the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); } cbt->iface.value.size = 1; return (0); @@ -95,12 +80,10 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); page = cbt->ref->page; - upd = NULL; /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -127,33 +110,20 @@ new_page: if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; /* - * FIXME-PM-1523: Now we only do transaction read if we have an update chain and it doesn't work + * FIXME-WT-6127: Now we only do transaction read if we have an update chain and it doesn't work * in durable history. Review this when we have a plan for fixed-length column store. */ + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) restart_read: - WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it to - * the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); cbt->iface.value.size = 1; return (0); } @@ -166,9 +136,8 @@ static inline int __cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -187,18 +156,17 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); - if (upd == NULL) + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -216,10 +184,9 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint64_t rle, rle_start; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -258,18 +225,17 @@ restart_read: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = NULL; + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* @@ -309,8 +275,9 @@ restart_read: continue; } - WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip, &upd)); - if (upd == NULL) + WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip)); + if (cbt->upd_value->type == WT_UPDATE_INVALID || + cbt->upd_value->type == WT_UPDATE_TOMBSTONE) continue; return (0); } @@ -334,10 +301,9 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool kpack_used; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; key = &cbt->iface.key; @@ -386,17 +352,16 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* Check for the end of the page. */ @@ -422,17 +387,16 @@ restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); WT_RET(__wt_txn_read( - session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL, &upd)); - if (upd == NULL) + session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -461,7 +425,7 @@ __cursor_key_order_check_col(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, boo return (0); } - WT_PANIC_RET(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 + WT_RET_PANIC(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " "key %" PRIu64, next ? "next" : "prev", cbt->lastrecno, cbt->recno); @@ -494,7 +458,7 @@ __cursor_key_order_check_row(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, boo WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); - WT_PANIC_ERR(session, EINVAL, + WT_ERR_PANIC(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %.1024s then " "key %.1024s", next ? "next" : "prev", __wt_buf_set_printable_format(session, cbt->lastkey->data, @@ -536,7 +500,7 @@ __wt_cursor_key_order_init(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * Cursor searches set the position for cursor movements, set the last-key value for diagnostic @@ -648,7 +612,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) bool newpage, restart; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 0099d1ae594..f8db9cd6233 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -40,7 +40,7 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt) uint64_t recno; int i; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); restart: /* @@ -123,9 +123,8 @@ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -198,28 +197,14 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it - * to the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); } cbt->iface.value.size = 1; return (0); @@ -235,9 +220,8 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; btree = S2BT(session); @@ -265,35 +249,21 @@ new_page: cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = NULL; /* - * FIXME-PM-1523: Now we only do transaction read if we have an update chain and it doesn't work + * FIXME-WT-6127: Now we only do transaction read if we have an update chain and it doesn't work * in durable history. Review this when we have a plan for fixed-length column store. */ + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) restart_read: - WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it to - * the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); cbt->iface.value.size = 1; return (0); } @@ -306,9 +276,8 @@ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -327,17 +296,16 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -355,10 +323,9 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint64_t rle_start; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -398,18 +365,17 @@ restart_read: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = NULL; + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* @@ -449,8 +415,9 @@ restart_read: continue; } - WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip, &upd)); - if (upd == NULL) + WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip)); + if (cbt->upd_value->type == WT_UPDATE_INVALID || + cbt->upd_value->type == WT_UPDATE_TOMBSTONE) continue; return (0); } @@ -474,10 +441,9 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool kpack_used; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; key = &cbt->iface.key; @@ -536,17 +502,16 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* Check for the beginning of the page. */ @@ -574,17 +539,16 @@ restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); WT_RET(__wt_txn_read( - session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL, &upd)); - if (upd == NULL) + session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -604,7 +568,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) bool newpage, restart; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 61a0a2653f6..ccec03700d0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -60,7 +60,7 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation) WT_SESSION_IMPL *session; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); /* * Check the page active flag, asserting the page reference with any external key. @@ -171,21 +171,18 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * Return if the cursor references an valid key/value pair. */ int -__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE **updp, bool *valid) +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *valid) { WT_BTREE *btree; WT_CELL *cell; WT_COL *cip; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - if (updp != NULL) - *updp = NULL; *valid = false; btree = cbt->btree; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * We may be pointing to an insert object, and we may have a page with @@ -232,22 +229,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE * update that's been deleted is not a valid key/value pair). */ if (cbt->ins != NULL) { - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - WT_ASSERT(session, !F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; return (0); } } /* + * Clean out any stale value here. Calling a transaction read helper automatically clears this + * but we have some code paths that don't do this (fixed length column store is one example). + */ + __wt_upd_value_clear(cbt->upd_value); + + /* * If we don't have an insert object, or in the case of column-store, there's an insert object * but no update was visible to us and the key on the page is the same as the insert object's * key, and the slot as set by the search function is valid, we can use the original page @@ -299,17 +296,10 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE * Check for an update ondisk or in the history store. For column store, an insert object * can have the same key as an on-page or history store object. */ - WT_RET(__wt_txn_read(session, cbt, key, recno, NULL, NULL, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); + WT_RET(__wt_txn_read(session, cbt, key, recno, NULL, NULL)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; } break; @@ -335,17 +325,10 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE (page->modify != NULL && page->modify->mod_row_update != NULL) ? page->modify->mod_row_update[cbt->slot] : NULL, - NULL, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); + NULL)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; } break; @@ -363,7 +346,7 @@ __cursor_col_search(WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool *leaf_foundp) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_WITH_PAGE_INDEX( session, ret = __wt_col_search(cbt, cbt->iface.recno, leaf, false, leaf_foundp)); return (ret); @@ -379,7 +362,7 @@ __cursor_row_search(WT_CURSOR_BTREE *cbt, bool insert, WT_REF *leaf, bool *leaf_ WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_WITH_PAGE_INDEX( session, ret = __wt_row_search(cbt, &cbt->iface.key, insert, leaf, false, leaf_foundp)); return (ret); @@ -429,7 +412,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_SESSION_IMPL *session; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); @@ -440,11 +423,11 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) } /* - * __wt_btcur_search_uncommitted -- - * Search and return exact matching records only, including uncommitted ones. + * __wt_btcur_search_prepared -- + * Search and return exact matching records only. */ int -__wt_btcur_search_uncommitted(WT_CURSOR *cursor, WT_UPDATE **updp) +__wt_btcur_search_prepared(WT_CURSOR *cursor, WT_UPDATE **updp) { WT_BTREE *btree; WT_CURSOR_BTREE *cbt; @@ -500,12 +483,6 @@ __wt_btcur_search_uncommitted(WT_CURSOR *cursor, WT_UPDATE **updp) break; } - /* - * Like regular uncommitted updates, pages with prepared updates are pinned to the cache and can - * never be written to the history store. Therefore, there is no need to do a search here for - * uncommitted updates. - */ - *updp = upd; return (0); } @@ -522,13 +499,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; - upd = NULL; /* -Wuninitialized */ + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); @@ -557,11 +532,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, false, cbt->ref, &leaf_found)); if (leaf_found && cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, cbt->ref, &leaf_found)); if (leaf_found && cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } if (!valid) { @@ -570,16 +545,16 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, false, NULL, NULL)); if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, NULL, NULL)); if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } if (valid) - ret = __cursor_kv_return(cbt, upd); + ret = __cursor_kv_return(cbt, cbt->upd_value); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length column-store implicitly @@ -619,14 +594,12 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; int exact; bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; - upd = NULL; /* -Wuninitialized */ + session = CUR2S(cbt); exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); @@ -671,7 +644,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (leaf_found && (cbt->compare == 0 || (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1))) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -682,10 +655,10 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, true, NULL, NULL)); - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, NULL, NULL)); - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } @@ -706,7 +679,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (valid) { exact = cbt->compare; - ret = __cursor_kv_return(cbt, upd); + ret = __cursor_kv_return(cbt, cbt->upd_value); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; @@ -781,9 +754,12 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; insert_bytes = cursor->key.size + cursor->value.size; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; + WT_RET_PANIC_ASSERT( + session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); + WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_CONN_INCRV(session, cursor_insert_bytes, insert_bytes); @@ -793,9 +769,6 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); - WT_RET_ASSERT( - session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); - /* It's no longer possible to bulk-load into the tree. */ __wt_cursor_disable_bulk(session); @@ -859,7 +832,9 @@ retry: * If not overwriting, fail if the key exists, else insert the key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0) { - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (valid) WT_ERR(WT_DUPLICATE_KEY); } @@ -885,7 +860,9 @@ retry: */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { if (cbt->compare == 0) { - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); if (valid) WT_ERR(WT_DUPLICATE_KEY); } else if (__cursor_fix_implicit(btree, cbt)) @@ -932,7 +909,7 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt) btree = cbt->btree; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = NULL; if (cbt->compare != 0) @@ -964,7 +941,7 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) uint64_t yield_count, sleep_usecs; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; WT_ASSERT(session, cbt->btree->type == BTREE_ROW); @@ -1015,7 +992,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; iterating = F_ISSET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); searched = false; @@ -1085,7 +1062,8 @@ retry: if (cbt->compare != 0) goto search_notfound; - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF(ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (!valid) goto search_notfound; @@ -1103,8 +1081,10 @@ retry: /* Remove the record if it exists. */ valid = false; - if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + if (cbt->compare == 0) { + WT_WITH_UPDATE_VALUE_SKIP_BUF(ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); + } if (cbt->compare != 0 || !valid) { if (!__cursor_fix_implicit(btree, cbt)) goto search_notfound; @@ -1203,10 +1183,10 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; - WT_RET_ASSERT( + WT_RET_PANIC_ASSERT( session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); /* It's no longer possible to bulk-load into the tree. */ @@ -1287,7 +1267,9 @@ update_local: WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0) WT_ERR(WT_NOTFOUND); - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (!valid) WT_ERR(WT_NOTFOUND); } @@ -1302,8 +1284,11 @@ update_local: if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); valid = false; - if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + if (cbt->compare == 0) { + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); + } if ((cbt->compare != 0 || !valid) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } @@ -1375,7 +1360,7 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); upd = NULL; if (cbt->ins != NULL) @@ -1429,7 +1414,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) bool overwrite; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); /* Save the cursor state. */ __cursor_state_save(cursor, &state); @@ -1465,7 +1450,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify)); orig = cursor->value.size; - WT_ERR(__wt_modify_apply(cursor, modify->data)); + WT_ERR(__wt_modify_apply_item(session, cursor->value_format, &cursor->value, modify->data)); new = cursor->value.size; WT_ERR(__cursor_size_chk(session, &cursor->value)); @@ -1515,7 +1500,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt) bool overwrite; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_reserve); WT_STAT_DATA_INCR(session, cursor_reserve); @@ -1542,7 +1527,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); @@ -1568,7 +1553,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) a = (WT_CURSOR *)a_arg; b = (WT_CURSOR *)b_arg; - session = (WT_SESSION_IMPL *)a->session; + session = CUR2S(a_arg); /* Confirm both cursors reference the same object. */ if (a_arg->btree != b_arg->btree) @@ -1640,8 +1625,8 @@ __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) a = (WT_CURSOR *)a_arg; b = (WT_CURSOR *)b_arg; + session = CUR2S(a_arg); cmp = 0; - session = (WT_SESSION_IMPL *)a->session; /* Confirm both cursors reference the same object. */ if (a_arg->btree != b_arg->btree) @@ -1673,7 +1658,7 @@ __cursor_truncate( WT_SESSION_IMPL *session; uint64_t yield_count, sleep_usecs; - session = (WT_SESSION_IMPL *)start->iface.session; + session = CUR2S(start); yield_count = sleep_usecs = 0; /* @@ -1729,7 +1714,7 @@ __cursor_truncate_fix( uint64_t yield_count, sleep_usecs; const uint8_t *value; - session = (WT_SESSION_IMPL *)start->iface.session; + session = CUR2S(start); yield_count = sleep_usecs = 0; /* @@ -1786,8 +1771,8 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)start->iface.session; btree = start->btree; + session = CUR2S(start); WT_STAT_DATA_INCR(session, cursor_truncate); WT_RET(__wt_txn_autocommit_check(session)); @@ -1852,6 +1837,8 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + cbt->modify_update = &cbt->_modify_update; + cbt->upd_value = &cbt->_upd_value; #ifdef HAVE_DIAGNOSTIC cbt->lastkey = &cbt->_lastkey; @@ -1869,7 +1856,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * The in-memory split and history store table code creates low-level btree cursors to @@ -1879,6 +1866,8 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) if (!lowlevel) ret = __cursor_reset(cbt); + __wt_buf_free(session, &cbt->_modify_update.buf); + __wt_buf_free(session, &cbt->_upd_value.buf); __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); #ifdef HAVE_DIAGNOSTIC diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index e3e89620fd5..a835e593022 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -156,18 +156,16 @@ __debug_item_value(WT_DBG *ds, const char *tag, const void *data_arg, size_t siz } /* - * __debug_time_pairs -- + * __debug_time_window -- * Dump a set of start and stop time pairs, with an optional tag. */ static inline int -__debug_time_pairs(WT_DBG *ds, const char *tag, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_ts, uint64_t stop_txn) +__debug_time_window(WT_DBG *ds, const char *tag, WT_TIME_WINDOW *tw) { - char tp_string[2][WT_TP_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; - return (ds->f(ds, "\t%s%s%s,%s\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", - __wt_time_pair_to_string(start_ts, start_txn, tp_string[0]), - __wt_time_pair_to_string(stop_ts, stop_txn, tp_string[1]))); + return (ds->f(ds, "\t%s%s%s\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", + __wt_time_window_to_string(tw, time_string))); } /* @@ -711,15 +709,13 @@ int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; uint32_t session_flags; bool is_owner; - cursor = cursor_arg; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor_arg); session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); @@ -741,26 +737,24 @@ __wt_debug_cursor_hs(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor) WT_DECL_ITEM(hs_key); WT_DECL_ITEM(hs_value); WT_DECL_RET; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; WT_UPDATE *upd; - wt_timestamp_t hs_durable_ts; - uint64_t hs_upd_type_full; + uint64_t hs_counter, hs_upd_type_full; uint32_t hs_btree_id; - uint8_t hs_prep_state, hs_upd_type; + uint8_t hs_upd_type; ds = &_ds; + __wt_time_window_init(&tw); WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); WT_ERR(__debug_config(session, ds, NULL)); - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &start.timestamp, &start.txnid, - &stop.timestamp, &stop.txnid)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &tw.start_ts, &hs_counter)); + WT_ERR(hs_cursor->get_value( + hs_cursor, &tw.stop_ts, &tw.durable_start_ts, &hs_upd_type_full, hs_value)); + WT_ERR(__debug_time_window(ds, "T", &tw)); - WT_ERR(__debug_time_pairs(ds, "T", start.timestamp, start.txnid, stop.timestamp, stop.txnid)); - - WT_ERR( - hs_cursor->get_value(hs_cursor, &hs_durable_ts, &hs_prep_state, &hs_upd_type_full, hs_value)); hs_upd_type = (uint8_t)hs_upd_type_full; switch (hs_upd_type) { case WT_UPDATE_MODIFY: @@ -806,8 +800,7 @@ __wt_debug_key_value( WT_ERR(ds->f(ds, "\tK {%" PRIu64 " %" PRIu64 "}", recno, rle)); else WT_ERR(__debug_item_key(ds, "K", key->data, key->size)); - WT_ERR(__debug_time_pairs( - ds, "T", value->start_ts, value->start_txn, value->stop_ts, value->stop_txn)); + WT_ERR(__debug_time_window(ds, "T", &value->tw)); WT_ERR(__debug_cell_data(ds, NULL, value != NULL ? value->type : 0, "V", value)); err: @@ -1350,8 +1343,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) { WT_ADDR_COPY addr; WT_SESSION_IMPL *session; - char tp_string[2][WT_TP_STRING_SIZE]; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; session = ds->session; @@ -1365,13 +1357,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", %s", "reading")); if (__wt_ref_addr_copy(session, ref, &addr)) - WT_RET(ds->f(ds, - ", start/stop durable ts %s,%s, start/stop ts/txn %s,%s, prepared updates: %s, %s", - __wt_timestamp_to_string(addr.newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(addr.newest_stop_durable_ts, ts_string[1]), - __wt_time_pair_to_string(addr.oldest_start_ts, addr.oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(addr.newest_stop_ts, addr.newest_stop_txn, tp_string[1]), - addr.prepare ? "true" : "false", + WT_RET(ds->f(ds, "%s, %s", __wt_time_aggregate_to_string(&addr.ta, time_string), __wt_addr_string(session, addr.addr, addr.size, ds->t1))); return (ds->f(ds, "\n")); } @@ -1386,8 +1372,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) WT_DECL_ITEM(buf); WT_DECL_RET; WT_SESSION_IMPL *session; - char tp_string[2][WT_TP_STRING_SIZE]; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; session = ds->session; @@ -1429,11 +1414,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - WT_RET(ds->f(ds, ", ts/txn %s,%s,%s,%s", - __wt_timestamp_to_string(unpack->newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_durable_ts, ts_string[1]), - __wt_time_pair_to_string(unpack->oldest_start_ts, unpack->oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(unpack->newest_stop_ts, unpack->newest_stop_txn, tp_string[1]))); + WT_RET(ds->f(ds, ", %s", __wt_time_aggregate_to_string(&unpack->ta, time_string))); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -1441,9 +1422,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: case WT_CELL_VALUE_SHORT: - WT_RET(ds->f(ds, ", ts/txn %s,%s", - __wt_time_pair_to_string(unpack->start_ts, unpack->start_txn, tp_string[0]), - __wt_time_pair_to_string(unpack->stop_ts, unpack->stop_txn, tp_string[1]))); + WT_RET(ds->f(ds, ", %s", __wt_time_window_to_string(&unpack->tw, time_string))); break; } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 94b544f6bc2..b9a3eed1c93 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -115,7 +115,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) goto err; if (addr.type != WT_ADDR_LEAF_NO) goto err; - if (!__wt_txn_visible(session, addr.oldest_start_txn, addr.oldest_start_ts)) + if (!__wt_txn_visible(session, addr.ta.oldest_start_txn, addr.ta.oldest_start_ts)) goto err; /* @@ -292,7 +292,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_ROW *rip; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; WT_UPDATE **upd_array, *upd; size_t size; uint32_t count, i; @@ -382,8 +382,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * Retrieve the stop time pair from the page's row. If we find an existing stop time pair we * don't need to append a tombstone. */ - __wt_read_row_time_pairs(session, page, rip, &start, &stop); - if (stop.timestamp == WT_TS_MAX && stop.txnid == WT_TXN_MAX) { + __wt_read_row_time_window(session, page, rip, &tw); + if (tw.stop_ts == WT_TS_MAX && tw.stop_txn == WT_TXN_MAX) { WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size)); upd->next = upd_array[WT_ROW_SLOT(page, rip)]; upd_array[WT_ROW_SLOT(page, rip)] = upd; diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index d4d83783a1b..7fac3deabd5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -140,7 +140,7 @@ corrupt: F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { WT_TRET(bm->corrupt(bm, session, addr, addr_size)); - WT_PANIC_ERR(session, ret, "%s: fatal read error: %s", btree->dhandle->name, fail_msg); + WT_ERR_PANIC(session, ret, "%s: fatal read error: %s", btree->dhandle->name, fail_msg); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 72523b695de..cccd2c628a3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -65,7 +65,7 @@ __wt_ovfl_read( */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { - WT_ASSERT(session, __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)); + WT_ASSERT(session, __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)); ret = __wt_buf_setstr(session, store, "WT_CELL_VALUE_OVFL_RM"); *decoded = true; } else diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index ac588bf901d..1a690b24804 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -544,13 +544,23 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL_UNPACK unpack; + WT_ITEM buf; WT_ROW *rip; + WT_UPDATE **upd_array, *upd; + size_t size, total_size; + uint32_t i; + bool instantiate_prepared, prepare; btree = S2BT(session); + prepare = false; + + instantiate_prepared = F_ISSET_ATOMIC(page, WT_PAGE_INSTANTIATE_PREPARE_UPDATE); /* Walk the page, building indices. */ rip = page->pg_row; WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + if (instantiate_prepared && !prepare && F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE)) + prepare = true; switch (unpack.type) { case WT_CELL_KEY_OVFL: __wt_row_leaf_key_set_cell(page, rip, unpack.cell); @@ -575,9 +585,9 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) * The visibility information is not referenced on the page so we need to ensure that * the value is globally visible at the point in time where we read the page into cache. */ - if (!btree->huffman_value && unpack.stop_txn == WT_TXN_MAX && - unpack.stop_ts == WT_TS_MAX && - __wt_txn_visible_all(session, unpack.start_txn, unpack.start_ts)) + if (!btree->huffman_value && unpack.tw.stop_txn == WT_TXN_MAX && + unpack.tw.stop_ts == WT_TS_MAX && !F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE) && + __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.start_ts)) __wt_row_leaf_value_set(page, rip - 1, &unpack); break; case WT_CELL_VALUE_OVFL: @@ -589,8 +599,47 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_CELL_FOREACH_END; /* - * We do not currently instantiate keys on leaf pages when the page is loaded, they're - * instantiated on demand. + * Instantiate prepared updates on leaf pages when the page is loaded. For in-memory databases, + * all non obsolete updates will retain on the page as part of __split_multi_inmem function. */ + if (prepare && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { + WT_RET(__wt_page_modify_init(session, page)); + if (!F_ISSET(btree, WT_BTREE_READONLY)) + __wt_page_modify_set(session, page); + + /* Allocate the per-page update array if one doesn't already exist. */ + if (page->entries != 0 && page->modify->mod_row_update == NULL) + WT_RET(__wt_calloc_def(session, page->entries, &page->modify->mod_row_update)); + + /* For each entry in the page */ + size = total_size = 0; + upd_array = page->modify->mod_row_update; + WT_ROW_FOREACH (page, rip, i) { + /* Unpack the on-page value cell. */ + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + if (F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE)) { + if (unpack.tw.stop_ts == WT_TS_MAX && unpack.tw.stop_txn == WT_TXN_MAX) { + /* Take the value from the original page cell. */ + WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &buf)); + + WT_RET(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, &size)); + upd->durable_ts = WT_TS_NONE; + upd->start_ts = unpack.tw.start_ts; + upd->txnid = unpack.tw.start_txn; + } else { + WT_RET(__wt_upd_alloc_tombstone(session, &upd, &size)); + upd->durable_ts = WT_TS_NONE; + upd->start_ts = unpack.tw.stop_ts; + upd->txnid = unpack.tw.stop_txn; + } + upd->prepare_state = WT_PREPARE_INPROGRESS; + upd_array[WT_ROW_SLOT(page, rip)] = upd; + total_size += size; + } + } + + __wt_cache_page_inmem_incr(session, page, total_size); + } + return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index b3a8985fbe4..3f113e4b2dc 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -13,10 +13,8 @@ * Check if the inserted key/value pair is valid. */ static int -__random_insert_valid( - WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_INSERT *ins, WT_UPDATE **updp, bool *validp) +__random_insert_valid(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_INSERT *ins, bool *validp) { - *updp = NULL; *validp = false; __cursor_pos_clear(cbt); @@ -27,7 +25,7 @@ __random_insert_valid( cbt->tmp->data = WT_INSERT_KEY(ins); cbt->tmp->size = WT_INSERT_KEY_SIZE(ins); - return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, updp, validp)); + return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, validp)); } /* @@ -35,16 +33,15 @@ __random_insert_valid( * Check if the slot key/value pair is valid. */ static int -__random_slot_valid(WT_CURSOR_BTREE *cbt, uint32_t slot, WT_UPDATE **updp, bool *validp) +__random_slot_valid(WT_CURSOR_BTREE *cbt, uint32_t slot, bool *validp) { - *updp = NULL; *validp = false; __cursor_pos_clear(cbt); cbt->slot = slot; cbt->compare = 0; - return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, updp, validp)); + return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, validp)); } /* Magic constant: 5000 entries in a skip list is enough to forcibly evict. */ @@ -64,7 +61,7 @@ __random_skip_entries(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head) uint32_t entries; int level; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); entries = 0; /* [-Wconditional-uninitialized] */ if (ins_head == NULL) @@ -106,18 +103,16 @@ __random_skip_entries(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head) * Return a random key/value from a skip list. */ static int -__random_leaf_skip( - WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, uint32_t entries, WT_UPDATE **updp, bool *validp) +__random_leaf_skip(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, uint32_t entries, bool *validp) { WT_INSERT *ins, *saved_ins; WT_SESSION_IMPL *session; uint32_t i; int retry; - *updp = NULL; *validp = false; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* This is a relatively expensive test, try a few times then quit. */ for (retry = 0; retry < WT_RANDOM_SKIP_RETRY; ++retry) { @@ -136,7 +131,7 @@ __random_leaf_skip( /* Try and return our selected record. */ if (ins != NULL) { - WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + WT_RET(__random_insert_valid(cbt, ins_head, ins, validp)); if (*validp) return (0); } @@ -148,7 +143,7 @@ __random_leaf_skip( ins = saved_ins; } for (; --i > 0 && ins != NULL; ins = WT_SKIP_NEXT(ins)) { - WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + WT_RET(__random_insert_valid(cbt, ins_head, ins, validp)); if (*validp) return (0); } @@ -166,24 +161,23 @@ __random_leaf_skip( * Look for a large insert list from which we can select a random item. */ static int -__random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +__random_leaf_insert(WT_CURSOR_BTREE *cbt, bool *validp) { WT_INSERT_HEAD *ins_head; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t entries, slot, start; - *updp = NULL; *validp = false; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* Check for a large insert list with no items, that's common when tables are newly created. */ ins_head = WT_ROW_INSERT_SMALLEST(page); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_SMALLEST_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -199,7 +193,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -208,7 +202,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -219,7 +213,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT_SMALLEST(page); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -234,25 +228,24 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) * Return a random key/value from a page's on-disk entries. */ static int -__random_leaf_disk(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +__random_leaf_disk(WT_CURSOR_BTREE *cbt, bool *validp) { WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t entries, slot; int retry; - *updp = NULL; *validp = false; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); entries = cbt->ref->page->entries; /* This is a relatively cheap test, so try several times. */ for (retry = 0; retry < WT_RANDOM_DISK_RETRY; ++retry) { slot = __wt_random(&session->rnd) % entries; WT_RET(__wt_row_leaf_key(session, page, page->pg_row + slot, cbt->tmp, false)); - WT_RET(__random_slot_valid(cbt, slot, updp, validp)); + WT_RET(__random_slot_valid(cbt, slot, validp)); if (*validp) break; } @@ -274,12 +267,11 @@ __random_leaf(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint32_t i; bool next, valid; - cursor = (WT_CURSOR *)cbt; - session = (WT_SESSION_IMPL *)cbt->iface.session; + cursor = &cbt->iface; + session = CUR2S(cbt); /* * If the page has a sufficiently large number of disk-based entries, randomly select from them. @@ -287,24 +279,24 @@ __random_leaf(WT_CURSOR_BTREE *cbt) * a reasonable chunk of the name space. */ if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH) { - WT_RET(__random_leaf_disk(cbt, &upd, &valid)); + WT_RET(__random_leaf_disk(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); } /* Look for any large insert list and select from it. */ - WT_RET(__random_leaf_insert(cbt, &upd, &valid)); + WT_RET(__random_leaf_insert(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); /* * Try again if there are at least a few hundred disk-based entries: this may be a normal leaf * page with big items. */ if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH / 2) { - WT_RET(__random_leaf_disk(cbt, &upd, &valid)); + WT_RET(__random_leaf_disk(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); } /* @@ -484,7 +476,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); read_flags = WT_READ_RESTART_OK; if (F_ISSET(cbt, WT_CBT_READ_ONCE)) diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 5c8c0ea871a..4d83914e1a3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -157,6 +157,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * evicting that page and deciding that is a sign that eviction is unstuck. */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; + FLD_SET(page_flags, WT_PAGE_INSTANTIATE_PREPARE_UPDATE); if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS); WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, ¬used)); diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 353f159f6bb..5f29cf08691 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -76,13 +76,7 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session, const uint8_t *key, size_t key WT_RET(__wt_calloc_one(session, ©_addr)); copy->addr = copy_addr; - copy_addr->newest_start_durable_ts = unpack->newest_start_durable_ts; - copy_addr->oldest_start_ts = unpack->oldest_start_ts; - copy_addr->oldest_start_txn = unpack->oldest_start_txn; - copy_addr->newest_stop_durable_ts = unpack->newest_stop_durable_ts; - copy_addr->newest_stop_ts = unpack->newest_stop_ts; - copy_addr->newest_stop_txn = unpack->newest_stop_txn; - copy_addr->prepare = F_ISSET(unpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(©_addr->ta, &unpack->ta); WT_RET(__wt_memdup(session, unpack->data, unpack->size, ©_addr->addr)); copy_addr->size = (uint8_t)unpack->size; copy_addr->type = unpack->type == WT_CELL_ADDR_LEAF ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO; diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 2061d561a7a..1a2360f6d09 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -23,7 +23,7 @@ __key_return(WT_CURSOR_BTREE *cbt) page = cbt->ref->page; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -70,102 +70,71 @@ __key_return(WT_CURSOR_BTREE *cbt) } /* - * __time_pairs_init -- - * Initialize the time pairs to globally visible. + * __read_col_time_window -- + * Retrieve the time window from a column store cell. */ -static inline void -__time_pairs_init(WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +static void +__read_col_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_WINDOW *tw) { - start->txnid = WT_TXN_NONE; - start->timestamp = WT_TS_NONE; - stop->txnid = WT_TXN_MAX; - stop->timestamp = WT_TS_MAX; + WT_CELL_UNPACK unpack; + + __wt_cell_unpack(session, page, cell, &unpack); + __wt_time_window_copy(tw, &unpack.tw); } /* - * __time_pairs_set -- - * Set the time pairs. + * __wt_read_row_time_window -- + * Retrieve the time window from a row. */ -static inline void -__time_pairs_set(WT_TIME_PAIR *start, WT_TIME_PAIR *stop, WT_CELL_UNPACK *unpack) +void +__wt_read_row_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw) { - start->timestamp = unpack->start_ts; - start->txnid = unpack->start_txn; - stop->timestamp = unpack->stop_ts; - stop->txnid = unpack->stop_txn; + WT_CELL_UNPACK unpack; + + __wt_time_window_init(tw); + /* + * If a value is simple and is globally visible at the time of reading a page into cache, we set + * the time pairs as globally visible. + */ + if (__wt_row_leaf_value_exists(rip)) + return; + + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_time_window_copy(tw, &unpack.tw); } /* - * __wt_read_cell_time_pairs -- + * __wt_read_cell_time_window -- * Read the time pairs from the cell. */ void -__wt_read_cell_time_pairs( - WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +__wt_read_cell_time_window(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_WINDOW *tw) { WT_PAGE *page; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = ref->page; - WT_ASSERT(session, start != NULL && stop != NULL); + WT_ASSERT(session, tw != NULL); /* Take the value from the original page cell. */ if (page->type == WT_PAGE_ROW_LEAF) { - __wt_read_row_time_pairs(session, page, &page->pg_row[cbt->slot], start, stop); + __wt_read_row_time_window(session, page, &page->pg_row[cbt->slot], tw); } else if (page->type == WT_PAGE_COL_VAR) { - __wt_read_col_time_pairs( - session, page, WT_COL_PTR(page, &page->pg_var[cbt->slot]), start, stop); + __read_col_time_window(session, page, WT_COL_PTR(page, &page->pg_var[cbt->slot]), tw); } else { /* WT_PAGE_COL_FIX: return the default time pairs. */ - __time_pairs_init(start, stop); + __wt_time_window_init(tw); } } /* - * __wt_read_col_time_pairs -- - * Retrieve the time pairs from a column store cell. - */ -void -__wt_read_col_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) -{ - WT_CELL_UNPACK unpack; - - __wt_cell_unpack(session, page, cell, &unpack); - __time_pairs_set(start, stop, &unpack); -} - -/* - * __wt_read_row_time_pairs -- - * Retrieve the time pairs from a row. - */ -void -__wt_read_row_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) -{ - WT_CELL_UNPACK unpack; - - __time_pairs_init(start, stop); - /* - * If a value is simple and is globally visible at the time of reading a page into cache, we set - * the time pairs as globally visible. - */ - if (__wt_row_leaf_value_exists(rip)) - return; - - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); - __time_pairs_set(start, stop, &unpack); -} - -/* * __wt_value_return_buf -- * Change a buffer to reference an internal original-page return value. */ int -__wt_value_return_buf( - WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +__wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_WINDOW *tw) { WT_BTREE *btree; WT_CELL *cell; @@ -176,18 +145,12 @@ __wt_value_return_buf( WT_SESSION_IMPL *session; uint8_t v; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); page = ref->page; cursor = &cbt->iface; - if (start != NULL && stop != NULL) - __time_pairs_init(start, stop); - - /* Must provide either both start and stop as output parameters or neither. */ - WT_ASSERT(session, (start != NULL && stop != NULL) || (start == NULL && stop == NULL)); - if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -195,14 +158,16 @@ __wt_value_return_buf( * If a value is simple and is globally visible at the time of reading a page into cache, we * encode its location into the WT_ROW. */ - if (__wt_row_leaf_value(page, rip, buf)) + if (__wt_row_leaf_value(page, rip, buf)) { + if (tw != NULL) + __wt_time_window_init(tw); return (0); + } /* Take the value from the original page cell. */ __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); - if (start != NULL && stop != NULL) - __time_pairs_set(start, stop, &unpack); - + if (tw != NULL) + __wt_time_window_copy(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); } @@ -210,17 +175,18 @@ __wt_value_return_buf( /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); __wt_cell_unpack(session, page, cell, &unpack); - if (start != NULL && stop != NULL) - __time_pairs_set(start, stop, &unpack); - + if (tw != NULL) + __wt_time_window_copy(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); } /* * WT_PAGE_COL_FIX: Take the value from the original page. * - * FIXME-PM-1523: Should also check visibility here + * FIXME-WT-6126: Should also check visibility here */ + if (tw != NULL) + __wt_time_window_init(tw); v = __bit_getv_recno(ref, cursor->recno, btree->bitcnt); return (__wt_buf_set(session, buf, &v, 1)); } @@ -232,95 +198,7 @@ __wt_value_return_buf( static inline int __value_return(WT_CURSOR_BTREE *cbt) { - return (__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, NULL, NULL)); -} - -/* - * __wt_value_return_upd -- - * Change the cursor to reference an internal update structure return value. - */ -int -__wt_value_return_upd(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) -{ - WT_CURSOR *cursor; - WT_DECL_RET; - WT_MODIFY_VECTOR modifies; - WT_SESSION_IMPL *session; - WT_TIME_PAIR start, stop; - - cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; - __wt_modify_vector_init(session, &modifies); - - /* - * We're passed a "standard" or "modified" update that's visible to us. Our caller should have - * already checked for deleted items (we're too far down the call stack to return not-found). - * - * Fast path if it's a standard item, assert our caller's behavior. - */ - if (upd->type == WT_UPDATE_STANDARD) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - /* Copy an external update, and delete after using it */ - WT_RET(__wt_buf_set(session, &cursor->value, upd->data, upd->size)); - __wt_free_update_list(session, &upd); - } else { - cursor->value.data = upd->data; - cursor->value.size = upd->size; - } - return (0); - } - WT_ASSERT(session, upd->type == WT_UPDATE_MODIFY); - - /* - * Find a complete update. - */ - for (; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (WT_UPDATE_DATA_VALUE(upd)) - break; - - if (upd->type == WT_UPDATE_MODIFY) - WT_ERR(__wt_modify_vector_push(&modifies, upd)); - } - - /* - * If there's no full update, the base item is the on-page item. If the update is a tombstone, - * the base item is an empty item. - */ - if (upd == NULL) { - /* - * Callers of this function set the cursor slot to an impossible value to check we don't try - * and return on-page values when the update list should have been sufficient (which - * happens, for example, if an update list was truncated, deleting some standard update - * required by a previous modify update). Assert the case. - */ - WT_ASSERT(session, cbt->slot != UINT32_MAX); - - WT_ERR(__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, &start, &stop)); - /* - * Applying modifies on top of a tombstone is invalid. So if we're using the onpage value, - * the stop time pair should be unset. - */ - WT_ASSERT(session, stop.txnid == WT_TXN_MAX && stop.timestamp == WT_TS_MAX); - } else { - /* The base update must not be a tombstone. */ - WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD); - WT_ERR(__wt_buf_set(session, &cursor->value, upd->data, upd->size)); - } - - /* - * Once we have a base item, roll forward through any visible modify updates. - */ - while (modifies.size > 0) { - __wt_modify_vector_pop(&modifies, &upd); - WT_ERR(__wt_modify_apply(cursor, upd->data)); - } - -err: - __wt_modify_vector_free(&modifies); - return (ret); + return (__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, NULL)); } /* @@ -352,20 +230,37 @@ __wt_key_return(WT_CURSOR_BTREE *cbt) /* * __wt_value_return -- - * Change the cursor to reference an internal return value. + * Change the cursor to reference an update return value. */ int -__wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE_VALUE *upd_value) { WT_CURSOR *cursor; + WT_SESSION_IMPL *session; cursor = &cbt->iface; + session = CUR2S(cbt); F_CLR(cursor, WT_CURSTD_VALUE_EXT); - if (upd == NULL) + if (upd_value->type == WT_UPDATE_INVALID) { + /* + * FIXME-WT-6127: This is a holdover from the pre-durable history read logic where we used + * to fallback to the on-page value if we didn't find a visible update elsewhere. This is + * still required for fixed length column store as we have issues with this table type in + * durable history which we're planning to address in PM-1814. + */ + WT_ASSERT(session, cbt->btree->type == BTREE_COL_FIX); WT_RET(__value_return(cbt)); - else - WT_RET(__wt_value_return_upd(cbt, upd)); + } else { + /* + * We're passed a "standard" update that's visible to us. Our caller should have already + * checked for deleted items (we're too far down the call stack to return not-found) and any + * modify updates should be have been reconstructed into a full standard update. + */ + WT_ASSERT(session, upd_value->type == WT_UPDATE_STANDARD); + cursor->value.data = upd_value->buf.data; + cursor->value.size = upd_value->buf.size; + } F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index a42e11e1d8f..344c6a573d7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -186,12 +186,7 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root) __wt_seconds(session, &ckptbase->sec); WT_ERR(__wt_metadata_search(session, dhandle->name, &config)); WT_ERR(__wt_meta_block_metadata(session, config, ckptbase)); - ckptbase->start_durable_ts = WT_TS_NONE; - ckptbase->oldest_start_ts = WT_TS_NONE; - ckptbase->oldest_start_txn = WT_TXN_NONE; - ckptbase->stop_durable_ts = WT_TS_NONE; - ckptbase->newest_stop_ts = WT_TS_MAX; - ckptbase->newest_stop_txn = WT_TXN_MAX; + __wt_time_aggregate_init(&ckptbase->ta); ckptbase->write_gen = btree->write_gen; F_SET(ckptbase, WT_CKPT_ADD); @@ -917,7 +912,7 @@ __slvg_col_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s */ /* Case #2/8, #10, #11 */ if (a_trk->col_start > b_trk->col_start) - WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + WT_RET_PANIC(session, EINVAL, "unexpected merge array sort order"); if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */ /* @@ -1174,12 +1169,7 @@ __slvg_col_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF * regardless of a value's timestamps or transaction IDs. */ WT_ERR(__wt_calloc_one(session, &addr)); - addr->newest_start_durable_ts = addr->newest_stop_durable_ts = addr->oldest_start_ts = - WT_TS_NONE; - addr->oldest_start_txn = WT_TXN_NONE; - addr->newest_stop_ts = WT_TS_MAX; - addr->newest_stop_txn = WT_TXN_MAX; - addr->prepare = false; + __wt_time_aggregate_init(&addr->ta); WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; @@ -1323,7 +1313,7 @@ __slvg_col_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK * return (__slvg_ovfl_ref(session, ovfl, false)); } - WT_PANIC_RET(session, EINVAL, "overflow record at column-store page merge not found"); + WT_RET_PANIC(session, EINVAL, "overflow record at column-store page merge not found"); } /* @@ -1512,7 +1502,7 @@ __slvg_row_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s WT_RET(__wt_compare(session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp)); if (start_cmp > 0) /* Case #2/8, #10, #11 */ - WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + WT_RET_PANIC(session, EINVAL, "unexpected merge array sort order"); if (start_cmp == 0) { /* Case #1, #4, #9 */ /* @@ -1782,12 +1772,7 @@ __slvg_row_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF * regardless of a value's timestamps or transaction IDs. */ WT_ERR(__wt_calloc_one(session, &addr)); - addr->newest_start_durable_ts = addr->newest_stop_durable_ts = addr->oldest_start_ts = - WT_TS_NONE; - addr->oldest_start_txn = WT_TXN_NONE; - addr->newest_stop_ts = WT_TS_MAX; - addr->newest_stop_txn = WT_TXN_MAX; - addr->prepare = false; + __wt_time_aggregate_init(&addr->ta); WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; @@ -1992,7 +1977,7 @@ __slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK * return (__slvg_ovfl_ref(session, ovfl, true)); } - WT_PANIC_RET(session, EINVAL, "overflow record at row-store page merge not found"); + WT_RET_PANIC(session, EINVAL, "overflow record at row-store page merge not found"); } /* @@ -2270,7 +2255,7 @@ __slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, bool multi_panic) if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { if (!multi_panic) return (__wt_set_return(session, EBUSY)); - WT_PANIC_RET(session, EINVAL, + WT_RET_PANIC(session, EINVAL, "overflow record unexpectedly referenced multiple times " "during leaf page merge"); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a2d85f79db8..2a016d6d725 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -147,7 +147,7 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error during page split"); + WT_RET_PANIC(session, ret, "fatal error during page split"); } #endif @@ -249,13 +249,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) { __wt_cell_unpack(session, from_home, (WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); - addr->oldest_start_ts = unpack.oldest_start_ts; - addr->oldest_start_txn = unpack.oldest_start_txn; - addr->newest_start_durable_ts = unpack.newest_start_durable_ts; - addr->newest_stop_ts = unpack.newest_stop_ts; - addr->newest_stop_txn = unpack.newest_stop_txn; - addr->newest_stop_durable_ts = unpack.newest_stop_durable_ts; - addr->prepare = F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(&addr->ta, &unpack.ta); WT_ERR(__wt_memdup(session, unpack.data, unpack.size, &addr->addr)); addr->size = (uint8_t)unpack.size; switch (unpack.raw) { @@ -574,17 +568,17 @@ err: case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during root page split to deepen the tree"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during root page split " - "to deepen the tree"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, + "ignoring not-fatal error during root page split to deepen the tree"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during root page split to deepen the tree"); break; } return (ret); @@ -877,17 +871,16 @@ err: if (empty_parent) ret = __wt_set_return(session, EBUSY); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during parent page split"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during parent page " - "split"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, "ignoring not-fatal error during parent page split"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during parent page split"); break; } __wt_scr_free(session, &scr); @@ -1154,17 +1147,16 @@ err: } __wt_free_ref_index(session, page, alloc_index, true); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during internal page split"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during internal page " - "split"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, "ignoring not-fatal error during internal page split"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during internal page split"); break; } return (ret); @@ -1391,7 +1383,7 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT WT_SAVE_UPD *supd; WT_UPDATE *prev_onpage, *upd; uint64_t recno; - uint32_t i, slot; + uint32_t i, page_flags, slot; /* * In 04/2016, we removed column-store record numbers from the WT_PAGE structure, leading to @@ -1413,7 +1405,8 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT * our caller will not discard the disk image when discarding the original page, and our caller * will discard the allocated page on error, when discarding the allocated WT_REF. */ - WT_RET(__wt_page_inmem(session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, &page)); + page_flags = WT_PAGE_DISK_ALLOC | WT_PAGE_INSTANTIATE_PREPARE_UPDATE; + WT_RET(__wt_page_inmem(session, ref, multi->disk_image, page_flags, &page)); multi->disk_image = NULL; /* @@ -1704,13 +1697,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R if (multi->addr.addr != NULL) { WT_RET(__wt_calloc_one(session, &addr)); ref->addr = addr; - addr->oldest_start_ts = multi->addr.oldest_start_ts; - addr->oldest_start_txn = multi->addr.oldest_start_txn; - addr->newest_start_durable_ts = multi->addr.newest_start_durable_ts; - addr->newest_stop_ts = multi->addr.newest_stop_ts; - addr->newest_stop_txn = multi->addr.newest_stop_txn; - addr->newest_stop_durable_ts = multi->addr.newest_stop_durable_ts; - addr->prepare = multi->addr.prepare; + __wt_time_aggregate_copy(&addr->ta, &multi->addr.ta); WT_RET(__wt_memdup(session, multi->addr.addr, multi->addr.size, &addr->addr)); addr->size = multi->addr.size; addr->type = multi->addr.type; diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 851a407f165..fd36f6b24f9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -220,8 +220,8 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl if (previous_state == WT_REF_DISK) { /* There should be an address, but simply skip any page where we don't find one. */ if (__wt_ref_addr_copy(session, ref, &addr)) { - newest_stop_ts = addr.newest_stop_ts; - newest_stop_txn = addr.newest_stop_txn; + newest_stop_ts = addr.ta.newest_stop_ts; + newest_stop_txn = addr.ta.newest_stop_txn; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } @@ -274,21 +274,21 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl /* Calculate the max stop time pair by traversing all multi addresses. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - newest_stop_txn = WT_MAX(newest_stop_txn, multi->addr.newest_stop_txn); - newest_stop_ts = WT_MAX(newest_stop_ts, multi->addr.newest_stop_ts); + newest_stop_txn = WT_MAX(newest_stop_txn, multi->addr.ta.newest_stop_txn); + newest_stop_ts = WT_MAX(newest_stop_ts, multi->addr.ta.newest_stop_ts); } obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replacement block"; - newest_stop_txn = mod->mod_replace.newest_stop_txn; - newest_stop_ts = mod->mod_replace.newest_stop_ts; + newest_stop_txn = mod->mod_replace.ta.newest_stop_txn; + newest_stop_ts = mod->mod_replace.ta.newest_stop_ts; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else if (__wt_ref_addr_copy(session, ref, &addr)) { tag = "WT_REF address"; - newest_stop_txn = addr.newest_stop_txn; - newest_stop_ts = addr.newest_stop_ts; + newest_stop_txn = addr.ta.newest_stop_txn; + newest_stop_ts = addr.ta.newest_stop_ts; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else tag = "unexpected page state"; @@ -469,12 +469,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) btree->syncing = WT_BTREE_SYNC_RUNNING; is_hs = WT_IS_HS(btree); - /* - * Add in history store reconciliation for standard files. - * - * FIXME-PM-1521: Remove the history store check, and assert that no updates from the - * history store are copied to the history store recursively. - */ + /* Add in history store reconciliation for standard files. */ rec_flags = WT_REC_CHECKPOINT; if (!is_hs && !WT_IS_METADATA(btree->dhandle)) rec_flags |= WT_REC_HS; diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index c9708e9511b..0b3d4da2459 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -255,17 +255,13 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * Create a fake, unpacked parent cell for the tree based on the checkpoint information. */ memset(&addr_unpack, 0, sizeof(addr_unpack)); - addr_unpack.newest_start_durable_ts = ckpt->start_durable_ts; - addr_unpack.newest_stop_durable_ts = ckpt->stop_durable_ts; - addr_unpack.oldest_start_ts = ckpt->oldest_start_ts; - addr_unpack.newest_stop_ts = ckpt->newest_stop_ts; - if (ckpt->write_gen > S2C(session)->base_write_gen) { - addr_unpack.oldest_start_txn = ckpt->oldest_start_txn; - addr_unpack.newest_stop_txn = ckpt->newest_stop_txn; - } else { - addr_unpack.oldest_start_txn = WT_TXN_NONE; - addr_unpack.newest_stop_txn = WT_TXN_MAX; + __wt_time_aggregate_copy(&addr_unpack.ta, &ckpt->ta); + if (ckpt->write_gen <= S2C(session)->base_write_gen) { + addr_unpack.ta.oldest_start_txn = WT_TXN_NONE; + addr_unpack.ta.newest_stop_txn = WT_TXN_MAX; } + if (ckpt->ta.prepare) + F_SET(&addr_unpack, WT_CELL_UNPACK_PREPARE); addr_unpack.raw = WT_CELL_ADDR_INT; /* Verify the tree. */ @@ -367,15 +363,14 @@ __verify_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) WT_ADDR_COPY addr; WT_DECL_ITEM(tmp); WT_DECL_RET; - char tp_string[2][WT_TP_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; WT_ERR(__wt_scr_alloc(session, 0, &tmp)); if (__wt_ref_addr_copy(session, ref, &addr)) { - WT_ERR(__wt_buf_fmt(session, buf, "%s %s,%s", - __wt_addr_string(session, addr.addr, addr.size, tmp), - __wt_time_pair_to_string(addr.oldest_start_ts, addr.oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(addr.newest_stop_ts, addr.newest_stop_txn, tp_string[1]))); + WT_ERR( + __wt_buf_fmt(session, buf, "%s %s", __wt_addr_string(session, addr.addr, addr.size, tmp), + __wt_time_aggregate_to_string(&addr.ta, time_string))); } else WT_ERR(__wt_buf_fmt(session, buf, "%s -/-,-/-", __wt_addr_string(session, NULL, 0, tmp))); @@ -391,28 +386,41 @@ err: static int __verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack, WT_VSTUFF *vs) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; - if (unpack->oldest_start_ts != WT_TS_NONE && unpack->newest_stop_ts == WT_TS_NONE) + if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has a newest stop " "timestamp of 0", __verify_addr_string(session, ref, vs->tmp1)); - if (unpack->oldest_start_ts > unpack->newest_stop_ts) + if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " - "timestamp %s newer than its newest stop timestamp %s", + "timestamp newer than its newest stop timestamp; time window %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack->oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_ts, ts_string[1])); - if (unpack->oldest_start_txn > unpack->newest_stop_txn) + __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " - "transaction (%" PRIu64 - ") newer than its newest stop " - "transaction (%" PRIu64 ")", - __verify_addr_string(session, ref, vs->tmp1), unpack->oldest_start_txn, - unpack->newest_stop_txn); + "transaction newer than its newest stop " + "transaction; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "internal page reference at %s has an oldest start " + "timestamp newer than its newest start durable " + "timestamp; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_ts != WT_TS_MAX && + unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "internal page reference at %s has a newest stop " + "timestamp newer than its newest stop durable " + "timestamp; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); return (0); } @@ -781,7 +789,7 @@ __verify_ts_stable_cmp(WT_SESSION_IMPL *session, WT_ITEM *key, WT_REF *ref, uint { WT_BTREE *btree; WT_DECL_RET; - char tp_string[2][WT_TP_STRING_SIZE]; + char tp_string[2][WT_TS_INT_STRING_SIZE]; bool start; btree = S2BT(session); @@ -949,7 +957,7 @@ __verify_page_content( uint64_t recno, rle; uint32_t cell_num; uint8_t *p; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; bool found_ovfl; btree = S2BT(session); @@ -992,108 +1000,126 @@ __verify_page_content( case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - if (unpack.oldest_start_ts != WT_TS_NONE && unpack.newest_stop_ts == WT_TS_NONE) + if (unpack.ta.oldest_start_ts != WT_TS_NONE && unpack.ta.newest_stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "newest stop timestamp of 0", - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1)); - if (unpack.oldest_start_ts > unpack.newest_stop_ts) + "newest stop timestamp of 0; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.oldest_start_ts > unpack.ta.newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has an " - "oldest start timestamp %s newer than " - "its newest stop timestamp %s", + "oldest start timestamp newer than " + "its newest stop timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack.oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack.newest_stop_ts, ts_string[1])); - if (unpack.oldest_start_txn > unpack.newest_stop_txn) { + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.oldest_start_txn > unpack.ta.newest_stop_txn) { WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has an " - "oldest start transaction (%" PRIu64 - ") " - "newer than its newest stop transaction " - "(%" PRIu64 ")", + " on page " + "at %s has an oldest start transaction newer than " + "its newest stop transaction; time aggregate %s ", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - unpack.oldest_start_txn, unpack.newest_stop_txn); + __wt_time_aggregate_to_string(&unpack.ta, time_string)); } + if (unpack.ta.oldest_start_ts > unpack.ta.newest_start_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 + " on page at %s has an " + "oldest start timestamp newer than " + "its newest start durable timestamp; time aggregate %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.newest_stop_ts != WT_TS_MAX && + unpack.ta.newest_stop_ts > unpack.ta.newest_stop_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 + " on page at %s has a " + "newest stop timestamp newer than " + "its newest stop durable timestamp; time aggregate %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", - * unpack.newest_start_durable_ts, "start durable", - * addr_unpack->newest_start_durable_ts, false, vs)); - */ + if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", + unpack.ta.newest_start_durable_ts, "start durable", + addr_unpack->ta.newest_start_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.oldest_start_ts, "oldest start", addr_unpack->oldest_start_ts, true, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.oldest_start_txn, "oldest start", addr_unpack->oldest_start_txn, true, dsk, + unpack.ta.oldest_start_ts, "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start", + unpack.ta.oldest_start_txn, "oldest start", addr_unpack->ta.oldest_start_txn, true, + dsk, vs)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", - * unpack.newest_stop_durable_ts, "stop durable", addr_unpack->newest_stop_durable_ts, - * false, vs)); - */ + if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", + unpack.ta.newest_stop_durable_ts, "stop durable", + addr_unpack->ta.newest_stop_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.newest_stop_ts, "newest stop", addr_unpack->newest_stop_ts, false, vs)); + unpack.ta.newest_stop_ts, "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.newest_stop_txn, "newest stop", addr_unpack->newest_stop_txn, false, dsk, vs)); - WT_RET(__verify_ts_stable_cmp( - session, NULL, ref, cell_num - 1, addr_unpack->start_ts, addr_unpack->stop_ts, vs)); + unpack.ta.newest_stop_txn, "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, + vs)); + WT_RET(__verify_ts_stable_cmp(session, NULL, ref, cell_num - 1, + addr_unpack->ta.oldest_start_ts, addr_unpack->ta.newest_stop_ts, vs)); break; case WT_CELL_DEL: case WT_CELL_VALUE: case WT_CELL_VALUE_COPY: case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: - if (unpack.start_ts != WT_TS_NONE && unpack.stop_ts == WT_TS_NONE) + if (unpack.tw.start_ts != WT_TS_NONE && unpack.tw.stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a stop " - "timestamp of 0", - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1)); - if (unpack.start_ts > unpack.stop_ts) + "timestamp of 0; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_ts > unpack.tw.stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "start timestamp %s newer than its stop " - "timestamp %s", + "start timestamp newer than its stop " + "timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack.start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack.stop_ts, ts_string[1])); - if (unpack.start_txn > unpack.stop_txn) + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_txn > unpack.tw.stop_txn) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "start transaction %" PRIu64 - "newer than " - "its stop transaction %" PRIu64, - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), unpack.start_txn, - unpack.stop_txn); - - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET( - * __verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.durable_start_ts, - * "durable start", addr_unpack->newest_start_durable_ts, true, vs)); - */ - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_ts, - "oldest start", addr_unpack->oldest_start_ts, true, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_txn, - "oldest start", addr_unpack->oldest_start_txn, true, dsk, vs)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", - * unpack.durable_stop_ts, - * "durable stop", addr_unpack->newest_stop_durable_ts, true, vs)); - */ - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_ts, - "newest stop", addr_unpack->newest_stop_ts, false, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_txn, - "newest stop", addr_unpack->newest_stop_txn, false, dsk, vs)); + "start transaction newer than " + "its stop transaction; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_ts > unpack.tw.durable_start_ts) + WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 + " on page at %s has a " + "start timestamp newer than its start durable " + "timestamp; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.stop_ts != WT_TS_MAX && unpack.tw.stop_ts > unpack.tw.durable_stop_ts) + WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 + " on page at %s has a " + "stop timestamp newer than its stop durable " + "timestamp; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + + if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", + unpack.tw.durable_start_ts, "newest durable start", + addr_unpack->ta.newest_start_durable_ts, false, vs)); + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_ts, + "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_txn, + "oldest start", addr_unpack->ta.oldest_start_txn, true, dsk, vs)); + if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", + unpack.tw.durable_stop_ts, "newest durable stop", + addr_unpack->ta.newest_stop_durable_ts, false, vs)); + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_ts, + "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_txn, + "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, vs)); WT_RET(__verify_ts_stable_cmp( - session, NULL, ref, cell_num - 1, unpack.start_ts, unpack.stop_ts, vs)); + session, NULL, ref, cell_num - 1, unpack.tw.start_ts, unpack.tw.stop_ts, vs)); break; } @@ -1106,7 +1132,7 @@ __verify_page_content( continue; WT_RET(__wt_row_leaf_key(session, page, rip++, vs->tmp1, false)); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) @@ -1117,7 +1143,7 @@ __verify_page_content( p = vs->tmp1->mem; WT_RET(__wt_vpack_uint(&p, 0, recno)); vs->tmp1->size = WT_PTRDIFF(p, vs->tmp1->mem); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index 7b80327a22c..a1e96d41dc9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -268,7 +268,7 @@ static int __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t cell_num, WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; /* * Check timestamp and transaction order, and optionally against parent values. Timestamps and @@ -284,43 +284,57 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - if (unpack->oldest_start_ts != WT_TS_NONE && unpack->newest_stop_ts == WT_TS_NONE) + if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a newest stop " - "timestamp of 0", - cell_num - 1, tag); - if (unpack->oldest_start_ts > unpack->newest_stop_ts) + "timestamp of 0; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest " + "start timestamp newer than its newest stop " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has an oldest " - "start timestamp %s newer than its newest stop " - "timestamp %s", - cell_num - 1, tag, __wt_timestamp_to_string(unpack->oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_ts, ts_string[1])); - if (unpack->oldest_start_txn > unpack->newest_stop_txn) + "start transaction newer than its " + "newest stop transaction; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has an oldest " - "start transaction %" PRIu64 - " newer than its " - "newest stop transaction %" PRIu64, - cell_num - 1, tag, unpack->oldest_start_txn, unpack->newest_stop_txn); + "start timestamp newer than its newest start durable " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_ts != WT_TS_MAX && + unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a newest " + "stop timestamp newer than its newest stop durable " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); if (addr == NULL) break; - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->newest_start_durable_ts, "start durable", addr->newest_start_durable_ts, false, - tag)); + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + unpack->ta.newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, + false, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", - unpack->oldest_start_ts, "oldest start", addr->oldest_start_ts, true, tag)); + unpack->ta.oldest_start_ts, "oldest start", addr->ta.oldest_start_ts, true, tag)); WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", - unpack->oldest_start_txn, "oldest start", addr->oldest_start_txn, true, tag, dsk)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->newest_stop_durable_ts, "stop durable", addr->newest_stop_durable_ts, false, - tag)); + unpack->ta.oldest_start_txn, "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + unpack->ta.newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, + false, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", - unpack->newest_stop_ts, "newest stop", addr->newest_stop_ts, false, tag)); + unpack->ta.newest_stop_ts, "newest stop", addr->ta.newest_stop_ts, false, tag)); WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", - unpack->newest_stop_txn, "newest stop", addr->newest_stop_txn, false, tag, dsk)); + unpack->ta.newest_stop_txn, "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -328,36 +342,52 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: case WT_CELL_VALUE_SHORT: - if (unpack->start_ts != WT_TS_NONE && unpack->stop_ts == WT_TS_NONE) + if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a stop " - "timestamp of 0", - cell_num - 1, tag); - if (unpack->start_ts > unpack->stop_ts) + "timestamp of 0; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_ts > unpack->tw.stop_ts) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a start " - "timestamp %s newer than its stop timestamp %s", - cell_num - 1, tag, __wt_timestamp_to_string(unpack->start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->stop_ts, ts_string[1])); - if (unpack->start_txn > unpack->stop_txn) + "timestamp newer than its stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_txn > unpack->tw.stop_txn) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a start " - "transaction %" PRIu64 - " newer than its stop " - "transaction %" PRIu64, - cell_num - 1, tag, unpack->start_txn, unpack->stop_txn); + "transaction newer than its stop " + "transaction; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_ts > unpack->tw.durable_start_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a start " + "timestamp newer than its durable start timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.stop_ts != WT_TS_MAX && unpack->tw.stop_ts > unpack->tw.durable_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a stop " + "timestamp newer than its durable stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); if (addr == NULL) break; - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->start_ts, - "oldest start", addr->oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->start_txn, - "oldest start", addr->oldest_start_txn, true, tag, dsk)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->stop_ts, - "newest stop", addr->newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->stop_txn, - "newest stop", addr->newest_stop_txn, false, tag, dsk)); + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + unpack->tw.durable_start_ts, "newest start durable", addr->ta.newest_start_durable_ts, + false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_ts, + "oldest start", addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_txn, + "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + unpack->tw.durable_stop_ts, "newest stop durable", addr->ta.newest_stop_durable_ts, + false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_ts, + "newest stop", addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_txn, + "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); break; } @@ -707,10 +737,7 @@ __verify_dsk_col_var( struct { const void *data; size_t size; - wt_timestamp_t start_ts; - uint64_t start_txn; - wt_timestamp_t stop_ts; - uint64_t stop_txn; + WT_TIME_WINDOW tw; bool deleted; } last; WT_BM *bm; @@ -728,10 +755,7 @@ __verify_dsk_col_var( last.data = NULL; last.size = 0; - last.start_ts = WT_TS_NONE; - last.start_txn = WT_TXN_NONE; - last.stop_ts = WT_TS_NONE; - last.stop_txn = WT_TXN_NONE; + __wt_time_window_init(&last.tw); last.deleted = false; cell_num = 0; @@ -760,11 +784,11 @@ __verify_dsk_col_var( } /* - * Compare the last two items and see if reconciliation missed a chance for RLE encoding. We - * don't have to care about data encoding or anything else, a byte comparison is enough. + * Compare the last two items and see if reconciliation missed a chance for RLE encoding. + * The time windows must match and we otherwise don't have to care about data encoding, a + * byte comparison is enough. */ - if (unpack->start_ts != last.start_ts || unpack->start_txn != last.start_txn || - unpack->stop_ts != last.stop_ts || unpack->stop_txn != last.stop_txn) + if (!__wt_time_windows_equal(&unpack->tw, &last.tw)) ; else if (last.deleted) { if (cell_type == WT_CELL_DEL) @@ -777,10 +801,7 @@ match_err: "have been run-length encoded", cell_num - 1, cell_num, tag); - last.start_ts = unpack->start_ts; - last.start_txn = unpack->start_txn; - last.stop_ts = unpack->stop_ts; - last.stop_txn = unpack->stop_txn; + __wt_time_window_copy(&last.tw, &unpack->tw); switch (cell_type) { case WT_CELL_DEL: last.data = NULL; diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index bfd3ecb9f5c..a4a4f8b662d 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -34,7 +34,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U btree = cbt->btree; ins = NULL; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = upd_arg; append = logged = false; @@ -137,7 +137,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U } /* Avoid a data copy in WT_CURSOR.update. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); /* * Point the new WT_UPDATE item to the next element in the list. If we get it right, the @@ -188,7 +188,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U logged = true; /* Avoid a data copy in WT_CURSOR.update. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index a6d56c9499d..e98cf094421 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -74,7 +74,7 @@ __wt_col_search( uint32_t base, indx, limit, read_flags; int depth; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); current = NULL; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index b7b1c5edff8..6aa44046cb8 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -58,7 +58,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, ins = NULL; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = upd_arg; logged = false; @@ -109,7 +109,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, logged = true; /* Avoid WT_CURSOR.update data copy. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else { upd_size = __wt_update_list_memsize(upd); @@ -169,7 +169,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, logged = true; /* Avoid WT_CURSOR.update data copy. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else upd_size = __wt_update_list_memsize(upd); diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 98ae6f66daf..917705f6f9c 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -224,7 +224,7 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le int cmp, depth; bool append_check, descend_right, done; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); collator = btree->collator; item = cbt->tmp; |