diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-05-06 15:45:01 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-06 06:00:50 +0000 |
commit | d2274bb6e1f8b21d73121a2fcb20b6628f652bbe (patch) | |
tree | 72c771934dab7adff1bbffdcb1af1ac6e1c36a0d /src/third_party | |
parent | e500238a9ea3d5498ebffeb74a1aceac42eb2c1f (diff) | |
download | mongo-d2274bb6e1f8b21d73121a2fcb20b6628f652bbe.tar.gz |
Import wiredtiger: 18dfb9e58e39927696affcd8e362364e23e1aa59 from branch mongodb-4.4r4.4.0-rc4
ref: a707df12a2..18dfb9e58e
for: 4.4.0-rc4
WT-5242 Minimize checkpoints pinned during backup
WT-5470 Reduce copies and allocations in read path
WT-5673 Prepare support with durable history: modify verify and salvage as needed
WT-5677 Prepare support with durable history: add test/format stress tests
WT-5710 Review WT_PANIC usage
WT-5716 Create the history store file at the same time as creating the metadata file in wiredtiger open
WT-5839 Ignore non-globally visible tombstones for both data store and hs store in hs verification
WT-5841 Return WT_TRY_SALVAGE when the history file is removed or truncated
WT-5928 Cleanup stale FIXMEs from durable history
WT-5977 WT_SESSION_NO_RECONCILE flag set by history cursor prevents eviction
WT-5984 Allow prepared updates to be evicted in durable history
WT-6009 Prepare support with durable history: add statistic for prepared updates evicted
WT-6032 Turn on mongodb-4.4 branch upgrade/downgrade testing
WT-6051 Fix reconstructing full value from modifies for string format
WT-6068 Re-enable tests temporarily disabled during durable history development
WT-6069 Remove WT_UPDATE_RESTORED_FROM_DISK flag
WT-6070 Coverity : Copy paste error
WT-6071 Coverity : Change format specifier
WT-6086 Move time windows and aggregated time windows into structures
WT-6087 Add a C2S(cursor) macro to simplify translation from a cursor to a session
WT-6095 Verify on-disk page only for row store as part of rollback to stable
WT-6109 Cleanup usage of cursor->session
WT-6110 Cleanup cast from cbt to cursor
WT-6120 Remove use-after-free in __verify_history_store_id
WT-6130 Disable test_random_abort
Diffstat (limited to 'src/third_party')
138 files changed, 3280 insertions, 2925 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 30a47fa3201..6bef5f96b93 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -1523,8 +1523,9 @@ methods = { including the named checkpoint, or \c "to=<checkpoint>" to drop all checkpoints before and including the named checkpoint. Checkpoints cannot be - dropped while a hot backup is in progress or if open in - a cursor''', type='list'), + dropped if open in a cursor. While a hot backup is in + progress, checkpoints created prior to the start of the + backup cannot be dropped''', type='list'), Config('force', 'false', r''' if false (the default), checkpoints may be skipped if the underlying object has not been modified, if true, this option forces the checkpoint''', diff --git a/src/third_party/wiredtiger/dist/s_clang-scan.diff b/src/third_party/wiredtiger/dist/s_clang-scan.diff index d7177e94279..3c0bd823a4c 100644 --- a/src/third_party/wiredtiger/dist/s_clang-scan.diff +++ b/src/third_party/wiredtiger/dist/s_clang-scan.diff @@ -1,11 +1,11 @@ In file included from src/block/block_write.c:9: -In file included from ./src/include/wt_internal.h:418: +In file included from ./src/include/wt_internal.h:420: ./src/include/intpack.i:193:7: warning: Assigned value is garbage or undefined p = *pp; ^ ~~~ -1 warning generated. +1 warning generated In file included from src/btree/col_modify.c:9: -In file included from ./src/include/wt_internal.h:423: +In file included from ./src/include/wt_internal.h:425: ./src/include/mutex.i:158:13: warning: Null pointer passed as an argument to a 'nonnull' parameter return (pthread_mutex_trylock(&t->lock)); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -14,12 +14,8 @@ src/conn/conn_capacity.c:291:5: warning: Value stored to 'capacity' is never rea capacity = steal_capacity = 0; ^ ~~~~~~~~~~~~~~~~~~ 1 warning generated. -src/reconcile/rec_col.c:1111:25: warning: Null pointer passed as an argument to a 'nonnull' parameter - memcmp(last.value->data, data, size) == 0))) { - ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1 warning generated. In file included from src/reconcile/rec_write.c:9: -In file included from ./src/include/wt_internal.h:423: +In file included from ./src/include/wt_internal.h:425: ./src/include/mutex.i:184:16: warning: Null pointer passed as an argument to a 'nonnull' parameter if ((ret = pthread_mutex_unlock(&t->lock)) != 0) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 29f43d94ef7..e9422174821 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -503,6 +503,7 @@ autoconf automake bInheritHandle backoff +backport bal basecfg basho diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index d519fa820b9..181a3c29847 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -823,6 +823,7 @@ dsrc_stats = [ RecStat('rec_overflow_key_internal', 'internal-page overflow keys'), RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), + RecStat('rec_prepare_value', 'prepared values written'), RecStat('rec_page_delete', 'pages deleted'), RecStat('rec_page_delete_fast', 'fast-path pages deleted'), RecStat('rec_page_match', 'page checksum matches'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 477909db5c1..4fcc4a17180 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "a707df12a2503ad39ccdd82a84062faa6a07e082" + "commit": "18dfb9e58e39927696affcd8e362364e23e1aa59" } diff --git a/src/third_party/wiredtiger/lang/java/wiredtiger.i b/src/third_party/wiredtiger/lang/java/wiredtiger.i index cd1174e6e48..e544951909e 100644 --- a/src/third_party/wiredtiger/lang/java/wiredtiger.i +++ b/src/third_party/wiredtiger/lang/java/wiredtiger.i @@ -2264,8 +2264,7 @@ err: if (ret != 0) if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0) goto err; - if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session, - 1, &jcb)) != 0) + if ((ret = __wt_calloc_def(CUR2S(cursor), 1, &jcb)) != 0) goto err; if ((cursor->flags & WT_CURSTD_RAW) != 0) @@ -2274,7 +2273,7 @@ err: if (ret != 0) cursor->flags |= WT_CURSTD_RAW; jcb->jnienv = jenv; - jcb->session = (WT_SESSION_IMPL *)cursor->session; + jcb->session = CUR2S(cursor); cursor->lang_private = jcb; err: if (ret != 0) diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i index 7f5f6d17e96..249cde15837 100644 --- a/src/third_party/wiredtiger/lang/python/wiredtiger.i +++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i @@ -1347,7 +1347,7 @@ cursorCloseHandler(WT_CURSOR *cursor) cursor->lang_private = NULL; if (pcb != NULL) ret = pythonClose(pcb); - __wt_free((WT_SESSION_IMPL *)cursor->session, pcb); + __wt_free(CUR2S(cursor), pcb); return (ret); } @@ -1374,7 +1374,7 @@ cursorFreeHandler(WT_CURSOR *cursor) pcb = (PY_CALLBACK *)cursor->lang_private; cursor->lang_private = NULL; - __wt_free((WT_SESSION_IMPL *)cursor->session, pcb); + __wt_free(CUR2S(cursor), pcb); return (0); } diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c index b696d2cc366..3faa53808c8 100644 --- a/src/third_party/wiredtiger/src/async/async_op.c +++ b/src/third_party/wiredtiger/src/async/async_op.c @@ -291,7 +291,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) #ifdef HAVE_DIAGNOSTIC WT_ORDERED_READ(my_op, async->async_queue[my_slot]); if (my_op != NULL) - return (__wt_panic(session)); + return (__wt_panic(session, WT_PANIC, "async failure")); #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c index 23098c5b8c5..eb301b2cdfa 100644 --- a/src/third_party/wiredtiger/src/async/async_worker.c +++ b/src/third_party/wiredtiger/src/async/async_worker.c @@ -309,7 +309,7 @@ __wt_async_worker(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "async worker error"); + WT_IGNORE_RET(__wt_panic(session, ret, "async worker error")); } /* * Worker thread cleanup, close our cached cursors and free all the WT_ASYNC_CURSOR structures. diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index 6cbe80a9317..292917be5d6 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -196,11 +196,11 @@ __wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block) case WT_CKPT_INPROGRESS: case WT_CKPT_PANIC_ON_FAILURE: case WT_CKPT_SALVAGE: - __wt_err(session, EINVAL, + ret = __wt_panic(session, EINVAL, "%s: an unexpected checkpoint start: the checkpoint " "has already started or was configured for salvage", block->name); - ret = __wt_block_panic(session); + __wt_block_set_readonly(session); break; case WT_CKPT_NONE: block->ckpt_state = WT_CKPT_INPROGRESS; @@ -389,11 +389,11 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) break; case WT_CKPT_NONE: case WT_CKPT_PANIC_ON_FAILURE: - __wt_err(session, EINVAL, + ret = __wt_panic(session, EINVAL, "%s: an unexpected checkpoint attempt: the checkpoint " "was never started or has already completed", block->name); - ret = __wt_block_panic(session); + __wt_block_set_readonly(session); break; case WT_CKPT_SALVAGE: /* Salvage doesn't use the standard checkpoint APIs. */ @@ -638,8 +638,8 @@ live_update: err: if (ret != 0 && fatal) { - __wt_err(session, ret, "%s: fatal checkpoint failure", block->name); - ret = __wt_block_panic(session); + ret = __wt_panic(session, ret, "%s: fatal checkpoint failure", block->name); + __wt_block_set_readonly(session); } if (locked) @@ -860,26 +860,26 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool fa goto done; case WT_CKPT_NONE: case WT_CKPT_SALVAGE: - __wt_err(session, EINVAL, + ret = __wt_panic(session, EINVAL, "%s: an unexpected checkpoint resolution: the checkpoint " "was never started or completed, or configured for salvage", block->name); - ret = __wt_block_panic(session); + __wt_block_set_readonly(session); break; case WT_CKPT_PANIC_ON_FAILURE: if (!failed) break; - __wt_err( + ret = __wt_panic( session, EINVAL, "%s: the checkpoint failed, the system must restart", block->name); - ret = __wt_block_panic(session); + __wt_block_set_readonly(session); break; } WT_ERR(ret); if ((ret = __wt_block_extlist_merge(session, block, &ci->ckpt_avail, &ci->avail)) != 0) { - __wt_err( + ret = __wt_panic( session, ret, "%s: fatal checkpoint failure during extent list merge", block->name); - ret = __wt_block_panic(session); + __wt_block_set_readonly(session); } __wt_spin_unlock(session, &block->live_lock); diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index 632908d6b0a..8e854da15c1 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -13,11 +13,12 @@ * Handle extension list errors that would normally panic the system but * which should fail gracefully when verifying. */ -#define WT_BLOCK_RET(session, block, v, ...) \ - do { \ - int __ret = (v); \ - __wt_err(session, __ret, __VA_ARGS__); \ - return ((block)->verify ? __ret : __wt_panic(session)); \ +#define WT_BLOCK_RET(session, block, v, ...) \ + do { \ + int __ret = (v); \ + __wt_err(session, __ret, __VA_ARGS__); \ + return ((block)->verify ? __ret : __wt_panic(session, WT_PANIC, \ + "block manager extension list failure")); \ } while (0) static int __block_append(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); @@ -297,13 +298,10 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list else if (live && __block_off_match(&block->live.discard, offset, size)) name = "discard"; __wt_spin_unlock(session, &block->live_lock); - if (name != NULL) { - __wt_errx(session, "%s failed: %" PRIuMAX "/%" PRIu32 - " is on the %s list " - "(%s, %d)", - list, (uintmax_t)offset, size, name, func, line); - return (__wt_panic(session)); - } + if (name != NULL) + return (__wt_panic(session, WT_PANIC, + "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list (%s, %d)", list, (uintmax_t)offset, + size, name, func, line)); return (0); } #endif @@ -336,7 +334,7 @@ __block_off_remove( __block_size_srch(el->sz, ext->size, sstack); szp = *sstack[0]; if (szp == NULL || szp->size != ext->size) - WT_PANIC_RET(session, EINVAL, "extent not found in by-size list during remove"); + WT_RET_PANIC(session, EINVAL, "extent not found in by-size list during remove"); __block_off_srch(szp->off, off, astack, true); ext = *astack[0]; if (ext == NULL || ext->off != off) @@ -643,7 +641,7 @@ __wt_block_extlist_check(WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *b b = b->next[0]; continue; } - WT_PANIC_RET(session, EINVAL, "checkpoint merge check: %s list overlaps the %s list", + WT_RET_PANIC(session, EINVAL, "checkpoint merge check: %s list overlaps the %s list", al->name, bl->name); } return (0); diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c index c689e3848bf..07a368b0350 100644 --- a/src/third_party/wiredtiger/src/block/block_mgr.c +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -630,14 +630,12 @@ err: } /* - * __wt_block_panic -- - * Report an error, then panic the handle and the system. + * __wt_block_set_readonly -- + * Set the block API to read-only. */ -int -__wt_block_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) +void +__wt_block_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) { /* Switch the handle into read-only mode. */ __bm_method_set(S2BT(session)->bm, true); - - return (__wt_panic(session)); } diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 638796e4459..e6b59847fcf 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -326,9 +326,9 @@ __desc_read(WT_SESSION_IMPL *session, uint32_t allocsize, WT_BLOCK *block) * In the general case, we should return a generic error and signal that we've detected data * corruption. * - * FIXME: MongoDB relies heavily on the error codes reported when opening cursors (which hits - * this logic if the relevant data handle isn't already open). However this code gets run in - * rollback to stable as part of recovery where we want to skip any corrupted data files + * FIXME-WT-5832: MongoDB relies heavily on the error codes reported when opening cursors (which + * hits this logic if the relevant data handle isn't already open). However this code gets run + * in rollback to stable as part of recovery where we want to skip any corrupted data files * temporarily to allow MongoDB to initiate salvage. This is why we've been forced into this * situation. We should address this as part of WT-5832 and clarify what error codes we expect * to be returning across the API boundary. diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index 9c0d17fe28f..0226ede60e1 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -289,5 +289,5 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); if (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) return (WT_ERROR); - WT_PANIC_RET(session, WT_ERROR, "%s: fatal read error", block->name); + WT_RET_PANIC(session, WT_ERROR, "%s: fatal read error", block->name); } diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index d69371ee533..a8a0091d854 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -39,7 +39,7 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) * backups, which only copies log files, or targeted backups, stops all block truncation * unnecessarily). We may want a more targeted solution at some point. */ - if (!conn->hot_backup) { + if (conn->hot_backup_start == 0) { WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, block->fh, len), NULL); } diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index d6c89dacd33..9ea91c6f421 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -16,9 +16,8 @@ static inline int __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -58,28 +57,14 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it - * to the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); } cbt->iface.value.size = 1; return (0); @@ -95,12 +80,10 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); page = cbt->ref->page; - upd = NULL; /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -127,33 +110,20 @@ new_page: if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; /* - * FIXME-PM-1523: Now we only do transaction read if we have an update chain and it doesn't work + * FIXME-WT-6127: Now we only do transaction read if we have an update chain and it doesn't work * in durable history. Review this when we have a plan for fixed-length column store. */ + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) restart_read: - WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it to - * the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); cbt->iface.value.size = 1; return (0); } @@ -166,9 +136,8 @@ static inline int __cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -187,18 +156,17 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); - if (upd == NULL) + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -216,10 +184,9 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint64_t rle, rle_start; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -258,18 +225,17 @@ restart_read: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = NULL; + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* @@ -309,8 +275,9 @@ restart_read: continue; } - WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip, &upd)); - if (upd == NULL) + WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip)); + if (cbt->upd_value->type == WT_UPDATE_INVALID || + cbt->upd_value->type == WT_UPDATE_TOMBSTONE) continue; return (0); } @@ -334,10 +301,9 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool kpack_used; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; key = &cbt->iface.key; @@ -386,17 +352,16 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* Check for the end of the page. */ @@ -422,17 +387,16 @@ restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); WT_RET(__wt_txn_read( - session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL, &upd)); - if (upd == NULL) + session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -461,7 +425,7 @@ __cursor_key_order_check_col(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, boo return (0); } - WT_PANIC_RET(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 + WT_RET_PANIC(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " "key %" PRIu64, next ? "next" : "prev", cbt->lastrecno, cbt->recno); @@ -494,7 +458,7 @@ __cursor_key_order_check_row(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, boo WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); - WT_PANIC_ERR(session, EINVAL, + WT_ERR_PANIC(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %.1024s then " "key %.1024s", next ? "next" : "prev", __wt_buf_set_printable_format(session, cbt->lastkey->data, @@ -536,7 +500,7 @@ __wt_cursor_key_order_init(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * Cursor searches set the position for cursor movements, set the last-key value for diagnostic @@ -648,7 +612,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) bool newpage, restart; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 0099d1ae594..f8db9cd6233 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -40,7 +40,7 @@ __cursor_skip_prev(WT_CURSOR_BTREE *cbt) uint64_t recno; int i; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); restart: /* @@ -123,9 +123,8 @@ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -198,28 +197,14 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it - * to the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); } cbt->iface.value.size = 1; return (0); @@ -235,9 +220,8 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; btree = S2BT(session); @@ -265,35 +249,21 @@ new_page: cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = NULL; /* - * FIXME-PM-1523: Now we only do transaction read if we have an update chain and it doesn't work + * FIXME-WT-6127: Now we only do transaction read if we have an update chain and it doesn't work * in durable history. Review this when we have a plan for fixed-length column store. */ + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) restart_read: - WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL, &upd)); - if (upd == NULL) { + WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins->upd, NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; - } else { - /* - * If this update has been restored from the disk, it needs to be freed after copying it to - * the user cursor. - */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - switch (upd->type) { - case WT_UPDATE_TOMBSTONE: - cbt->iface.value.data = upd->data; - __wt_free_update_list(session, &upd); - break; - default: - return (__wt_value_return(cbt, upd)); - } - } - if (upd != NULL) - cbt->iface.value.data = upd->data; - } + } else if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) + cbt->iface.value.data = cbt->upd_value->buf.data; + else + WT_RET(__wt_value_return(cbt, cbt->upd_value)); cbt->iface.value.size = 1; return (0); } @@ -306,9 +276,8 @@ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) { WT_SESSION_IMPL *session; - WT_UPDATE *upd; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* If restarting after a prepare conflict, jump to the right spot. */ if (restart) @@ -327,17 +296,16 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -355,10 +323,9 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint64_t rle_start; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -398,18 +365,17 @@ restart_read: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = NULL; + __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* @@ -449,8 +415,9 @@ restart_read: continue; } - WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip, &upd)); - if (upd == NULL) + WT_RET(__wt_bt_col_var_cursor_walk_txn_read(session, cbt, page, &unpack, cip)); + if (cbt->upd_value->type == WT_UPDATE_INVALID || + cbt->upd_value->type == WT_UPDATE_TOMBSTONE) continue; return (0); } @@ -474,10 +441,9 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool kpack_used; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; key = &cbt->iface.key; @@ -536,17 +502,16 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, ins->upd, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* Check for the beginning of the page. */ @@ -574,17 +539,16 @@ restart_read_page: rip = &page->pg_row[cbt->slot]; WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack, &kpack_used)); WT_RET(__wt_txn_read( - session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL, &upd)); - if (upd == NULL) + session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip), NULL)); + if (cbt->upd_value->type == WT_UPDATE_INVALID) continue; - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { + if (cbt->upd_value->txnid != WT_TXN_NONE && + __wt_txn_upd_value_visible_all(session, cbt->upd_value)) ++cbt->page_deleted_count; - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); continue; } - return (__wt_value_return(cbt, upd)); + return (__wt_value_return(cbt, cbt->upd_value)); } /* NOTREACHED */ } @@ -604,7 +568,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) bool newpage, restart; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 61a0a2653f6..ccec03700d0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -60,7 +60,7 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation) WT_SESSION_IMPL *session; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); /* * Check the page active flag, asserting the page reference with any external key. @@ -171,21 +171,18 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * Return if the cursor references an valid key/value pair. */ int -__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE **updp, bool *valid) +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *valid) { WT_BTREE *btree; WT_CELL *cell; WT_COL *cip; WT_PAGE *page; WT_SESSION_IMPL *session; - WT_UPDATE *upd; - if (updp != NULL) - *updp = NULL; *valid = false; btree = cbt->btree; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * We may be pointing to an insert object, and we may have a page with @@ -232,22 +229,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE * update that's been deleted is not a valid key/value pair). */ if (cbt->ins != NULL) { - WT_RET(__wt_txn_read_upd_list(session, cbt->ins->upd, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - WT_ASSERT(session, !F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; return (0); } } /* + * Clean out any stale value here. Calling a transaction read helper automatically clears this + * but we have some code paths that don't do this (fixed length column store is one example). + */ + __wt_upd_value_clear(cbt->upd_value); + + /* * If we don't have an insert object, or in the case of column-store, there's an insert object * but no update was visible to us and the key on the page is the same as the insert object's * key, and the slot as set by the search function is valid, we can use the original page @@ -299,17 +296,10 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE * Check for an update ondisk or in the history store. For column store, an insert object * can have the same key as an on-page or history store object. */ - WT_RET(__wt_txn_read(session, cbt, key, recno, NULL, NULL, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); + WT_RET(__wt_txn_read(session, cbt, key, recno, NULL, NULL)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; } break; @@ -335,17 +325,10 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE (page->modify != NULL && page->modify->mod_row_update != NULL) ? page->modify->mod_row_update[cbt->slot] : NULL, - NULL, &upd)); - if (upd != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); + NULL)); + if (cbt->upd_value->type != WT_UPDATE_INVALID) { + if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - } - if (updp != NULL) - *updp = upd; - else if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); *valid = true; } break; @@ -363,7 +346,7 @@ __cursor_col_search(WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool *leaf_foundp) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_WITH_PAGE_INDEX( session, ret = __wt_col_search(cbt, cbt->iface.recno, leaf, false, leaf_foundp)); return (ret); @@ -379,7 +362,7 @@ __cursor_row_search(WT_CURSOR_BTREE *cbt, bool insert, WT_REF *leaf, bool *leaf_ WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_WITH_PAGE_INDEX( session, ret = __wt_row_search(cbt, &cbt->iface.key, insert, leaf, false, leaf_foundp)); return (ret); @@ -429,7 +412,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_SESSION_IMPL *session; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); @@ -440,11 +423,11 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) } /* - * __wt_btcur_search_uncommitted -- - * Search and return exact matching records only, including uncommitted ones. + * __wt_btcur_search_prepared -- + * Search and return exact matching records only. */ int -__wt_btcur_search_uncommitted(WT_CURSOR *cursor, WT_UPDATE **updp) +__wt_btcur_search_prepared(WT_CURSOR *cursor, WT_UPDATE **updp) { WT_BTREE *btree; WT_CURSOR_BTREE *cbt; @@ -500,12 +483,6 @@ __wt_btcur_search_uncommitted(WT_CURSOR *cursor, WT_UPDATE **updp) break; } - /* - * Like regular uncommitted updates, pages with prepared updates are pinned to the cache and can - * never be written to the history store. Therefore, there is no need to do a search here for - * uncommitted updates. - */ - *updp = upd; return (0); } @@ -522,13 +499,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; - upd = NULL; /* -Wuninitialized */ + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); @@ -557,11 +532,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, false, cbt->ref, &leaf_found)); if (leaf_found && cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, cbt->ref, &leaf_found)); if (leaf_found && cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } if (!valid) { @@ -570,16 +545,16 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, false, NULL, NULL)); if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, NULL, NULL)); if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } if (valid) - ret = __cursor_kv_return(cbt, upd); + ret = __cursor_kv_return(cbt, cbt->upd_value); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length column-store implicitly @@ -619,14 +594,12 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; int exact; bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; - upd = NULL; /* -Wuninitialized */ + session = CUR2S(cbt); exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); @@ -671,7 +644,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (leaf_found && (cbt->compare == 0 || (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1))) - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -682,10 +655,10 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(cbt, true, NULL, NULL)); - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); } else { WT_ERR(__cursor_col_search(cbt, NULL, NULL)); - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &upd, &valid)); + WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); } } @@ -706,7 +679,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (valid) { exact = cbt->compare; - ret = __cursor_kv_return(cbt, upd); + ret = __cursor_kv_return(cbt, cbt->upd_value); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; @@ -781,9 +754,12 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; insert_bytes = cursor->key.size + cursor->value.size; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; + WT_RET_PANIC_ASSERT( + session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); + WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_CONN_INCRV(session, cursor_insert_bytes, insert_bytes); @@ -793,9 +769,6 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); - WT_RET_ASSERT( - session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); - /* It's no longer possible to bulk-load into the tree. */ __wt_cursor_disable_bulk(session); @@ -859,7 +832,9 @@ retry: * If not overwriting, fail if the key exists, else insert the key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0) { - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (valid) WT_ERR(WT_DUPLICATE_KEY); } @@ -885,7 +860,9 @@ retry: */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { if (cbt->compare == 0) { - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); if (valid) WT_ERR(WT_DUPLICATE_KEY); } else if (__cursor_fix_implicit(btree, cbt)) @@ -932,7 +909,7 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt) btree = cbt->btree; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = NULL; if (cbt->compare != 0) @@ -964,7 +941,7 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) uint64_t yield_count, sleep_usecs; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; WT_ASSERT(session, cbt->btree->type == BTREE_ROW); @@ -1015,7 +992,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; iterating = F_ISSET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); searched = false; @@ -1085,7 +1062,8 @@ retry: if (cbt->compare != 0) goto search_notfound; - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF(ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (!valid) goto search_notfound; @@ -1103,8 +1081,10 @@ retry: /* Remove the record if it exists. */ valid = false; - if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + if (cbt->compare == 0) { + WT_WITH_UPDATE_VALUE_SKIP_BUF(ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); + } if (cbt->compare != 0 || !valid) { if (!__cursor_fix_implicit(btree, cbt)) goto search_notfound; @@ -1203,10 +1183,10 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); yield_count = sleep_usecs = 0; - WT_RET_ASSERT( + WT_RET_PANIC_ASSERT( session, S2BT(session) == btree, WT_PANIC, "btree differs unexpectedly from session's btree"); /* It's no longer possible to bulk-load into the tree. */ @@ -1287,7 +1267,9 @@ update_local: WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0) WT_ERR(WT_NOTFOUND); - WT_ERR(__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, NULL, &valid)); + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, &valid)); + WT_ERR(ret); if (!valid) WT_ERR(WT_NOTFOUND); } @@ -1302,8 +1284,11 @@ update_local: if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); valid = false; - if (cbt->compare == 0) - WT_ERR(__wt_cursor_valid(cbt, NULL, cbt->recno, NULL, &valid)); + if (cbt->compare == 0) { + WT_WITH_UPDATE_VALUE_SKIP_BUF( + ret = __wt_cursor_valid(cbt, NULL, cbt->recno, &valid)); + WT_ERR(ret); + } if ((cbt->compare != 0 || !valid) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } @@ -1375,7 +1360,7 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); upd = NULL; if (cbt->ins != NULL) @@ -1429,7 +1414,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) bool overwrite; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); /* Save the cursor state. */ __cursor_state_save(cursor, &state); @@ -1465,7 +1450,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify)); orig = cursor->value.size; - WT_ERR(__wt_modify_apply(cursor, modify->data)); + WT_ERR(__wt_modify_apply_item(session, cursor->value_format, &cursor->value, modify->data)); new = cursor->value.size; WT_ERR(__cursor_size_chk(session, &cursor->value)); @@ -1515,7 +1500,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt) bool overwrite; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_reserve); WT_STAT_DATA_INCR(session, cursor_reserve); @@ -1542,7 +1527,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cbt); WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); @@ -1568,7 +1553,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) a = (WT_CURSOR *)a_arg; b = (WT_CURSOR *)b_arg; - session = (WT_SESSION_IMPL *)a->session; + session = CUR2S(a_arg); /* Confirm both cursors reference the same object. */ if (a_arg->btree != b_arg->btree) @@ -1640,8 +1625,8 @@ __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) a = (WT_CURSOR *)a_arg; b = (WT_CURSOR *)b_arg; + session = CUR2S(a_arg); cmp = 0; - session = (WT_SESSION_IMPL *)a->session; /* Confirm both cursors reference the same object. */ if (a_arg->btree != b_arg->btree) @@ -1673,7 +1658,7 @@ __cursor_truncate( WT_SESSION_IMPL *session; uint64_t yield_count, sleep_usecs; - session = (WT_SESSION_IMPL *)start->iface.session; + session = CUR2S(start); yield_count = sleep_usecs = 0; /* @@ -1729,7 +1714,7 @@ __cursor_truncate_fix( uint64_t yield_count, sleep_usecs; const uint8_t *value; - session = (WT_SESSION_IMPL *)start->iface.session; + session = CUR2S(start); yield_count = sleep_usecs = 0; /* @@ -1786,8 +1771,8 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)start->iface.session; btree = start->btree; + session = CUR2S(start); WT_STAT_DATA_INCR(session, cursor_truncate); WT_RET(__wt_txn_autocommit_check(session)); @@ -1852,6 +1837,8 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + cbt->modify_update = &cbt->_modify_update; + cbt->upd_value = &cbt->_upd_value; #ifdef HAVE_DIAGNOSTIC cbt->lastkey = &cbt->_lastkey; @@ -1869,7 +1856,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* * The in-memory split and history store table code creates low-level btree cursors to @@ -1879,6 +1866,8 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) if (!lowlevel) ret = __cursor_reset(cbt); + __wt_buf_free(session, &cbt->_modify_update.buf); + __wt_buf_free(session, &cbt->_upd_value.buf); __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); #ifdef HAVE_DIAGNOSTIC diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index e3e89620fd5..a835e593022 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -156,18 +156,16 @@ __debug_item_value(WT_DBG *ds, const char *tag, const void *data_arg, size_t siz } /* - * __debug_time_pairs -- + * __debug_time_window -- * Dump a set of start and stop time pairs, with an optional tag. */ static inline int -__debug_time_pairs(WT_DBG *ds, const char *tag, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_ts, uint64_t stop_txn) +__debug_time_window(WT_DBG *ds, const char *tag, WT_TIME_WINDOW *tw) { - char tp_string[2][WT_TP_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; - return (ds->f(ds, "\t%s%s%s,%s\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", - __wt_time_pair_to_string(start_ts, start_txn, tp_string[0]), - __wt_time_pair_to_string(stop_ts, stop_txn, tp_string[1]))); + return (ds->f(ds, "\t%s%s%s\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", + __wt_time_window_to_string(tw, time_string))); } /* @@ -711,15 +709,13 @@ int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; uint32_t session_flags; bool is_owner; - cursor = cursor_arg; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor_arg); session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); @@ -741,26 +737,24 @@ __wt_debug_cursor_hs(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor) WT_DECL_ITEM(hs_key); WT_DECL_ITEM(hs_value); WT_DECL_RET; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; WT_UPDATE *upd; - wt_timestamp_t hs_durable_ts; - uint64_t hs_upd_type_full; + uint64_t hs_counter, hs_upd_type_full; uint32_t hs_btree_id; - uint8_t hs_prep_state, hs_upd_type; + uint8_t hs_upd_type; ds = &_ds; + __wt_time_window_init(&tw); WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); WT_ERR(__debug_config(session, ds, NULL)); - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &start.timestamp, &start.txnid, - &stop.timestamp, &stop.txnid)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &tw.start_ts, &hs_counter)); + WT_ERR(hs_cursor->get_value( + hs_cursor, &tw.stop_ts, &tw.durable_start_ts, &hs_upd_type_full, hs_value)); + WT_ERR(__debug_time_window(ds, "T", &tw)); - WT_ERR(__debug_time_pairs(ds, "T", start.timestamp, start.txnid, stop.timestamp, stop.txnid)); - - WT_ERR( - hs_cursor->get_value(hs_cursor, &hs_durable_ts, &hs_prep_state, &hs_upd_type_full, hs_value)); hs_upd_type = (uint8_t)hs_upd_type_full; switch (hs_upd_type) { case WT_UPDATE_MODIFY: @@ -806,8 +800,7 @@ __wt_debug_key_value( WT_ERR(ds->f(ds, "\tK {%" PRIu64 " %" PRIu64 "}", recno, rle)); else WT_ERR(__debug_item_key(ds, "K", key->data, key->size)); - WT_ERR(__debug_time_pairs( - ds, "T", value->start_ts, value->start_txn, value->stop_ts, value->stop_txn)); + WT_ERR(__debug_time_window(ds, "T", &value->tw)); WT_ERR(__debug_cell_data(ds, NULL, value != NULL ? value->type : 0, "V", value)); err: @@ -1350,8 +1343,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) { WT_ADDR_COPY addr; WT_SESSION_IMPL *session; - char tp_string[2][WT_TP_STRING_SIZE]; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; session = ds->session; @@ -1365,13 +1357,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", %s", "reading")); if (__wt_ref_addr_copy(session, ref, &addr)) - WT_RET(ds->f(ds, - ", start/stop durable ts %s,%s, start/stop ts/txn %s,%s, prepared updates: %s, %s", - __wt_timestamp_to_string(addr.newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(addr.newest_stop_durable_ts, ts_string[1]), - __wt_time_pair_to_string(addr.oldest_start_ts, addr.oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(addr.newest_stop_ts, addr.newest_stop_txn, tp_string[1]), - addr.prepare ? "true" : "false", + WT_RET(ds->f(ds, "%s, %s", __wt_time_aggregate_to_string(&addr.ta, time_string), __wt_addr_string(session, addr.addr, addr.size, ds->t1))); return (ds->f(ds, "\n")); } @@ -1386,8 +1372,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) WT_DECL_ITEM(buf); WT_DECL_RET; WT_SESSION_IMPL *session; - char tp_string[2][WT_TP_STRING_SIZE]; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; session = ds->session; @@ -1429,11 +1414,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - WT_RET(ds->f(ds, ", ts/txn %s,%s,%s,%s", - __wt_timestamp_to_string(unpack->newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_durable_ts, ts_string[1]), - __wt_time_pair_to_string(unpack->oldest_start_ts, unpack->oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(unpack->newest_stop_ts, unpack->newest_stop_txn, tp_string[1]))); + WT_RET(ds->f(ds, ", %s", __wt_time_aggregate_to_string(&unpack->ta, time_string))); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -1441,9 +1422,7 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: case WT_CELL_VALUE_SHORT: - WT_RET(ds->f(ds, ", ts/txn %s,%s", - __wt_time_pair_to_string(unpack->start_ts, unpack->start_txn, tp_string[0]), - __wt_time_pair_to_string(unpack->stop_ts, unpack->stop_txn, tp_string[1]))); + WT_RET(ds->f(ds, ", %s", __wt_time_window_to_string(&unpack->tw, time_string))); break; } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 94b544f6bc2..b9a3eed1c93 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -115,7 +115,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) goto err; if (addr.type != WT_ADDR_LEAF_NO) goto err; - if (!__wt_txn_visible(session, addr.oldest_start_txn, addr.oldest_start_ts)) + if (!__wt_txn_visible(session, addr.ta.oldest_start_txn, addr.ta.oldest_start_ts)) goto err; /* @@ -292,7 +292,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_ROW *rip; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; WT_UPDATE **upd_array, *upd; size_t size; uint32_t count, i; @@ -382,8 +382,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * Retrieve the stop time pair from the page's row. If we find an existing stop time pair we * don't need to append a tombstone. */ - __wt_read_row_time_pairs(session, page, rip, &start, &stop); - if (stop.timestamp == WT_TS_MAX && stop.txnid == WT_TXN_MAX) { + __wt_read_row_time_window(session, page, rip, &tw); + if (tw.stop_ts == WT_TS_MAX && tw.stop_txn == WT_TXN_MAX) { WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size)); upd->next = upd_array[WT_ROW_SLOT(page, rip)]; upd_array[WT_ROW_SLOT(page, rip)] = upd; diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index d4d83783a1b..7fac3deabd5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -140,7 +140,7 @@ corrupt: F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { WT_TRET(bm->corrupt(bm, session, addr, addr_size)); - WT_PANIC_ERR(session, ret, "%s: fatal read error: %s", btree->dhandle->name, fail_msg); + WT_ERR_PANIC(session, ret, "%s: fatal read error: %s", btree->dhandle->name, fail_msg); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 72523b695de..cccd2c628a3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -65,7 +65,7 @@ __wt_ovfl_read( */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { - WT_ASSERT(session, __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)); + WT_ASSERT(session, __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)); ret = __wt_buf_setstr(session, store, "WT_CELL_VALUE_OVFL_RM"); *decoded = true; } else diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index ac588bf901d..1a690b24804 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -544,13 +544,23 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL_UNPACK unpack; + WT_ITEM buf; WT_ROW *rip; + WT_UPDATE **upd_array, *upd; + size_t size, total_size; + uint32_t i; + bool instantiate_prepared, prepare; btree = S2BT(session); + prepare = false; + + instantiate_prepared = F_ISSET_ATOMIC(page, WT_PAGE_INSTANTIATE_PREPARE_UPDATE); /* Walk the page, building indices. */ rip = page->pg_row; WT_CELL_FOREACH_BEGIN (session, btree, page->dsk, unpack) { + if (instantiate_prepared && !prepare && F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE)) + prepare = true; switch (unpack.type) { case WT_CELL_KEY_OVFL: __wt_row_leaf_key_set_cell(page, rip, unpack.cell); @@ -575,9 +585,9 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) * The visibility information is not referenced on the page so we need to ensure that * the value is globally visible at the point in time where we read the page into cache. */ - if (!btree->huffman_value && unpack.stop_txn == WT_TXN_MAX && - unpack.stop_ts == WT_TS_MAX && - __wt_txn_visible_all(session, unpack.start_txn, unpack.start_ts)) + if (!btree->huffman_value && unpack.tw.stop_txn == WT_TXN_MAX && + unpack.tw.stop_ts == WT_TS_MAX && !F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE) && + __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.start_ts)) __wt_row_leaf_value_set(page, rip - 1, &unpack); break; case WT_CELL_VALUE_OVFL: @@ -589,8 +599,47 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_CELL_FOREACH_END; /* - * We do not currently instantiate keys on leaf pages when the page is loaded, they're - * instantiated on demand. + * Instantiate prepared updates on leaf pages when the page is loaded. For in-memory databases, + * all non obsolete updates will retain on the page as part of __split_multi_inmem function. */ + if (prepare && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { + WT_RET(__wt_page_modify_init(session, page)); + if (!F_ISSET(btree, WT_BTREE_READONLY)) + __wt_page_modify_set(session, page); + + /* Allocate the per-page update array if one doesn't already exist. */ + if (page->entries != 0 && page->modify->mod_row_update == NULL) + WT_RET(__wt_calloc_def(session, page->entries, &page->modify->mod_row_update)); + + /* For each entry in the page */ + size = total_size = 0; + upd_array = page->modify->mod_row_update; + WT_ROW_FOREACH (page, rip, i) { + /* Unpack the on-page value cell. */ + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + if (F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE)) { + if (unpack.tw.stop_ts == WT_TS_MAX && unpack.tw.stop_txn == WT_TXN_MAX) { + /* Take the value from the original page cell. */ + WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &buf)); + + WT_RET(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, &size)); + upd->durable_ts = WT_TS_NONE; + upd->start_ts = unpack.tw.start_ts; + upd->txnid = unpack.tw.start_txn; + } else { + WT_RET(__wt_upd_alloc_tombstone(session, &upd, &size)); + upd->durable_ts = WT_TS_NONE; + upd->start_ts = unpack.tw.stop_ts; + upd->txnid = unpack.tw.stop_txn; + } + upd->prepare_state = WT_PREPARE_INPROGRESS; + upd_array[WT_ROW_SLOT(page, rip)] = upd; + total_size += size; + } + } + + __wt_cache_page_inmem_incr(session, page, total_size); + } + return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index b3a8985fbe4..3f113e4b2dc 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -13,10 +13,8 @@ * Check if the inserted key/value pair is valid. */ static int -__random_insert_valid( - WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_INSERT *ins, WT_UPDATE **updp, bool *validp) +__random_insert_valid(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_INSERT *ins, bool *validp) { - *updp = NULL; *validp = false; __cursor_pos_clear(cbt); @@ -27,7 +25,7 @@ __random_insert_valid( cbt->tmp->data = WT_INSERT_KEY(ins); cbt->tmp->size = WT_INSERT_KEY_SIZE(ins); - return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, updp, validp)); + return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, validp)); } /* @@ -35,16 +33,15 @@ __random_insert_valid( * Check if the slot key/value pair is valid. */ static int -__random_slot_valid(WT_CURSOR_BTREE *cbt, uint32_t slot, WT_UPDATE **updp, bool *validp) +__random_slot_valid(WT_CURSOR_BTREE *cbt, uint32_t slot, bool *validp) { - *updp = NULL; *validp = false; __cursor_pos_clear(cbt); cbt->slot = slot; cbt->compare = 0; - return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, updp, validp)); + return (__wt_cursor_valid(cbt, cbt->tmp, WT_RECNO_OOB, validp)); } /* Magic constant: 5000 entries in a skip list is enough to forcibly evict. */ @@ -64,7 +61,7 @@ __random_skip_entries(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head) uint32_t entries; int level; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); entries = 0; /* [-Wconditional-uninitialized] */ if (ins_head == NULL) @@ -106,18 +103,16 @@ __random_skip_entries(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head) * Return a random key/value from a skip list. */ static int -__random_leaf_skip( - WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, uint32_t entries, WT_UPDATE **updp, bool *validp) +__random_leaf_skip(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, uint32_t entries, bool *validp) { WT_INSERT *ins, *saved_ins; WT_SESSION_IMPL *session; uint32_t i; int retry; - *updp = NULL; *validp = false; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* This is a relatively expensive test, try a few times then quit. */ for (retry = 0; retry < WT_RANDOM_SKIP_RETRY; ++retry) { @@ -136,7 +131,7 @@ __random_leaf_skip( /* Try and return our selected record. */ if (ins != NULL) { - WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + WT_RET(__random_insert_valid(cbt, ins_head, ins, validp)); if (*validp) return (0); } @@ -148,7 +143,7 @@ __random_leaf_skip( ins = saved_ins; } for (; --i > 0 && ins != NULL; ins = WT_SKIP_NEXT(ins)) { - WT_RET(__random_insert_valid(cbt, ins_head, ins, updp, validp)); + WT_RET(__random_insert_valid(cbt, ins_head, ins, validp)); if (*validp) return (0); } @@ -166,24 +161,23 @@ __random_leaf_skip( * Look for a large insert list from which we can select a random item. */ static int -__random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +__random_leaf_insert(WT_CURSOR_BTREE *cbt, bool *validp) { WT_INSERT_HEAD *ins_head; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t entries, slot, start; - *updp = NULL; *validp = false; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); /* Check for a large insert list with no items, that's common when tables are newly created. */ ins_head = WT_ROW_INSERT_SMALLEST(page); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_SMALLEST_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -199,7 +193,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -208,7 +202,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT(page, &page->pg_row[slot]); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -219,7 +213,7 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) ins_head = WT_ROW_INSERT_SMALLEST(page); entries = __random_skip_entries(cbt, ins_head); if (entries >= WT_RANDOM_SKIP_INSERT_ENOUGH) { - WT_RET(__random_leaf_skip(cbt, ins_head, entries, updp, validp)); + WT_RET(__random_leaf_skip(cbt, ins_head, entries, validp)); if (*validp) return (0); } @@ -234,25 +228,24 @@ __random_leaf_insert(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) * Return a random key/value from a page's on-disk entries. */ static int -__random_leaf_disk(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *validp) +__random_leaf_disk(WT_CURSOR_BTREE *cbt, bool *validp) { WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t entries, slot; int retry; - *updp = NULL; *validp = false; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); entries = cbt->ref->page->entries; /* This is a relatively cheap test, so try several times. */ for (retry = 0; retry < WT_RANDOM_DISK_RETRY; ++retry) { slot = __wt_random(&session->rnd) % entries; WT_RET(__wt_row_leaf_key(session, page, page->pg_row + slot, cbt->tmp, false)); - WT_RET(__random_slot_valid(cbt, slot, updp, validp)); + WT_RET(__random_slot_valid(cbt, slot, validp)); if (*validp) break; } @@ -274,12 +267,11 @@ __random_leaf(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - WT_UPDATE *upd; uint32_t i; bool next, valid; - cursor = (WT_CURSOR *)cbt; - session = (WT_SESSION_IMPL *)cbt->iface.session; + cursor = &cbt->iface; + session = CUR2S(cbt); /* * If the page has a sufficiently large number of disk-based entries, randomly select from them. @@ -287,24 +279,24 @@ __random_leaf(WT_CURSOR_BTREE *cbt) * a reasonable chunk of the name space. */ if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH) { - WT_RET(__random_leaf_disk(cbt, &upd, &valid)); + WT_RET(__random_leaf_disk(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); } /* Look for any large insert list and select from it. */ - WT_RET(__random_leaf_insert(cbt, &upd, &valid)); + WT_RET(__random_leaf_insert(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); /* * Try again if there are at least a few hundred disk-based entries: this may be a normal leaf * page with big items. */ if (cbt->ref->page->entries > WT_RANDOM_DISK_ENOUGH / 2) { - WT_RET(__random_leaf_disk(cbt, &upd, &valid)); + WT_RET(__random_leaf_disk(cbt, &valid)); if (valid) - return (__cursor_kv_return(cbt, upd)); + return (__cursor_kv_return(cbt, cbt->upd_value)); } /* @@ -484,7 +476,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) btree = cbt->btree; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); read_flags = WT_READ_RESTART_OK; if (F_ISSET(cbt, WT_CBT_READ_ONCE)) diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 5c8c0ea871a..4d83914e1a3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -157,6 +157,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * evicting that page and deciding that is a sign that eviction is unstuck. */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; + FLD_SET(page_flags, WT_PAGE_INSTANTIATE_PREPARE_UPDATE); if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS); WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, ¬used)); diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 353f159f6bb..5f29cf08691 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -76,13 +76,7 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session, const uint8_t *key, size_t key WT_RET(__wt_calloc_one(session, ©_addr)); copy->addr = copy_addr; - copy_addr->newest_start_durable_ts = unpack->newest_start_durable_ts; - copy_addr->oldest_start_ts = unpack->oldest_start_ts; - copy_addr->oldest_start_txn = unpack->oldest_start_txn; - copy_addr->newest_stop_durable_ts = unpack->newest_stop_durable_ts; - copy_addr->newest_stop_ts = unpack->newest_stop_ts; - copy_addr->newest_stop_txn = unpack->newest_stop_txn; - copy_addr->prepare = F_ISSET(unpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(©_addr->ta, &unpack->ta); WT_RET(__wt_memdup(session, unpack->data, unpack->size, ©_addr->addr)); copy_addr->size = (uint8_t)unpack->size; copy_addr->type = unpack->type == WT_CELL_ADDR_LEAF ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO; diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 2061d561a7a..1a2360f6d09 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -23,7 +23,7 @@ __key_return(WT_CURSOR_BTREE *cbt) page = cbt->ref->page; cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -70,102 +70,71 @@ __key_return(WT_CURSOR_BTREE *cbt) } /* - * __time_pairs_init -- - * Initialize the time pairs to globally visible. + * __read_col_time_window -- + * Retrieve the time window from a column store cell. */ -static inline void -__time_pairs_init(WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +static void +__read_col_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_WINDOW *tw) { - start->txnid = WT_TXN_NONE; - start->timestamp = WT_TS_NONE; - stop->txnid = WT_TXN_MAX; - stop->timestamp = WT_TS_MAX; + WT_CELL_UNPACK unpack; + + __wt_cell_unpack(session, page, cell, &unpack); + __wt_time_window_copy(tw, &unpack.tw); } /* - * __time_pairs_set -- - * Set the time pairs. + * __wt_read_row_time_window -- + * Retrieve the time window from a row. */ -static inline void -__time_pairs_set(WT_TIME_PAIR *start, WT_TIME_PAIR *stop, WT_CELL_UNPACK *unpack) +void +__wt_read_row_time_window(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw) { - start->timestamp = unpack->start_ts; - start->txnid = unpack->start_txn; - stop->timestamp = unpack->stop_ts; - stop->txnid = unpack->stop_txn; + WT_CELL_UNPACK unpack; + + __wt_time_window_init(tw); + /* + * If a value is simple and is globally visible at the time of reading a page into cache, we set + * the time pairs as globally visible. + */ + if (__wt_row_leaf_value_exists(rip)) + return; + + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); + __wt_time_window_copy(tw, &unpack.tw); } /* - * __wt_read_cell_time_pairs -- + * __wt_read_cell_time_window -- * Read the time pairs from the cell. */ void -__wt_read_cell_time_pairs( - WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +__wt_read_cell_time_window(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_WINDOW *tw) { WT_PAGE *page; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = ref->page; - WT_ASSERT(session, start != NULL && stop != NULL); + WT_ASSERT(session, tw != NULL); /* Take the value from the original page cell. */ if (page->type == WT_PAGE_ROW_LEAF) { - __wt_read_row_time_pairs(session, page, &page->pg_row[cbt->slot], start, stop); + __wt_read_row_time_window(session, page, &page->pg_row[cbt->slot], tw); } else if (page->type == WT_PAGE_COL_VAR) { - __wt_read_col_time_pairs( - session, page, WT_COL_PTR(page, &page->pg_var[cbt->slot]), start, stop); + __read_col_time_window(session, page, WT_COL_PTR(page, &page->pg_var[cbt->slot]), tw); } else { /* WT_PAGE_COL_FIX: return the default time pairs. */ - __time_pairs_init(start, stop); + __wt_time_window_init(tw); } } /* - * __wt_read_col_time_pairs -- - * Retrieve the time pairs from a column store cell. - */ -void -__wt_read_col_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) -{ - WT_CELL_UNPACK unpack; - - __wt_cell_unpack(session, page, cell, &unpack); - __time_pairs_set(start, stop, &unpack); -} - -/* - * __wt_read_row_time_pairs -- - * Retrieve the time pairs from a row. - */ -void -__wt_read_row_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) -{ - WT_CELL_UNPACK unpack; - - __time_pairs_init(start, stop); - /* - * If a value is simple and is globally visible at the time of reading a page into cache, we set - * the time pairs as globally visible. - */ - if (__wt_row_leaf_value_exists(rip)) - return; - - __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); - __time_pairs_set(start, stop, &unpack); -} - -/* * __wt_value_return_buf -- * Change a buffer to reference an internal original-page return value. */ int -__wt_value_return_buf( - WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_PAIR *start, WT_TIME_PAIR *stop) +__wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, WT_TIME_WINDOW *tw) { WT_BTREE *btree; WT_CELL *cell; @@ -176,18 +145,12 @@ __wt_value_return_buf( WT_SESSION_IMPL *session; uint8_t v; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); page = ref->page; cursor = &cbt->iface; - if (start != NULL && stop != NULL) - __time_pairs_init(start, stop); - - /* Must provide either both start and stop as output parameters or neither. */ - WT_ASSERT(session, (start != NULL && stop != NULL) || (start == NULL && stop == NULL)); - if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -195,14 +158,16 @@ __wt_value_return_buf( * If a value is simple and is globally visible at the time of reading a page into cache, we * encode its location into the WT_ROW. */ - if (__wt_row_leaf_value(page, rip, buf)) + if (__wt_row_leaf_value(page, rip, buf)) { + if (tw != NULL) + __wt_time_window_init(tw); return (0); + } /* Take the value from the original page cell. */ __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); - if (start != NULL && stop != NULL) - __time_pairs_set(start, stop, &unpack); - + if (tw != NULL) + __wt_time_window_copy(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); } @@ -210,17 +175,18 @@ __wt_value_return_buf( /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); __wt_cell_unpack(session, page, cell, &unpack); - if (start != NULL && stop != NULL) - __time_pairs_set(start, stop, &unpack); - + if (tw != NULL) + __wt_time_window_copy(tw, &unpack.tw); return (__wt_page_cell_data_ref(session, page, &unpack, buf)); } /* * WT_PAGE_COL_FIX: Take the value from the original page. * - * FIXME-PM-1523: Should also check visibility here + * FIXME-WT-6126: Should also check visibility here */ + if (tw != NULL) + __wt_time_window_init(tw); v = __bit_getv_recno(ref, cursor->recno, btree->bitcnt); return (__wt_buf_set(session, buf, &v, 1)); } @@ -232,95 +198,7 @@ __wt_value_return_buf( static inline int __value_return(WT_CURSOR_BTREE *cbt) { - return (__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, NULL, NULL)); -} - -/* - * __wt_value_return_upd -- - * Change the cursor to reference an internal update structure return value. - */ -int -__wt_value_return_upd(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) -{ - WT_CURSOR *cursor; - WT_DECL_RET; - WT_MODIFY_VECTOR modifies; - WT_SESSION_IMPL *session; - WT_TIME_PAIR start, stop; - - cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cbt->iface.session; - __wt_modify_vector_init(session, &modifies); - - /* - * We're passed a "standard" or "modified" update that's visible to us. Our caller should have - * already checked for deleted items (we're too far down the call stack to return not-found). - * - * Fast path if it's a standard item, assert our caller's behavior. - */ - if (upd->type == WT_UPDATE_STANDARD) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) { - /* Copy an external update, and delete after using it */ - WT_RET(__wt_buf_set(session, &cursor->value, upd->data, upd->size)); - __wt_free_update_list(session, &upd); - } else { - cursor->value.data = upd->data; - cursor->value.size = upd->size; - } - return (0); - } - WT_ASSERT(session, upd->type == WT_UPDATE_MODIFY); - - /* - * Find a complete update. - */ - for (; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (WT_UPDATE_DATA_VALUE(upd)) - break; - - if (upd->type == WT_UPDATE_MODIFY) - WT_ERR(__wt_modify_vector_push(&modifies, upd)); - } - - /* - * If there's no full update, the base item is the on-page item. If the update is a tombstone, - * the base item is an empty item. - */ - if (upd == NULL) { - /* - * Callers of this function set the cursor slot to an impossible value to check we don't try - * and return on-page values when the update list should have been sufficient (which - * happens, for example, if an update list was truncated, deleting some standard update - * required by a previous modify update). Assert the case. - */ - WT_ASSERT(session, cbt->slot != UINT32_MAX); - - WT_ERR(__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, &start, &stop)); - /* - * Applying modifies on top of a tombstone is invalid. So if we're using the onpage value, - * the stop time pair should be unset. - */ - WT_ASSERT(session, stop.txnid == WT_TXN_MAX && stop.timestamp == WT_TS_MAX); - } else { - /* The base update must not be a tombstone. */ - WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD); - WT_ERR(__wt_buf_set(session, &cursor->value, upd->data, upd->size)); - } - - /* - * Once we have a base item, roll forward through any visible modify updates. - */ - while (modifies.size > 0) { - __wt_modify_vector_pop(&modifies, &upd); - WT_ERR(__wt_modify_apply(cursor, upd->data)); - } - -err: - __wt_modify_vector_free(&modifies); - return (ret); + return (__wt_value_return_buf(cbt, cbt->ref, &cbt->iface.value, NULL)); } /* @@ -352,20 +230,37 @@ __wt_key_return(WT_CURSOR_BTREE *cbt) /* * __wt_value_return -- - * Change the cursor to reference an internal return value. + * Change the cursor to reference an update return value. */ int -__wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE_VALUE *upd_value) { WT_CURSOR *cursor; + WT_SESSION_IMPL *session; cursor = &cbt->iface; + session = CUR2S(cbt); F_CLR(cursor, WT_CURSTD_VALUE_EXT); - if (upd == NULL) + if (upd_value->type == WT_UPDATE_INVALID) { + /* + * FIXME-WT-6127: This is a holdover from the pre-durable history read logic where we used + * to fallback to the on-page value if we didn't find a visible update elsewhere. This is + * still required for fixed length column store as we have issues with this table type in + * durable history which we're planning to address in PM-1814. + */ + WT_ASSERT(session, cbt->btree->type == BTREE_COL_FIX); WT_RET(__value_return(cbt)); - else - WT_RET(__wt_value_return_upd(cbt, upd)); + } else { + /* + * We're passed a "standard" update that's visible to us. Our caller should have already + * checked for deleted items (we're too far down the call stack to return not-found) and any + * modify updates should be have been reconstructed into a full standard update. + */ + WT_ASSERT(session, upd_value->type == WT_UPDATE_STANDARD); + cursor->value.data = upd_value->buf.data; + cursor->value.size = upd_value->buf.size; + } F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index a42e11e1d8f..344c6a573d7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -186,12 +186,7 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root) __wt_seconds(session, &ckptbase->sec); WT_ERR(__wt_metadata_search(session, dhandle->name, &config)); WT_ERR(__wt_meta_block_metadata(session, config, ckptbase)); - ckptbase->start_durable_ts = WT_TS_NONE; - ckptbase->oldest_start_ts = WT_TS_NONE; - ckptbase->oldest_start_txn = WT_TXN_NONE; - ckptbase->stop_durable_ts = WT_TS_NONE; - ckptbase->newest_stop_ts = WT_TS_MAX; - ckptbase->newest_stop_txn = WT_TXN_MAX; + __wt_time_aggregate_init(&ckptbase->ta); ckptbase->write_gen = btree->write_gen; F_SET(ckptbase, WT_CKPT_ADD); @@ -917,7 +912,7 @@ __slvg_col_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s */ /* Case #2/8, #10, #11 */ if (a_trk->col_start > b_trk->col_start) - WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + WT_RET_PANIC(session, EINVAL, "unexpected merge array sort order"); if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */ /* @@ -1174,12 +1169,7 @@ __slvg_col_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF * regardless of a value's timestamps or transaction IDs. */ WT_ERR(__wt_calloc_one(session, &addr)); - addr->newest_start_durable_ts = addr->newest_stop_durable_ts = addr->oldest_start_ts = - WT_TS_NONE; - addr->oldest_start_txn = WT_TXN_NONE; - addr->newest_stop_ts = WT_TS_MAX; - addr->newest_stop_txn = WT_TXN_MAX; - addr->prepare = false; + __wt_time_aggregate_init(&addr->ta); WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; @@ -1323,7 +1313,7 @@ __slvg_col_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK * return (__slvg_ovfl_ref(session, ovfl, false)); } - WT_PANIC_RET(session, EINVAL, "overflow record at column-store page merge not found"); + WT_RET_PANIC(session, EINVAL, "overflow record at column-store page merge not found"); } /* @@ -1512,7 +1502,7 @@ __slvg_row_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_s WT_RET(__wt_compare(session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp)); if (start_cmp > 0) /* Case #2/8, #10, #11 */ - WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + WT_RET_PANIC(session, EINVAL, "unexpected merge array sort order"); if (start_cmp == 0) { /* Case #1, #4, #9 */ /* @@ -1782,12 +1772,7 @@ __slvg_row_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF * regardless of a value's timestamps or transaction IDs. */ WT_ERR(__wt_calloc_one(session, &addr)); - addr->newest_start_durable_ts = addr->newest_stop_durable_ts = addr->oldest_start_ts = - WT_TS_NONE; - addr->oldest_start_txn = WT_TXN_NONE; - addr->newest_stop_ts = WT_TS_MAX; - addr->newest_stop_txn = WT_TXN_MAX; - addr->prepare = false; + __wt_time_aggregate_init(&addr->ta); WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; @@ -1992,7 +1977,7 @@ __slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK * return (__slvg_ovfl_ref(session, ovfl, true)); } - WT_PANIC_RET(session, EINVAL, "overflow record at row-store page merge not found"); + WT_RET_PANIC(session, EINVAL, "overflow record at row-store page merge not found"); } /* @@ -2270,7 +2255,7 @@ __slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, bool multi_panic) if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { if (!multi_panic) return (__wt_set_return(session, EBUSY)); - WT_PANIC_RET(session, EINVAL, + WT_RET_PANIC(session, EINVAL, "overflow record unexpectedly referenced multiple times " "during leaf page merge"); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a2d85f79db8..2a016d6d725 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -147,7 +147,7 @@ __split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error during page split"); + WT_RET_PANIC(session, ret, "fatal error during page split"); } #endif @@ -249,13 +249,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_ref if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) { __wt_cell_unpack(session, from_home, (WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); - addr->oldest_start_ts = unpack.oldest_start_ts; - addr->oldest_start_txn = unpack.oldest_start_txn; - addr->newest_start_durable_ts = unpack.newest_start_durable_ts; - addr->newest_stop_ts = unpack.newest_stop_ts; - addr->newest_stop_txn = unpack.newest_stop_txn; - addr->newest_stop_durable_ts = unpack.newest_stop_durable_ts; - addr->prepare = F_ISSET(&unpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(&addr->ta, &unpack.ta); WT_ERR(__wt_memdup(session, unpack.data, unpack.size, &addr->addr)); addr->size = (uint8_t)unpack.size; switch (unpack.raw) { @@ -574,17 +568,17 @@ err: case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during root page split to deepen the tree"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during root page split " - "to deepen the tree"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, + "ignoring not-fatal error during root page split to deepen the tree"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during root page split to deepen the tree"); break; } return (ret); @@ -877,17 +871,16 @@ err: if (empty_parent) ret = __wt_set_return(session, EBUSY); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during parent page split"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during parent page " - "split"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, "ignoring not-fatal error during parent page split"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during parent page split"); break; } __wt_scr_free(session, &scr); @@ -1154,17 +1147,16 @@ err: } __wt_free_ref_index(session, page, alloc_index, true); break; - case WT_ERR_PANIC: - __wt_err(session, ret, "fatal error during internal page split"); - ret = WT_PANIC; - break; case WT_ERR_IGNORE: - if (ret != 0 && ret != WT_PANIC) { - __wt_err(session, ret, - "ignoring not-fatal error during internal page " - "split"); + if (ret != WT_PANIC) { + if (ret != 0) + __wt_err(session, ret, "ignoring not-fatal error during internal page split"); ret = 0; + break; } + /* FALLTHROUGH */ + case WT_ERR_PANIC: + ret = __wt_panic(session, ret, "fatal error during internal page split"); break; } return (ret); @@ -1391,7 +1383,7 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT WT_SAVE_UPD *supd; WT_UPDATE *prev_onpage, *upd; uint64_t recno; - uint32_t i, slot; + uint32_t i, page_flags, slot; /* * In 04/2016, we removed column-store record numbers from the WT_PAGE structure, leading to @@ -1413,7 +1405,8 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT * our caller will not discard the disk image when discarding the original page, and our caller * will discard the allocated page on error, when discarding the allocated WT_REF. */ - WT_RET(__wt_page_inmem(session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, &page)); + page_flags = WT_PAGE_DISK_ALLOC | WT_PAGE_INSTANTIATE_PREPARE_UPDATE; + WT_RET(__wt_page_inmem(session, ref, multi->disk_image, page_flags, &page)); multi->disk_image = NULL; /* @@ -1704,13 +1697,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R if (multi->addr.addr != NULL) { WT_RET(__wt_calloc_one(session, &addr)); ref->addr = addr; - addr->oldest_start_ts = multi->addr.oldest_start_ts; - addr->oldest_start_txn = multi->addr.oldest_start_txn; - addr->newest_start_durable_ts = multi->addr.newest_start_durable_ts; - addr->newest_stop_ts = multi->addr.newest_stop_ts; - addr->newest_stop_txn = multi->addr.newest_stop_txn; - addr->newest_stop_durable_ts = multi->addr.newest_stop_durable_ts; - addr->prepare = multi->addr.prepare; + __wt_time_aggregate_copy(&addr->ta, &multi->addr.ta); WT_RET(__wt_memdup(session, multi->addr.addr, multi->addr.size, &addr->addr)); addr->size = multi->addr.size; addr->type = multi->addr.type; diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 851a407f165..fd36f6b24f9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -220,8 +220,8 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl if (previous_state == WT_REF_DISK) { /* There should be an address, but simply skip any page where we don't find one. */ if (__wt_ref_addr_copy(session, ref, &addr)) { - newest_stop_ts = addr.newest_stop_ts; - newest_stop_txn = addr.newest_stop_txn; + newest_stop_ts = addr.ta.newest_stop_ts; + newest_stop_txn = addr.ta.newest_stop_txn; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } @@ -274,21 +274,21 @@ __sync_ref_obsolete_check(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF_LIST *rl /* Calculate the max stop time pair by traversing all multi addresses. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - newest_stop_txn = WT_MAX(newest_stop_txn, multi->addr.newest_stop_txn); - newest_stop_ts = WT_MAX(newest_stop_ts, multi->addr.newest_stop_ts); + newest_stop_txn = WT_MAX(newest_stop_txn, multi->addr.ta.newest_stop_txn); + newest_stop_ts = WT_MAX(newest_stop_ts, multi->addr.ta.newest_stop_ts); } obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replacement block"; - newest_stop_txn = mod->mod_replace.newest_stop_txn; - newest_stop_ts = mod->mod_replace.newest_stop_ts; + newest_stop_txn = mod->mod_replace.ta.newest_stop_txn; + newest_stop_ts = mod->mod_replace.ta.newest_stop_ts; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else if (__wt_ref_addr_copy(session, ref, &addr)) { tag = "WT_REF address"; - newest_stop_txn = addr.newest_stop_txn; - newest_stop_ts = addr.newest_stop_ts; + newest_stop_txn = addr.ta.newest_stop_txn; + newest_stop_ts = addr.ta.newest_stop_ts; obsolete = __wt_txn_visible_all(session, newest_stop_txn, newest_stop_ts); } else tag = "unexpected page state"; @@ -469,12 +469,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) btree->syncing = WT_BTREE_SYNC_RUNNING; is_hs = WT_IS_HS(btree); - /* - * Add in history store reconciliation for standard files. - * - * FIXME-PM-1521: Remove the history store check, and assert that no updates from the - * history store are copied to the history store recursively. - */ + /* Add in history store reconciliation for standard files. */ rec_flags = WT_REC_CHECKPOINT; if (!is_hs && !WT_IS_METADATA(btree->dhandle)) rec_flags |= WT_REC_HS; diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index c9708e9511b..0b3d4da2459 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -255,17 +255,13 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * Create a fake, unpacked parent cell for the tree based on the checkpoint information. */ memset(&addr_unpack, 0, sizeof(addr_unpack)); - addr_unpack.newest_start_durable_ts = ckpt->start_durable_ts; - addr_unpack.newest_stop_durable_ts = ckpt->stop_durable_ts; - addr_unpack.oldest_start_ts = ckpt->oldest_start_ts; - addr_unpack.newest_stop_ts = ckpt->newest_stop_ts; - if (ckpt->write_gen > S2C(session)->base_write_gen) { - addr_unpack.oldest_start_txn = ckpt->oldest_start_txn; - addr_unpack.newest_stop_txn = ckpt->newest_stop_txn; - } else { - addr_unpack.oldest_start_txn = WT_TXN_NONE; - addr_unpack.newest_stop_txn = WT_TXN_MAX; + __wt_time_aggregate_copy(&addr_unpack.ta, &ckpt->ta); + if (ckpt->write_gen <= S2C(session)->base_write_gen) { + addr_unpack.ta.oldest_start_txn = WT_TXN_NONE; + addr_unpack.ta.newest_stop_txn = WT_TXN_MAX; } + if (ckpt->ta.prepare) + F_SET(&addr_unpack, WT_CELL_UNPACK_PREPARE); addr_unpack.raw = WT_CELL_ADDR_INT; /* Verify the tree. */ @@ -367,15 +363,14 @@ __verify_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) WT_ADDR_COPY addr; WT_DECL_ITEM(tmp); WT_DECL_RET; - char tp_string[2][WT_TP_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; WT_ERR(__wt_scr_alloc(session, 0, &tmp)); if (__wt_ref_addr_copy(session, ref, &addr)) { - WT_ERR(__wt_buf_fmt(session, buf, "%s %s,%s", - __wt_addr_string(session, addr.addr, addr.size, tmp), - __wt_time_pair_to_string(addr.oldest_start_ts, addr.oldest_start_txn, tp_string[0]), - __wt_time_pair_to_string(addr.newest_stop_ts, addr.newest_stop_txn, tp_string[1]))); + WT_ERR( + __wt_buf_fmt(session, buf, "%s %s", __wt_addr_string(session, addr.addr, addr.size, tmp), + __wt_time_aggregate_to_string(&addr.ta, time_string))); } else WT_ERR(__wt_buf_fmt(session, buf, "%s -/-,-/-", __wt_addr_string(session, NULL, 0, tmp))); @@ -391,28 +386,41 @@ err: static int __verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack, WT_VSTUFF *vs) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; - if (unpack->oldest_start_ts != WT_TS_NONE && unpack->newest_stop_ts == WT_TS_NONE) + if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has a newest stop " "timestamp of 0", __verify_addr_string(session, ref, vs->tmp1)); - if (unpack->oldest_start_ts > unpack->newest_stop_ts) + if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " - "timestamp %s newer than its newest stop timestamp %s", + "timestamp newer than its newest stop timestamp; time window %s", __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack->oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_ts, ts_string[1])); - if (unpack->oldest_start_txn > unpack->newest_stop_txn) + __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " - "transaction (%" PRIu64 - ") newer than its newest stop " - "transaction (%" PRIu64 ")", - __verify_addr_string(session, ref, vs->tmp1), unpack->oldest_start_txn, - unpack->newest_stop_txn); + "transaction newer than its newest stop " + "transaction; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "internal page reference at %s has an oldest start " + "timestamp newer than its newest start durable " + "timestamp; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_ts != WT_TS_MAX && + unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "internal page reference at %s has a newest stop " + "timestamp newer than its newest stop durable " + "timestamp; time aggregate %s", + __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack->ta, time_string)); return (0); } @@ -781,7 +789,7 @@ __verify_ts_stable_cmp(WT_SESSION_IMPL *session, WT_ITEM *key, WT_REF *ref, uint { WT_BTREE *btree; WT_DECL_RET; - char tp_string[2][WT_TP_STRING_SIZE]; + char tp_string[2][WT_TS_INT_STRING_SIZE]; bool start; btree = S2BT(session); @@ -949,7 +957,7 @@ __verify_page_content( uint64_t recno, rle; uint32_t cell_num; uint8_t *p; - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; bool found_ovfl; btree = S2BT(session); @@ -992,108 +1000,126 @@ __verify_page_content( case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - if (unpack.oldest_start_ts != WT_TS_NONE && unpack.newest_stop_ts == WT_TS_NONE) + if (unpack.ta.oldest_start_ts != WT_TS_NONE && unpack.ta.newest_stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "newest stop timestamp of 0", - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1)); - if (unpack.oldest_start_ts > unpack.newest_stop_ts) + "newest stop timestamp of 0; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.oldest_start_ts > unpack.ta.newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has an " - "oldest start timestamp %s newer than " - "its newest stop timestamp %s", + "oldest start timestamp newer than " + "its newest stop timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack.oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack.newest_stop_ts, ts_string[1])); - if (unpack.oldest_start_txn > unpack.newest_stop_txn) { + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.oldest_start_txn > unpack.ta.newest_stop_txn) { WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 - " on page at %s has an " - "oldest start transaction (%" PRIu64 - ") " - "newer than its newest stop transaction " - "(%" PRIu64 ")", + " on page " + "at %s has an oldest start transaction newer than " + "its newest stop transaction; time aggregate %s ", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - unpack.oldest_start_txn, unpack.newest_stop_txn); + __wt_time_aggregate_to_string(&unpack.ta, time_string)); } + if (unpack.ta.oldest_start_ts > unpack.ta.newest_start_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 + " on page at %s has an " + "oldest start timestamp newer than " + "its newest start durable timestamp; time aggregate %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); + if (unpack.ta.newest_stop_ts != WT_TS_MAX && + unpack.ta.newest_stop_ts > unpack.ta.newest_stop_durable_ts) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 + " on page at %s has a " + "newest stop timestamp newer than " + "its newest stop durable timestamp; time aggregate %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_aggregate_to_string(&unpack.ta, time_string)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", - * unpack.newest_start_durable_ts, "start durable", - * addr_unpack->newest_start_durable_ts, false, vs)); - */ + if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", + unpack.ta.newest_start_durable_ts, "start durable", + addr_unpack->ta.newest_start_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.oldest_start_ts, "oldest start", addr_unpack->oldest_start_ts, true, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start", - unpack.oldest_start_txn, "oldest start", addr_unpack->oldest_start_txn, true, dsk, + unpack.ta.oldest_start_ts, "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start", + unpack.ta.oldest_start_txn, "oldest start", addr_unpack->ta.oldest_start_txn, true, + dsk, vs)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", - * unpack.newest_stop_durable_ts, "stop durable", addr_unpack->newest_stop_durable_ts, - * false, vs)); - */ + if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", + unpack.ta.newest_stop_durable_ts, "stop durable", + addr_unpack->ta.newest_stop_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.newest_stop_ts, "newest stop", addr_unpack->newest_stop_ts, false, vs)); + unpack.ta.newest_stop_ts, "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "newest stop", - unpack.newest_stop_txn, "newest stop", addr_unpack->newest_stop_txn, false, dsk, vs)); - WT_RET(__verify_ts_stable_cmp( - session, NULL, ref, cell_num - 1, addr_unpack->start_ts, addr_unpack->stop_ts, vs)); + unpack.ta.newest_stop_txn, "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, + vs)); + WT_RET(__verify_ts_stable_cmp(session, NULL, ref, cell_num - 1, + addr_unpack->ta.oldest_start_ts, addr_unpack->ta.newest_stop_ts, vs)); break; case WT_CELL_DEL: case WT_CELL_VALUE: case WT_CELL_VALUE_COPY: case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: - if (unpack.start_ts != WT_TS_NONE && unpack.stop_ts == WT_TS_NONE) + if (unpack.tw.start_ts != WT_TS_NONE && unpack.tw.stop_ts == WT_TS_NONE) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a stop " - "timestamp of 0", - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1)); - if (unpack.start_ts > unpack.stop_ts) + "timestamp of 0; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_ts > unpack.tw.stop_ts) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "start timestamp %s newer than its stop " - "timestamp %s", + "start timestamp newer than its stop " + "timestamp; time window %s", cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), - __wt_timestamp_to_string(unpack.start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack.stop_ts, ts_string[1])); - if (unpack.start_txn > unpack.stop_txn) + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_txn > unpack.tw.stop_txn) WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "start transaction %" PRIu64 - "newer than " - "its stop transaction %" PRIu64, - cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), unpack.start_txn, - unpack.stop_txn); - - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET( - * __verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.durable_start_ts, - * "durable start", addr_unpack->newest_start_durable_ts, true, vs)); - */ - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_ts, - "oldest start", addr_unpack->oldest_start_ts, true, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_txn, - "oldest start", addr_unpack->oldest_start_txn, true, dsk, vs)); - /* - * FIXME-prepare-support: Enable verification once all durable is finished. - * - * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", - * unpack.durable_stop_ts, - * "durable stop", addr_unpack->newest_stop_durable_ts, true, vs)); - */ - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_ts, - "newest stop", addr_unpack->newest_stop_ts, false, vs)); - WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_txn, - "newest stop", addr_unpack->newest_stop_txn, false, dsk, vs)); + "start transaction newer than " + "its stop transaction; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.start_ts > unpack.tw.durable_start_ts) + WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 + " on page at %s has a " + "start timestamp newer than its start durable " + "timestamp; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + if (unpack.tw.stop_ts != WT_TS_MAX && unpack.tw.stop_ts > unpack.tw.durable_stop_ts) + WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 + " on page at %s has a " + "stop timestamp newer than its stop durable " + "timestamp; time window %s", + cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), + __wt_time_window_to_string(&unpack.tw, time_string)); + + if (addr_unpack->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable", + unpack.tw.durable_start_ts, "newest durable start", + addr_unpack->ta.newest_start_durable_ts, false, vs)); + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_ts, + "oldest start", addr_unpack->ta.oldest_start_ts, true, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.tw.start_txn, + "oldest start", addr_unpack->ta.oldest_start_txn, true, dsk, vs)); + if (addr_unpack->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable", + unpack.tw.durable_stop_ts, "newest durable stop", + addr_unpack->ta.newest_stop_durable_ts, false, vs)); + WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_ts, + "newest stop", addr_unpack->ta.newest_stop_ts, false, vs)); + WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.tw.stop_txn, + "newest stop", addr_unpack->ta.newest_stop_txn, false, dsk, vs)); WT_RET(__verify_ts_stable_cmp( - session, NULL, ref, cell_num - 1, unpack.start_ts, unpack.stop_ts, vs)); + session, NULL, ref, cell_num - 1, unpack.tw.start_ts, unpack.tw.stop_ts, vs)); break; } @@ -1106,7 +1132,7 @@ __verify_page_content( continue; WT_RET(__wt_row_leaf_key(session, page, rip++, vs->tmp1, false)); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) @@ -1117,7 +1143,7 @@ __verify_page_content( p = vs->tmp1->mem; WT_RET(__wt_vpack_uint(&p, 0, recno)); vs->tmp1->size = WT_PTRDIFF(p, vs->tmp1->mem); - WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs)); + WT_RET(__verify_key_hs(session, vs->tmp1, unpack.tw.start_ts, vs)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_history) diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index 7b80327a22c..a1e96d41dc9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -268,7 +268,7 @@ static int __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t cell_num, WT_ADDR *addr, const char *tag, const WT_PAGE_HEADER *dsk) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + char time_string[WT_TIME_STRING_SIZE]; /* * Check timestamp and transaction order, and optionally against parent values. Timestamps and @@ -284,43 +284,57 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: - if (unpack->oldest_start_ts != WT_TS_NONE && unpack->newest_stop_ts == WT_TS_NONE) + if (unpack->ta.oldest_start_ts != WT_TS_NONE && unpack->ta.newest_stop_ts == WT_TS_NONE) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a newest stop " - "timestamp of 0", - cell_num - 1, tag); - if (unpack->oldest_start_ts > unpack->newest_stop_ts) + "timestamp of 0; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has an oldest " + "start timestamp newer than its newest stop " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_txn > unpack->ta.newest_stop_txn) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has an oldest " - "start timestamp %s newer than its newest stop " - "timestamp %s", - cell_num - 1, tag, __wt_timestamp_to_string(unpack->oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->newest_stop_ts, ts_string[1])); - if (unpack->oldest_start_txn > unpack->newest_stop_txn) + "start transaction newer than its " + "newest stop transaction; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.oldest_start_ts > unpack->ta.newest_start_durable_ts) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has an oldest " - "start transaction %" PRIu64 - " newer than its " - "newest stop transaction %" PRIu64, - cell_num - 1, tag, unpack->oldest_start_txn, unpack->newest_stop_txn); + "start timestamp newer than its newest start durable " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); + if (unpack->ta.newest_stop_ts != WT_TS_MAX && + unpack->ta.newest_stop_ts > unpack->ta.newest_stop_durable_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a newest " + "stop timestamp newer than its newest stop durable " + "timestamp; time aggregate %s", + cell_num - 1, tag, __wt_time_aggregate_to_string(&unpack->ta, time_string)); if (addr == NULL) break; - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", - unpack->newest_start_durable_ts, "start durable", addr->newest_start_durable_ts, false, - tag)); + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + unpack->ta.newest_start_durable_ts, "start durable", addr->ta.newest_start_durable_ts, + false, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start", - unpack->oldest_start_ts, "oldest start", addr->oldest_start_ts, true, tag)); + unpack->ta.oldest_start_ts, "oldest start", addr->ta.oldest_start_ts, true, tag)); WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start", - unpack->oldest_start_txn, "oldest start", addr->oldest_start_txn, true, tag, dsk)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", - unpack->newest_stop_durable_ts, "stop durable", addr->newest_stop_durable_ts, false, - tag)); + unpack->ta.oldest_start_txn, "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + unpack->ta.newest_stop_durable_ts, "stop durable", addr->ta.newest_stop_durable_ts, + false, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", - unpack->newest_stop_ts, "newest stop", addr->newest_stop_ts, false, tag)); + unpack->ta.newest_stop_ts, "newest stop", addr->ta.newest_stop_ts, false, tag)); WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop", - unpack->newest_stop_txn, "newest stop", addr->newest_stop_txn, false, tag, dsk)); + unpack->ta.newest_stop_txn, "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -328,36 +342,52 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: case WT_CELL_VALUE_SHORT: - if (unpack->start_ts != WT_TS_NONE && unpack->stop_ts == WT_TS_NONE) + if (unpack->tw.start_ts != WT_TS_NONE && unpack->tw.stop_ts == WT_TS_NONE) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a stop " - "timestamp of 0", - cell_num - 1, tag); - if (unpack->start_ts > unpack->stop_ts) + "timestamp of 0; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_ts > unpack->tw.stop_ts) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a start " - "timestamp %s newer than its stop timestamp %s", - cell_num - 1, tag, __wt_timestamp_to_string(unpack->start_ts, ts_string[0]), - __wt_timestamp_to_string(unpack->stop_ts, ts_string[1])); - if (unpack->start_txn > unpack->stop_txn) + "timestamp newer than its stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_txn > unpack->tw.stop_txn) WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a start " - "transaction %" PRIu64 - " newer than its stop " - "transaction %" PRIu64, - cell_num - 1, tag, unpack->start_txn, unpack->stop_txn); + "transaction newer than its stop " + "transaction; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.start_ts > unpack->tw.durable_start_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a start " + "timestamp newer than its durable start timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); + if (unpack->tw.stop_ts != WT_TS_MAX && unpack->tw.stop_ts > unpack->tw.durable_stop_ts) + WT_RET_VRFY(session, "cell %" PRIu32 + " on page at %s has a stop " + "timestamp newer than its durable stop timestamp; time window %s", + cell_num - 1, tag, __wt_time_window_to_string(&unpack->tw, time_string)); if (addr == NULL) break; - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->start_ts, - "oldest start", addr->oldest_start_ts, true, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->start_txn, - "oldest start", addr->oldest_start_txn, true, tag, dsk)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->stop_ts, - "newest stop", addr->newest_stop_ts, false, tag)); - WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->stop_txn, - "newest stop", addr->newest_stop_txn, false, tag, dsk)); + if (addr->ta.newest_start_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable", + unpack->tw.durable_start_ts, "newest start durable", addr->ta.newest_start_durable_ts, + false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_ts, + "oldest start", addr->ta.oldest_start_ts, true, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "start", unpack->tw.start_txn, + "oldest start", addr->ta.oldest_start_txn, true, tag, dsk)); + if (addr->ta.newest_stop_durable_ts != WT_TS_NONE) + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable", + unpack->tw.durable_stop_ts, "newest stop durable", addr->ta.newest_stop_durable_ts, + false, tag)); + WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_ts, + "newest stop", addr->ta.newest_stop_ts, false, tag)); + WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "stop", unpack->tw.stop_txn, + "newest stop", addr->ta.newest_stop_txn, false, tag, dsk)); break; } @@ -707,10 +737,7 @@ __verify_dsk_col_var( struct { const void *data; size_t size; - wt_timestamp_t start_ts; - uint64_t start_txn; - wt_timestamp_t stop_ts; - uint64_t stop_txn; + WT_TIME_WINDOW tw; bool deleted; } last; WT_BM *bm; @@ -728,10 +755,7 @@ __verify_dsk_col_var( last.data = NULL; last.size = 0; - last.start_ts = WT_TS_NONE; - last.start_txn = WT_TXN_NONE; - last.stop_ts = WT_TS_NONE; - last.stop_txn = WT_TXN_NONE; + __wt_time_window_init(&last.tw); last.deleted = false; cell_num = 0; @@ -760,11 +784,11 @@ __verify_dsk_col_var( } /* - * Compare the last two items and see if reconciliation missed a chance for RLE encoding. We - * don't have to care about data encoding or anything else, a byte comparison is enough. + * Compare the last two items and see if reconciliation missed a chance for RLE encoding. + * The time windows must match and we otherwise don't have to care about data encoding, a + * byte comparison is enough. */ - if (unpack->start_ts != last.start_ts || unpack->start_txn != last.start_txn || - unpack->stop_ts != last.stop_ts || unpack->stop_txn != last.stop_txn) + if (!__wt_time_windows_equal(&unpack->tw, &last.tw)) ; else if (last.deleted) { if (cell_type == WT_CELL_DEL) @@ -777,10 +801,7 @@ match_err: "have been run-length encoded", cell_num - 1, cell_num, tag); - last.start_ts = unpack->start_ts; - last.start_txn = unpack->start_txn; - last.stop_ts = unpack->stop_ts; - last.stop_txn = unpack->stop_txn; + __wt_time_window_copy(&last.tw, &unpack->tw); switch (cell_type) { case WT_CELL_DEL: last.data = NULL; diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index bfd3ecb9f5c..a4a4f8b662d 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -34,7 +34,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U btree = cbt->btree; ins = NULL; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = upd_arg; append = logged = false; @@ -137,7 +137,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U } /* Avoid a data copy in WT_CURSOR.update. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); /* * Point the new WT_UPDATE item to the next element in the list. If we get it right, the @@ -188,7 +188,7 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U logged = true; /* Avoid a data copy in WT_CURSOR.update. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index a6d56c9499d..e98cf094421 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -74,7 +74,7 @@ __wt_col_search( uint32_t base, indx, limit, read_flags; int depth; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); current = NULL; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index b7b1c5edff8..6aa44046cb8 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -58,7 +58,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, ins = NULL; page = cbt->ref->page; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); upd = upd_arg; logged = false; @@ -109,7 +109,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, logged = true; /* Avoid WT_CURSOR.update data copy. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else { upd_size = __wt_update_list_memsize(upd); @@ -169,7 +169,7 @@ __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, logged = true; /* Avoid WT_CURSOR.update data copy. */ - cbt->modify_update = upd; + __wt_upd_value_assign(cbt->modify_update, upd); } else upd_size = __wt_update_list_memsize(upd); diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 98ae6f66daf..917705f6f9c 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -224,7 +224,7 @@ __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *le int cmp, depth; bool append_check, descend_right, done; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); collator = btree->collator; item = cbt->tmp; diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c index 31e5ab78478..8b5f7299fd4 100644 --- a/src/third_party/wiredtiger/src/conn/conn_capacity.c +++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c @@ -115,7 +115,7 @@ __capacity_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "capacity server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "capacity server error")); } return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index 122d310934d..5c8ad02b01e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -116,7 +116,7 @@ __ckpt_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "checkpoint server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "checkpoint server error")); } return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 3d4edc58167..beb222d08bb 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -166,7 +166,7 @@ __wt_conn_dhandle_alloc(WT_SESSION_IMPL *session, const char *uri, const char *c dhandle = (WT_DATA_HANDLE *)table; dhandle->type = WT_DHANDLE_TYPE_TABLE; } else - WT_PANIC_RET(session, EINVAL, "illegal handle allocation URI %s", uri); + WT_RET_PANIC(session, EINVAL, "illegal handle allocation URI %s", uri); /* Btree handles keep their data separate from the interface. */ if (dhandle->type == WT_DHANDLE_TYPE_BTREE) { diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 2bc89996afd..1770505d566 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -589,7 +589,7 @@ __log_file_server(void *arg) * file system may not support truncate: both are OK, it's just more work during * cursor traversal. */ - if (!conn->hot_backup && conn->log_cursors == 0) { + if (conn->hot_backup_start == 0 && conn->log_cursors == 0) { WT_WITH_HOTBACKUP_READ_LOCK(session, WT_ERR_ERROR_OK(__wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP, false), @@ -661,7 +661,7 @@ __log_file_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "log close server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "log close server error")); } WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count); if (locked) @@ -856,7 +856,7 @@ __log_wrlsn_server(void *arg) __wt_log_wrlsn(session, NULL); if (0) { err: - WT_PANIC_MSG(session, ret, "log wrlsn server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "log wrlsn server error")); } return (WT_THREAD_RET_VALUE); } @@ -947,7 +947,7 @@ __log_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "log server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "log server error")); } return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 3c28ac121ad..076e64c73ce 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -39,6 +39,8 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) */ conn->default_session = session; + __wt_seconds(session, &conn->ckpt_finish_secs); + /* * Publish: there must be a barrier to ensure the connection structure fields are set before * other threads read from the pointer. @@ -208,9 +210,18 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* * Run recovery. NOTE: This call will start (and stop) eviction if recovery is required. * Recovery must run before the history store table is created (because recovery will update the - * metadata), and before eviction is started for real. + * metadata, and set the maximum file id seen), and before eviction is started for real. */ - WT_RET(__wt_txn_recover(session)); + WT_RET(__wt_txn_recover(session, cfg)); + + /* Initialize metadata tracking, required before creating tables. */ + WT_RET(__wt_meta_track_init(session)); + + /* + * Create the history store file. This will only actually create it on upgrade or when creating + * a new database. + */ + WT_RET(__wt_hs_create(session, cfg)); /* * Start the optional logging/archive threads. NOTE: The log manager must be started before @@ -219,12 +230,6 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_RET(__wt_logmgr_open(session)); - /* Initialize metadata tracking, required before creating tables. */ - WT_RET(__wt_meta_track_init(session)); - - /* Create the history store table. */ - WT_RET(__wt_hs_create(session, cfg)); - /* * Start eviction threads. NOTE: Eviction must be started after the history store table is * created. diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 455f10ea905..b3a1b3e979a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -596,7 +596,7 @@ __statlog_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "statistics log server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "statistics log server error")); } __wt_buf_free(session, &path); __wt_buf_free(session, &tmp); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index eeb7ffa514c..934acd228e8 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -325,7 +325,7 @@ __sweep_server(void *arg) if (0) { err: - WT_PANIC_MSG(session, ret, "handle sweep server error"); + WT_IGNORE_RET(__wt_panic(session, ret, "handle sweep server error")); } return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 05dc7e2ff9b..4011a62b0d5 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -319,7 +319,7 @@ __backup_add_id(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval) * We didn't find an entry. This should not happen. */ if (i == WT_BLKINCR_MAX) - WT_PANIC_RET(session, WT_NOTFOUND, "Could not find an incremental backup slot to use"); + WT_RET_PANIC(session, WT_NOTFOUND, "Could not find an incremental backup slot to use"); /* Use the slot. */ if (blk->id_str != NULL) @@ -610,7 +610,7 @@ __backup_start( * Single thread hot backups: we're holding the schema lock, so we know we'll serialize with * other attempts to start a hot backup. */ - if (conn->hot_backup && !is_dup) + if (conn->hot_backup_start != 0 && !is_dup) WT_RET_MSG(session, EINVAL, "there is already a backup cursor open"); if (F_ISSET(session, WT_SESSION_BACKUP_DUP) && is_dup) @@ -766,7 +766,7 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion and next hot backup can proceed. */ - WT_WITH_HOTBACKUP_WRITE_LOCK(session, conn->hot_backup = false); + WT_WITH_HOTBACKUP_WRITE_LOCK(session, conn->hot_backup_start = 0); F_CLR(session, WT_SESSION_BACKUP_CURSOR); return (ret); diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c index 09a9057355d..6eb4351276b 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_bulk.c +++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c @@ -18,8 +18,8 @@ __bulk_col_keycmp_err(WT_CURSOR_BULK *cbulk) WT_CURSOR *cursor; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; cursor = &cbulk->cbt.iface; + session = CUR2S(cbulk); WT_RET_MSG(session, EINVAL, "bulk-load presented with out-of-order keys: %" PRIu64 " is less " @@ -196,8 +196,8 @@ __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; cursor = &cbulk->cbt.iface; + session = CUR2S(cbulk); WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 7dfb3bca218..b2d75494110 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -542,7 +542,7 @@ __curfile_cache(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); cbt->dhandle = cbt->btree->dhandle; WT_TRET(__wt_cursor_cache(cursor, cbt->dhandle)); @@ -565,7 +565,7 @@ __curfile_reopen(WT_CURSOR *cursor, bool check_only) cbt = (WT_CURSOR_BTREE *)cursor; dhandle = cbt->dhandle; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); if (check_only) return (WT_DHANDLE_CAN_REOPEN(dhandle) ? 0 : WT_NOTFOUND); @@ -655,7 +655,7 @@ __curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], cacheable = F_ISSET(session, WT_SESSION_CACHE_CURSORS) && !bulk; WT_RET(__wt_calloc(session, 1, csize, &cbt)); - cursor = (WT_CURSOR *)cbt; + cursor = &cbt->iface; *cursor = iface; cursor->session = (WT_SESSION *)session; cursor->internal_uri = btree->dhandle->name; diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 0f1fab36bf8..9cc1ba83a4f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -103,7 +103,7 @@ __curindex_move(WT_CURSOR_INDEX *cindex) WT_SESSION_IMPL *session; u_int i; - session = (WT_SESSION_IMPL *)cindex->iface.session; + session = CUR2S(cindex); first = NULL; /* Point the public cursor to the key in the child. */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index bb2497f3d19..06159cb54bd 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -35,7 +35,7 @@ __wt_curjoin_joined(WT_CURSOR *cursor) WT_GCC_FUNC_ATTRIBUTE((cold)) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET_MSG(session, ENOTSUP, "cursor is being used in a join"); } @@ -770,7 +770,7 @@ __curjoin_init_bloom( goto done; WT_ERR(ret); } else - WT_PANIC_ERR(session, EINVAL, "fatal error in join cursor position state"); + WT_ERR_PANIC(session, EINVAL, "fatal error in join cursor position state"); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c index 692ab34d210..7e9ac93eea6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_json.c +++ b/src/third_party/wiredtiger/src/cursor/cur_json.c @@ -373,7 +373,7 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const char *beginkey, *end, *lparen, *p; json = (WT_CURSOR_JSON *)cursor->json_private; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); beginkey = colconf->str; end = beginkey + colconf->len; diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 6743845cdba..f3e2b2930dc 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -487,7 +487,7 @@ __curstat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **resultp) const char *static_desc; sgrp = &cst->u.join_stats_group; - session = (WT_SESSION_IMPL *)sgrp->join_cursor->iface.session; + session = CUR2S(sgrp->join_cursor); WT_RET(__wt_stat_join_desc(cst, slot, &static_desc)); len = strlen("join: ") + strlen(sgrp->desc_prefix) + strlen(static_desc) + 1; WT_RET(__wt_realloc(session, NULL, len, &cst->desc_buf)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index a65bb55a8ba..fd81465eb76 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -29,7 +29,7 @@ __wt_cursor_cached(WT_CURSOR *cursor) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET_MSG(session, ENOTSUP, "Cursor has been closed"); } @@ -42,7 +42,7 @@ __wt_cursor_notsup(WT_CURSOR *cursor) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET_MSG(session, ENOTSUP, "Unsupported cursor operation"); } @@ -142,7 +142,7 @@ __wt_cursor_modify_value_format_notsup(WT_CURSOR *cursor, WT_MODIFY *entries, in WT_UNUSED(nentries); if (cursor->value_format != NULL && strlen(cursor->value_format) != 0) { - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.modify only supported for 'S' and 'u' value " "formats"); @@ -221,7 +221,7 @@ __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_ATTRIBUTE((cold) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET_MSG(session, cursor->saved_err == 0 ? EINVAL : cursor->saved_err, "requires %s be set", key ? "key" : "value"); @@ -238,7 +238,7 @@ __wt_cursor_copy_release_item(WT_CURSOR *cursor, WT_ITEM *item) WT_GCC_FUNC_ATTR WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); /* Bail out if the item has been cleared. */ if (item->data == NULL) @@ -646,7 +646,7 @@ __wt_cursor_cache(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) WT_SESSION_IMPL *session; uint64_t bucket; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_ASSERT(session, !F_ISSET(cursor, WT_CURSTD_CACHED) && dhandle != NULL); WT_TRET(cursor->reset(cursor)); @@ -687,7 +687,7 @@ __wt_cursor_reopen(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) WT_SESSION_IMPL *session; uint64_t bucket; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_CACHED)); if (dhandle != NULL) { @@ -892,7 +892,7 @@ __wt_cursor_close(WT_CURSOR *cursor) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); if (F_ISSET(cursor, WT_CURSTD_OPEN)) { TAILQ_REMOVE(&session->cursors, cursor, q); @@ -1066,7 +1066,7 @@ __wt_cursor_init( WT_SESSION_IMPL *session; bool readonly; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); if (cursor->internal_uri == NULL) WT_RET(__wt_strdup(session, uri, &cursor->internal_uri)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 4fd78188c39..b5edf359059 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -150,7 +150,7 @@ __apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, bool skip_immutable) int (*f)(WT_CURSOR *); cp = ctable->idx_cursors; - session = (WT_SESSION_IMPL *)ctable->iface.session; + session = CUR2S(ctable); for (i = 0; i < ctable->table->nindices; i++, cp++) { idx = ctable->table->indices[i]; @@ -729,7 +729,7 @@ __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop) int cmp; ctable = (start != NULL) ? start : stop; - session = (WT_SESSION_IMPL *)ctable->iface.session; + session = CUR2S(ctable); wt_start = &start->iface; wt_stop = &stop->iface; @@ -877,7 +877,7 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) const char *cfg[] = {cfg_arg[0], cfg_arg[1], "dump=\"\",readonly=0", NULL, NULL}; u_int i; - session = (WT_SESSION_IMPL *)ctable->iface.session; + session = CUR2S(ctable); table = ctable->table; WT_RET(__curtable_complete(session, table)); /* completeness check */ @@ -904,7 +904,7 @@ __curtable_open_indices(WT_CURSOR_TABLE *ctable) WT_TABLE *table; u_int i; - session = (WT_SESSION_IMPL *)ctable->iface.session; + session = CUR2S(ctable); table = ctable->table; WT_RET(__wt_schema_open_indices(session, table)); diff --git a/src/third_party/wiredtiger/src/docs/backup.dox b/src/third_party/wiredtiger/src/docs/backup.dox index 610033d05cf..ac18263eff0 100644 --- a/src/third_party/wiredtiger/src/docs/backup.dox +++ b/src/third_party/wiredtiger/src/docs/backup.dox @@ -56,10 +56,9 @@ aggregate the file names from the cursor and then list the file names as arguments to a file archiver such as the system tar utility. During the period the backup cursor is open, database checkpoints can -be created, but no checkpoints can be deleted. This may result in -significant file growth. Additionally while the backup cursor is open -automatic log file archiving, even if enabled, will not reclaim any -log files. +be created, but checkpoints created prior to the backup cursor cannot +be deleted. Additionally while the backup cursor is open automatic log +file archiving, even if enabled, will not reclaim any log files. Additionally, if a crash occurs during the period the backup cursor is open and logging is disabled (in other words, when depending on diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 455e8c15bef..bbc6f3af565 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -325,7 +325,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) if (0) { err: - WT_PANIC_RET(session, ret, "cache eviction thread error"); + WT_RET_PANIC(session, ret, "cache eviction thread error"); } return (ret); } @@ -362,7 +362,7 @@ __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread) if (0) { err: - WT_PANIC_RET(session, ret, "cache eviction thread error"); + WT_RET_PANIC(session, ret, "cache eviction thread error"); } return (ret); } diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index ec93cf88a75..6d0d3be7fc8 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -122,11 +122,12 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32 * affect those already-running history store operations by changing the cursor state. When * doing history store operations, we set the no-reconciliation flag, use it as short-hand to * avoid that problem. This doesn't open up the window for the deadlock because setting the - * no-reconciliation flag limits eviction to in-memory splits. FIXME: This isn't reasonable and - * needs a better fix. + * no-reconciliation flag limits eviction to in-memory splits. * * The test for the connection's default session is because there are known problems with using - * cached cursors from the default session. FIXME: This isn't reasonable and needs a better fix. + * cached cursors from the default session. + * + * FIXME-WT-6037: This isn't reasonable and needs a better fix. */ if (!WT_IS_METADATA(S2BT(session)->dhandle) && !F_ISSET(conn, WT_CONN_IN_MEMORY) && session->hs_cursor == NULL && !F_ISSET(session, WT_SESSION_NO_RECONCILE) && diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index 2770c48ad53..ed1db846793 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -9,10 +9,19 @@ #include "wt_internal.h" /* + * WT_HS_TIME_PAIR -- + * A pair containing a timestamp and transaction id. + */ +typedef struct { + wt_timestamp_t timestamp; + uint64_t txnid; +} WT_HS_TIME_PAIR; + +/* * When an operation is accessing the history store table, it should ignore the cache size (since - * the cache is already full), and the operation can't reenter reconciliation. + * the cache is already full). */ -#define WT_HS_SESSION_FLAGS (WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_RECONCILE) +#define WT_HS_SESSION_FLAGS WT_SESSION_IGNORE_CACHE_SIZE static int __hs_delete_key_from_pos( WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key); @@ -282,7 +291,7 @@ __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd) WT_SESSION_IMPL *session; WT_UPDATE *last_upd; - session = (WT_SESSION_IMPL *)hs_cbt->iface.session; + session = CUR2S(hs_cbt); /* If there are existing updates, append them after the new updates. */ if (hs_cbt->compare == 0) { @@ -355,15 +364,14 @@ __hs_insert_updates_verbose(WT_SESSION_IMPL *session, WT_BTREE *btree) static int __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, - WT_TIME_PAIR stop_ts_pair) + WT_HS_TIME_PAIR stop_ts_pair) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; - WT_UPDATE *hs_upd; - uint32_t session_flags; + WT_UPDATE *hs_upd, *upd_local; cbt = (WT_CURSOR_BTREE *)cursor; - hs_upd = NULL; + hs_upd = upd_local = NULL; /* * Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to @@ -373,23 +381,32 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W cursor, btree->id, key, upd->start_ts, __wt_atomic_add64(&btree->hs_counter, 1)); cursor->set_value(cursor, stop_ts_pair.timestamp, upd->durable_ts, (uint64_t)type, hs_value); - /* - * Insert a delete record to represent stop time pair for the actual record to be inserted. Set - * the stop time pair as the commit time pair of the history store delete record. - */ - WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); - hs_upd->start_ts = stop_ts_pair.timestamp; - hs_upd->durable_ts = stop_ts_pair.timestamp; - hs_upd->txnid = stop_ts_pair.txnid; + /* Allocate a tombstone only when there is a valid stop time pair. */ + if (stop_ts_pair.timestamp != WT_TS_MAX || stop_ts_pair.txnid != WT_TXN_MAX) { + /* + * Insert a delete record to represent stop time pair for the actual record to be inserted. + * Set the stop time pair as the commit time pair of the history store delete record. + */ + WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); + hs_upd->start_ts = stop_ts_pair.timestamp; + hs_upd->durable_ts = stop_ts_pair.timestamp; + hs_upd->txnid = stop_ts_pair.txnid; + } /* * Append to the delete record, the actual record to be inserted into the history store. Set the * current update start time pair as the commit time pair to the history store record. */ - WT_ERR(__wt_upd_alloc(session, &cursor->value, WT_UPDATE_STANDARD, &hs_upd->next, NULL)); - hs_upd->next->start_ts = upd->start_ts; - hs_upd->next->durable_ts = upd->durable_ts; - hs_upd->next->txnid = upd->txnid; + WT_ERR(__wt_upd_alloc(session, &cursor->value, WT_UPDATE_STANDARD, &upd_local, NULL)); + upd_local->start_ts = upd->start_ts; + upd_local->durable_ts = upd->durable_ts; + upd_local->txnid = upd->txnid; + + /* Insert the standard update as next update if there is a tombstone. */ + if (hs_upd != NULL) + hs_upd->next = upd_local; + else + hs_upd = upd_local; /* * Search the page and insert the updates. We expect there will be no existing data: assert that @@ -425,8 +442,7 @@ err: */ WT_TRET(__wt_cursor_key_order_init(cbt)); #endif - session_flags = session->flags; - F_SET(session, WT_SESSION_IGNORE_HS_TOMBSTONE); + F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); /* We're pointing at the newly inserted update. Iterate once more to avoid deleting it. */ ret = cursor->next(cursor); if (ret == WT_NOTFOUND) @@ -435,8 +451,7 @@ err: WT_TRET(__hs_delete_key_from_pos(session, cursor, btree->id, key)); WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); } - if (!FLD_ISSET(session_flags, WT_SESSION_IGNORE_HS_TOMBSTONE)) - F_CLR(session, WT_SESSION_IGNORE_HS_TOMBSTONE); + F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE); } /* We did a row search, release the cursor so that the page doesn't continue being held. */ cursor->reset(cursor); @@ -452,7 +467,7 @@ err: static int __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, - WT_TIME_PAIR stop_ts_pair) + WT_HS_TIME_PAIR stop_ts_pair) { WT_DECL_RET; @@ -505,7 +520,7 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT */ static int __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, - const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_PAIR stop_ts_pair) + const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, WT_HS_TIME_PAIR stop_ts_pair) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -526,7 +541,7 @@ __hs_calculate_full_value(WT_SESSION_IMPL *session, WT_ITEM *full_value, WT_UPDA { if (upd->type == WT_UPDATE_MODIFY) { WT_RET(__wt_buf_set(session, full_value, base_full_value, size)); - WT_RET(__wt_modify_apply_item(session, full_value, upd->data, false)); + WT_RET(__wt_modify_apply_item(session, S2BT(session)->value_format, full_value, upd->data)); } else { WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD); full_value->data = upd->data; @@ -541,8 +556,10 @@ __hs_calculate_full_value(WT_SESSION_IMPL *session, WT_ITEM *full_value, WT_UPDA * Copy one set of saved updates into the database's history store table. */ int -__wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi) +__wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) { + WT_BTREE *btree; + WT_CURSOR *cursor; WT_DECL_ITEM(full_value); WT_DECL_ITEM(key); WT_DECL_ITEM(modify_value); @@ -554,9 +571,8 @@ __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MUL WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM]; WT_MODIFY_VECTOR modifies; WT_SAVE_UPD *list; - WT_SESSION_IMPL *session; WT_UPDATE *prev_upd, *upd; - WT_TIME_PAIR stop_ts_pair; + WT_HS_TIME_PAIR stop_ts_pair; wt_off_t hs_size; uint64_t insert_cnt, max_hs_size; uint32_t i; @@ -564,8 +580,9 @@ __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MUL int nentries; bool squashed; + btree = S2BT(session); + cursor = session->hs_cursor; prev_upd = NULL; - session = (WT_SESSION_IMPL *)cursor->session; insert_cnt = 0; __wt_modify_vector_init(session, &modifies); @@ -585,9 +602,6 @@ __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MUL if (list->onpage_upd == NULL) continue; - /* onpage_upd now is always from the update chain */ - WT_ASSERT(session, !F_ISSET(list->onpage_upd, WT_UPDATE_RESTORED_FROM_DISK)); - /* History store table key component: source key. */ switch (page->type) { case WT_PAGE_COL_FIX: @@ -696,13 +710,25 @@ __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MUL __wt_modify_vector_pop(&modifies, &prev_upd); /* - * Set the stop timestamp from durable timestamp instead of commit timestamp. The - * Garbage collection of history store removes the history values once the stop - * timestamp is globally visible. i.e. durable timestamp of data store version. + * For any uncommitted prepared updates written to disk, the stop timestamp of the last + * update moved into the history store should be with max visibility to protect its + * removal by checkpoint garbage collection until the data store update is committed. */ - WT_ASSERT(session, prev_upd->start_ts <= prev_upd->durable_ts); - stop_ts_pair.timestamp = prev_upd->durable_ts; - stop_ts_pair.txnid = prev_upd->txnid; + if (prev_upd->prepare_state == WT_PREPARE_INPROGRESS) { + WT_ASSERT(session, + list->onpage_upd == prev_upd || list->onpage_upd->txnid == prev_upd->txnid); + stop_ts_pair.timestamp = WT_TS_MAX; + stop_ts_pair.txnid = WT_TXN_MAX; + } else { + /* + * Set the stop timestamp from durable timestamp instead of commit timestamp. The + * garbage collection of history store removes the history values once the stop + * timestamp is globally visible. i.e. durable timestamp of data store version. + */ + WT_ASSERT(session, prev_upd->start_ts <= prev_upd->durable_ts); + stop_ts_pair.timestamp = prev_upd->durable_ts; + stop_ts_pair.txnid = prev_upd->txnid; + } if (prev_upd->type == WT_UPDATE_TOMBSTONE) { WT_ASSERT(session, modifies.size > 0); @@ -764,10 +790,9 @@ __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, WT_MUL WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size); max_hs_size = ((WT_CURSOR_BTREE *)cursor)->btree->file_max; if (max_hs_size != 0 && (uint64_t)hs_size > max_hs_size) - WT_PANIC_ERR(session, WT_PANIC, "WiredTigerHS: file size of %" PRIu64 - " exceeds maximum " - "size %" PRIu64, - (uint64_t)hs_size, max_hs_size); + WT_ERR_PANIC(session, WT_PANIC, + "WiredTigerHS: file size of %" PRIu64 " exceeds maximum size %" PRIu64, (uint64_t)hs_size, + max_hs_size); err: if (ret == 0 && insert_cnt > 0) @@ -810,9 +835,6 @@ __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t bt * Note that we need to compare the raw key off the cursor to determine where we are in the * history store as opposed to comparing the embedded data store key since the ordering is not * guaranteed to be the same. - * - * FIXME: We should be repeatedly moving the cursor backwards within the loop instead of doing a - * search near operation each time as it is cheaper. */ cursor->set_key( cursor, btree_id, key, timestamp != WT_TS_NONE ? timestamp : WT_TS_MAX, UINT64_MAX); @@ -863,15 +885,14 @@ __hs_restore_read_timestamp(WT_SESSION_IMPL *session) * prepare conflict will be returned upon reading a prepared update. */ int -__wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDATE **updp, - bool allow_prepare, WT_ITEM *on_disk_buf) +__wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno, + WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf) { WT_CURSOR *hs_cursor; - WT_DECL_ITEM(hs_key); WT_DECL_ITEM(hs_value); WT_DECL_ITEM(orig_hs_value_buf); WT_DECL_RET; - WT_ITEM recno_key; + WT_ITEM hs_key, recno_key; WT_MODIFY_VECTOR modifies; WT_TXN *txn; WT_UPDATE *mod_upd, *upd; @@ -883,11 +904,10 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA int cmp; bool is_owner, modify; - *updp = NULL; - hs_cursor = NULL; mod_upd = upd = NULL; orig_hs_value_buf = NULL; + WT_CLEAR(hs_key); __wt_modify_vector_init(session, &modifies); txn = session->txn; hs_btree_id = S2BT(session)->id; @@ -914,8 +934,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA key->size = WT_PTRDIFF(p, recno_key_buf); } - /* Allocate buffers for the history store key/value. */ - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); + /* Allocate buffer for the history store value. */ WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); /* Open a history store table cursor. */ @@ -934,7 +953,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA ret = 0; goto done; } - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); /* Stop before crossing over to the next btree */ if (hs_btree_id != S2BT(session)->id) @@ -944,7 +963,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA * Keys are sorted in an order, skip the ones before the desired key, and bail out if we have * crossed over the desired key and not found the record we are looking for. */ - WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); + WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); if (cmp != 0) goto done; @@ -956,6 +975,13 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA WT_ASSERT(session, upd_type != WT_UPDATE_TOMBSTONE); /* + * If the caller has signalled they don't need the value buffer, don't bother reconstructing a + * modify update or copying the contents into the value buffer. + */ + if (upd_value->skip_buf) + goto skip_buf; + + /* * Keep walking until we get a non-modify update. Once we get to that point, squash the updates * together. */ @@ -1008,9 +1034,9 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA * reverse deltas on top of. */ WT_ERR(hs_cursor->get_key( - hs_cursor, &hs_btree_id, hs_key, &hs_start_ts_tmp, &hs_counter_tmp)); + hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp)); - WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); + WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); if (cmp != 0) { /* Fallback to the onpage value as the base value. */ @@ -1028,7 +1054,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA WT_ASSERT(session, upd_type == WT_UPDATE_STANDARD); while (modifies.size > 0) { __wt_modify_vector_pop(&modifies, &mod_upd); - WT_ERR(__wt_modify_apply_item(session, hs_value, mod_upd->data, false)); + WT_ERR(__wt_modify_apply_item(session, value_format, hs_value, mod_upd->data)); __wt_free_update_list(session, &mod_upd); mod_upd = NULL; } @@ -1037,19 +1063,18 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA WT_STAT_CONN_INCR(session, cache_hs_read_squash); } - /* Allocate an update structure for the record found. */ - WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &upd, NULL)); - upd->txnid = WT_TXN_NONE; - upd->durable_ts = durable_timestamp; - upd->start_ts = hs_start_ts; - upd->prepare_state = upd->start_ts == upd->durable_ts ? WT_PREPARE_INIT : WT_PREPARE_RESOLVED; - /* - * We're not keeping this in our update list as we want to get rid of it after the read has been - * dealt with. Mark this update as external and to be discarded when not needed. + * Potential optimization: We can likely get rid of this copy and the update allocation above. + * We already have buffers containing the modify values so there's no good reason to allocate an + * update other than to work with our modify vector implementation. */ - F_SET(upd, WT_UPDATE_RESTORED_FROM_DISK); - *updp = upd; + WT_ERR(__wt_buf_set(session, &upd_value->buf, hs_value->data, hs_value->size)); +skip_buf: + upd_value->start_ts = hs_start_ts; + upd_value->txnid = WT_TXN_NONE; + upd_value->type = upd_type; + upd_value->prepare_state = + (hs_start_ts == durable_timestamp) ? WT_PREPARE_INIT : WT_PREPARE_RESOLVED; done: err: @@ -1059,7 +1084,7 @@ err: __wt_scr_free(session, &orig_hs_value_buf); else __wt_scr_free(session, &hs_value); - __wt_scr_free(session, &hs_key); + WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0); /* * Restore the read timestamp if we encountered an error while processing a modify. There's no @@ -1172,17 +1197,19 @@ __wt_hs_delete_key(WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *k return (0); WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); + /* * In order to delete a key range, we need to be able to inspect all history store records * regardless of their stop time pairs. */ - F_SET(session, WT_SESSION_IGNORE_HS_TOMBSTONE); + F_SET(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); + /* The tree structure can change while we try to insert the mod list, retry if that happens. */ while ((ret = __hs_delete_key_int(session, btree_id, key)) == WT_RESTART) ; - if (!FLD_ISSET(session_flags, WT_SESSION_IGNORE_HS_TOMBSTONE)) - F_CLR(session, WT_SESSION_IGNORE_HS_TOMBSTONE); + F_CLR(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); + WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); return (ret); } @@ -1252,29 +1279,38 @@ err: static int __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32_t this_btree_id) { - WT_CURSOR *cursor; - WT_DECL_ITEM(hs_key); + WT_CURSOR *hs_cursor; WT_DECL_ITEM(prev_hs_key); - WT_DECL_ITEM(tmp); WT_DECL_RET; + WT_ITEM hs_key; wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t btree_id; int cmp; bool found; - cursor = session->hs_cursor; + hs_cursor = session->hs_cursor; + WT_CLEAR(hs_key); - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); WT_ERR(__wt_scr_alloc(session, 0, &prev_hs_key)); /* + * We need to be able to iterate over the history store content for another table. In order to + * do this, we must ignore non-globally visible tombstones in the history store since every + * history store record is succeeded with a tombstone. We also need to skip the non-globally + * visible tombstones in the data table to verify the corresponding entries in the history store + * are too present in the data store. + */ + F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); + F_SET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); + + /* * The caller is responsible for positioning the history store cursor at the first record to * verify. When we return after moving to a new key the caller is responsible for keeping the * cursor there or deciding they're done. */ - for (; ret == 0; ret = cursor->next(cursor)) { - WT_ERR(cursor->get_key(cursor, &btree_id, hs_key, &hs_start_ts, &hs_counter)); + for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { + WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &hs_key, &hs_start_ts, &hs_counter)); /* * If the btree id does not match the preview one, we're done. It is up to the caller to set @@ -1290,34 +1326,34 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32 * If we have already checked against this key, keep going to the next key. We only need to * check the key once. */ - WT_ERR(__wt_compare(session, NULL, hs_key, prev_hs_key, &cmp)); + WT_ERR(__wt_compare(session, NULL, &hs_key, prev_hs_key, &cmp)); if (cmp == 0) continue; - WT_WITH_PAGE_INDEX(session, ret = __wt_row_search(cbt, hs_key, false, NULL, false, NULL)); + WT_WITH_PAGE_INDEX(session, ret = __wt_row_search(cbt, &hs_key, false, NULL, false, NULL)); WT_ERR(ret); -/* FIXME: temporarily disable hs verification. */ -#if 0 found = cbt->compare == 0; -#else - found = true; -#endif WT_ERR(__cursor_reset(cbt)); - if (!found) - WT_ERR_MSG(session, WT_PANIC, + if (!found) { + F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); + WT_ERR_PANIC(session, WT_PANIC, "the associated history store key %s was not found in the data store %s", - __wt_buf_set_printable(session, hs_key->data, hs_key->size, prev_hs_key), + __wt_buf_set_printable(session, hs_key.data, hs_key.size, prev_hs_key), session->dhandle->name); + } - /* Swap current/previous buffers. */ - tmp = hs_key; - hs_key = prev_hs_key; - prev_hs_key = tmp; + /* + * Copy the key memory into our scratch buffer. The key will get invalidated on our next + * cursor iteration. + */ + WT_ERR(__wt_buf_set(session, prev_hs_key, hs_key.data, hs_key.size)); } WT_ERR_NOTFOUND_OK(ret, true); err: - __wt_scr_free(session, &hs_key); + F_CLR(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); + F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); + WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0); __wt_scr_free(session, &prev_hs_key); return (ret); } @@ -1370,8 +1406,8 @@ __wt_history_store_verify(WT_SESSION_IMPL *session) { WT_CURSOR *cursor, *data_cursor; WT_DECL_ITEM(buf); - WT_DECL_ITEM(hs_key); WT_DECL_RET; + WT_ITEM hs_key; wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t btree_id, session_flags; @@ -1382,13 +1418,13 @@ __wt_history_store_verify(WT_SESSION_IMPL *session) WT_ASSERT(session, S2C(session)->default_session != session); cursor = data_cursor = NULL; + WT_CLEAR(hs_key); btree_id = WT_BTREE_ID_INVALID; session_flags = 0; /* [-Wconditional-uninitialized] */ uri_data = NULL; is_owner = false; /* [-Wconditional-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner)); cursor = session->hs_cursor; ret = cursor->next(cursor); @@ -1405,12 +1441,14 @@ __wt_history_store_verify(WT_SESSION_IMPL *session) * The cursor is positioned either from above or left over from the internal call on the * first key of a new btree id. */ - WT_ERR(cursor->get_key(cursor, &btree_id, hs_key, &hs_start_ts, &hs_counter)); - if ((ret = __wt_metadata_btree_id_to_uri(session, btree_id, &uri_data)) != 0) - WT_ERR_MSG(session, WT_PANIC, + WT_ERR(cursor->get_key(cursor, &btree_id, &hs_key, &hs_start_ts, &hs_counter)); + if ((ret = __wt_metadata_btree_id_to_uri(session, btree_id, &uri_data)) != 0) { + F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); + WT_ERR_PANIC(session, WT_PANIC, "Unable to find btree id %" PRIu32 " in the metadata file for the associated history store key %s", - btree_id, __wt_buf_set_printable(session, hs_key->data, hs_key->size, buf)); + btree_id, __wt_buf_set_printable(session, hs_key.data, hs_key.size, buf)); + } WT_ERR(__wt_open_cursor(session, uri_data, NULL, NULL, &data_cursor)); F_SET(data_cursor, WT_CURSOR_RAW_OK); ret = __verify_history_store_id(session, (WT_CURSOR_BTREE *)data_cursor, btree_id); @@ -1423,7 +1461,7 @@ err: WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); __wt_scr_free(session, &buf); - __wt_scr_free(session, &hs_key); + WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0); __wt_free(session, uri_data); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 513e0106e53..1bcca8dc686 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -126,14 +126,7 @@ __wt_page_header_byteswap(WT_PAGE_HEADER *dsk) * An in-memory structure to hold a block's location. */ struct __wt_addr { - /* Validity window */ - wt_timestamp_t newest_start_durable_ts; - wt_timestamp_t oldest_start_ts; - uint64_t oldest_start_txn; - wt_timestamp_t newest_stop_durable_ts; - wt_timestamp_t newest_stop_ts; - uint64_t newest_stop_txn; - bool prepare; + WT_TIME_AGGREGATE ta; uint8_t *addr; /* Block-manager's cookie */ uint8_t size; /* Block-manager's cookie length */ @@ -159,14 +152,7 @@ struct __wt_addr { * copy of the WT_REF address information. */ struct __wt_addr_copy { - /* Validity window */ - wt_timestamp_t newest_start_durable_ts; - wt_timestamp_t oldest_start_ts; - uint64_t oldest_start_txn; - wt_timestamp_t newest_stop_durable_ts; - wt_timestamp_t newest_stop_ts; - uint64_t newest_stop_txn; - bool prepare; + WT_TIME_AGGREGATE ta; uint8_t type; @@ -640,16 +626,17 @@ struct __wt_page { uint8_t type; /* Page type */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_PAGE_BUILD_KEYS 0x01u /* Keys have been built in memory */ -#define WT_PAGE_DISK_ALLOC 0x02u /* Disk image in allocated memory */ -#define WT_PAGE_DISK_MAPPED 0x04u /* Disk image in mapped memory */ -#define WT_PAGE_EVICT_LRU 0x08u /* Page is on the LRU queue */ -#define WT_PAGE_EVICT_NO_PROGRESS 0x10u /* Eviction doesn't count as progress */ -#define WT_PAGE_OVERFLOW_KEYS 0x20u /* Page has overflow keys */ -#define WT_PAGE_SPLIT_INSERT 0x40u /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80u /* Ignore updates on page discard */ - /* AUTOMATIC FLAG VALUE GENERATION STOP */ - uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ +#define WT_PAGE_BUILD_KEYS 0x001u /* Keys have been built in memory */ +#define WT_PAGE_DISK_ALLOC 0x002u /* Disk image in allocated memory */ +#define WT_PAGE_DISK_MAPPED 0x004u /* Disk image in mapped memory */ +#define WT_PAGE_EVICT_LRU 0x008u /* Page is on the LRU queue */ +#define WT_PAGE_EVICT_NO_PROGRESS 0x010u /* Eviction doesn't count as progress */ +#define WT_PAGE_INSTANTIATE_PREPARE_UPDATE 0x020u /* Instantiate prepared updates */ +#define WT_PAGE_OVERFLOW_KEYS 0x040u /* Page has overflow keys */ +#define WT_PAGE_SPLIT_INSERT 0x080u /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x100u /* Ignore updates on page discard */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ @@ -817,15 +804,6 @@ struct __wt_page_deleted { }; /* - * WT_TIME_PAIR -- - * A pair containing a timestamp and transaction id. - */ -struct __wt_time_pair { - wt_timestamp_t timestamp; - uint64_t txnid; -}; - -/* * WT_REF -- * A single in-memory page and state information. */ @@ -1098,7 +1076,6 @@ struct __wt_update { /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_UPDATE_HS 0x1u /* Update has been written to history store. */ #define WT_UPDATE_RESTORED_FOR_ROLLBACK 0x2u /* Update restored for rollback to stable. */ -#define WT_UPDATE_RESTORED_FROM_DISK 0x4u /* Update is temporary retrieved from disk. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint8_t flags; @@ -1122,6 +1099,39 @@ struct __wt_update { #define WT_UPDATE_MEMSIZE(upd) WT_ALIGN(WT_UPDATE_SIZE + (upd)->size, 32) /* + * WT_UPDATE_VALUE -- + * + * A generic representation of an update's value regardless of where it exists. This structure is + * used to represent both in-memory updates and updates that don't exist in an update list such as + * reconstructed modify updates, updates in the history store and onpage values. + * + * The skip buffer flag is an optimization for callers of various read functions to communicate that + * they just want to check that an update exists and not read its underlying value. This means that + * the read functions can avoid the performance penalty of reconstructing modifies. + */ +struct __wt_update_value { + WT_ITEM buf; + wt_timestamp_t start_ts; + uint64_t txnid; + uint8_t type; + uint8_t prepare_state; + bool skip_buf; +}; + +/* + * WT_WITH_UPDATE_VALUE_SKIP_BUF -- + * + * A helper macro to use for calling read functions when we're checking for the existence of a given + * key. This means that read functions can avoid the performance penalty of reconstructing modifies. + */ +#define WT_WITH_UPDATE_VALUE_SKIP_BUF(op) \ + do { \ + cbt->upd_value->skip_buf = true; \ + op; \ + cbt->upd_value->skip_buf = false; \ + } while (0) + +/* * WT_MAX_MODIFY_UPDATE, WT_MODIFY_VECTOR_STACK_SIZE * Limit update chains value to avoid penalizing reads and permit truncation. Having a smaller * value will penalize the cases when history has to be maintained, resulting in multiplying cache diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index cd1952b00d1..7b3ff5b8f3d 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1115,13 +1115,7 @@ __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) /* If off-page, the pointer references a WT_ADDR structure. */ if (__wt_off_page(page, addr)) { - copy->oldest_start_ts = addr->oldest_start_ts; - copy->oldest_start_txn = addr->oldest_start_txn; - copy->newest_start_durable_ts = addr->newest_start_durable_ts; - copy->newest_stop_ts = addr->newest_stop_ts; - copy->newest_stop_txn = addr->newest_stop_txn; - copy->newest_stop_durable_ts = addr->newest_stop_durable_ts; - copy->prepare = addr->prepare; + __wt_time_aggregate_copy(©->ta, &addr->ta); copy->type = addr->type; memcpy(copy->addr, addr->addr, copy->size = addr->size); return (true); @@ -1129,13 +1123,7 @@ __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) /* If on-page, the pointer references a cell. */ __wt_cell_unpack(session, page, (WT_CELL *)addr, unpack); - copy->oldest_start_ts = unpack->oldest_start_ts; - copy->oldest_start_txn = unpack->oldest_start_txn; - copy->newest_start_durable_ts = unpack->newest_start_durable_ts; - copy->newest_stop_ts = unpack->newest_stop_ts; - copy->newest_stop_txn = unpack->newest_stop_txn; - copy->newest_stop_durable_ts = unpack->newest_stop_durable_ts; - copy->prepare = F_ISSET(unpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(©->ta, &unpack->ta); copy->type = 0; /* Avoid static analyzer uninitialized value complaints. */ switch (unpack->raw) { case WT_CELL_ADDR_INT: @@ -1708,25 +1696,14 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32 */ static inline int __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_PAGE *page, - WT_CELL_UNPACK *unpack, WT_COL *cip, WT_UPDATE **updp) + WT_CELL_UNPACK *unpack, WT_COL *cip) { - WT_UPDATE *upd; - - *updp = NULL; - cbt->slot = WT_COL_SLOT(page, cip); - WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, NULL, unpack, &upd)); - if (upd == NULL) + WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, NULL, unpack)); + if (cbt->upd_value->type == WT_UPDATE_INVALID || cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); - if (upd->type == WT_UPDATE_TOMBSTONE) { - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - return (0); - } - - WT_RET(__wt_value_return(cbt, upd)); - *updp = upd; + WT_RET(__wt_value_return(cbt, cbt->upd_value)); cbt->tmp->data = cbt->iface.value.data; cbt->tmp->size = cbt->iface.value.size; diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h index b80449a8c18..760811e5acf 100644 --- a/src/third_party/wiredtiger/src/include/cell.h +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -151,23 +151,10 @@ struct __wt_cell { struct __wt_cell_unpack { WT_CELL *cell; /* Cell's disk image address */ - uint64_t v; /* RLE count or recno */ + WT_TIME_AGGREGATE ta; /* Address validity window */ + WT_TIME_WINDOW tw; /* Value validity window */ - /* Value validity window */ - wt_timestamp_t start_ts; /* default value: WT_TS_NONE */ - uint64_t start_txn; /* default value: WT_TXN_NONE */ - wt_timestamp_t durable_start_ts; /* default value: WT_TS_NONE */ - wt_timestamp_t stop_ts; /* default value: WT_TS_MAX */ - uint64_t stop_txn; /* default value: WT_TXN_MAX */ - wt_timestamp_t durable_stop_ts; /* default value: WT_TS_NONE */ - - /* Address validity window */ - wt_timestamp_t oldest_start_ts; /* default value: WT_TS_NONE */ - uint64_t oldest_start_txn; /* default value: WT_TXN_NONE */ - wt_timestamp_t newest_start_durable_ts; /* default value: WT_TS_NONE */ - wt_timestamp_t newest_stop_ts; /* default value: WT_TS_MAX */ - uint64_t newest_stop_txn; /* default value: WT_TXN_MAX */ - wt_timestamp_t newest_stop_durable_ts; /* default value: WT_TS_NONE */ + uint64_t v; /* RLE count or recno */ /* * !!! @@ -185,9 +172,9 @@ struct __wt_cell_unpack { uint8_t type; /* Cell type */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CELL_UNPACK_OVERFLOW 0x1u /* cell is an overflow */ -#define WT_CELL_UNPACK_PREPARE 0x2u /* cell is part of a prepared transaction */ -#define WT_CELL_UNPACK_TIME_PAIRS_CLEARED 0x4u /* time pairs are cleared because of restart */ - /* AUTOMATIC FLAG VALUE GENERATION STOP */ +#define WT_CELL_UNPACK_OVERFLOW 0x1u /* cell is an overflow */ +#define WT_CELL_UNPACK_PREPARE 0x2u /* cell is part of a prepared transaction */ +#define WT_CELL_UNPACK_TIME_WINDOW_CLEARED 0x4u /* time window cleared because of restart */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint8_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index bcd23946883..f14eb7f8d15 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -11,47 +11,47 @@ * Check the value's validity window for sanity. */ static inline void -__cell_check_value_validity(WT_SESSION_IMPL *session, wt_timestamp_t durable_start_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn) +__cell_check_value_validity(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw) { #ifdef HAVE_DIAGNOSTIC + /* + * We're using WT_ERR_ASSERT rather than WT_ASSERT because we want to push out a message string. + * This usage of WT_ERR_ASSERT isn't "correct", because it jumps to a non-existent error label + * in non-diagnostic builds and returns WT_PANIC without calling the underlying panic routine. + * That's OK, we have to be in a diagnostic build to get here, and fixing it would require new + * macros that aren't needed anywhere else, so we're leaving it alone. + */ char ts_string[2][WT_TS_INT_STRING_SIZE]; - if (start_ts > durable_start_ts) - WT_ERR_ASSERT(session, start_ts <= durable_start_ts, WT_PANIC, + if (tw->start_ts > tw->durable_start_ts) + WT_ERR_ASSERT(session, tw->start_ts <= tw->durable_start_ts, WT_PANIC, "a start timestamp %s newer than its durable start timestamp %s", - __wt_timestamp_to_string(start_ts, ts_string[0]), - __wt_timestamp_to_string(durable_start_ts, ts_string[1])); + __wt_timestamp_to_string(tw->start_ts, ts_string[0]), + __wt_timestamp_to_string(tw->durable_start_ts, ts_string[1])); - if (start_ts != WT_TS_NONE && stop_ts == WT_TS_NONE) - WT_ERR_ASSERT(session, stop_ts != WT_TS_NONE, WT_PANIC, "stop timestamp of 0"); + if (tw->start_ts != WT_TS_NONE && tw->stop_ts == WT_TS_NONE) + WT_ERR_ASSERT(session, tw->stop_ts != WT_TS_NONE, WT_PANIC, "stop timestamp of 0"); - if (start_ts > stop_ts) - WT_ERR_ASSERT(session, start_ts <= stop_ts, WT_PANIC, + if (tw->start_ts > tw->stop_ts) + WT_ERR_ASSERT(session, tw->start_ts <= tw->stop_ts, WT_PANIC, "a start timestamp %s newer than its stop timestamp %s", - __wt_timestamp_to_string(start_ts, ts_string[0]), - __wt_timestamp_to_string(stop_ts, ts_string[1])); + __wt_timestamp_to_string(tw->start_ts, ts_string[0]), + __wt_timestamp_to_string(tw->stop_ts, ts_string[1])); - if (start_txn > stop_txn) - WT_ERR_ASSERT(session, start_txn <= stop_txn, WT_PANIC, + if (tw->start_txn > tw->stop_txn) + WT_ERR_ASSERT(session, tw->start_txn <= tw->stop_txn, WT_PANIC, "a start transaction ID %" PRIu64 " newer than its stop transaction ID %" PRIu64, - start_txn, stop_txn); + tw->start_txn, tw->stop_txn); - if (stop_ts != WT_TS_MAX && stop_ts > durable_stop_ts) - WT_ERR_ASSERT(session, stop_ts <= durable_stop_ts, WT_PANIC, + if (tw->stop_ts != WT_TS_MAX && tw->stop_ts > tw->durable_stop_ts) + WT_ERR_ASSERT(session, tw->stop_ts <= tw->durable_stop_ts, WT_PANIC, "a stop timestamp %s newer than its durable stop timestamp %s", - __wt_timestamp_to_string(stop_ts, ts_string[0]), - __wt_timestamp_to_string(durable_stop_ts, ts_string[1])); + __wt_timestamp_to_string(tw->stop_ts, ts_string[0]), + __wt_timestamp_to_string(tw->durable_stop_ts, ts_string[1])); #else WT_UNUSED(session); - WT_UNUSED(durable_start_ts); - WT_UNUSED(durable_stop_ts); - WT_UNUSED(start_ts); - WT_UNUSED(start_txn); - WT_UNUSED(stop_ts); - WT_UNUSED(stop_txn); + WT_UNUSED(tw); #endif } @@ -60,21 +60,17 @@ __cell_check_value_validity(WT_SESSION_IMPL *session, wt_timestamp_t durable_sta * Pack the validity window for a value. */ static inline void -__cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t durable_start_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare) +__cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, WT_TIME_WINDOW *tw) { uint8_t flags, *flagsp; /* Globally visible values have no associated validity window. */ - if (durable_start_ts == WT_TS_NONE && start_ts == WT_TS_NONE && start_txn == WT_TXN_NONE && - durable_stop_ts == WT_TS_NONE && stop_ts == WT_TS_MAX && stop_txn == WT_TXN_MAX) { + if (__wt_time_window_is_empty(tw)) { ++*pp; return; } - __cell_check_value_validity( - session, durable_start_ts, start_ts, start_txn, durable_stop_ts, stop_ts, stop_txn); + __cell_check_value_validity(session, tw); **pp |= WT_CELL_SECOND_DESC; ++*pp; @@ -82,46 +78,41 @@ __cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_ ++*pp; flags = 0; - if (start_ts != WT_TS_NONE) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_ts)); + if (tw->start_ts != WT_TS_NONE) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->start_ts)); LF_SET(WT_CELL_TS_START); } - if (start_txn != WT_TXN_NONE) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_txn)); + if (tw->start_txn != WT_TXN_NONE) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->start_txn)); LF_SET(WT_CELL_TXN_START); } - if (durable_start_ts != WT_TS_NONE) { - WT_ASSERT(session, start_ts != WT_TS_NONE && start_ts <= durable_start_ts); + if (tw->durable_start_ts != WT_TS_NONE) { + WT_ASSERT(session, tw->start_ts <= tw->durable_start_ts); /* Store differences if any, not absolutes. */ - if (durable_start_ts - start_ts > 0) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, durable_start_ts - start_ts)); + if (tw->durable_start_ts - tw->start_ts > 0) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->durable_start_ts - tw->start_ts)); LF_SET(WT_CELL_TS_DURABLE_START); } } - if (stop_ts != WT_TS_MAX) { + if (tw->stop_ts != WT_TS_MAX) { /* Store differences, not absolutes. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_ts - start_ts)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->stop_ts - tw->start_ts)); LF_SET(WT_CELL_TS_STOP); } - if (stop_txn != WT_TXN_MAX) { + if (tw->stop_txn != WT_TXN_MAX) { /* Store differences, not absolutes. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_txn - start_txn)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->stop_txn - tw->start_txn)); LF_SET(WT_CELL_TXN_STOP); } - if (durable_stop_ts != WT_TS_NONE) { - WT_ASSERT(session, stop_ts != WT_TS_MAX && stop_ts <= durable_stop_ts); + if (tw->durable_stop_ts != WT_TS_NONE) { + WT_ASSERT(session, tw->stop_ts <= tw->durable_stop_ts); /* Store differences if any, not absolutes. */ - if (durable_stop_ts - stop_ts > 0) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, durable_stop_ts - stop_ts)); + if (tw->durable_stop_ts - tw->stop_ts > 0) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, tw->durable_stop_ts - tw->stop_ts)); LF_SET(WT_CELL_TS_DURABLE_STOP); } } - /* - * Currently, no uncommitted prepared updates are written to the data store, so this flag must - * be false until we allow writing them in WT-5984. In that ticket this assert must be removed. - */ - WT_ASSERT(session, prepare == false); - if (prepare) + if (tw->prepare) LF_SET(WT_CELL_PREPARE); *flagsp = flags; } @@ -131,47 +122,47 @@ __cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_ * Check the address' validity window for sanity. */ static inline void -__wt_check_addr_validity(WT_SESSION_IMPL *session, wt_timestamp_t start_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn) +__wt_check_addr_validity(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta) { #ifdef HAVE_DIAGNOSTIC + /* + * We're using WT_ERR_ASSERT rather than WT_ASSERT because we want to push out a message string. + * This usage of WT_ERR_ASSERT isn't "correct", because it jumps to a non-existent error label + * in non-diagnostic builds and returns WT_PANIC without calling the underlying panic routine. + * That's OK, we have to be in a diagnostic build to get here, and fixing it would require new + * macros that aren't needed anywhere else, so we're leaving it alone. + */ char ts_string[2][WT_TS_INT_STRING_SIZE]; - if (oldest_start_ts != WT_TS_NONE && newest_stop_ts == WT_TS_NONE) + if (ta->oldest_start_ts != WT_TS_NONE && ta->newest_stop_ts == WT_TS_NONE) WT_ERR_ASSERT( - session, newest_stop_ts != WT_TS_NONE, WT_PANIC, "newest stop timestamp of 0"); + session, ta->newest_stop_ts != WT_TS_NONE, WT_PANIC, "newest stop timestamp of 0"); - if (oldest_start_ts > newest_stop_ts) - WT_ERR_ASSERT(session, oldest_start_ts <= newest_stop_ts, WT_PANIC, + if (ta->oldest_start_ts > ta->newest_stop_ts) + WT_ERR_ASSERT(session, ta->oldest_start_ts <= ta->newest_stop_ts, WT_PANIC, "an oldest start timestamp %s newer than its newest stop timestamp %s", - __wt_timestamp_to_string(oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(newest_stop_ts, ts_string[1])); + __wt_timestamp_to_string(ta->oldest_start_ts, ts_string[0]), + __wt_timestamp_to_string(ta->newest_stop_ts, ts_string[1])); - if (oldest_start_txn > newest_stop_txn) - WT_ERR_ASSERT(session, oldest_start_txn <= newest_stop_txn, WT_PANIC, + if (ta->oldest_start_txn > ta->newest_stop_txn) + WT_ERR_ASSERT(session, ta->oldest_start_txn <= ta->newest_stop_txn, WT_PANIC, "an oldest start transaction %" PRIu64 " newer than its newest stop transaction %" PRIu64, - oldest_start_txn, newest_stop_txn); + ta->oldest_start_txn, ta->newest_stop_txn); - if (oldest_start_ts > start_durable_ts) - WT_ERR_ASSERT(session, oldest_start_ts <= start_durable_ts, WT_PANIC, + if (ta->oldest_start_ts > ta->newest_start_durable_ts) + WT_ERR_ASSERT(session, ta->oldest_start_ts <= ta->newest_start_durable_ts, WT_PANIC, "an oldest start timestamp %s newer than its durable start timestamp %s", - __wt_timestamp_to_string(oldest_start_ts, ts_string[0]), - __wt_timestamp_to_string(start_durable_ts, ts_string[1])); + __wt_timestamp_to_string(ta->oldest_start_ts, ts_string[0]), + __wt_timestamp_to_string(ta->newest_start_durable_ts, ts_string[1])); - if (newest_stop_ts != WT_TS_MAX && newest_stop_ts > stop_durable_ts) - WT_ERR_ASSERT(session, newest_stop_ts <= stop_durable_ts, WT_PANIC, + if (ta->newest_stop_ts != WT_TS_MAX && ta->newest_stop_ts > ta->newest_stop_durable_ts) + WT_ERR_ASSERT(session, ta->newest_stop_ts <= ta->newest_stop_durable_ts, WT_PANIC, "a newest stop timestamp %s newer than its durable stop timestamp %s", - __wt_timestamp_to_string(newest_stop_ts, ts_string[0]), - __wt_timestamp_to_string(stop_durable_ts, ts_string[1])); + __wt_timestamp_to_string(ta->newest_stop_ts, ts_string[0]), + __wt_timestamp_to_string(ta->newest_stop_durable_ts, ts_string[1])); #else WT_UNUSED(session); - WT_UNUSED(start_durable_ts); - WT_UNUSED(oldest_start_ts); - WT_UNUSED(oldest_start_txn); - WT_UNUSED(stop_durable_ts); - WT_UNUSED(newest_stop_ts); - WT_UNUSED(newest_stop_txn); + WT_UNUSED(ta); #endif } @@ -180,22 +171,17 @@ __wt_check_addr_validity(WT_SESSION_IMPL *session, wt_timestamp_t start_durable_ * Pack the validity window for an address. */ static inline void -__cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t start_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, bool prepare) +__cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, WT_TIME_AGGREGATE *ta) { uint8_t flags, *flagsp; /* Globally visible values have no associated validity window. */ - if (start_durable_ts == WT_TS_NONE && stop_durable_ts == WT_TS_NONE && - oldest_start_ts == WT_TS_NONE && oldest_start_txn == WT_TXN_NONE && - newest_stop_ts == WT_TS_MAX && newest_stop_txn == WT_TXN_MAX) { + if (__wt_time_aggregate_is_empty(ta)) { ++*pp; return; } - __wt_check_addr_validity(session, start_durable_ts, oldest_start_ts, oldest_start_txn, - stop_durable_ts, newest_stop_ts, newest_stop_txn); + __wt_check_addr_validity(session, ta); **pp |= WT_CELL_SECOND_DESC; ++*pp; @@ -203,21 +189,18 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t ++*pp; flags = 0; - if (oldest_start_ts != WT_TS_NONE) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_ts)); + if (ta->oldest_start_ts != WT_TS_NONE) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->oldest_start_ts)); LF_SET(WT_CELL_TS_START); } - if (oldest_start_txn != WT_TXN_NONE) { - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_txn)); + if (ta->oldest_start_txn != WT_TXN_NONE) { + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->oldest_start_txn)); LF_SET(WT_CELL_TXN_START); } - if (start_durable_ts != WT_TS_NONE) { + if (ta->newest_start_durable_ts != WT_TS_NONE) { /* Store differences, not absolutes. */ - /* - * FIXME-prepare-support: - * WT_ASSERT( - * session, oldest_start_ts != WT_TS_NONE && oldest_start_ts <= start_durable_ts); - */ + WT_ASSERT(session, ta->oldest_start_ts <= ta->newest_start_durable_ts); + /* * Unlike value cell, we store the durable start timestamp even the difference is zero * compared to oldest commit timestamp. The difference can only be zero when the page @@ -225,43 +208,38 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t * having that check to find out whether it is zero or not will unnecessarily add overhead * than benefit. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_durable_ts - oldest_start_ts)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->newest_start_durable_ts - ta->oldest_start_ts)); LF_SET(WT_CELL_TS_DURABLE_START); } - if (newest_stop_ts != WT_TS_MAX) { + if (ta->newest_stop_ts != WT_TS_MAX) { /* Store differences, not absolutes. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_ts - oldest_start_ts)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->newest_stop_ts - ta->oldest_start_ts)); LF_SET(WT_CELL_TS_STOP); } - if (newest_stop_txn != WT_TXN_MAX) { + if (ta->newest_stop_txn != WT_TXN_MAX) { /* Store differences, not absolutes. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_txn - oldest_start_txn)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->newest_stop_txn - ta->oldest_start_txn)); LF_SET(WT_CELL_TXN_STOP); } - if (stop_durable_ts != WT_TS_NONE) { - /* Store differences, not absolutes. */ - /* - * FIXME-prepare-support: - * WT_ASSERT(session, - * newest_stop_ts != WT_TS_MAX && newest_stop_ts <= stop_durable__ts); - */ + if (ta->newest_stop_durable_ts != WT_TS_NONE) { + WT_ASSERT(session, + ta->newest_stop_ts == WT_TS_MAX || ta->newest_stop_ts <= ta->newest_stop_durable_ts); + /* + * Store differences, not absolutes. + * * Unlike value cell, we store the durable stop timestamp even the difference is zero * compared to newest commit timestamp. The difference can only be zero when the page * contains all the key/value pairs with the same timestamp. But this scenario is rare and * having that check to find out whether it is zero or not will unnecessarily add overhead * than benefit. */ - WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_durable_ts - newest_stop_ts)); + WT_IGNORE_RET(__wt_vpack_uint(pp, 0, ta->newest_stop_durable_ts - ta->newest_stop_ts)); LF_SET(WT_CELL_TS_DURABLE_STOP); } - /* - * Currently, no uncommitted prepared updates are written to the data store, so this flag must - * be false until we allow writing them in WT-5984. In that ticket this assert must be removed. - */ - WT_ASSERT(session, prepare == false); - if (prepare) + if (ta->prepare) LF_SET(WT_CELL_PREPARE); + *flagsp = flags; } @@ -271,9 +249,7 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t */ static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, uint64_t recno, - wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, - bool prepare, size_t size) + WT_TIME_AGGREGATE *ta, size_t size) { uint8_t *p; @@ -281,8 +257,7 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui p = cell->__chunk; *p = '\0'; - __cell_pack_addr_validity(session, &p, start_durable_ts, oldest_start_ts, oldest_start_txn, - stop_durable_ts, newest_stop_ts, newest_stop_txn, prepare); + __cell_pack_addr_validity(session, &p, ta); if (recno == WT_RECNO_OOB) cell->__chunk[0] |= (uint8_t)cell_type; /* Type */ @@ -301,9 +276,8 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui * Set a value item's WT_CELL contents. */ static inline size_t -__wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t durable_start_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, uint64_t rle, size_t size) +__wt_cell_pack_value( + WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, uint64_t rle, size_t size) { uint8_t byte, *p; bool validity; @@ -312,8 +286,7 @@ __wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t dur p = cell->__chunk; *p = '\0'; - __cell_pack_value_validity(session, &p, durable_start_ts, start_ts, start_txn, durable_stop_ts, - stop_ts, stop_txn, prepare); + __cell_pack_value_validity(session, &p, tw); /* * Short data cells without a validity window or run-length encoding have 6 bits of data length @@ -435,9 +408,8 @@ __wt_cell_pack_value_match( * Write a copy value cell. */ static inline size_t -__wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_durable_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, uint64_t rle, uint64_t v) +__wt_cell_pack_copy( + WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, uint64_t rle, uint64_t v) { uint8_t *p; @@ -445,8 +417,7 @@ __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t star p = cell->__chunk; *p = '\0'; - __cell_pack_value_validity(session, &p, start_durable_ts, start_ts, start_txn, stop_durable_ts, - stop_ts, stop_txn, prepare); + __cell_pack_value_validity(session, &p, tw); if (rle < 2) cell->__chunk[0] |= WT_CELL_VALUE_COPY; /* Type */ @@ -466,9 +437,7 @@ __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t star * Write a deleted value cell. */ static inline size_t -__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_durable_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle) +__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, uint64_t rle) { uint8_t *p; @@ -476,9 +445,8 @@ __wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start p = cell->__chunk; *p = '\0'; - /* FIXME-prepare-support: we should pass prepare value. */ - __cell_pack_value_validity(session, &p, start_durable_ts, start_ts, start_txn, stop_durable_ts, - stop_ts, stop_txn, false); + /* FIXME-WT-6124: we should set the time window prepare value. */ + __cell_pack_value_validity(session, &p, tw); if (rle < 2) cell->__chunk[0] |= WT_CELL_DEL; /* Type */ @@ -564,9 +532,7 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) * Pack an overflow cell. */ static inline size_t -__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, - wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, +__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, WT_TIME_WINDOW *tw, uint64_t rle, size_t size) { uint8_t *p; @@ -578,12 +544,12 @@ __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, switch (type) { case WT_CELL_KEY_OVFL: case WT_CELL_KEY_OVFL_RM: + WT_ASSERT(session, tw == NULL); ++p; break; case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: - __cell_pack_value_validity(session, &p, durable_start_ts, start_ts, start_txn, - durable_stop_ts, stop_ts, stop_txn, prepare); + __cell_pack_value_validity(session, &p, tw); break; } @@ -739,26 +705,22 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE { struct { uint64_t v; - wt_timestamp_t start_ts; - wt_timestamp_t durable_start_ts; - uint64_t start_txn; - wt_timestamp_t stop_ts; - wt_timestamp_t durable_stop_ts; - uint64_t stop_txn; + WT_TIME_WINDOW tw; uint32_t len; } copy; + WT_TIME_AGGREGATE *ta; + WT_TIME_WINDOW *tw; uint64_t v; const uint8_t *p; uint8_t flags; + bool copy_cell; + + copy_cell = false; + copy.len = 0; /* [-Wconditional-uninitialized] */ + copy.v = 0; /* [-Wconditional-uninitialized] */ - copy.v = 0; /* -Werror=maybe-uninitialized */ - copy.start_ts = WT_TS_NONE; - copy.durable_start_ts = WT_TS_NONE; - copy.start_txn = WT_TXN_NONE; - copy.stop_ts = WT_TS_MAX; - copy.durable_stop_ts = WT_TS_NONE; - copy.stop_txn = WT_TXN_MAX; - copy.len = 0; + tw = &unpack->tw; + ta = &unpack->ta; /* * The verification code specifies an end argument, a pointer to 1B past the end-of-page. In which @@ -789,18 +751,8 @@ restart: * following switch. All validity windows default to durability. */ unpack->v = 0; - unpack->durable_start_ts = WT_TS_NONE; - unpack->durable_stop_ts = WT_TS_NONE; - unpack->start_ts = WT_TS_NONE; - unpack->start_txn = WT_TXN_NONE; - unpack->stop_ts = WT_TS_MAX; - unpack->stop_txn = WT_TXN_MAX; - unpack->newest_start_durable_ts = WT_TS_NONE; - unpack->newest_stop_durable_ts = WT_TS_NONE; - unpack->oldest_start_ts = WT_TS_NONE; - unpack->oldest_start_txn = WT_TXN_NONE; - unpack->newest_stop_ts = WT_TS_MAX; - unpack->newest_stop_txn = WT_TXN_MAX; + __wt_time_window_init(&unpack->tw); + __wt_time_aggregate_init(&unpack->ta); unpack->raw = (uint8_t)__wt_cell_type_raw(cell); unpack->type = (uint8_t)__wt_cell_type(cell); unpack->flags = 0; @@ -852,39 +804,38 @@ restart: break; flags = *p++; /* skip second descriptor byte */ - if (LF_ISSET(WT_CELL_PREPARE)) + if (LF_ISSET(WT_CELL_PREPARE)) { F_SET(unpack, WT_CELL_UNPACK_PREPARE); + ta->prepare = 1; + } if (LF_ISSET(WT_CELL_TS_START)) - WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->oldest_start_ts)); + WT_RET( + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->oldest_start_ts)); if (LF_ISSET(WT_CELL_TXN_START)) - WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->oldest_start_txn)); + WT_RET( + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->oldest_start_txn)); if (LF_ISSET(WT_CELL_TS_DURABLE_START)) { WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_start_durable_ts)); - unpack->newest_start_durable_ts += unpack->oldest_start_ts; + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_start_durable_ts)); + ta->newest_start_durable_ts += ta->oldest_start_ts; } if (LF_ISSET(WT_CELL_TS_STOP)) { WT_RET( - __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_stop_ts)); - unpack->newest_stop_ts += unpack->oldest_start_ts; + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_ts)); + ta->newest_stop_ts += ta->oldest_start_ts; } if (LF_ISSET(WT_CELL_TXN_STOP)) { - WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_stop_txn)); - unpack->newest_stop_txn += unpack->oldest_start_txn; + WT_RET( + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_txn)); + ta->newest_stop_txn += ta->oldest_start_txn; } if (LF_ISSET(WT_CELL_TS_DURABLE_STOP)) { WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_stop_durable_ts)); - unpack->newest_stop_durable_ts += unpack->newest_stop_ts; + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_durable_ts)); + ta->newest_stop_durable_ts += ta->newest_stop_ts; } - - __wt_check_addr_validity(session, unpack->newest_start_durable_ts, unpack->oldest_start_ts, - unpack->oldest_start_txn, unpack->newest_stop_durable_ts, unpack->newest_stop_ts, - unpack->newest_stop_txn); + __wt_check_addr_validity(session, ta); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -895,38 +846,39 @@ restart: break; flags = *p++; /* skip second descriptor byte */ - if (LF_ISSET(WT_CELL_PREPARE)) + if (LF_ISSET(WT_CELL_PREPARE)) { F_SET(unpack, WT_CELL_UNPACK_PREPARE); + tw->prepare = 1; + } if (LF_ISSET(WT_CELL_TS_START)) - WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->start_ts)); + WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->start_ts)); if (LF_ISSET(WT_CELL_TXN_START)) - WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->start_txn)); + WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->start_txn)); if (LF_ISSET(WT_CELL_TS_DURABLE_START)) { - WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->durable_start_ts)); - unpack->durable_start_ts += unpack->start_ts; + WT_RET( + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->durable_start_ts)); + tw->durable_start_ts += tw->start_ts; } else - unpack->durable_start_ts = unpack->start_ts; + tw->durable_start_ts = tw->start_ts; if (LF_ISSET(WT_CELL_TS_STOP)) { - WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->stop_ts)); - unpack->stop_ts += unpack->start_ts; + WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->stop_ts)); + tw->stop_ts += tw->start_ts; } if (LF_ISSET(WT_CELL_TXN_STOP)) { - WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->stop_txn)); - unpack->stop_txn += unpack->start_txn; + WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->stop_txn)); + tw->stop_txn += tw->start_txn; } if (LF_ISSET(WT_CELL_TS_DURABLE_STOP)) { - WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->durable_stop_ts)); - unpack->durable_stop_ts += unpack->stop_ts; - } else if (unpack->stop_ts != WT_TS_MAX) - unpack->durable_stop_ts = unpack->stop_ts; + WT_RET( + __wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->durable_stop_ts)); + tw->durable_stop_ts += tw->stop_ts; + } else if (tw->stop_ts != WT_TS_MAX) + tw->durable_stop_ts = tw->stop_ts; else - unpack->durable_stop_ts = WT_TS_NONE; + tw->durable_stop_ts = WT_TS_NONE; - __cell_check_value_validity(session, unpack->durable_start_ts, unpack->start_ts, - unpack->start_txn, unpack->durable_stop_ts, unpack->stop_ts, unpack->stop_txn); + __cell_check_value_validity(session, tw); break; } @@ -943,19 +895,16 @@ restart: */ switch (unpack->raw) { case WT_CELL_VALUE_COPY: + copy_cell = true; + /* * The cell is followed by an offset to a cell written earlier in the page. Save/restore the - * length and RLE of this cell, we need the length to step through the set of cells on the - * page and this RLE is probably different from the RLE of the earlier cell. + * visibility window, length and RLE of this cell, we need the length to step through the + * set of cells on the page and the RLE and timestamp information are specific to this cell. */ + __wt_time_window_copy(©.tw, tw); WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v)); copy.v = unpack->v; - copy.start_ts = unpack->start_ts; - copy.durable_start_ts = unpack->durable_start_ts; - copy.start_txn = unpack->start_txn; - copy.stop_ts = unpack->stop_ts; - copy.durable_stop_ts = unpack->durable_stop_ts; - copy.stop_txn = unpack->stop_txn; copy.len = WT_PTRDIFF32(p, cell); cell = (WT_CELL *)((uint8_t *)cell - v); goto restart; @@ -1003,22 +952,17 @@ restart: return (WT_ERROR); /* Unknown cell type. */ } -/* - * Check the original cell against the full cell length (this is a diagnostic as well, we may be - * copying the cell from the page and we need the right length). - */ done: + /* + * Check the original cell against the full cell length (this is a diagnostic as well, we may be + * copying the cell from the page and we need the right length). + */ WT_CELL_LEN_CHK(cell, unpack->__len); - if (copy.len != 0) { - unpack->raw = WT_CELL_VALUE_COPY; + if (copy_cell) { + __wt_time_window_copy(tw, ©.tw); unpack->v = copy.v; - unpack->start_ts = copy.start_ts; - unpack->durable_start_ts = copy.durable_start_ts; - unpack->start_txn = copy.start_txn; - unpack->stop_ts = copy.stop_ts; - unpack->durable_stop_ts = copy.durable_stop_ts; - unpack->stop_txn = copy.stop_txn; unpack->__len = copy.len; + unpack->raw = WT_CELL_VALUE_COPY; } return (0); @@ -1032,6 +976,12 @@ static inline void __wt_cell_unpack_dsk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack) { + WT_TIME_AGGREGATE *ta; + WT_TIME_WINDOW *tw; + + ta = &unpack->ta; + tw = &unpack->tw; + /* * Row-store doesn't store zero-length values on pages, but this allows us to pretend. */ @@ -1042,18 +992,8 @@ __wt_cell_unpack_dsk( * If there isn't any value validity window (which is what it will take to get to a * zero-length item), the value must be stable. */ - unpack->durable_start_ts = WT_TS_NONE; - unpack->durable_stop_ts = WT_TS_NONE; - unpack->start_ts = WT_TS_NONE; - unpack->start_txn = WT_TXN_NONE; - unpack->stop_ts = WT_TS_MAX; - unpack->stop_txn = WT_TXN_MAX; - unpack->newest_start_durable_ts = WT_TS_NONE; - unpack->newest_stop_durable_ts = WT_TS_NONE; - unpack->oldest_start_ts = WT_TS_NONE; - unpack->oldest_start_txn = WT_TXN_NONE; - unpack->newest_stop_ts = WT_TS_MAX; - unpack->newest_stop_txn = WT_TXN_MAX; + __wt_time_window_init(tw); + __wt_time_aggregate_init(ta); unpack->data = ""; unpack->size = 0; unpack->__len = 0; @@ -1081,30 +1021,30 @@ __wt_cell_unpack_dsk( * Previous startup txnid=0, ts=y txnid=0, ts=WT_TS_NONE txnid=MAX, ts=MAX */ if (dsk->write_gen > 0 && dsk->write_gen <= S2C(session)->base_write_gen) { - /* FIXME-prepare-support: deal with durable timestamps. */ + /* FIXME-WT-6124: deal with durable timestamps. */ /* Tell reconciliation we cleared the transaction ids and the cell needs to be rebuilt. */ - if (unpack->start_txn != WT_TXN_NONE) { - unpack->start_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED); + if (tw->start_txn != WT_TXN_NONE) { + tw->start_txn = WT_TXN_NONE; + F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); } - if (unpack->stop_txn != WT_TXN_MAX) { - unpack->stop_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED); - if (unpack->stop_ts == WT_TS_MAX) - unpack->stop_ts = WT_TS_NONE; + if (tw->stop_txn != WT_TXN_MAX) { + tw->stop_txn = WT_TXN_NONE; + F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + if (tw->stop_ts == WT_TS_MAX) + tw->stop_ts = WT_TS_NONE; } else - WT_ASSERT(session, unpack->stop_ts == WT_TS_MAX); - if (unpack->oldest_start_txn != WT_TXN_NONE) { - unpack->oldest_start_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED); + WT_ASSERT(session, tw->stop_ts == WT_TS_MAX); + if (ta->oldest_start_txn != WT_TXN_NONE) { + ta->oldest_start_txn = WT_TXN_NONE; + F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); } - if (unpack->newest_stop_txn != WT_TXN_MAX) { - unpack->newest_stop_txn = WT_TXN_NONE; - F_SET(unpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED); - if (unpack->newest_stop_ts == WT_TS_MAX) - unpack->newest_stop_ts = WT_TS_NONE; + if (ta->newest_stop_txn != WT_TXN_MAX) { + ta->newest_stop_txn = WT_TXN_NONE; + F_SET(unpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED); + if (ta->newest_stop_ts == WT_TS_MAX) + ta->newest_stop_ts = WT_TS_NONE; } else - WT_ASSERT(session, unpack->newest_stop_ts == WT_TS_MAX); + WT_ASSERT(session, ta->newest_stop_ts == WT_TS_MAX); } } diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index f6bd37c5124..547feaa54a3 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -150,10 +150,10 @@ struct __wt_named_extractor { * WT_CONN_HOTBACKUP_START -- * Macro to set connection data appropriately for when we commence hot backup. */ -#define WT_CONN_HOTBACKUP_START(conn) \ - do { \ - (conn)->hot_backup = true; \ - (conn)->hot_backup_list = NULL; \ +#define WT_CONN_HOTBACKUP_START(conn) \ + do { \ + (conn)->hot_backup_start = (conn)->ckpt_finish_secs; \ + (conn)->hot_backup_list = NULL; \ } while (0) /* @@ -269,13 +269,14 @@ struct __wt_connection_impl { WT_TXN_GLOBAL txn_global; /* Global transaction state */ WT_RWLOCK hot_backup_lock; /* Hot backup serialization */ - bool hot_backup; /* Hot backup in progress */ + uint64_t hot_backup_start; /* Clock value of most recent checkpoint needed by hot backup */ char **hot_backup_list; /* Hot backup file list */ WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */ wt_thread_t ckpt_tid; /* Checkpoint thread */ bool ckpt_tid_set; /* Checkpoint thread set */ WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */ + uint64_t ckpt_finish_secs; /* Clock value of last completed checkpoint */ #define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0) wt_off_t ckpt_logsize; /* Checkpoint log size period */ bool ckpt_signalled; /* Checkpoint signalled */ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index dfe6d209357..522a031630e 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -6,6 +6,9 @@ * See the file LICENSE for redistribution information. */ +/* Get the session from any cursor. */ +#define CUR2S(c) ((WT_SESSION_IMPL *)((WT_CURSOR *)c)->session) + /* * Initialize a static WT_CURSOR structure. */ @@ -178,7 +181,10 @@ struct __wt_cursor_btree { * The update structure allocated by the row- and column-store modify functions, used to avoid a * data copy in the WT_CURSOR.update call. */ - WT_UPDATE *modify_update; + WT_UPDATE_VALUE *modify_update, _modify_update; + + /* An intermediate structure to hold the update value to be assigned to the cursor buffer. */ + WT_UPDATE_VALUE *upd_value, _upd_value; /* * Fixed-length column-store items are a single byte, and it's simpler and cheaper to allocate diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 14de00f80c3..81cc28feb08 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -24,7 +24,7 @@ __cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v) static inline int __cursor_copy_release(WT_CURSOR *cursor) { - if (F_ISSET(S2C((WT_SESSION_IMPL *)cursor->session), WT_CONN_DEBUG_CURSOR_COPY)) { + if (F_ISSET(S2C(CUR2S(cursor)), WT_CONN_DEBUG_CURSOR_COPY)) { if (F_ISSET(cursor, WT_CURSTD_DEBUG_COPY_KEY)) { WT_RET(__wt_cursor_copy_release_item(cursor, &cursor->key)); F_CLR(cursor, WT_CURSTD_DEBUG_COPY_KEY); @@ -77,8 +77,7 @@ __cursor_localkey(WT_CURSOR *cursor) { if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { if (!WT_DATA_IN_ITEM(&cursor->key)) - WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session, &cursor->key, cursor->key.data, - cursor->key.size)); + WT_RET(__wt_buf_set(CUR2S(cursor), &cursor->key, cursor->key.data, cursor->key.size)); F_CLR(cursor, WT_CURSTD_KEY_INT); F_SET(cursor, WT_CURSTD_KEY_EXT); } @@ -94,8 +93,8 @@ __cursor_localvalue(WT_CURSOR *cursor) { if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { if (!WT_DATA_IN_ITEM(&cursor->value)) - WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session, &cursor->value, - cursor->value.data, cursor->value.size)); + WT_RET( + __wt_buf_set(CUR2S(cursor), &cursor->value, cursor->value.data, cursor->value.size)); F_CLR(cursor, WT_CURSTD_VALUE_INT); F_SET(cursor, WT_CURSTD_VALUE_EXT); } @@ -193,7 +192,7 @@ __cursor_reset(WT_CURSOR_BTREE *cbt) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); __cursor_pos_clear(cbt); @@ -242,7 +241,7 @@ __wt_curindex_get_valuev(WT_CURSOR *cursor, va_list ap) WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)cursor; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_RET(__cursor_checkvalue(cursor)); if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { @@ -269,7 +268,7 @@ __wt_curtable_get_valuev(WT_CURSOR *cursor, va_list ap) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); primary = *ctable->cg_cursors; WT_RET(__cursor_checkvalue(primary)); @@ -354,10 +353,10 @@ __wt_cursor_disable_bulk(WT_SESSION_IMPL *session) * Return a page referenced key/value pair to the application. */ static inline int -__cursor_kv_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__cursor_kv_return(WT_CURSOR_BTREE *cbt, WT_UPDATE_VALUE *upd_value) { WT_RET(__wt_key_return(cbt)); - WT_RET(__wt_value_return(cbt, upd)); + WT_RET(__wt_value_return(cbt, upd_value)); return (0); } @@ -371,7 +370,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); if (reenter) { #ifdef HAVE_DIAGNOSTIC @@ -421,7 +420,7 @@ __cursor_row_slot_key_return( *kpack_used = false; - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); btree = S2BT(session); page = cbt->ref->page; diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h index 04a1fd3c5ae..76c644a2850 100644 --- a/src/third_party/wiredtiger/src/include/error.h +++ b/src/third_party/wiredtiger/src/include/error.h @@ -22,6 +22,8 @@ #define __wt_err(session, error, ...) __wt_err_func(session, error, __func__, __LINE__, __VA_ARGS__) #define __wt_errx(session, ...) __wt_errx_func(session, __func__, __LINE__, __VA_ARGS__) +#define __wt_panic(session, error, ...) \ + __wt_panic_func(session, error, __func__, __LINE__, __VA_ARGS__) #define __wt_set_return(session, error) __wt_set_return_func(session, __func__, __LINE__, error) /* Set "ret" and branch-to-err-label tests. */ @@ -47,6 +49,9 @@ #define WT_ERR_ERROR_OK(a, e, keep) WT_ERR_TEST((ret = (a)) != 0 && ret != (e), ret, keep) #define WT_ERR_NOTFOUND_OK(a, keep) WT_ERR_ERROR_OK(a, WT_NOTFOUND, keep) +/* Return WT_PANIC regardless of earlier return codes. */ +#define WT_ERR_PANIC(session, v, ...) WT_ERR(__wt_panic(session, v, __VA_ARGS__)) + /* Return tests. */ #define WT_RET(a) \ do { \ @@ -99,27 +104,13 @@ #define WT_TRET_BUSY_OK(a) WT_TRET_ERROR_OK(a, EBUSY) #define WT_TRET_NOTFOUND_OK(a) WT_TRET_ERROR_OK(a, WT_NOTFOUND) -/* Called on unexpected code path: locate the failure. */ -#define __wt_illegal_value(session, v) \ - __wt_illegal_value_func(session, (uintmax_t)(v), __func__, __LINE__) +/* Return WT_PANIC regardless of earlier return codes. */ +#define WT_RET_PANIC(session, v, ...) return (__wt_panic(session, v, __VA_ARGS__)) -#define WT_PANIC_MSG(session, v, ...) \ - do { \ - __wt_err(session, v, __VA_ARGS__); \ - WT_IGNORE_RET(__wt_panic(session)); \ - } while (0) -#define WT_PANIC_ERR(session, v, ...) \ - do { \ - WT_PANIC_MSG(session, v, __VA_ARGS__); \ - /* Return WT_PANIC regardless of earlier return codes. */ \ - WT_ERR(WT_PANIC); \ - } while (0) -#define WT_PANIC_RET(session, v, ...) \ - do { \ - WT_PANIC_MSG(session, v, __VA_ARGS__); \ - /* Return WT_PANIC regardless of earlier return codes. */ \ - return (WT_PANIC); \ - } while (0) +/* Called on unexpected code path: locate the failure. */ +#define __wt_illegal_value(session, v) \ + __wt_panic(session, EINVAL, "%s: 0x%" PRIxMAX, \ + "encountered an illegal file format or internal value", (uintmax_t)(v)) /* * WT_ERR_ASSERT, WT_RET_ASSERT, WT_ASSERT @@ -149,6 +140,13 @@ __wt_abort(session); \ } \ } while (0) +#define WT_RET_PANIC_ASSERT(session, exp, v, ...) \ + do { \ + if (!(exp)) { \ + __wt_err(session, v, __VA_ARGS__); \ + __wt_abort(session); \ + } \ + } while (0) #else #define WT_ASSERT(session, exp) WT_UNUSED(session) #define WT_ERR_ASSERT(session, exp, v, ...) \ @@ -161,6 +159,11 @@ if (!(exp)) \ WT_RET_MSG(session, v, __VA_ARGS__); \ } while (0) +#define WT_RET_PANIC_ASSERT(session, exp, v, ...) \ + do { \ + if (!(exp)) \ + WT_RET_PANIC(session, v, __VA_ARGS__); \ + } while (0) #endif /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index a3e71435037..a5dfb85bf89 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -34,8 +34,12 @@ extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_time_pair_to_string(wt_timestamp_t timestamp, uint64_t txn_id, char *tp_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern char *__wt_time_window_to_string(WT_TIME_WINDOW *tw, char *tw_string) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_timestamp_to_string(wt_timestamp_t ts, char *ts_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const WT_CONFIG_ENTRY *__wt_conn_config_match(const char *method) @@ -184,8 +188,6 @@ extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *blo extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_block_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -280,7 +282,7 @@ extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((wa extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_btcur_search_uncommitted(WT_CURSOR *cursor, WT_UPDATE **updp) +extern int __wt_btcur_search_prepared(WT_CURSOR *cursor, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_close(WT_SESSION_IMPL *session) @@ -559,8 +561,8 @@ extern int __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE **updp, - bool *valid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *valid) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curstat_index_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], @@ -720,8 +722,8 @@ extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path extern int __wt_filename_construct(WT_SESSION_IMPL *session, const char *path, const char *file_prefix, uintmax_t id_1, uint32_t id_2, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, - WT_UPDATE **updp, bool allow_prepare, WT_ITEM *on_disk_buf) +extern int __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, + uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -762,8 +764,8 @@ extern int __wt_hs_delete_key(WT_SESSION_IMPL *session, uint32_t btree_id, const WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_hs_insert_updates(WT_CURSOR *cursor, WT_BTREE *btree, WT_PAGE *page, - WT_MULTI *multi) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, @@ -772,9 +774,6 @@ extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, cons size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_huffman_open(WT_SESSION_IMPL *session, void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_illegal_value_func(WT_SESSION_IMPL *session, uintmax_t v, const char *func, - int line) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_import(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_inmem_unsupported_op(WT_SESSION_IMPL *session, const char *tag) @@ -1087,15 +1086,15 @@ extern int __wt_metadata_update(WT_SESSION_IMPL *session, const char *key, const WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_update_base_write_gen(WT_SESSION_IMPL *session, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply(WT_CURSOR *cursor, const void *modify) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply_item(WT_SESSION_IMPL *session, WT_ITEM *value, const void *modify, - bool sformat) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_apply_item(WT_SESSION_IMPL *session, const char *value_format, + WT_ITEM *value, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_pack(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries, WT_ITEM **modifyp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_reconstruct_from_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_UPDATE *upd, WT_UPDATE_VALUE *upd_value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_vector_push(WT_MODIFY_VECTOR *modifies, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) @@ -1157,9 +1156,11 @@ extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) - WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_panic_func( + WT_SESSION_IMPL *session, int error, const char *func, int line, const char *fmt, ...) + WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 5, 6))) + WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) @@ -1177,9 +1178,7 @@ extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocate extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv, - uint8_t type, wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint8_t type, WT_TIME_WINDOW *tw, uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_child_modify(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) @@ -1484,7 +1483,7 @@ extern int __wt_txn_query_timestamp(WT_SESSION_IMPL *session, char *hex_timestam const char *cfg[], bool global_txn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_recover(WT_SESSION_IMPL *session) +extern int __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1513,12 +1512,10 @@ extern int __wt_unexpected_object_type( WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +extern int __wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE_VALUE *upd_value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_value_return_buf(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_ITEM *buf, - WT_TIME_PAIR *start, WT_TIME_PAIR *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_value_return_upd(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_TIME_WINDOW *tw) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) @@ -1591,6 +1588,7 @@ extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci) extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on); extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext); extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); +extern void __wt_block_set_readonly(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)); extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); extern void __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash); @@ -1604,9 +1602,7 @@ extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); extern void __wt_capacity_throttle(WT_SESSION_IMPL *session, uint64_t bytes, WT_THROTTLE_TYPE type); extern void __wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); -extern void __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, - wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn); +extern void __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta); extern void __wt_ckpt_verbose(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, const char *ckpt_name, const uint8_t *ckpt_string); extern void __wt_cond_auto_wait( @@ -1707,12 +1703,9 @@ extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed(WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_read_cell_time_pairs( - WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_PAIR *start, WT_TIME_PAIR *stop); -extern void __wt_read_col_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_TIME_PAIR *start, WT_TIME_PAIR *stop); -extern void __wt_read_row_time_pairs( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_PAIR *start, WT_TIME_PAIR *stop); +extern void __wt_read_cell_time_window(WT_CURSOR_BTREE *cbt, WT_REF *ref, WT_TIME_WINDOW *tw); +extern void __wt_read_row_time_window( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw); extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); @@ -1833,6 +1826,14 @@ static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_split_descent_race(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_time_aggregate_is_empty(WT_TIME_AGGREGATE *ta) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_time_window_is_empty(WT_TIME_WINDOW *tw) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_time_windows_equal(WT_TIME_WINDOW *tw1, WT_TIME_WINDOW *tw2) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, + WT_UPDATE_VALUE *upd_value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd) @@ -1844,7 +1845,7 @@ static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id, static inline double __wt_eviction_dirty_target(WT_CACHE *cache) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_bt_col_var_cursor_walk_txn_read(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_COL *cip, WT_UPDATE **updp) + WT_CURSOR_BTREE *cbt, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_COL *cip) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_btree_block_free(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1951,13 +1952,11 @@ static inline int __wt_page_swap_func( static inline int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, - const void *data, size_t size, wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, - uint64_t start_txn, wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, - bool prepare, uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline int __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, - wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle, WT_REC_KV *val) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + const void *data, size_t size, WT_TIME_WINDOW *tw, uint64_t rle) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_rec_dict_replace( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_TIME_WINDOW *tw, uint64_t rle, WT_REC_KV *val) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, @@ -2008,10 +2007,10 @@ static inline int __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF * static inline int __wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, - uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp) + uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK *vpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_search_check(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, @@ -2048,30 +2047,20 @@ static inline int __wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t static inline int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, - uint64_t recno, wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, - uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, - uint64_t newest_stop_txn, bool prepare, size_t size) + uint64_t recno, WT_TIME_AGGREGATE *ta, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, - wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline size_t __wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, - wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_TIME_WINDOW *tw, uint64_t rle, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline size_t __wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, WT_TIME_WINDOW *tw, + uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_int_key(WT_CELL *cell, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, - wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_TIME_WINDOW *tw, uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, - wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_TIME_WINDOW *tw, uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_cell_total_len(WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline size_t __wt_strnlen(const char *s, size_t maxlen) @@ -2151,9 +2140,7 @@ static inline void __wt_cell_unpack( WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack); static inline void __wt_cell_unpack_dsk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack); -static inline void __wt_check_addr_validity(WT_SESSION_IMPL *session, - wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn); +static inline void __wt_check_addr_validity(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta); static inline void __wt_cond_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)); static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session); @@ -2166,12 +2153,7 @@ static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref); static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page); static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page); static inline void __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page); -static inline void __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *start_durable_tsp, - wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *stop_durable_tsp, - wt_timestamp_t *newest_stop_tsp, uint64_t *newest_stop_txnp, bool *preparep); -static inline void __wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t start_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, bool prepare); +static inline void __wt_rec_addr_ts_init(WT_RECONCILE *r, WT_TIME_AGGREGATE *ta); static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *addr, WT_CELL_UNPACK *vpack, bool proxy_cell, uint64_t recno); static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv); @@ -2194,6 +2176,16 @@ static inline void __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t); static inline void __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t); static inline void __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t); static inline void __wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep); +static inline void __wt_time_aggregate_copy(WT_TIME_AGGREGATE *dest, WT_TIME_AGGREGATE *source); +static inline void __wt_time_aggregate_init(WT_TIME_AGGREGATE *ta); +static inline void __wt_time_aggregate_init_max(WT_TIME_AGGREGATE *ta); +static inline void __wt_time_aggregate_merge(WT_TIME_AGGREGATE *dest, WT_TIME_AGGREGATE *source); +static inline void __wt_time_aggregate_update(WT_TIME_AGGREGATE *ta, WT_TIME_WINDOW *tw); +static inline void __wt_time_window_copy(WT_TIME_WINDOW *dest, WT_TIME_WINDOW *source); +static inline void __wt_time_window_init(WT_TIME_WINDOW *tw); +static inline void __wt_time_window_init_max(WT_TIME_WINDOW *tw); +static inline void __wt_time_window_set_start(WT_TIME_WINDOW *tw, WT_UPDATE *upd); +static inline void __wt_time_window_set_stop(WT_TIME_WINDOW *tw, WT_UPDATE *upd); static inline void __wt_timing_stress(WT_SESSION_IMPL *session, u_int flag); static inline void __wt_tree_modify_set(WT_SESSION_IMPL *session); static inline void __wt_txn_cursor_op(WT_SESSION_IMPL *session); @@ -2208,3 +2200,5 @@ static inline void __wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timest static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); static inline void __wt_txn_timestamp_flags(WT_SESSION_IMPL *session); static inline void __wt_txn_unmodify(WT_SESSION_IMPL *session); +static inline void __wt_upd_value_assign(WT_UPDATE_VALUE *upd_value, WT_UPDATE *upd); +static inline void __wt_upd_value_clear(WT_UPDATE_VALUE *upd_value); diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index 9b0ae3c4a72..f9160c6b28c 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -133,13 +133,7 @@ struct __wt_ckpt { WT_BLOCK_MODS backup_blocks[WT_BLKINCR_MAX]; - /* Validity window */ - wt_timestamp_t start_durable_ts; - wt_timestamp_t oldest_start_ts; - uint64_t oldest_start_txn; - wt_timestamp_t stop_durable_ts; - wt_timestamp_t newest_stop_ts; - uint64_t newest_stop_txn; + WT_TIME_AGGREGATE ta; /* Validity window */ WT_ITEM addr; /* Checkpoint cookie string */ WT_ITEM raw; /* Checkpoint cookie raw */ diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index e2b08cd088b..a75c421f882 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -168,7 +168,7 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) WT_DECL_RET; if ((ret = pthread_mutex_lock(&t->lock)) != 0) - WT_PANIC_MSG(session, ret, "pthread_mutex_lock: %s", t->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_mutex_lock: %s", t->name)); } #endif @@ -182,7 +182,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) WT_DECL_RET; if ((ret = pthread_mutex_unlock(&t->lock)) != 0) - WT_PANIC_MSG(session, ret, "pthread_mutex_unlock: %s", t->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_mutex_unlock: %s", t->name)); } #elif SPINLOCK_TYPE == SPINLOCK_MSVC diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 43581c7cc1f..647c015e26e 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -111,24 +111,13 @@ struct __wt_reconcile { uint32_t entries; uint64_t recno; WT_ITEM key; - wt_timestamp_t newest_start_durable_ts; - wt_timestamp_t oldest_start_ts; - uint64_t oldest_start_txn; - wt_timestamp_t newest_stop_durable_ts; - wt_timestamp_t newest_stop_ts; - uint64_t newest_stop_txn; - bool prepare; + WT_TIME_AGGREGATE ta; /* Saved minimum split-size boundary information. */ uint32_t min_entries; uint64_t min_recno; WT_ITEM min_key; - wt_timestamp_t min_newest_start_durable_ts; - wt_timestamp_t min_oldest_start_ts; - uint64_t min_oldest_start_txn; - wt_timestamp_t min_newest_stop_durable_ts; - wt_timestamp_t min_newest_stop_ts; - uint64_t min_newest_stop_txn; + WT_TIME_AGGREGATE ta_min; size_t min_offset; /* byte offset */ @@ -241,13 +230,7 @@ struct __wt_reconcile { typedef struct { WT_UPDATE *upd; /* Update to write (or NULL) */ - wt_timestamp_t start_durable_ts; /* Transaction IDs, timestamps */ - wt_timestamp_t start_ts; - uint64_t start_txn; - wt_timestamp_t stop_durable_ts; - wt_timestamp_t stop_ts; - uint64_t stop_txn; - bool prepare; + WT_TIME_WINDOW tw; } WT_UPDATE_SELECT; /* diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i index a65daaf9715..3f9339a81ab 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.i +++ b/src/third_party/wiredtiger/src/include/reconcile.i @@ -43,9 +43,7 @@ __wt_rec_need_split(WT_RECONCILE *r, size_t len) * Initialize an address timestamp triplet. */ static inline void -__wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *start_durable_tsp, - wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *stop_durable_tsp, - wt_timestamp_t *newest_stop_tsp, uint64_t *newest_stop_txnp, bool *preparep) +__wt_rec_addr_ts_init(WT_RECONCILE *r, WT_TIME_AGGREGATE *ta) { /* * If the page is not fixed-length column-store, where we don't maintain timestamps at all, set @@ -53,40 +51,10 @@ __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *start_durable_tsp, * corrected as we process key/value items. Otherwise, set the oldest/newest timestamps to * simple durability. */ - *start_durable_tsp = WT_TS_NONE; - *oldest_start_tsp = WT_TS_MAX; - *oldest_start_txnp = WT_TXN_MAX; - *stop_durable_tsp = WT_TS_NONE; - *newest_stop_tsp = WT_TS_NONE; - *newest_stop_txnp = WT_TXN_NONE; - *preparep = false; - if (r->page->type == WT_PAGE_COL_FIX) { - *oldest_start_tsp = WT_TS_NONE; - *oldest_start_txnp = WT_TXN_NONE; - *newest_stop_tsp = WT_TS_MAX; - *newest_stop_txnp = WT_TXN_MAX; - } -} - -/* - * __wt_rec_addr_ts_update -- - * Update the chunk's timestamp information. - */ -static inline void -__wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t start_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, bool prepare) -{ - r->cur_ptr->newest_start_durable_ts = - WT_MAX(start_durable_ts, r->cur_ptr->newest_start_durable_ts); - r->cur_ptr->oldest_start_ts = WT_MIN(oldest_start_ts, r->cur_ptr->oldest_start_ts); - r->cur_ptr->oldest_start_txn = WT_MIN(oldest_start_txn, r->cur_ptr->oldest_start_txn); - r->cur_ptr->newest_stop_durable_ts = - WT_MAX(stop_durable_ts, r->cur_ptr->newest_stop_durable_ts); - r->cur_ptr->newest_stop_ts = WT_MAX(newest_stop_ts, r->cur_ptr->newest_stop_ts); - r->cur_ptr->newest_stop_txn = WT_MAX(newest_stop_txn, r->cur_ptr->newest_stop_txn); - if (prepare) - r->cur_ptr->prepare = true; + if (r->page->type == WT_PAGE_COL_FIX) + __wt_time_aggregate_init(ta); + else + __wt_time_aggregate_init_max(ta); } /* @@ -201,17 +169,13 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add val->buf.data = addr->addr; val->buf.size = addr->size; val->cell_len = - __wt_cell_pack_addr(session, &val->cell, cell_type, recno, addr->newest_start_durable_ts, - addr->oldest_start_ts, addr->oldest_start_txn, addr->newest_stop_durable_ts, - addr->newest_stop_ts, addr->newest_stop_txn, addr->prepare, val->buf.size); + __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &addr->ta, val->buf.size); } else { WT_ASSERT(session, addr == NULL); val->buf.data = vpack->data; val->buf.size = vpack->size; - val->cell_len = __wt_cell_pack_addr(session, &val->cell, cell_type, recno, - vpack->newest_start_durable_ts, vpack->oldest_start_ts, vpack->oldest_start_txn, - vpack->newest_stop_durable_ts, vpack->newest_stop_ts, vpack->newest_stop_txn, - F_ISSET(vpack, WT_CELL_UNPACK_PREPARE), val->buf.size); + val->cell_len = + __wt_cell_pack_addr(session, &val->cell, cell_type, recno, &vpack->ta, val->buf.size); } val->len = val->cell_len + val->buf.size; @@ -223,9 +187,7 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add */ static inline int __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *data, size_t size, - wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle) + WT_TIME_WINDOW *tw, uint64_t rle) { WT_BTREE *btree; WT_REC_KV *val; @@ -251,13 +213,13 @@ __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *d if (val->buf.size > btree->maxleafvalue) { WT_STAT_DATA_INCR(session, rec_overflow_value); - return (__wt_rec_cell_build_ovfl(session, r, val, WT_CELL_VALUE_OVFL, durable_start_ts, - start_ts, start_txn, durable_stop_ts, stop_ts, stop_txn, prepare, rle)); + return (__wt_rec_cell_build_ovfl(session, r, val, WT_CELL_VALUE_OVFL, tw, rle)); } } + if (tw->prepare) + WT_STAT_DATA_INCR(session, rec_prepare_value); - val->cell_len = __wt_cell_pack_value(session, &val->cell, durable_start_ts, start_ts, start_txn, - durable_stop_ts, stop_ts, stop_txn, prepare, rle, val->buf.size); + val->cell_len = __wt_cell_pack_value(session, &val->cell, tw, rle, val->buf.size); val->len = val->cell_len + val->buf.size; return (0); @@ -268,9 +230,8 @@ __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *d * Check for a dictionary match. */ static inline int -__wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t start_durable_ts, - wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, uint64_t rle, WT_REC_KV *val) +__wt_rec_dict_replace( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_TIME_WINDOW *tw, uint64_t rle, WT_REC_KV *val) { WT_REC_DICTIONARY *dp; uint64_t offset; @@ -306,8 +267,7 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t * offset from the beginning of the page. */ offset = (uint64_t)WT_PTRDIFF(r->first_free, (uint8_t *)r->cur_ptr->image.mem + dp->offset); - val->len = val->cell_len = __wt_cell_pack_copy(session, &val->cell, start_durable_ts, - start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, rle, offset); + val->len = val->cell_len = __wt_cell_pack_copy(session, &val->cell, tw, rle, offset); val->buf.data = NULL; val->buf.size = 0; } diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index 69d948e70fd..83827896f43 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -264,7 +264,7 @@ struct __wt_table { if ((skipp) != (bool *)NULL) \ *(bool *)(skipp) = true; \ if (F_ISSET(session, WT_SESSION_LOCKED_HOTBACKUP)) { \ - if (!__conn->hot_backup) { \ + if (__conn->hot_backup_start == 0) { \ if ((skipp) != (bool *)NULL) \ *(bool *)(skipp) = false; \ op; \ @@ -272,7 +272,7 @@ struct __wt_table { } else { \ __wt_readlock(session, &__conn->hot_backup_lock); \ F_SET(session, WT_SESSION_LOCKED_HOTBACKUP_READ); \ - if (!__conn->hot_backup) { \ + if (__conn->hot_backup_start == 0) { \ if ((skipp) != (bool *)NULL) \ *(bool *)(skipp) = false; \ op; \ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 25447813d55..e2cd24d0e95 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -167,32 +167,31 @@ struct __wt_session_impl { #define WT_SESSION_CAN_WAIT 0x00000008u #define WT_SESSION_HS_CURSOR 0x00000010u #define WT_SESSION_IGNORE_CACHE_SIZE 0x00000020u -#define WT_SESSION_IGNORE_HS_TOMBSTONE 0x00000040u -#define WT_SESSION_INTERNAL 0x00000080u -#define WT_SESSION_LOCKED_CHECKPOINT 0x00000100u -#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000200u -#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000400u -#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00000800u -#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00001000u -#define WT_SESSION_LOCKED_METADATA 0x00002000u -#define WT_SESSION_LOCKED_PASS 0x00004000u -#define WT_SESSION_LOCKED_SCHEMA 0x00008000u -#define WT_SESSION_LOCKED_SLOT 0x00010000u -#define WT_SESSION_LOCKED_TABLE_READ 0x00020000u -#define WT_SESSION_LOCKED_TABLE_WRITE 0x00040000u -#define WT_SESSION_LOCKED_TURTLE 0x00080000u -#define WT_SESSION_LOGGING_INMEM 0x00100000u -#define WT_SESSION_NO_DATA_HANDLES 0x00200000u -#define WT_SESSION_NO_LOGGING 0x00400000u -#define WT_SESSION_NO_RECONCILE 0x00800000u -#define WT_SESSION_NO_SCHEMA_LOCK 0x01000000u -#define WT_SESSION_QUIET_CORRUPT_FILE 0x02000000u -#define WT_SESSION_READ_WONT_NEED 0x04000000u -#define WT_SESSION_RESOLVING_MODIFY 0x08000000u -#define WT_SESSION_RESOLVING_TXN 0x10000000u -#define WT_SESSION_ROLLBACK_TO_STABLE 0x20000000u -#define WT_SESSION_SCHEMA_TXN 0x40000000u -#define WT_SESSION_SERVER_ASYNC 0x80000000u +#define WT_SESSION_INTERNAL 0x00000040u +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000080u +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000100u +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000200u +#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x00000400u +#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x00000800u +#define WT_SESSION_LOCKED_METADATA 0x00001000u +#define WT_SESSION_LOCKED_PASS 0x00002000u +#define WT_SESSION_LOCKED_SCHEMA 0x00004000u +#define WT_SESSION_LOCKED_SLOT 0x00008000u +#define WT_SESSION_LOCKED_TABLE_READ 0x00010000u +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00020000u +#define WT_SESSION_LOCKED_TURTLE 0x00040000u +#define WT_SESSION_LOGGING_INMEM 0x00080000u +#define WT_SESSION_NO_DATA_HANDLES 0x00100000u +#define WT_SESSION_NO_LOGGING 0x00200000u +#define WT_SESSION_NO_RECONCILE 0x00400000u +#define WT_SESSION_NO_SCHEMA_LOCK 0x00800000u +#define WT_SESSION_QUIET_CORRUPT_FILE 0x01000000u +#define WT_SESSION_READ_WONT_NEED 0x02000000u +#define WT_SESSION_RESOLVING_MODIFY 0x04000000u +#define WT_SESSION_RESOLVING_TXN 0x08000000u +#define WT_SESSION_ROLLBACK_TO_STABLE 0x10000000u +#define WT_SESSION_SCHEMA_TXN 0x20000000u +#define WT_SESSION_SERVER_ASYNC 0x40000000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index d1c7888e54f..59936321cbc 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -868,6 +868,7 @@ struct __wt_dsrc_stats { int64_t rec_pages; int64_t rec_pages_eviction; int64_t rec_page_delete; + int64_t rec_prepare_value; int64_t session_compact; int64_t txn_update_conflict; }; diff --git a/src/third_party/wiredtiger/src/include/timestamp.h b/src/third_party/wiredtiger/src/include/timestamp.h new file mode 100644 index 00000000000..a014cc8f624 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/timestamp.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2014-2020 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * We format timestamps in a couple of ways, declare appropriate sized buffers. Hexadecimal is 2x + * the size of the value. MongoDB format (high/low pairs of 4B unsigned integers, with surrounding + * parenthesis and separating comma and space), is 2x the maximum digits from a 4B unsigned integer + * plus 4. Both sizes include a trailing null byte as well. + */ +#define WT_TS_HEX_STRING_SIZE (2 * sizeof(wt_timestamp_t) + 1) +#define WT_TS_INT_STRING_SIZE (2 * 10 + 4 + 1) + +/* + * We need an appropriately sized buffer for formatted time pairs, aggregates and windows. This is + * for time windows with 4 timestamps, 2 transaction IDs, prepare state and formatting. The + * formatting is currently about 32 characters - enough space that we don't need to think about it. + */ +#define WT_TP_STRING_SIZE (WT_TS_INT_STRING_SIZE + 1 + 20 + 1) +#define WT_TIME_STRING_SIZE (WT_TS_INT_STRING_SIZE * 4 + 20 * 2 + 64) + +/* The time pairs that define a value's time window and associated prepare information. */ +struct __wt_time_window { + wt_timestamp_t durable_start_ts; /* default value: WT_TS_NONE */ + wt_timestamp_t start_ts; /* default value: WT_TS_NONE */ + uint64_t start_txn; /* default value: WT_TXN_NONE */ + + wt_timestamp_t durable_stop_ts; /* default value: WT_TS_NONE */ + wt_timestamp_t stop_ts; /* default value: WT_TS_MAX */ + uint64_t stop_txn; /* default value: WT_TXN_MAX */ + + /* + * Prepare information isn't really part of a time window, but we need to aggregate it to the + * internal page information in reconciliation, and this is the simplest place to put it. + */ + uint8_t prepare; +}; + +/* The time pairs that define an aggregated time window and associated prepare information. */ +struct __wt_time_aggregate { + wt_timestamp_t newest_start_durable_ts; /* default value: WT_TS_NONE */ + wt_timestamp_t newest_stop_durable_ts; /* default value: WT_TS_NONE */ + + wt_timestamp_t oldest_start_ts; /* default value: WT_TS_NONE */ + uint64_t oldest_start_txn; /* default value: WT_TXN_NONE */ + wt_timestamp_t newest_stop_ts; /* default value: WT_TS_MAX */ + uint64_t newest_stop_txn; /* default value: WT_TXN_MAX */ + + uint8_t prepare; +}; diff --git a/src/third_party/wiredtiger/src/include/timestamp.i b/src/third_party/wiredtiger/src/include/timestamp.i new file mode 100644 index 00000000000..c6e7d66ffef --- /dev/null +++ b/src/third_party/wiredtiger/src/include/timestamp.i @@ -0,0 +1,225 @@ +/*- + * Copyright (c) 2014-2020 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * __wt_time_window_init -- + * Initialize the fields in a time window to their defaults. + */ +static inline void +__wt_time_window_init(WT_TIME_WINDOW *tw) +{ + tw->durable_start_ts = WT_TS_NONE; + tw->start_ts = WT_TS_NONE; + tw->start_txn = WT_TXN_NONE; + + tw->durable_stop_ts = WT_TS_NONE; + tw->stop_ts = WT_TS_MAX; + tw->stop_txn = WT_TXN_MAX; + + tw->prepare = 0; +} + +/* + * __wt_time_window_init_max -- + * Initialize the fields in a time window to values that force an override. + */ +static inline void +__wt_time_window_init_max(WT_TIME_WINDOW *tw) +{ + tw->durable_start_ts = WT_TS_MAX; + tw->start_ts = WT_TS_MAX; + tw->start_txn = WT_TXN_MAX; + + tw->durable_stop_ts = WT_TS_MAX; + tw->stop_ts = WT_TS_NONE; + tw->stop_txn = WT_TXN_NONE; + + tw->prepare = 0; +} + +/* + * __wt_time_window_copy -- + * Copy the values from one time window structure to another. + */ +static inline void +__wt_time_window_copy(WT_TIME_WINDOW *dest, WT_TIME_WINDOW *source) +{ + *dest = *source; +} + +/* + * __wt_time_window_is_empty -- + * Return true if the time window is equivalent to the default time window. + */ +static inline bool +__wt_time_window_is_empty(WT_TIME_WINDOW *tw) +{ + return (tw->durable_start_ts == WT_TS_NONE && tw->start_ts == WT_TS_NONE && + tw->start_txn == WT_TXN_NONE && tw->durable_stop_ts == WT_TS_NONE && + tw->stop_ts == WT_TS_MAX && tw->stop_txn == WT_TXN_MAX && tw->prepare == 0); +} + +/* + * __wt_time_windows_equal -- + * Return true if the time windows are the same. + */ +static inline bool +__wt_time_windows_equal(WT_TIME_WINDOW *tw1, WT_TIME_WINDOW *tw2) +{ + return (tw1->durable_start_ts == tw2->durable_start_ts && tw1->start_ts == tw2->start_ts && + tw1->start_txn == tw2->start_txn && tw1->durable_stop_ts == tw2->durable_stop_ts && + tw1->stop_ts == tw2->stop_ts && tw1->stop_txn == tw2->stop_txn && + tw1->prepare == tw2->prepare); +} + +/* + * __wt_time_window_set_start -- + * Set the start values of a time window from those in an update structure. + */ +static inline void +__wt_time_window_set_start(WT_TIME_WINDOW *tw, WT_UPDATE *upd) +{ + /* + * Durable timestamp can be 0 for prepared updates, in those cases use the prepared timestamp as + * durable timestamp. + */ + tw->durable_start_ts = tw->start_ts = upd->start_ts; + if (upd->durable_ts != WT_TS_NONE) + tw->durable_start_ts = upd->durable_ts; + tw->start_txn = upd->txnid; +} + +/* + * __wt_time_window_set_stop -- + * Set the start values of a time window from those in an update structure. + */ +static inline void +__wt_time_window_set_stop(WT_TIME_WINDOW *tw, WT_UPDATE *upd) +{ + /* + * Durable timestamp can be 0 for prepared updates, in those cases use the prepared timestamp as + * durable timestamp. + */ + tw->durable_stop_ts = tw->stop_ts = upd->start_ts; + if (upd->durable_ts != WT_TS_NONE) + tw->durable_stop_ts = upd->durable_ts; + tw->stop_txn = upd->txnid; +} + +/* + * __wt_time_aggregate_init -- + * Initialize the fields in an aggregated time window to their defaults. + */ +static inline void +__wt_time_aggregate_init(WT_TIME_AGGREGATE *ta) +{ + /* + * The aggregated durable timestamp values represent the maximum durable timestamp over set of + * timestamps. These aggregated max values are used for rollback to stable operation to find out + * whether the page has any timestamp updates more than stable timestamp. + */ + ta->newest_start_durable_ts = WT_TS_NONE; + ta->newest_stop_durable_ts = WT_TS_NONE; + + ta->oldest_start_ts = WT_TS_NONE; + ta->oldest_start_txn = WT_TXN_NONE; + + ta->newest_stop_ts = WT_TS_MAX; + ta->newest_stop_txn = WT_TXN_MAX; + + ta->prepare = 0; +} + +/* + * __wt_time_aggregate_init_max -- + * Initialize the fields in an aggregated time window to maximum values, since this structure is + * generally populated by iterating over a set of timestamps and calculating max/min seen for + * each value, it's useful to be able to start with a negatively initialized structure. + */ +static inline void +__wt_time_aggregate_init_max(WT_TIME_AGGREGATE *ta) +{ + /* + * The aggregated durable timestamp values represent the maximum durable timestamp over set of + * timestamps. These aggregated max values are used for rollback to stable operation to find out + * whether the page has any timestamp updates more than stable timestamp. + */ + ta->newest_start_durable_ts = WT_TS_NONE; + ta->newest_stop_durable_ts = WT_TS_NONE; + + ta->oldest_start_ts = WT_TS_MAX; + ta->oldest_start_txn = WT_TXN_MAX; + + ta->newest_stop_ts = WT_TS_NONE; + ta->newest_stop_txn = WT_TXN_NONE; + + ta->prepare = 0; +} + +/* + * __wt_time_aggregate_is_empty -- + * Return true if the time aggregate is equivalent to the default time aggregate. + */ +static inline bool +__wt_time_aggregate_is_empty(WT_TIME_AGGREGATE *ta) +{ + return (ta->newest_start_durable_ts == WT_TS_NONE && ta->newest_stop_durable_ts == WT_TS_NONE && + ta->oldest_start_ts == WT_TS_MAX && ta->oldest_start_txn == WT_TXN_MAX && + ta->newest_stop_ts == WT_TS_NONE && ta->newest_stop_txn == WT_TXN_NONE && ta->prepare == 0); +} + +/* + * __wt_time_aggregate_copy -- + * Copy the values from one time aggregate structure to another. + */ +static inline void +__wt_time_aggregate_copy(WT_TIME_AGGREGATE *dest, WT_TIME_AGGREGATE *source) +{ + *dest = *source; +} + +/* + * __wt_time_aggregate_update -- + * Update the aggregated window to reflect for a new time window. + */ +static inline void +__wt_time_aggregate_update(WT_TIME_AGGREGATE *ta, WT_TIME_WINDOW *tw) +{ + ta->newest_start_durable_ts = WT_MAX(tw->durable_start_ts, ta->newest_start_durable_ts); + ta->newest_stop_durable_ts = WT_MAX(tw->durable_stop_ts, ta->newest_stop_durable_ts); + + ta->oldest_start_ts = WT_MIN(tw->start_ts, ta->oldest_start_ts); + ta->oldest_start_txn = WT_MIN(tw->start_txn, ta->oldest_start_txn); + ta->newest_stop_ts = WT_MAX(tw->stop_ts, ta->newest_stop_ts); + ta->newest_stop_txn = WT_MAX(tw->stop_txn, ta->newest_stop_txn); + + if (tw->prepare != 0) + ta->prepare = 1; +} + +/* + * __wt_time_aggregate_merge -- + * Merge an aggregated time window into another - choosing the most conservative value from + * each. + */ +static inline void +__wt_time_aggregate_merge(WT_TIME_AGGREGATE *dest, WT_TIME_AGGREGATE *source) +{ + dest->newest_start_durable_ts = + WT_MAX(dest->newest_start_durable_ts, source->newest_start_durable_ts); + dest->newest_stop_durable_ts = + WT_MAX(dest->newest_stop_durable_ts, source->newest_stop_durable_ts); + + dest->oldest_start_ts = WT_MIN(dest->oldest_start_ts, source->oldest_start_ts); + dest->oldest_start_txn = WT_MIN(dest->oldest_start_txn, source->oldest_start_txn); + dest->newest_stop_ts = WT_MAX(dest->newest_stop_ts, source->newest_stop_ts); + dest->newest_stop_txn = WT_MAX(dest->newest_stop_txn, source->newest_stop_txn); + + if (source->prepare != 0) + dest->prepare = 1; +} diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 483fd429496..a06b1405651 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -11,6 +11,9 @@ #define WT_TXN_MAX (UINT64_MAX - 10) /* End of time */ #define WT_TXN_ABORTED UINT64_MAX /* Update rolled back */ +#define WT_TS_NONE 0 /* Beginning of time */ +#define WT_TS_MAX UINT64_MAX /* End of time */ + /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_TXN_LOG_CKPT_CLEANUP 0x01u #define WT_TXN_LOG_CKPT_PREPARE 0x02u @@ -52,25 +55,6 @@ typedef enum { #define WT_SESSION_IS_CHECKPOINT(s) ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) -#define WT_TS_NONE 0 /* Beginning of time */ -#define WT_TS_MAX UINT64_MAX /* End of time */ - -/* - * We format timestamps in a couple of ways, declare appropriate sized buffers. Hexadecimal is 2x - * the size of the value. MongoDB format (high/low pairs of 4B unsigned integers, with surrounding - * parenthesis and separating comma and space), is 2x the maximum digits from a 4B unsigned integer - * plus 4. Both sizes include a trailing null byte as well. - */ -#define WT_TS_HEX_STRING_SIZE (2 * sizeof(wt_timestamp_t) + 1) -#define WT_TS_INT_STRING_SIZE (2 * 10 + 4 + 1) - -/* - * We need an appropriately sized buffer for formatted time pairs. This is for time pairs of the - * form (time_stamp, slash and transaction_id), which gives the max digits of a timestamp plus slash - * plus max digits of a 8 byte integer with a trailing null byte. - */ -#define WT_TP_STRING_SIZE (WT_TS_INT_STRING_SIZE + 1 + 20 + 1) - /* * Perform an operation at the specified isolation level. * diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 42d9233676c..0cd8c89c9a7 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -59,8 +59,8 @@ __wt_txn_err_set(WT_SESSION_IMPL *session, int ret) * a prepared transaction. */ if (F_ISSET(txn, WT_TXN_PREPARE)) - WT_PANIC_MSG(session, ret, - "transactional error logged after transaction was prepared, failing the system"); + WT_IGNORE_RET(__wt_panic(session, ret, + "transactional error logged after transaction was prepared, failing the system")); } /* @@ -584,6 +584,20 @@ __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd) } /* + * __wt_txn_upd_value_visible_all -- + * Is the given update value visible to all (possible) readers? + */ +static inline bool +__wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, WT_UPDATE_VALUE *upd_value) +{ + if (upd_value->prepare_state == WT_PREPARE_LOCKED || + upd_value->prepare_state == WT_PREPARE_INPROGRESS) + return (false); + + return (__wt_txn_visible_all(session, upd_value->txnid, upd_value->start_ts)); +} + +/* * __txn_visible_id -- * Can the current transaction see the given ID? */ @@ -769,12 +783,12 @@ __wt_upd_alloc_tombstone(WT_SESSION_IMPL *session, WT_UPDATE **updp, size_t *siz * Get the first visible update in a list (or NULL if none are visible). */ static inline int -__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp) +__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_VISIBLE_TYPE upd_visible; uint8_t type; - *updp = NULL; + __wt_upd_value_clear(cbt->upd_value); for (; upd != NULL; upd = upd->next) { WT_ORDERED_READ(type, upd->type); @@ -784,19 +798,33 @@ __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **upd upd_visible = __wt_txn_upd_visible_type(session, upd); if (upd_visible == WT_VISIBLE_TRUE) { /* - * A tombstone representing a stop time pair will have either a valid txn id or a valid - * timestamp. Ignore such tombstones in history store based on session settings. + * Ignore non-globally visible tombstones when we are doing history store scans in + * rollback to stable or when we are told to. */ - if (type == WT_UPDATE_TOMBSTONE && WT_IS_HS(S2BT(session)) && - F_ISSET(session, WT_SESSION_IGNORE_HS_TOMBSTONE) && - (upd->start_ts != WT_TS_NONE || upd->txnid != WT_TXN_NONE)) + if (type == WT_UPDATE_TOMBSTONE && + (F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) || + (WT_IS_HS(S2BT(session)) && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) && + !__wt_txn_upd_visible_all(session, upd)) continue; - *updp = upd; - return (0); + break; } if (upd_visible == WT_VISIBLE_PREPARE) return (WT_PREPARE_CONFLICT); } + if (upd == NULL) + return (0); + /* + * Now assign to the update value. If it's not a modify, we're free to simply point the value at + * the update's memory without owning it. If it is a modify, we need to reconstruct the full + * update now and make the value own the buffer. + * + * If the caller has specifically asked us to skip assigning the buffer, we shouldn't bother + * reconstructing the modify. + */ + if (upd->type != WT_UPDATE_MODIFY || cbt->upd_value->skip_buf) + __wt_upd_value_assign(cbt->upd_value, upd); + else + WT_RET(__wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value)); return (0); } @@ -809,101 +837,83 @@ __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **upd */ static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, - WT_UPDATE *upd, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) + WT_UPDATE *upd, WT_CELL_UNPACK *vpack) { - WT_DECL_RET; - WT_ITEM buf; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; - *updp = NULL; - WT_RET(__wt_txn_read_upd_list(session, upd, updp)); - if (*updp != NULL) + WT_RET(__wt_txn_read_upd_list(session, cbt, upd)); + if (WT_UPDATE_DATA_VALUE(cbt->upd_value) || + (cbt->upd_value->type == WT_UPDATE_MODIFY && cbt->upd_value->skip_buf)) return (0); + WT_ASSERT(session, cbt->upd_value->type == WT_UPDATE_INVALID); /* If there is no ondisk value, there can't be anything in the history store either. */ - if (cbt->ref->page->dsk == NULL || cbt->slot == UINT32_MAX) - return (__wt_upd_alloc_tombstone(session, updp, NULL)); - - buf.data = NULL; - buf.size = 0; - buf.mem = NULL; - buf.memsize = 0; - buf.flags = 0; + if (cbt->ref->page->dsk == NULL || cbt->slot == UINT32_MAX) { + cbt->upd_value->type = WT_UPDATE_TOMBSTONE; + return (0); + } /* Check the ondisk value. */ if (vpack == NULL) { - ret = __wt_value_return_buf(cbt, cbt->ref, &buf, &start, &stop); - if (ret != 0) { - __wt_buf_free(session, &buf); - return (ret); - } + __wt_time_window_init(&tw); + WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw)); } else { - start.timestamp = vpack->start_ts; - start.txnid = vpack->start_txn; - stop.timestamp = vpack->stop_ts; - stop.txnid = vpack->start_txn; - buf.data = vpack->data; - buf.size = vpack->size; + __wt_time_window_copy(&tw, &vpack->tw); + cbt->upd_value->buf.data = vpack->data; + cbt->upd_value->buf.size = vpack->size; } /* - * If the stop pair is set, that means that there is a tombstone at that time. If the stop time - * pair is visible to our txn then that means we've just spotted a tombstone and should return - * "not found", except for history store scan during rollback to stable. + * If the stop pair is set, that means that there is a tombstone at that time. If it is not + * prepared and the stop time pair is visible to our txn then that means we've just spotted a + * tombstone and should return "not found", except for history store scan during rollback to + * stable and when we are told to ignore non-globally visible tombstones. */ - if (stop.txnid != WT_TXN_MAX && stop.timestamp != WT_TS_MAX && - (!WT_IS_HS(S2BT(session)) || !F_ISSET(session, WT_SESSION_IGNORE_HS_TOMBSTONE)) && - __wt_txn_visible(session, stop.txnid, stop.timestamp)) { - __wt_buf_free(session, &buf); - WT_RET(__wt_upd_alloc_tombstone(session, updp, NULL)); - (*updp)->txnid = stop.txnid; - /* FIXME: Reevaluate this as part of PM-1524. */ - (*updp)->durable_ts = (*updp)->start_ts = stop.timestamp; - F_SET(*updp, WT_UPDATE_RESTORED_FROM_DISK); + if (tw.stop_txn != WT_TXN_MAX && tw.stop_ts != WT_TS_MAX && !tw.prepare && + __wt_txn_visible(session, tw.stop_txn, tw.stop_ts) && + ((!F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) && + (!WT_IS_HS(S2BT(session)) || !F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))) || + __wt_txn_visible_all(session, tw.stop_txn, tw.stop_ts))) { + cbt->upd_value->buf.data = NULL; + cbt->upd_value->buf.size = 0; + cbt->upd_value->start_ts = tw.stop_ts; + cbt->upd_value->txnid = tw.stop_txn; + cbt->upd_value->type = WT_UPDATE_TOMBSTONE; + cbt->upd_value->prepare_state = WT_PREPARE_INIT; return (0); } /* - * If the start time pair is visible then we need to return the ondisk value. - * - * FIXME-PM-1521: This should be probably be re-factored to return a buffer of bytes rather than - * an update. This allocation is expensive and doesn't serve a purpose other than to work within - * the current system. + * If the start time pair is visible and it is not a prepared value then we need to return the + * ondisk value. */ - if (__wt_txn_visible(session, start.txnid, start.timestamp) || - F_ISSET(session, WT_SESSION_RESOLVING_MODIFY)) { + if ((!tw.prepare || (tw.stop_txn != WT_TXN_MAX && tw.stop_ts != WT_TS_MAX)) && + (__wt_txn_visible(session, tw.start_txn, tw.start_ts) || + F_ISSET(session, WT_SESSION_RESOLVING_MODIFY))) { /* If we are resolving a modify then the btree must be the history store. */ WT_ASSERT( session, (F_ISSET(session, WT_SESSION_RESOLVING_MODIFY) && WT_IS_HS(S2BT(session))) || !F_ISSET(session, WT_SESSION_RESOLVING_MODIFY)); - ret = __wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, updp, NULL); - __wt_buf_free(session, &buf); - WT_RET(ret); - (*updp)->txnid = start.txnid; - (*updp)->start_ts = start.timestamp; - F_SET((*updp), WT_UPDATE_RESTORED_FROM_DISK); + if (cbt->upd_value->skip_buf) { + cbt->upd_value->buf.data = NULL; + cbt->upd_value->buf.size = 0; + } + cbt->upd_value->start_ts = tw.start_ts; + cbt->upd_value->txnid = tw.start_txn; + cbt->upd_value->type = WT_UPDATE_STANDARD; + cbt->upd_value->prepare_state = WT_PREPARE_INIT; return (0); } /* If there's no visible update in the update chain or ondisk, check the history store file. */ - if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(S2BT(session), WT_BTREE_HS)) { - ret = __wt_find_hs_upd(session, key, recno, updp, false, &buf); - __wt_buf_free(session, &buf); - WT_RET_NOTFOUND_OK(ret); - } - - __wt_buf_free(session, &buf); - /* - * Return null not tombstone if nothing is found in history store. - */ - WT_ASSERT(session, (*updp) == NULL || (*updp)->type != WT_UPDATE_TOMBSTONE); + if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(S2BT(session), WT_BTREE_HS)) + WT_RET_NOTFOUND_OK(__wt_find_hs_upd(session, key, cbt->iface.value_format, recno, + cbt->upd_value, false, &cbt->upd_value->buf)); - /* - * FIXME-PM-1521: We call transaction read in a lot of places so we can't do this yet. When we - * re-factor this function to return a byte array, we should tackle this at the same time. - */ + /* Return invalid not tombstone if nothing is found in history store. */ + WT_ASSERT(session, cbt->upd_value->type != WT_UPDATE_TOMBSTONE); return (0); } @@ -1107,7 +1117,7 @@ static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_DECL_RET; - WT_TIME_PAIR start, stop; + WT_TIME_WINDOW tw; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; bool ignore_prepare_set, rollback; @@ -1144,11 +1154,11 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE */ if (!rollback && upd == NULL && cbt != NULL && cbt->btree->type != BTREE_COL_FIX && cbt->ins == NULL) { - __wt_read_cell_time_pairs(cbt, cbt->ref, &start, &stop); - if (stop.txnid != WT_TXN_MAX && stop.timestamp != WT_TS_MAX) - rollback = !__wt_txn_visible(session, stop.txnid, stop.timestamp); + __wt_read_cell_time_window(cbt, cbt->ref, &tw); + if (tw.stop_txn != WT_TXN_MAX && tw.stop_ts != WT_TS_MAX) + rollback = !__wt_txn_visible(session, tw.stop_txn, tw.stop_ts); else - rollback = !__wt_txn_visible(session, start.txnid, start.timestamp); + rollback = !__wt_txn_visible(session, tw.start_txn, tw.start_ts); } if (rollback) { @@ -1253,3 +1263,40 @@ __wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active) return (0); } + +/* + * __wt_upd_value_assign -- + * Point an update value at a given update. We're specifically not getting the value to own the + * memory since this exists in an update list somewhere. + */ +static inline void +__wt_upd_value_assign(WT_UPDATE_VALUE *upd_value, WT_UPDATE *upd) +{ + if (!upd_value->skip_buf) { + upd_value->buf.data = upd->data; + upd_value->buf.size = upd->size; + } + upd_value->start_ts = upd->start_ts; + upd_value->txnid = upd->txnid; + upd_value->type = upd->type; + upd_value->prepare_state = upd->prepare_state; +} + +/* + * __wt_upd_value_clear -- + * Clear an update value to its defaults. + */ +static inline void +__wt_upd_value_clear(WT_UPDATE_VALUE *upd_value) +{ + /* + * Make sure we don't touch the memory pointers here. If we have some allocated memory, that + * could come in handy next time we need to write to the buffer. + */ + upd_value->buf.data = NULL; + upd_value->buf.size = 0; + upd_value->start_ts = WT_TS_NONE; + upd_value->txnid = WT_TXN_NONE; + upd_value->type = WT_UPDATE_INVALID; + upd_value->prepare_state = WT_PREPARE_INIT; +} diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 1a9bc7b7519..74fbe62d15b 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -712,17 +712,18 @@ struct __wt_cursor { #define WT_CURSTD_DUMP_HEX 0x000080u #define WT_CURSTD_DUMP_JSON 0x000100u #define WT_CURSTD_DUMP_PRINT 0x000200u -#define WT_CURSTD_JOINED 0x000400u -#define WT_CURSTD_KEY_EXT 0x000800u /* Key points out of tree. */ -#define WT_CURSTD_KEY_INT 0x001000u /* Key points into tree. */ -#define WT_CURSTD_META_INUSE 0x002000u -#define WT_CURSTD_OPEN 0x004000u -#define WT_CURSTD_OVERWRITE 0x008000u -#define WT_CURSTD_RAW 0x010000u -#define WT_CURSTD_RAW_SEARCH 0x020000u -#define WT_CURSTD_UPDATE_LOCAL 0x040000u -#define WT_CURSTD_VALUE_EXT 0x080000u /* Value points out of tree. */ -#define WT_CURSTD_VALUE_INT 0x100000u /* Value points into tree. */ +#define WT_CURSTD_IGNORE_TOMBSTONE 0x000400u +#define WT_CURSTD_JOINED 0x000800u +#define WT_CURSTD_KEY_EXT 0x001000u /* Key points out of tree. */ +#define WT_CURSTD_KEY_INT 0x002000u /* Key points into tree. */ +#define WT_CURSTD_META_INUSE 0x004000u +#define WT_CURSTD_OPEN 0x008000u +#define WT_CURSTD_OVERWRITE 0x010000u +#define WT_CURSTD_RAW 0x020000u +#define WT_CURSTD_RAW_SEARCH 0x040000u +#define WT_CURSTD_UPDATE_LOCAL 0x080000u +#define WT_CURSTD_VALUE_EXT 0x100000u /* Value points out of tree. */ +#define WT_CURSTD_VALUE_INT 0x200000u /* Value points into tree. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) @@ -1950,8 +1951,9 @@ struct __wt_session { * one of the following keys: \c "from=all" to drop all checkpoints\, \c "from=<checkpoint>" * to drop all checkpoints after and including the named checkpoint\, or \c * "to=<checkpoint>" to drop all checkpoints before and including the named checkpoint. - * Checkpoints cannot be dropped while a hot backup is in progress or if open in a cursor., - * a list of strings; default empty.} + * Checkpoints cannot be dropped if open in a cursor. While a hot backup is in progress\, + * checkpoints created prior to the start of the backup cannot be dropped., a list of + * strings; default empty.} * @config{force, if false (the default)\, checkpoints may be skipped if the underlying * object has not been modified\, if true\, this option forces the checkpoint., a boolean * flag; default \c false.} @@ -6243,10 +6245,12 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_REC_PAGES_EVICTION 2142 /*! reconciliation: pages deleted */ #define WT_STAT_DSRC_REC_PAGE_DELETE 2143 +/*! reconciliation: prepared values written */ +#define WT_STAT_DSRC_REC_PREPARE_VALUE 2144 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2144 +#define WT_STAT_DSRC_SESSION_COMPACT 2145 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2145 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2146 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 204e6fd0eb9..bdf26c80663 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -315,8 +315,10 @@ struct __wt_thread; typedef struct __wt_thread WT_THREAD; struct __wt_thread_group; typedef struct __wt_thread_group WT_THREAD_GROUP; -struct __wt_time_pair; -typedef struct __wt_time_pair WT_TIME_PAIR; +struct __wt_time_aggregate; +typedef struct __wt_time_aggregate WT_TIME_AGGREGATE; +struct __wt_time_window; +typedef struct __wt_time_window WT_TIME_WINDOW; struct __wt_txn; typedef struct __wt_txn WT_TXN; struct __wt_txn_global; @@ -329,6 +331,8 @@ struct __wt_txn_shared; typedef struct __wt_txn_shared WT_TXN_SHARED; struct __wt_update; typedef struct __wt_update WT_UPDATE; +struct __wt_update_value; +typedef struct __wt_update_value WT_UPDATE_VALUE; union __wt_lsn; typedef union __wt_lsn WT_LSN; union __wt_rand_state; @@ -375,8 +379,9 @@ typedef uint64_t wt_timestamp_t; #include "misc.h" #include "mutex.h" -#include "stat.h" /* required by dhandle.h */ -#include "dhandle.h" /* required by btree.h */ +#include "stat.h" /* required by dhandle.h */ +#include "dhandle.h" /* required by btree.h */ +#include "timestamp.h" /* required by reconcile.h */ #include "api.h" #include "async.h" @@ -418,10 +423,11 @@ typedef uint64_t wt_timestamp_t; #include "intpack.i" /* required by cell.i, packing.i */ #include "misc.i" /* required by mutex.i */ -#include "buf.i" /* required by cell.i */ -#include "cell.i" /* required by btree.i */ -#include "mutex.i" /* required by btree.i */ -#include "txn.i" /* required by btree.i */ +#include "buf.i" /* required by cell.i */ +#include "timestamp.i" /* required by btree.i */ +#include "cell.i" /* required by btree.i */ +#include "mutex.i" /* required by btree.i */ +#include "txn.i" /* required by btree.i */ #include "bitstring.i" #include "block.i" diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 87e4bda2a8a..e8348fb9933 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -215,7 +215,7 @@ __log_fs_write( } __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) - WT_PANIC_RET(session, ret, "%s: fatal log failure", slot->slot_fh->name); + WT_RET_PANIC(session, ret, "%s: fatal log failure", slot->slot_fh->name); return (ret); } @@ -1166,7 +1166,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) * can copy the files in any way they choose, and a log file rename might confuse things. */ create_log = true; - if (conn->log_prealloc > 0 && !conn->hot_backup) { + if (conn->log_prealloc > 0 && conn->hot_backup_start == 0) { WT_WITH_HOTBACKUP_READ_LOCK( session, ret = __log_alloc_prealloc(session, log->fileid), &skipp); @@ -1194,7 +1194,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) * Increment the missed pre-allocated file counter only if a hot backup is not in progress. * We are deliberately not using pre-allocated log files during backup (see comment above). */ - if (!conn->hot_backup) + if (conn->hot_backup_start == 0) log->prep_missed++; WT_RET(__wt_log_allocfile(session, log->fileid, WT_LOG_FILENAME)); } @@ -1383,7 +1383,7 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) conn = S2C(session); log = conn->log; - if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { + if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && conn->hot_backup_start == 0) { WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, log_fh, offset), &skipp); if (!skipp) { if (ret != ENOTSUP) diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 6052d20025f..7017cf74fd5 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -32,7 +32,7 @@ __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_SESSION_IMPL *session; lsm_tree = clsm->lsm_tree; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); if (!lsm_tree->need_switch) { /* @@ -64,7 +64,7 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm) int waited; lsm_tree = clsm->lsm_tree; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); /* * If there is no primary chunk, or a chunk has overflowed the hard limit, which either means a @@ -96,7 +96,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) bool hard_limit, have_primary, ovfl; lsm_tree = clsm->lsm_tree; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); if (clsm->nchunks == 0) { primary = NULL; @@ -159,7 +159,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) uint64_t i, pinned_id, switch_txn; lsm_tree = clsm->lsm_tree; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); txn = session->txn; /* Merge cursors never update. */ @@ -259,7 +259,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm) { WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); if (F_ISSET(clsm, WT_CLSM_ACTIVE)) { --session->ncursors; @@ -428,7 +428,7 @@ __clsm_open_cursors(WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_ c = &clsm->iface; cursor = NULL; - session = (WT_SESSION_IMPL *)c->session; + session = CUR2S(clsm); txn = session->txn; chunk = NULL; locked = false; @@ -712,7 +712,7 @@ __wt_clsm_init_merge(WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_ WT_SESSION_IMPL *session; clsm = (WT_CURSOR_LSM *)cursor; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); F_SET(clsm, WT_CLSM_MERGE); if (start_chunk != 0) @@ -816,7 +816,7 @@ __clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp WT_SESSION_IMPL *session; cursor = &clsm->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); c->set_key(c, &cursor->key); WT_RET(c->search_near(c, cmpp)); @@ -1149,7 +1149,7 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) c = NULL; cursor = &clsm->iface; have_hash = false; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); WT_FORALL_CURSORS(clsm, c, i) { diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c index 5861bf4c899..50dbcf5726c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c @@ -25,7 +25,7 @@ __clsm_close_bulk(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; lsm_tree = clsm->lsm_tree; chunk = lsm_tree->chunk[0]; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); /* Close the bulk cursor to ensure the chunk is written to disk. */ bulk_cursor = clsm->chunks[0]->cursor; @@ -67,7 +67,7 @@ __clsm_insert_bulk(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; lsm_tree = clsm->lsm_tree; chunk = lsm_tree->chunk[0]; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); ++chunk->count; @@ -95,7 +95,7 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) bulk_cursor = NULL; cursor = &clsm->iface; lsm_tree = clsm->lsm_tree; - session = (WT_SESSION_IMPL *)clsm->iface.session; + session = CUR2S(clsm); F_SET(clsm, WT_CLSM_BULK); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 2d5c017909f..13246d2c6a4 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -442,7 +442,7 @@ __lsm_worker_manager(void *arg) if (ret != 0) { err: - WT_PANIC_MSG(session, ret, "LSM worker manager thread error"); + WT_IGNORE_RET(__wt_panic(session, ret, "LSM worker manager thread error")); } /* Connection close waits on us to shutdown, let it know we're done. */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c index 75d97ab898b..ac586eb9e05 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -544,7 +544,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) * of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree, NULL)) != 0) - WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); + WT_ERR_PANIC(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 64202688de3..6e71c0434c5 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -781,7 +781,7 @@ err: * progress. Error out of WiredTiger. */ if (ret != 0) - WT_PANIC_RET(session, ret, "Failed doing LSM switch"); + WT_RET_PANIC(session, ret, "Failed doing LSM switch"); else if (!first_switch) WT_RET(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); return (ret); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c index bee2f58c5a6..de06c1cc5ff 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_worker.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c @@ -164,7 +164,7 @@ __lsm_worker(void *arg) if (ret != 0) { err: __wt_lsm_manager_free_work_unit(session, entry); - WT_PANIC_MSG(session, ret, "Error in LSM worker thread %u", cookie->id); + WT_IGNORE_RET(__wt_panic(session, ret, "Error in LSM worker thread %u", cookie->id)); } return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 66415b2cd62..21acec991c6 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -463,7 +463,6 @@ __wt_meta_ckptlist_get( WT_CONFIG_ITEM k, v; WT_DECL_RET; size_t allocated, slot; - int64_t maxorder; char *config; *ckptbasep = NULL; @@ -508,11 +507,8 @@ __wt_meta_ckptlist_get( WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase)); /* The caller may be adding a value, initialize it. */ - maxorder = 0; - WT_CKPT_FOREACH (ckptbase, ckpt) - if (ckpt->order > maxorder) - maxorder = ckpt->order; - ckpt->order = maxorder + 1; + ckpt = &ckptbase[slot]; + ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1; __wt_seconds(session, &ckpt->sec); /* * Load most recent checkpoint backup blocks to this checkpoint. @@ -586,26 +582,44 @@ __ckpt_load(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_C ckpt->size = (uint64_t)a.val; /* Default to durability. */ - ret = __wt_config_subgets(session, v, "start_durable_ts", &a); - WT_RET_NOTFOUND_OK(ret); - ckpt->start_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val; + __wt_time_aggregate_init(&ckpt->ta); + ret = __wt_config_subgets(session, v, "oldest_start_ts", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->oldest_start_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val; + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.oldest_start_ts = (uint64_t)a.val; + ret = __wt_config_subgets(session, v, "oldest_start_txn", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->oldest_start_txn = ret == WT_NOTFOUND || a.len == 0 ? WT_TXN_NONE : (uint64_t)a.val; - ret = __wt_config_subgets(session, v, "stop_durable_ts", &a); + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.oldest_start_txn = (uint64_t)a.val; + + ret = __wt_config_subgets(session, v, "newest_start_durable_ts", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->stop_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val; + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.newest_start_durable_ts = (uint64_t)a.val; + ret = __wt_config_subgets(session, v, "newest_stop_ts", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->newest_stop_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_MAX : (uint64_t)a.val; + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.newest_stop_ts = (uint64_t)a.val; + ret = __wt_config_subgets(session, v, "newest_stop_txn", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->newest_stop_txn = ret == WT_NOTFOUND || a.len == 0 ? WT_TXN_MAX : (uint64_t)a.val; - __wt_check_addr_validity(session, ckpt->start_durable_ts, ckpt->oldest_start_ts, - ckpt->oldest_start_txn, ckpt->stop_durable_ts, ckpt->newest_stop_ts, ckpt->newest_stop_txn); + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.newest_stop_txn = (uint64_t)a.val; + + ret = __wt_config_subgets(session, v, "newest_stop_durable_ts", &a); + WT_RET_NOTFOUND_OK(ret); + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.newest_stop_durable_ts = (uint64_t)a.val; + + ret = __wt_config_subgets(session, v, "prepare", &a); + WT_RET_NOTFOUND_OK(ret); + if (ret != WT_NOTFOUND && a.len != 0) + ckpt->ta.prepare = (uint8_t)a.val; + + __wt_check_addr_validity(session, &ckpt->ta); WT_RET(__wt_config_subgets(session, v, "write_gen", &a)); if (a.len == 0) @@ -691,9 +705,7 @@ __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM WT_RET(__wt_raw_to_hex(session, ckpt->raw.data, ckpt->raw.size, &ckpt->addr)); } - __wt_check_addr_validity(session, ckpt->start_durable_ts, ckpt->oldest_start_ts, - ckpt->oldest_start_txn, ckpt->stop_durable_ts, ckpt->newest_stop_ts, - ckpt->newest_stop_txn); + __wt_check_addr_validity(session, &ckpt->ta); WT_RET(__wt_buf_catfmt(session, buf, "%s%s", sep, ckpt->name)); sep = ","; @@ -701,18 +713,17 @@ __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) WT_RET(__wt_buf_catfmt(session, buf, ".%" PRId64, ckpt->order)); - /* - * Use PRId64 formats: WiredTiger's configuration code handles signed 8B values. - */ + /* Use PRId64 formats: WiredTiger's configuration code handles signed 8B values. */ WT_RET(__wt_buf_catfmt(session, buf, "=(addr=\"%.*s\",order=%" PRId64 ",time=%" PRIu64 ",size=%" PRId64 - ",start_durable_ts=%" PRId64 ",oldest_start_ts=%" PRId64 ",oldest_start_txn=%" PRId64 - ",stop_durable_ts=%" PRId64 ",newest_stop_ts=%" PRId64 ",newest_stop_txn=%" PRId64 - ",write_gen=%" PRId64 ")", + ",newest_start_durable_ts=%" PRId64 ",oldest_start_ts=%" PRId64 + ",oldest_start_txn=%" PRId64 ",newest_stop_durable_ts=%" PRId64 ",newest_stop_ts=%" PRId64 + ",newest_stop_txn=%" PRId64 ",prepare:%d,write_gen=%" PRId64 ")", (int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec, - (int64_t)ckpt->size, (int64_t)ckpt->start_durable_ts, (int64_t)ckpt->oldest_start_ts, - (int64_t)ckpt->oldest_start_txn, (int64_t)ckpt->stop_durable_ts, - (int64_t)ckpt->newest_stop_ts, (int64_t)ckpt->newest_stop_txn, (int64_t)ckpt->write_gen)); + (int64_t)ckpt->size, (int64_t)ckpt->ta.newest_start_durable_ts, + (int64_t)ckpt->ta.oldest_start_ts, (int64_t)ckpt->ta.oldest_start_txn, + (int64_t)ckpt->ta.newest_stop_durable_ts, (int64_t)ckpt->ta.newest_stop_ts, + (int64_t)ckpt->ta.newest_stop_txn, (int)ckpt->ta.prepare, (int64_t)ckpt->write_gen)); } WT_RET(__wt_buf_catfmt(session, buf, ")")); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 5b2710e8aba..a569e132d75 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -331,7 +331,7 @@ err: __wt_cond_signal(session, S2C(session)->sweep_cond); if (ret != 0) - WT_PANIC_RET(session, ret, "failed to apply or unroll all tracked operations"); + WT_RET_PANIC(session, ret, "failed to apply or unroll all tracked operations"); return (saved_ret == 0 ? 0 : saved_ret); } diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 9bedba65310..119438fefcd 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -80,7 +80,7 @@ __metadata_load_hot_backup(WT_SESSION_IMPL *session) break; WT_ERR(__wt_getline(session, fs, value)); if (value->size == 0) - WT_PANIC_ERR(session, EINVAL, "%s: zero-length value", WT_METADATA_BACKUP); + WT_ERR_PANIC(session, EINVAL, "%s: zero-length value", WT_METADATA_BACKUP); WT_ERR(__wt_metadata_update(session, key->data, value->data)); } @@ -381,7 +381,7 @@ err: */ if (ret == 0 || strcmp(key, WT_METADATA_COMPAT) == 0 || F_ISSET(S2C(session), WT_CONN_SALVAGE)) return (ret); - WT_PANIC_RET(session, WT_TRY_SALVAGE, "%s: fatal turtle file read error", WT_METADATA_TURTLE); + WT_RET_PANIC(session, WT_TRY_SALVAGE, "%s: fatal turtle file read error", WT_METADATA_TURTLE); } /* @@ -437,5 +437,5 @@ err: */ if (ret == 0) return (ret); - WT_PANIC_RET(session, ret, "%s: fatal turtle file update error", WT_METADATA_TURTLE); + WT_RET_PANIC(session, ret, "%s: fatal turtle file update error", WT_METADATA_TURTLE); } diff --git a/src/third_party/wiredtiger/src/optrack/optrack.c b/src/third_party/wiredtiger/src/optrack/optrack.c index 6b77c534e4f..916c6ab74b6 100644 --- a/src/third_party/wiredtiger/src/optrack/optrack.c +++ b/src/third_party/wiredtiger/src/optrack/optrack.c @@ -39,7 +39,7 @@ __wt_optrack_record_funcid(WT_SESSION_IMPL *session, const char *func, uint16_t if (0) { err: - WT_PANIC_MSG(session, ret, "operation tracking initialization failure"); + WT_IGNORE_RET(__wt_panic(session, ret, "operation tracking initialization failure")); } if (locked) diff --git a/src/third_party/wiredtiger/src/os_common/os_errno.c b/src/third_party/wiredtiger/src/os_common/os_errno.c index 5c77449729a..c7c8a6dfd4d 100644 --- a/src/third_party/wiredtiger/src/os_common/os_errno.c +++ b/src/third_party/wiredtiger/src/os_common/os_errno.c @@ -78,6 +78,7 @@ __wt_ext_map_windows_error(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uin return (__wt_map_windows_error(windows_error)); #else WT_UNUSED(windows_error); - return (WT_PANIC); + WT_RET_PANIC( + (WT_SESSION_IMPL *)wt_session, WT_PANIC, "unexpected attempt to map Windows error"); #endif } diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index d8f695fa1c8..c0fc74ee8f8 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -88,7 +88,7 @@ __posix_sync(WT_SESSION_IMPL *session, int fd, const char *name, const char *fun WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret); if (ret == 0) return (0); - WT_PANIC_RET(session, ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func); + WT_RET_PANIC(session, ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func); } #endif #if defined(HAVE_FDATASYNC) @@ -96,13 +96,13 @@ __posix_sync(WT_SESSION_IMPL *session, int fd, const char *name, const char *fun WT_SYSCALL(fdatasync(fd), ret); if (ret == 0) return (0); - WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func); + WT_RET_PANIC(session, ret, "%s: %s: fdatasync", name, func); #else /* See comment in __posix_sync(): sync cannot be retried or fail. */ WT_SYSCALL(fsync(fd), ret); if (ret == 0) return (0); - WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func); + WT_RET_PANIC(session, ret, "%s: %s: fsync", name, func); #endif } @@ -148,7 +148,7 @@ err: return (ret); /* See comment in __posix_sync(): sync cannot be retried or fail. */ - WT_PANIC_RET(session, ret, "%s: directory-sync", path); + WT_RET_PANIC(session, ret, "%s: directory-sync", path); } #endif @@ -541,7 +541,7 @@ __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session) if (ret == 0) return (0); - WT_PANIC_RET(session, ret, "%s: handle-sync-nowait: sync_file_range", file_handle->name); + WT_RET_PANIC(session, ret, "%s: handle-sync-nowait: sync_file_range", file_handle->name); } #endif diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index ac9a676b9b0..61896fa1b83 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -106,7 +106,7 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs #ifdef HAVE_PTHREAD_COND_MONOTONIC WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret); if (ret != 0) - WT_PANIC_MSG(session, ret, "clock_gettime"); + WT_IGNORE_RET(__wt_panic(session, ret, "clock_gettime")); #else __wt_epoch_raw(session, &ts); #endif @@ -140,7 +140,7 @@ err: if (ret == 0) return; - WT_PANIC_MSG(session, ret, "pthread_cond_wait: %s", cond->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_cond_wait: %s", cond->name)); } /* @@ -175,7 +175,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) return; err: - WT_PANIC_MSG(session, ret, "pthread_cond_broadcast: %s", cond->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_cond_broadcast: %s", cond->name)); } /* @@ -193,10 +193,10 @@ __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) return; if ((ret = pthread_cond_destroy(&cond->cond)) != 0) - WT_PANIC_MSG(session, ret, "pthread_cond_destroy: %s", cond->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_cond_destroy: %s", cond->name)); if ((ret = pthread_mutex_destroy(&cond->mtx)) != 0) - WT_PANIC_MSG(session, ret, "pthread_mutex_destroy: %s", cond->name); + WT_IGNORE_RET(__wt_panic(session, ret, "pthread_mutex_destroy: %s", cond->name)); __wt_free(session, *condp); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c index 57e567e7828..38a3be97c92 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_time.c +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -30,7 +30,7 @@ __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); if (ret == 0) return; - WT_PANIC_MSG(session, ret, "clock_gettime"); + WT_IGNORE_RET(__wt_panic(session, ret, "clock_gettime")); #elif defined(HAVE_GETTIMEOFDAY) { struct timeval v; @@ -41,7 +41,7 @@ __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) tsp->tv_nsec = v.tv_usec * WT_THOUSAND; return; } - WT_PANIC_MSG(session, ret, "gettimeofday"); + WT_IGNORE_RET(__wt_panic(session, ret, "gettimeofday")); } #else NO TIME - OF - DAY IMPLEMENTATION : see src / os_posix / os_time.c diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c index 32a5e42c193..687796efe33 100644 --- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c @@ -116,8 +116,8 @@ skipping: __wt_err(session, __wt_map_windows_error(windows_error), "SleepConditionVariableCS: %s: %s", cond->name, __wt_formatmessage(session, windows_error)); - WT_PANIC_MSG( - session, __wt_map_windows_error(windows_error), "SleepConditionVariableCS: %s", cond->name); + WT_IGNORE_RET(__wt_panic( + session, __wt_map_windows_error(windows_error), "SleepConditionVariableCS: %s", cond->name)); } /* diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c index 118635ab734..6e95f17a94e 100644 --- a/src/third_party/wiredtiger/src/os_win/os_thread.c +++ b/src/third_party/wiredtiger/src/os_win/os_thread.c @@ -56,11 +56,10 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid) if ((windows_error = WaitForSingleObject(tid->id, INFINITE)) != WAIT_OBJECT_0) { if (windows_error == WAIT_FAILED) windows_error = __wt_getlasterror(); - __wt_err(session, __wt_map_windows_error(windows_error), - "thread join: WaitForSingleObject: %s", __wt_formatmessage(session, windows_error)); /* If we fail to wait, we will leak handles, do not continue. */ - return (WT_PANIC); + return (__wt_panic(session, __wt_map_windows_error(windows_error), + "thread join: WaitForSingleObject: %s", __wt_formatmessage(session, windows_error))); } if (CloseHandle(tid->id) == 0) { diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index 2d3f17a22af..d235f926fa0 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -36,7 +36,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C if (F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR) && page_del != NULL && __wt_page_del_active(session, ref, false)) { if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) - WT_PANIC_RET(session, EINVAL, "reconciliation illegally skipped an update"); + WT_RET_PANIC(session, EINVAL, "reconciliation illegally skipped an update"); return (__wt_set_return(session, EBUSY)); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index ffa4c94f1b2..18a4a16b556 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -19,7 +19,7 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) WT_RECONCILE *r; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + session = CUR2S(cbulk); r = cbulk->reconcile; btree = S2BT(session); @@ -110,14 +110,15 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet WT_BTREE *btree; WT_RECONCILE *r; WT_REC_KV *val; + WT_TIME_WINDOW tw; r = cbulk->reconcile; btree = S2BT(session); + __wt_time_window_init(&tw); val = &r->v; if (deleted) { - val->cell_len = __wt_cell_pack_del(session, &val->cell, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, - WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle); + val->cell_len = __wt_cell_pack_del(session, &val->cell, &tw, cbulk->rle); val->buf.data = NULL; val->buf.size = 0; val->len = val->cell_len; @@ -126,8 +127,8 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet * Store the bulk cursor's last buffer, not the current value, we're tracking duplicates, * which means we want the previous value seen, not the current value. */ - WT_RET(__wt_rec_cell_build_val(session, r, cbulk->last.data, cbulk->last.size, WT_TS_NONE, - WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, false, cbulk->rle)); + WT_RET( + __wt_rec_cell_build_val(session, r, cbulk->last.data, cbulk->last.size, &tw, cbulk->rle)); /* Boundary: split or write the page. */ if (WT_CROSSING_SPLIT_BND(r, val->len)) @@ -135,11 +136,9 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet /* Copy the value onto the page. */ if (btree->dictionary) - WT_RET(__wt_rec_dict_replace(session, r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, - WT_TS_MAX, WT_TXN_MAX, false, cbulk->rle, val)); + WT_RET(__wt_rec_dict_replace(session, r, &tw, cbulk->rle, val)); __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update( - r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, false); + __wt_time_aggregate_update(&r->cur_ptr->ta, &tw); /* Update the starting record number in case we split. */ r->recno += cbulk->rle; @@ -179,9 +178,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the value onto the page. */ __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update(r, addr->newest_start_durable_ts, addr->oldest_start_ts, - addr->oldest_start_txn, addr->newest_stop_durable_ts, addr->newest_stop_ts, - addr->newest_stop_txn, addr->prepare); + __wt_time_aggregate_merge(&r->cur_ptr->ta, &addr->ta); } return (0); } @@ -201,14 +198,14 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_PAGE *child, *page; WT_REC_KV *val; WT_REF *ref; - wt_timestamp_t newest_start_durable_ts, newest_stop_durable_ts, newest_stop_ts, oldest_start_ts; - uint64_t newest_stop_txn, oldest_start_txn; - bool hazard, prepare; + WT_TIME_AGGREGATE ta; + bool hazard; btree = S2BT(session); page = pageref->page; child = NULL; hazard = false; + __wt_time_aggregate_init(&ta); val = &r->v; vpack = &_vpack; @@ -283,22 +280,10 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) val->buf.size = __wt_cell_total_len(vpack); val->cell_len = 0; val->len = val->buf.size; - newest_start_durable_ts = vpack->newest_start_durable_ts; - oldest_start_ts = vpack->oldest_start_ts; - oldest_start_txn = vpack->oldest_start_txn; - newest_stop_durable_ts = vpack->newest_stop_durable_ts; - newest_stop_ts = vpack->newest_stop_ts; - newest_stop_txn = vpack->newest_stop_txn; - prepare = F_ISSET(vpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(&ta, &vpack->ta); } else { __wt_rec_cell_build_addr(session, r, addr, NULL, false, ref->ref_recno); - newest_start_durable_ts = addr->newest_start_durable_ts; - oldest_start_ts = addr->oldest_start_ts; - oldest_start_txn = addr->oldest_start_txn; - newest_stop_durable_ts = addr->newest_stop_durable_ts; - newest_stop_ts = addr->newest_stop_ts; - newest_stop_txn = addr->newest_stop_txn; - prepare = addr->prepare; + __wt_time_aggregate_copy(&ta, &addr->ta); } WT_CHILD_RELEASE_ERR(session, hazard, ref); @@ -308,8 +293,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Copy the value onto the page. */ __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update(r, newest_start_durable_ts, oldest_start_ts, oldest_start_txn, - newest_stop_durable_ts, newest_stop_ts, newest_stop_txn, prepare); + __wt_time_aggregate_merge(&r->cur_ptr->ta, &ta); } WT_INTL_FOREACH_END; @@ -329,7 +313,6 @@ int __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) { WT_BTREE *btree; - WT_DECL_RET; WT_INSERT *ins; WT_PAGE *page; WT_UPDATE *upd; @@ -350,13 +333,9 @@ __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_SKIP_FOREACH (ins, WT_COL_UPDATE_SINGLE(page)) { WT_RET(__wt_rec_upd_select(session, r, ins, NULL, NULL, &upd_select)); upd = upd_select.upd; - if (upd != NULL) { + if (upd != NULL) __bit_setv( r->first_free, WT_INSERT_RECNO(ins) - pageref->ref_recno, btree->bitcnt, *upd->data); - /* Free the update if it is external. */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - } } /* Calculate the number of entries per page remainder. */ @@ -422,17 +401,13 @@ __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * last, allowing it to grow in the future. */ __wt_rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); - WT_ERR(__wt_rec_split(session, r, 0, false)); + WT_RET(__wt_rec_split(session, r, 0, false)); /* Calculate the number of entries per page. */ entry = 0; nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); } - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - /* * Execute this loop once without an insert item to catch any missing records due to a * split, then quit. @@ -445,14 +420,9 @@ __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) __wt_rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); /* Write the remnant page. */ - ret = __wt_rec_split_finish(session, r); + WT_RET(__wt_rec_split_finish(session, r)); -err: - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - - return (ret); + return (0); } /* @@ -504,7 +474,7 @@ __wt_rec_col_fix_slvg( * We can't split during salvage -- if everything didn't fit, it's all gone wrong. */ if (salvage->missing != 0 || page_take != 0) - WT_PANIC_RET(session, WT_PANIC, "%s page too large, attempted split during salvage", + WT_RET_PANIC(session, WT_PANIC, "%s page too large, attempted split during salvage", __wt_page_type_string(page->type)); /* Write the page. */ @@ -517,9 +487,7 @@ __wt_rec_col_fix_slvg( */ static int __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKIE *salvage, - WT_ITEM *value, wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle, bool deleted, bool overflow_type) + WT_ITEM *value, WT_TIME_WINDOW *tw, uint64_t rle, bool deleted, bool overflow_type) { WT_BTREE *btree; WT_REC_KV *val; @@ -558,21 +526,18 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKI } if (deleted) { - val->cell_len = __wt_cell_pack_del(session, &val->cell, start_durable_ts, start_ts, - start_txn, stop_durable_ts, stop_ts, stop_txn, rle); + val->cell_len = __wt_cell_pack_del(session, &val->cell, tw, rle); val->buf.data = NULL; val->buf.size = 0; val->len = val->cell_len; } else if (overflow_type) { val->cell_len = - __wt_cell_pack_ovfl(session, &val->cell, WT_CELL_VALUE_OVFL, start_durable_ts, start_ts, - start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, rle, value->size); + __wt_cell_pack_ovfl(session, &val->cell, WT_CELL_VALUE_OVFL, tw, rle, value->size); val->buf.data = value->data; val->buf.size = value->size; val->len = val->cell_len + value->size; } else - WT_RET(__wt_rec_cell_build_val(session, r, value->data, value->size, start_durable_ts, - start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, rle)); + WT_RET(__wt_rec_cell_build_val(session, r, value->data, value->size, tw, rle)); /* Boundary: split or write the page. */ if (__wt_rec_need_split(r, val->len)) @@ -580,11 +545,9 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKI /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) - WT_RET(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn, - stop_durable_ts, stop_ts, stop_txn, prepare, rle, val)); + WT_RET(__wt_rec_dict_replace(session, r, tw, rle, val)); __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update( - r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare); + __wt_time_aggregate_update(&r->cur_ptr->ta, tw); /* Update the starting record number in case we split. */ r->recno += rle; @@ -602,15 +565,9 @@ __wt_rec_col_var( { enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; struct { - WT_ITEM *value; /* Value */ - wt_timestamp_t start_durable_ts; /* Timestamps/TxnID */ - wt_timestamp_t start_ts; - uint64_t start_txn; - wt_timestamp_t stop_durable_ts; - wt_timestamp_t stop_ts; - uint64_t stop_txn; + WT_ITEM *value; /* Value */ + WT_TIME_WINDOW tw; bool deleted; /* If deleted */ - bool prepare; } last; WT_BTREE *btree; WT_CELL *cell; @@ -621,13 +578,12 @@ __wt_rec_col_var( WT_DECL_RET; WT_INSERT *ins; WT_PAGE *page; + WT_TIME_WINDOW tw, default_tw; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts; uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; - uint64_t start_txn, stop_txn; uint32_t i, size; - bool deleted, orig_deleted, prepare, update_no_copy; + bool deleted, orig_deleted, update_no_copy; const void *data; btree = S2BT(session); @@ -636,33 +592,22 @@ __wt_rec_col_var( upd = NULL; size = 0; data = NULL; + __wt_time_window_init(&default_tw); cbt = &r->update_modify_cbt; cbt->iface.session = (WT_SESSION *)session; /* Set the "last" values to cause failure if they're not set. */ last.value = r->last; - last.start_durable_ts = WT_TS_MAX; - last.start_ts = WT_TS_MAX; - last.start_txn = WT_TXN_MAX; - last.stop_durable_ts = WT_TS_MAX; - last.stop_ts = WT_TS_NONE; - last.stop_txn = WT_TXN_NONE; + __wt_time_window_init_max(&last.tw); last.deleted = false; - last.prepare = false; /* * Set the start/stop values to cause failure if they're not set. * [-Werror=maybe-uninitialized] */ /* NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) */ - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_MAX; - start_txn = WT_TXN_MAX; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_NONE; - stop_txn = WT_TS_NONE; - prepare = false; + __wt_time_window_init_max(&tw); WT_RET(__wt_rec_split_init(session, r, page, pageref->ref_recno, btree->maxleafpage_precomp)); @@ -681,14 +626,8 @@ __wt_rec_col_var( if (salvage != NULL && salvage->missing != 0) { if (salvage->skip == 0) { rle = salvage->missing; - last.start_durable_ts = WT_TS_NONE; - last.start_ts = WT_TS_NONE; - last.start_txn = WT_TXN_NONE; - last.stop_durable_ts = WT_TS_NONE; - last.stop_ts = WT_TS_MAX; - last.stop_txn = WT_TXN_MAX; + __wt_time_window_init(&last.tw); last.deleted = true; - last.prepare = false; /* * Correct the number of records we're going to "take", pretending the missing records @@ -696,8 +635,8 @@ __wt_rec_col_var( */ salvage->take += salvage->missing; } else - WT_ERR(__rec_col_var_helper(session, r, NULL, NULL, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, - WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, prepare, salvage->missing, true, false)); + WT_ERR(__rec_col_var_helper( + session, r, NULL, NULL, &default_tw, salvage->missing, true, false)); } /* @@ -765,9 +704,8 @@ record_loop: ins = WT_SKIP_NEXT(ins); } - update_no_copy = - upd == NULL || !F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK); /* No data copy */ - repeat_count = 1; /* Single record */ + update_no_copy = true; /* No data copy */ + repeat_count = 1; /* Single record */ deleted = false; if (upd == NULL) { @@ -788,26 +726,12 @@ record_loop: */ deleted = orig_deleted; if (deleted || salvage) { - /* Set time pairs for the deleted key. */ - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - prepare = false; + __wt_time_window_init(&tw); if (deleted) goto compare; - } else { - start_durable_ts = vpack->durable_start_ts; - start_ts = vpack->start_ts; - start_txn = vpack->start_txn; - stop_durable_ts = vpack->durable_stop_ts; - stop_ts = vpack->stop_ts; - stop_txn = vpack->stop_txn; - prepare = F_ISSET(vpack, WT_CELL_UNPACK_PREPARE); - } + } else + __wt_time_window_copy(&tw, &vpack->tw); /* * If we are handling overflow items, use the overflow item itself exactly once, @@ -822,18 +746,15 @@ record_loop: * We're going to copy the on-page cell, write out any record we're tracking. */ if (rle != 0) { - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, - last.start_durable_ts, last.start_ts, last.start_txn, - last.stop_durable_ts, last.stop_ts, last.stop_txn, last.prepare, rle, - last.deleted, false)); + WT_ERR(__rec_col_var_helper( + session, r, salvage, last.value, &last.tw, rle, last.deleted, false)); rle = 0; } last.value->data = vpack->data; last.value->size = vpack->size; - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, start_durable_ts, - start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, - repeat_count, false, true)); + WT_ERR(__rec_col_var_helper( + session, r, salvage, last.value, &tw, repeat_count, false, true)); /* Track if page has overflow items. */ r->ovfl_items = true; @@ -859,18 +780,14 @@ record_loop: break; } } else { - start_durable_ts = upd_select.start_durable_ts; - start_ts = upd_select.start_ts; - start_txn = upd_select.start_txn; - stop_durable_ts = upd_select.stop_durable_ts; - stop_ts = upd_select.stop_ts; - stop_txn = upd_select.stop_txn; - prepare = upd_select.prepare; + __wt_time_window_copy(&tw, &upd_select.tw); switch (upd->type) { case WT_UPDATE_MODIFY: cbt->slot = WT_COL_SLOT(page, cip); - WT_ERR(__wt_value_return_upd(cbt, upd)); + WT_ERR( + __wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value)); + WT_ERR(__wt_value_return(cbt, cbt->upd_value)); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -880,13 +797,7 @@ record_loop: size = upd->size; break; case WT_UPDATE_TOMBSTONE: - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - prepare = false; + __wt_time_window_init(&tw); deleted = true; break; default: @@ -902,9 +813,7 @@ compare: * record number, we've been doing that all along. */ if (rle != 0) { - if ((last.start_durable_ts == start_durable_ts && last.start_ts == start_ts && - last.start_txn == start_txn && last.stop_durable_ts == stop_durable_ts && - last.stop_ts == stop_ts && last.stop_txn == stop_txn) && + if (__wt_time_windows_equal(&tw, &last.tw) && ((deleted && last.deleted) || (!deleted && !last.deleted && last.value->size == size && memcmp(last.value->data, data, size) == 0))) { @@ -914,17 +823,13 @@ compare: * tombstone to write to disk and the deletion of the keys must be globally * visible. */ - WT_ASSERT(session, - (!deleted && !last.deleted) || - (last.start_durable_ts == WT_TS_NONE && last.start_ts == WT_TS_NONE && - last.start_txn == WT_TXN_NONE && last.stop_durable_ts == WT_TS_NONE && - last.stop_ts == WT_TS_MAX && last.stop_txn == WT_TXN_MAX)); + WT_ASSERT( + session, (!deleted && !last.deleted) || __wt_time_window_is_empty(&last.tw)); rle += repeat_count; continue; } - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.start_durable_ts, - last.start_ts, last.start_txn, last.stop_durable_ts, last.stop_ts, last.stop_txn, - last.prepare, rle, last.deleted, false)); + WT_ERR(__rec_col_var_helper( + session, r, salvage, last.value, &last.tw, rle, last.deleted, false)); } /* @@ -948,17 +853,7 @@ compare: WT_ERR(__wt_buf_set(session, last.value, data, size)); } - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - - last.start_durable_ts = start_durable_ts; - last.start_ts = start_ts; - last.start_txn = start_txn; - last.stop_durable_ts = stop_durable_ts; - last.stop_ts = stop_ts; - last.stop_txn = stop_txn; - last.prepare = prepare; + __wt_time_window_copy(&last.tw, &tw); last.deleted = deleted; rle = repeat_count; } @@ -1005,8 +900,7 @@ compare: } while (src_recno <= n) { - update_no_copy = - upd == NULL || !F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK); /* No data copy */ + update_no_copy = true; /* No data copy */ deleted = false; /* @@ -1022,10 +916,7 @@ compare: * tombstone to write to disk and the deletion of the keys must be globally * visible. */ - WT_ASSERT(session, last.start_durable_ts == WT_TS_NONE && - last.start_ts == WT_TS_NONE && last.start_txn == WT_TXN_NONE && - last.stop_durable_ts == WT_TS_NONE && last.stop_ts == WT_TS_MAX && - last.stop_txn == WT_TXN_MAX); + WT_ASSERT(session, __wt_time_window_is_empty(&last.tw)); /* * The record adjustment is decremented by one so we can naturally fall into the * RLE accounting below, where we increment rle by one, then continue in the @@ -1034,36 +925,16 @@ compare: skip = (n - src_recno) - 1; rle += skip; src_recno += skip; - } else { + } else /* Set time pairs for the first deleted key in a deleted range. */ - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - prepare = false; - } + __wt_time_window_init(&tw); } else if (upd == NULL) { /* The updates on the key are all uncommitted so we write a deleted key to disk. */ - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - prepare = false; - + __wt_time_window_init(&tw); deleted = true; } else { /* Set time pairs for a key. */ - start_durable_ts = upd_select.start_durable_ts; - start_ts = upd_select.start_ts; - start_txn = upd_select.start_txn; - stop_durable_ts = upd_select.stop_durable_ts; - stop_ts = upd_select.stop_ts; - stop_txn = upd_select.stop_txn; - prepare = upd_select.prepare; + __wt_time_window_copy(&tw, &upd_select.tw); switch (upd->type) { case WT_UPDATE_MODIFY: @@ -1071,7 +942,9 @@ compare: * Impossible slot, there's no backing on-page item. */ cbt->slot = UINT32_MAX; - WT_ERR(__wt_value_return_upd(cbt, upd)); + WT_ERR( + __wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value)); + WT_ERR(__wt_value_return(cbt, cbt->upd_value)); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -1081,13 +954,7 @@ compare: size = upd->size; break; case WT_UPDATE_TOMBSTONE: - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - prepare = false; + __wt_time_window_init(&tw); deleted = true; break; default: @@ -1100,12 +967,7 @@ compare: * the same thing. */ if (rle != 0) { - /* - * FIXME-PM-1521: Follow up issue with clang in WT-5341. - */ - if ((last.start_durable_ts == start_durable_ts && last.start_ts == start_ts && - last.start_txn == start_txn && last.stop_durable_ts == stop_durable_ts && - last.stop_ts == stop_ts && last.stop_txn == stop_txn) && + if (__wt_time_windows_equal(&last.tw, &tw) && ((deleted && last.deleted) || (!deleted && !last.deleted && last.value->size == size && memcmp(last.value->data, data, size) == 0))) { @@ -1115,17 +977,17 @@ compare: * tombstone to write to disk and the deletion of the keys must be globally * visible. */ - WT_ASSERT(session, (!deleted && !last.deleted) || - (last.start_durable_ts == start_durable_ts && last.start_ts == WT_TS_NONE && - last.start_txn == WT_TXN_NONE && - last.stop_durable_ts == stop_durable_ts && - last.stop_ts == WT_TS_MAX && last.stop_txn == WT_TXN_MAX)); + WT_ASSERT(session, + (!deleted && !last.deleted) || + (last.tw.durable_start_ts == tw.durable_start_ts && + last.tw.start_ts == WT_TS_NONE && last.tw.start_txn == WT_TXN_NONE && + last.tw.durable_stop_ts == tw.durable_stop_ts && + last.tw.stop_ts == WT_TS_MAX && last.tw.stop_txn == WT_TXN_MAX)); ++rle; goto next; } - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.start_durable_ts, - last.start_ts, last.start_txn, last.stop_durable_ts, last.stop_ts, last.stop_txn, - last.prepare, rle, last.deleted, false)); + WT_ERR(__rec_col_var_helper( + session, r, salvage, last.value, &last.tw, rle, last.deleted, false)); } /* @@ -1143,18 +1005,8 @@ compare: WT_ERR(__wt_buf_set(session, last.value, data, size)); } - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - /* Ready for the next loop, reset the RLE counter. */ - last.start_durable_ts = start_durable_ts; - last.start_ts = start_ts; - last.start_txn = start_txn; - last.stop_durable_ts = stop_durable_ts; - last.stop_ts = stop_ts; - last.stop_txn = stop_txn; - last.prepare = prepare; + __wt_time_window_copy(&last.tw, &tw); last.deleted = deleted; rle = 1; @@ -1178,18 +1030,13 @@ next: /* If we were tracking a record, write it. */ if (rle != 0) - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.start_durable_ts, - last.start_ts, last.start_txn, last.stop_durable_ts, last.stop_ts, last.stop_txn, - last.prepare, rle, last.deleted, false)); + WT_ERR(__rec_col_var_helper( + session, r, salvage, last.value, &last.tw, rle, last.deleted, false)); /* Write the remnant page. */ ret = __wt_rec_split_finish(session, r); err: - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - __wt_scr_free(session, &orig); return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index d65768aba49..5365e077b65 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -80,8 +80,7 @@ __rec_cell_build_int_key( WT_STAT_DATA_INCR(session, rec_overflow_key_internal); *is_ovflp = true; - return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, WT_TS_NONE, WT_TS_NONE, - WT_TXN_NONE, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, false, 0)); + return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, NULL, 0)); } key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size); @@ -172,8 +171,7 @@ __rec_cell_build_leaf_key( WT_STAT_DATA_INCR(session, rec_overflow_key_leaf); *is_ovflp = true; - return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, WT_TS_NONE, - WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, false, 0)); + return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, NULL, 0)); } return (__rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp)); } @@ -195,19 +193,20 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_CURSOR *cursor; WT_RECONCILE *r; WT_REC_KV *key, *val; + WT_TIME_WINDOW tw; bool ovfl_key; r = cbulk->reconcile; btree = S2BT(session); cursor = &cbulk->cbt.iface; + __wt_time_window_init(&tw); key = &r->k; val = &r->v; WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */ cursor->key.data, cursor->key.size, &ovfl_key)); WT_RET(__wt_rec_cell_build_val(session, r, cursor->value.data, /* Build value cell */ - cursor->value.size, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, - false, 0)); + cursor->value.size, &tw, 0)); /* Boundary: split or write the page. */ if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { @@ -230,12 +229,10 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) else { r->all_empty_value = false; if (btree->dictionary) - WT_RET(__wt_rec_dict_replace(session, r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, - WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, false, 0, val)); + WT_RET(__wt_rec_dict_replace(session, r, &tw, 0, val)); __wt_rec_image_copy(session, r, val); } - __wt_rec_addr_ts_update( - r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, false); + __wt_time_aggregate_update(&r->cur_ptr->ta, &tw); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -279,9 +276,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the key and value onto the page. */ __wt_rec_image_copy(session, r, key); __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update(r, addr->newest_start_durable_ts, addr->oldest_start_ts, - addr->oldest_start_txn, addr->newest_stop_durable_ts, addr->newest_stop_ts, - addr->newest_stop_txn, addr->prepare); + __wt_time_aggregate_merge(&r->cur_ptr->ta, &addr->ta); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -306,10 +301,9 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_PAGE *child; WT_REC_KV *key, *val; WT_REF *ref; - wt_timestamp_t newest_start_durable_ts, newest_stop_durable_ts, newest_stop_ts, oldest_start_ts; + WT_TIME_AGGREGATE ta; size_t key_overflow_size, size; - uint64_t newest_stop_txn, oldest_start_txn; - bool force, hazard, key_onpage_ovfl, ovfl_key, prepare; + bool force, hazard, key_onpage_ovfl, ovfl_key; const void *p; btree = S2BT(session); @@ -437,16 +431,10 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ if (__wt_off_page(page, addr)) { __wt_rec_cell_build_addr(session, r, addr, NULL, state == WT_CHILD_PROXY, WT_RECNO_OOB); - newest_start_durable_ts = addr->newest_start_durable_ts; - oldest_start_ts = addr->oldest_start_ts; - oldest_start_txn = addr->oldest_start_txn; - newest_stop_durable_ts = addr->newest_stop_durable_ts; - newest_stop_ts = addr->newest_stop_ts; - newest_stop_txn = addr->newest_stop_txn; - prepare = addr->prepare; + __wt_time_aggregate_copy(&ta, &addr->ta); } else { __wt_cell_unpack(session, page, ref->addr, vpack); - if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED)) { + if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) { /* * The transaction ids are cleared after restart. Repack the cell with new validity * to flush the cleared transaction ids. @@ -464,13 +452,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) val->cell_len = 0; val->len = val->buf.size; } - newest_start_durable_ts = vpack->newest_start_durable_ts; - oldest_start_ts = vpack->oldest_start_ts; - oldest_start_txn = vpack->oldest_start_txn; - newest_stop_durable_ts = vpack->newest_stop_durable_ts; - newest_stop_ts = vpack->newest_stop_ts; - newest_stop_txn = vpack->newest_stop_txn; - prepare = F_ISSET(vpack, WT_CELL_UNPACK_PREPARE); + __wt_time_aggregate_copy(&ta, &vpack->ta); } WT_CHILD_RELEASE_ERR(session, hazard, ref); @@ -524,8 +506,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the key and value onto the page. */ __wt_rec_image_copy(session, r, key); __wt_rec_image_copy(session, r, val); - __wt_rec_addr_ts_update(r, newest_start_durable_ts, oldest_start_ts, oldest_start_txn, - newest_stop_durable_ts, newest_stop_ts, newest_stop_txn, prepare); + __wt_time_aggregate_merge(&r->cur_ptr->ta, &ta); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -545,15 +526,16 @@ err: * Return if a zero-length item can be written. */ static bool -__rec_row_zero_len(WT_SESSION_IMPL *session, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_ts, uint64_t stop_txn) +__rec_row_zero_len(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw) { /* - * The item must be globally visible because we're not writing anything on the page. + * The item must be globally visible because we're not writing anything on the page. Don't be + * tempted to check the time window against the default here - the check is subtly different due + * to the grouping. */ - return ((stop_ts == WT_TS_MAX && stop_txn == WT_TXN_MAX) && - ((start_ts == WT_TS_NONE && start_txn == WT_TXN_NONE) || - __wt_txn_visible_all(session, start_txn, start_ts))); + return ((tw->stop_ts == WT_TS_MAX && tw->stop_txn == WT_TXN_MAX) && + ((tw->start_ts == WT_TS_NONE && tw->start_txn == WT_TXN_NONE) || + __wt_txn_visible_all(session, tw->start_txn, tw->start_ts))); } /* @@ -565,13 +547,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) { WT_BTREE *btree; WT_CURSOR_BTREE *cbt; - WT_DECL_RET; WT_REC_KV *key, *val; + WT_TIME_WINDOW tw; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts; - uint64_t start_txn, stop_txn; - bool ovfl_key, prepare; + bool ovfl_key; btree = S2BT(session); @@ -588,13 +568,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if ((upd = upd_select.upd) == NULL) continue; - start_durable_ts = upd_select.start_durable_ts; - start_ts = upd_select.start_ts; - start_txn = upd_select.start_txn; - stop_durable_ts = upd_select.stop_durable_ts; - stop_ts = upd_select.stop_ts; - stop_txn = upd_select.stop_txn; - prepare = upd_select.prepare; + __wt_time_window_copy(&tw, &upd_select.tw); switch (upd->type) { case WT_UPDATE_MODIFY: @@ -602,28 +576,22 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) * Impossible slot, there's no backing on-page item. */ cbt->slot = UINT32_MAX; - WT_RET(__wt_value_return_upd(cbt, upd)); - WT_RET(__wt_rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, - start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, - 0)); + WT_RET(__wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value)); + WT_RET(__wt_value_return(cbt, cbt->upd_value)); + WT_RET(__wt_rec_cell_build_val( + session, r, cbt->iface.value.data, cbt->iface.value.size, &tw, 0)); break; case WT_UPDATE_STANDARD: /* Take the value from the update. */ - WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, start_durable_ts, - start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, 0)); + WT_RET(__wt_rec_cell_build_val(session, r, upd->data, upd->size, &tw, 0)); break; case WT_UPDATE_TOMBSTONE: continue; default: - ret = __wt_illegal_value(session, upd->type); - WT_ERR(ret); + WT_RET(__wt_illegal_value(session, upd->type)); } - /* Free the update if it is external. */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - /* Build key cell. */ - WT_ERR(__rec_cell_build_leaf_key( + WT_RET(__rec_cell_build_leaf_key( session, r, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ @@ -635,36 +603,29 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (r->key_pfx_compress_conf) { r->key_pfx_compress = false; if (!ovfl_key) - WT_ERR(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key)); + WT_RET(__rec_cell_build_leaf_key(session, r, NULL, 0, &ovfl_key)); } - WT_ERR(__wt_rec_split_crossing_bnd(session, r, key->len + val->len, false)); + WT_RET(__wt_rec_split_crossing_bnd(session, r, key->len + val->len, false)); } /* Copy the key/value pair onto the page. */ __wt_rec_image_copy(session, r, key); - if (val->len == 0 && __rec_row_zero_len(session, start_ts, start_txn, stop_ts, stop_txn)) + if (val->len == 0 && __rec_row_zero_len(session, &tw)) r->any_empty_value = true; else { r->all_empty_value = false; if (btree->dictionary) - WT_ERR(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn, - stop_durable_ts, stop_ts, stop_txn, prepare, 0, val)); + WT_RET(__wt_rec_dict_replace(session, r, &tw, 0, val)); __wt_rec_image_copy(session, r, val); } - __wt_rec_addr_ts_update( - r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare); + __wt_time_aggregate_update(&r->cur_ptr->ta, &tw); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); } -err: - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - - return (ret); + return (0); } /* @@ -673,8 +634,7 @@ err: */ static inline int __rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_CELL_UNPACK *vpack, - wt_timestamp_t start_durable_ts, uint64_t start_txn, wt_timestamp_t start_ts, - wt_timestamp_t stop_durable_ts, uint64_t stop_txn, wt_timestamp_t stop_ts, bool prepare) + WT_TIME_WINDOW *tw) { WT_DECL_ITEM(tmpval); WT_DECL_RET; @@ -693,8 +653,7 @@ __rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT p = tmpval->data; size = tmpval->size; } - WT_ERR(__wt_rec_cell_build_val(session, r, p, size, start_durable_ts, start_ts, start_txn, - stop_durable_ts, stop_ts, stop_txn, prepare, 0)); + WT_ERR(__wt_rec_cell_build_val(session, r, p, size, tw, 0)); err: __wt_scr_free(session, &tmpval); @@ -721,18 +680,18 @@ __wt_rec_row_leaf( WT_PAGE *page; WT_REC_KV *key, *val; WT_ROW *rip; + WT_TIME_WINDOW tw; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts; - uint64_t slvg_skip, start_txn, stop_txn; + uint64_t slvg_skip; uint32_t i; - bool dictionary, key_onpage_ovfl, ovfl_key, prepare; + bool dictionary, key_onpage_ovfl, ovfl_key; void *copy; btree = S2BT(session); page = pageref->page; - prepare = false; slvg_skip = salvage == NULL ? 0 : salvage->skip; + __wt_time_window_init(&tw); cbt = &r->update_modify_cbt; cbt->iface.session = (WT_SESSION *)session; @@ -796,37 +755,19 @@ __wt_rec_row_leaf( * pair information, else take the time pairs from the cell. */ if (upd == NULL) { - if (!salvage) { - start_durable_ts = vpack->durable_start_ts; - start_ts = vpack->start_ts; - start_txn = vpack->start_txn; - stop_durable_ts = vpack->durable_stop_ts; - stop_ts = vpack->stop_ts; - stop_txn = vpack->stop_txn; - } else { - start_durable_ts = WT_TS_NONE; - start_ts = WT_TS_NONE; - start_txn = WT_TXN_NONE; - stop_durable_ts = WT_TS_NONE; - stop_ts = WT_TS_MAX; - stop_txn = WT_TXN_MAX; - } - } else { - start_durable_ts = upd_select.start_durable_ts; - start_ts = upd_select.start_ts; - start_txn = upd_select.start_txn; - stop_durable_ts = upd_select.stop_durable_ts; - stop_ts = upd_select.stop_ts; - stop_txn = upd_select.stop_txn; - prepare = upd_select.prepare; - } + if (!salvage) + __wt_time_window_copy(&tw, &vpack->tw); + else + __wt_time_window_init(&tw); + } else + __wt_time_window_copy(&tw, &upd_select.tw); /* * If we reconcile an on disk key with a globally visible stop time pair and there are no * new updates for that key, skip writing that key. */ - if (upd == NULL && (stop_txn != WT_TXN_MAX || stop_ts != WT_TS_MAX) && - __wt_txn_visible_all(session, stop_txn, stop_ts)) + if (upd == NULL && (tw.stop_txn != WT_TXN_MAX || tw.stop_ts != WT_TS_MAX) && + __wt_txn_visible_all(session, tw.stop_txn, tw.stop_ts)) upd = &upd_tombstone; /* Build value cell. */ @@ -841,11 +782,10 @@ __wt_rec_row_leaf( * Repack the cell if we clear the transaction ids in the cell. */ if (vpack->raw == WT_CELL_VALUE_COPY) { - WT_ERR(__rec_cell_repack(session, btree, r, vpack, start_durable_ts, start_txn, - start_ts, stop_durable_ts, stop_txn, stop_ts, prepare)); + WT_ERR(__rec_cell_repack(session, btree, r, vpack, &tw)); dictionary = true; - } else if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED)) { + } else if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_WINDOW_CLEARED)) { /* * The transaction ids are cleared after restart. Repack the cell to flush the * cleared transaction ids. @@ -857,13 +797,11 @@ __wt_rec_row_leaf( val->buf.size = vpack->size; /* Rebuild the cell. */ - val->cell_len = __wt_cell_pack_ovfl(session, &val->cell, vpack->raw, - start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, - prepare, 0, val->buf.size); + val->cell_len = + __wt_cell_pack_ovfl(session, &val->cell, vpack->raw, &tw, 0, val->buf.size); val->len = val->cell_len + val->buf.size; } else - WT_ERR(__rec_cell_repack(session, btree, r, vpack, start_durable_ts, start_txn, - start_ts, stop_durable_ts, stop_txn, stop_ts, prepare)); + WT_ERR(__rec_cell_repack(session, btree, r, vpack, &tw)); dictionary = true; } else { @@ -884,16 +822,15 @@ __wt_rec_row_leaf( switch (upd->type) { case WT_UPDATE_MODIFY: cbt->slot = WT_ROW_SLOT(page, rip); - WT_ERR(__wt_value_return_upd(cbt, upd)); - WT_ERR(__wt_rec_cell_build_val(session, r, cbt->iface.value.data, - cbt->iface.value.size, start_durable_ts, start_ts, start_txn, stop_durable_ts, - stop_ts, stop_txn, prepare, 0)); + WT_ERR(__wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value)); + WT_ERR(__wt_value_return(cbt, cbt->upd_value)); + WT_ERR(__wt_rec_cell_build_val( + session, r, cbt->iface.value.data, cbt->iface.value.size, &tw, 0)); dictionary = true; break; case WT_UPDATE_STANDARD: /* Take the value from the update. */ - WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, start_durable_ts, - start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, 0)); + WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, &tw, 0)); dictionary = true; break; case WT_UPDATE_TOMBSTONE: @@ -940,9 +877,6 @@ __wt_rec_row_leaf( default: WT_ERR(__wt_illegal_value(session, upd->type)); } - /* Free the update if it is external. */ - if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); } /* @@ -1022,17 +956,15 @@ build: /* Copy the key/value pair onto the page. */ __wt_rec_image_copy(session, r, key); - if (val->len == 0 && __rec_row_zero_len(session, start_ts, start_txn, stop_ts, stop_txn)) + if (val->len == 0 && __rec_row_zero_len(session, &tw)) r->any_empty_value = true; else { r->all_empty_value = false; if (dictionary && btree->dictionary) - WT_ERR(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn, - stop_durable_ts, stop_ts, stop_txn, prepare, 0, val)); + WT_ERR(__wt_rec_dict_replace(session, r, &tw, 0, val)); __wt_rec_image_copy(session, r, val); } - __wt_rec_addr_ts_update( - r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, prepare); + __wt_time_aggregate_update(&r->cur_ptr->ta, &tw); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -1047,10 +979,6 @@ leaf_insert: ret = __wt_rec_split_finish(session, r); err: - /* Free the update if it is external. */ - if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) - __wt_free_update_list(session, &upd); - __wt_scr_free(session, &tmpkey); return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 36e2de9ccc5..aa44301a21d 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -76,11 +76,18 @@ __rec_append_orig_value( return (0); /* + * Prepared updates should already be in the update list, add the original update to the + * list only when the prepared update is a tombstone. + */ + if (F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) && upd->type != WT_UPDATE_TOMBSTONE) + return (0); + + /* * Done if the on page value already appears on the update list. We can't do the same check * for stop time pair because we may still need to append the onpage value if only the * tombstone is on the update chain. */ - if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid && + if (unpack->tw.start_ts == upd->start_ts && unpack->tw.start_txn == upd->txnid && upd->type != WT_UPDATE_TOMBSTONE) return (0); @@ -104,8 +111,8 @@ __rec_append_orig_value( } /* Done if the stop time pair of the onpage cell is globally visible. */ - if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) && - __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) + if ((unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) && + __wt_txn_visible_all(session, unpack->tw.stop_txn, unpack->tw.stop_ts)) return (0); /* We need the original on-page value for some reader: get a copy. */ @@ -113,9 +120,9 @@ __rec_append_orig_value( WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); WT_ERR(__wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &append, &size)); total_size += size; - append->txnid = unpack->start_txn; - append->start_ts = unpack->start_ts; - append->durable_ts = unpack->durable_start_ts; + append->txnid = unpack->tw.start_txn; + append->start_ts = unpack->tw.start_ts; + append->durable_ts = unpack->tw.durable_start_ts; /* * Additionally, we need to append a tombstone before the onpage value we're about to append to @@ -123,20 +130,26 @@ __rec_append_orig_value( * delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need * the tombstone to tell us there is no value between 10 and 20. */ - if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) { + if (unpack->tw.stop_ts != WT_TS_MAX || unpack->tw.stop_txn != WT_TXN_MAX) { /* No need to append the tombstone if it is already in the update chain. */ if (oldest_upd->type != WT_UPDATE_TOMBSTONE) { WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, &size)); total_size += size; - tombstone->txnid = unpack->stop_txn; - tombstone->start_ts = unpack->stop_ts; - tombstone->durable_ts = unpack->durable_stop_ts; + tombstone->txnid = unpack->tw.stop_txn; + tombstone->start_ts = unpack->tw.stop_ts; + tombstone->durable_ts = unpack->tw.durable_stop_ts; tombstone->next = append; append = tombstone; } else - WT_ASSERT(session, - unpack->stop_ts == oldest_upd->start_ts && unpack->stop_txn == oldest_upd->txnid); + /* + * Once the prepared update is resolved, the in-memory update and on-disk written copy + * doesn't have same timestamp due to replacing of prepare timestamp with commit and + * durable timestamps. Don't compare them when the on-disk version is a prepare. + */ + WT_ASSERT(session, F_ISSET(unpack, WT_CELL_UNPACK_PREPARE) || + (unpack->tw.stop_ts == oldest_upd->start_ts && + unpack->tw.stop_txn == oldest_upd->txnid)); } /* Append the new entry into the update list. */ @@ -161,6 +174,9 @@ static inline bool __rec_need_save_upd( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates) { + if (upd_select->tw.prepare) + return (true); + if (F_ISSET(r, WT_REC_EVICT) && has_newer_updates) return (true); @@ -176,8 +192,8 @@ __rec_need_save_upd( if (F_ISSET(r, WT_REC_CHECKPOINT) && upd_select->upd == NULL) return (false); - return (!__wt_txn_visible_all(session, upd_select->stop_txn, upd_select->stop_ts) && - !__wt_txn_visible_all(session, upd_select->start_txn, upd_select->start_ts)); + return (!__wt_txn_visible_all(session, upd_select->tw.stop_txn, upd_select->tw.stop_ts) && + !__wt_txn_visible_all(session, upd_select->tw.start_txn, upd_select->tw.start_ts)); } /* @@ -191,10 +207,12 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE *page; + WT_TIME_WINDOW *select_tw; WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd, *tombstone; wt_timestamp_t max_ts; size_t upd_memsize; uint64_t max_txn, txnid; + char time_string[WT_TIME_STRING_SIZE]; bool has_newer_updates, is_hs_page, supd_restore, upd_saved; /* @@ -202,13 +220,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * both must be initialized. */ upd_select->upd = NULL; - upd_select->start_durable_ts = WT_TS_NONE; - upd_select->start_ts = WT_TS_NONE; - upd_select->start_txn = WT_TXN_NONE; - upd_select->stop_durable_ts = WT_TS_NONE; - upd_select->stop_ts = WT_TS_MAX; - upd_select->stop_txn = WT_TXN_MAX; - upd_select->prepare = false; + select_tw = &upd_select->tw; + __wt_time_window_init(select_tw); page = r->page; first_txn_upd = upd = last_upd = tombstone = NULL; @@ -264,20 +277,24 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v continue; } + /* Ignore prepared updates if it is not eviction. */ if (upd->prepare_state == WT_PREPARE_LOCKED || upd->prepare_state == WT_PREPARE_INPROGRESS) { - WT_ASSERT(session, upd_select->upd == NULL); - has_newer_updates = true; - if (upd->start_ts > max_ts) - max_ts = upd->start_ts; - - /* - * Track the oldest update not on the page, used to decide whether reads can use the - * page image, hence using the start rather than the durable timestamp. - */ - if (upd->start_ts < r->min_skipped_ts) - r->min_skipped_ts = upd->start_ts; - continue; + WT_ASSERT(session, upd_select->upd == NULL || upd_select->upd->txnid == upd->txnid); + if (!F_ISSET(r, WT_REC_EVICT)) { + has_newer_updates = true; + if (upd->start_ts > max_ts) + max_ts = upd->start_ts; + + /* + * Track the oldest update not on the page, used to decide whether reads can use the + * page image, hence using the start rather than the durable timestamp. + */ + if (upd->start_ts < r->min_skipped_ts) + r->min_skipped_ts = upd->start_ts; + continue; + } else + WT_ASSERT(session, upd->prepare_state == WT_PREPARE_INPROGRESS); } /* Track the first update with non-zero timestamp. */ @@ -322,7 +339,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ if (has_newer_updates && F_ISSET(r, WT_REC_CLEAN_AFTER_REC | WT_REC_VISIBILITY_ERR)) { if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) - WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible"); + WT_RET_PANIC(session, EINVAL, "reconciliation error, update not visible"); return (__wt_set_return(session, EBUSY)); } @@ -341,6 +358,14 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ if (upd != NULL) { /* + * Mark the prepare flag if the selected update is an uncommitted prepare. As tombstone + * updates are never returned to write, set this flag before we move into the previous + * update to write. + */ + if (upd->prepare_state == WT_PREPARE_INPROGRESS) + select_tw->prepare = 1; + + /* * If the newest is a tombstone then select the update before it and set the end of the * visibility window to its time pair as appropriate to indicate that we should return "not * found" for reads after this point. @@ -349,9 +374,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * indicate that the value is visible to any timestamp/transaction id ahead of it. */ if (upd->type == WT_UPDATE_TOMBSTONE) { - upd_select->stop_ts = upd->start_ts; - upd_select->stop_txn = upd->txnid; - upd_select->stop_durable_ts = upd->durable_ts; + __wt_time_window_set_stop(select_tw, upd); tombstone = upd; /* Find the update this tombstone applies to. */ @@ -366,10 +389,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v } if (upd != NULL) { /* The beginning of the validity window is the selected update's time pair. */ - upd_select->start_ts = upd->start_ts; - upd_select->start_durable_ts = upd->durable_ts; - upd_select->start_txn = upd->txnid; - } else if (upd_select->stop_ts != WT_TS_NONE || upd_select->stop_txn != WT_TXN_NONE) { + __wt_time_window_set_start(select_tw, upd); + } else if (select_tw->stop_ts != WT_TS_NONE || select_tw->stop_txn != WT_TXN_NONE) { /* If we only have a tombstone in the update list, we must have an ondisk value. */ WT_ASSERT(session, vpack != NULL && tombstone != NULL); /* @@ -384,13 +405,11 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ WT_ERR(__rec_append_orig_value(session, page, tombstone, vpack)); WT_ASSERT(session, last_upd->next != NULL && - last_upd->next->txnid == vpack->start_txn && - last_upd->next->start_ts == vpack->start_ts && + last_upd->next->txnid == vpack->tw.start_txn && + last_upd->next->start_ts == vpack->tw.start_ts && last_upd->next->type == WT_UPDATE_STANDARD && last_upd->next->next == NULL); upd_select->upd = last_upd->next; - upd_select->start_ts = last_upd->next->start_ts; - upd_select->start_durable_ts = last_upd->next->durable_ts; - upd_select->start_txn = last_upd->next->txnid; + __wt_time_window_set_start(select_tw, last_upd->next); } } @@ -405,17 +424,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * time pair. While unusual, it is permitted for a single transaction to insert and then remove * a record. We don't want to generate a warning in that case. */ - if (upd_select->stop_ts < upd_select->start_ts || - (upd_select->stop_ts == upd_select->start_ts && - upd_select->stop_txn < upd_select->start_txn)) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; + if (select_tw->stop_ts < select_tw->start_ts || + (select_tw->stop_ts == select_tw->start_ts && select_tw->stop_txn < select_tw->start_txn)) { __wt_verbose(session, WT_VERB_TIMESTAMP, - "Warning: fixing out-of-order timestamps remove at %s earlier than value at %s", - __wt_timestamp_to_string(upd_select->stop_ts, ts_string[0]), - __wt_timestamp_to_string(upd_select->start_ts, ts_string[1])); - upd_select->start_durable_ts = upd_select->stop_durable_ts; - upd_select->start_ts = upd_select->stop_ts; - upd_select->start_txn = upd_select->stop_txn; + "Warning: fixing out-of-order timestamps remove earlier than value; time window %s", + __wt_time_window_to_string(select_tw, time_string)); + + select_tw->durable_start_ts = select_tw->durable_stop_ts; + select_tw->start_ts = select_tw->stop_ts; + select_tw->start_txn = select_tw->stop_txn; } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 365e5841347..2a4358c585f 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -594,6 +594,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO */ r->update_modify_cbt.ref = ref; r->update_modify_cbt.iface.value_format = btree->value_format; + r->update_modify_cbt.upd_value = &r->update_modify_cbt._upd_value; /* * If we allocated the reconciliation structure and there was an error, clean up. If our caller @@ -669,6 +670,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_buf_free(session, &r->_last); __wt_buf_free(session, &r->update_modify_cbt.iface.value); + __wt_buf_free(session, &r->update_modify_cbt._upd_value.buf); __wt_free(session, r); } @@ -788,17 +790,13 @@ __rec_split_chunk_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK * /* Don't touch the key item memory, that memory is reused. */ chunk->key.size = 0; chunk->entries = 0; - __wt_rec_addr_ts_init(r, &chunk->newest_start_durable_ts, &chunk->oldest_start_ts, - &chunk->oldest_start_txn, &chunk->newest_stop_durable_ts, &chunk->newest_stop_ts, - &chunk->newest_stop_txn, &chunk->prepare); + __wt_rec_addr_ts_init(r, &chunk->ta); chunk->min_recno = WT_RECNO_OOB; /* Don't touch the key item memory, that memory is reused. */ chunk->min_key.size = 0; chunk->min_entries = 0; - __wt_rec_addr_ts_init(r, &chunk->min_newest_start_durable_ts, &chunk->min_oldest_start_ts, - &chunk->min_oldest_start_txn, &chunk->min_newest_stop_durable_ts, &chunk->min_newest_stop_ts, - &chunk->min_newest_stop_txn, &chunk->prepare); + __wt_rec_addr_ts_init(r, &chunk->ta_min); chunk->min_offset = 0; /* @@ -1111,7 +1109,7 @@ __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len, bool * page. */ if (r->salvage != NULL) - WT_PANIC_RET(session, WT_PANIC, "%s page too large, attempted split during salvage", + WT_RET_PANIC(session, WT_PANIC, "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); /* @@ -1215,12 +1213,7 @@ __wt_rec_split_crossing_bnd(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t ne r->cur_ptr->min_recno = r->recno; if (S2BT(session)->type == BTREE_ROW) WT_RET(__rec_split_row_promote(session, r, &r->cur_ptr->min_key, r->page->type)); - r->cur_ptr->min_newest_start_durable_ts = r->cur_ptr->newest_start_durable_ts; - r->cur_ptr->min_oldest_start_ts = r->cur_ptr->oldest_start_ts; - r->cur_ptr->min_oldest_start_txn = r->cur_ptr->oldest_start_txn; - r->cur_ptr->min_newest_stop_durable_ts = r->cur_ptr->newest_stop_durable_ts; - r->cur_ptr->min_newest_stop_ts = r->cur_ptr->newest_stop_ts; - r->cur_ptr->min_newest_stop_txn = r->cur_ptr->newest_stop_txn; + __wt_time_aggregate_copy(&r->cur_ptr->ta_min, &r->cur_ptr->ta); /* Assert we're not re-entering this code. */ WT_ASSERT(session, r->cur_ptr->min_offset == 0); @@ -1271,16 +1264,7 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) * boundaries and create a single chunk. */ prev_ptr->entries += cur_ptr->entries; - prev_ptr->newest_start_durable_ts = - WT_MAX(prev_ptr->newest_start_durable_ts, cur_ptr->newest_start_durable_ts); - prev_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts); - prev_ptr->oldest_start_txn = WT_MIN(prev_ptr->oldest_start_txn, cur_ptr->oldest_start_txn); - prev_ptr->newest_stop_durable_ts = - WT_MAX(prev_ptr->newest_stop_durable_ts, cur_ptr->newest_stop_durable_ts); - prev_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts); - prev_ptr->newest_stop_txn = WT_MAX(prev_ptr->newest_stop_txn, cur_ptr->newest_stop_txn); - if (cur_ptr->prepare) - prev_ptr->prepare = true; + __wt_time_aggregate_merge(&prev_ptr->ta, &cur_ptr->ta); dsk = r->cur_ptr->image.mem; memcpy((uint8_t *)r->prev_ptr->image.mem + prev_ptr->image.size, WT_PAGE_HEADER_BYTE(btree, dsk), cur_ptr->image.size - WT_PAGE_HEADER_BYTE_SIZE(btree)); @@ -1323,25 +1307,11 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) cur_ptr->recno = prev_ptr->min_recno; WT_RET( __wt_buf_set(session, &cur_ptr->key, prev_ptr->min_key.data, prev_ptr->min_key.size)); - cur_ptr->newest_start_durable_ts = - WT_MAX(prev_ptr->newest_start_durable_ts, cur_ptr->newest_start_durable_ts); - cur_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts); - cur_ptr->oldest_start_txn = WT_MIN(prev_ptr->oldest_start_txn, cur_ptr->oldest_start_txn); - cur_ptr->newest_stop_durable_ts = - WT_MAX(prev_ptr->newest_stop_durable_ts, cur_ptr->newest_stop_durable_ts); - cur_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts); - cur_ptr->newest_stop_txn = WT_MAX(prev_ptr->newest_stop_txn, cur_ptr->newest_stop_txn); - if (prev_ptr->prepare) - cur_ptr->prepare = true; + __wt_time_aggregate_merge(&cur_ptr->ta, &prev_ptr->ta); cur_ptr->image.size += len_to_move; prev_ptr->entries = prev_ptr->min_entries; - prev_ptr->newest_start_durable_ts = prev_ptr->min_newest_start_durable_ts; - prev_ptr->oldest_start_ts = prev_ptr->min_oldest_start_ts; - prev_ptr->oldest_start_txn = prev_ptr->min_oldest_start_txn; - prev_ptr->newest_stop_durable_ts = prev_ptr->min_newest_stop_durable_ts; - prev_ptr->newest_stop_ts = prev_ptr->min_newest_stop_ts; - prev_ptr->newest_stop_txn = prev_ptr->min_newest_stop_txn; + __wt_time_aggregate_copy(&prev_ptr->ta, &prev_ptr->ta_min); prev_ptr->image.size -= len_to_move; } @@ -1722,12 +1692,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk multi = &r->multi[r->multi_next++]; /* Initialize the address (set the addr type for the parent). */ - multi->addr.newest_start_durable_ts = chunk->newest_start_durable_ts; - multi->addr.oldest_start_ts = chunk->oldest_start_ts; - multi->addr.oldest_start_txn = chunk->oldest_start_txn; - multi->addr.newest_stop_durable_ts = chunk->newest_stop_durable_ts; - multi->addr.newest_stop_ts = chunk->newest_stop_ts; - multi->addr.newest_stop_txn = chunk->newest_stop_txn; + __wt_time_aggregate_copy(&multi->addr.ta, &chunk->ta); switch (page->type) { case WT_PAGE_COL_FIX: @@ -2035,12 +2000,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_MULTI *multi; WT_PAGE_MODIFY *mod; WT_REF *ref; + WT_TIME_AGGREGATE ta; uint32_t i; btree = S2BT(session); bm = btree->bm; mod = page->modify; ref = r->ref; + __wt_time_aggregate_init(&ta); /* * This page may have previously been reconciled, and that information is now about to be @@ -2121,8 +2088,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ ref = r->ref; if (__wt_ref_is_root(ref)) { - __wt_checkpoint_tree_reconcile_update( - session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX); + __wt_checkpoint_tree_reconcile_update(session, &ta); WT_RET(bm->checkpoint(bm, session, NULL, btree->ckpt, false)); } @@ -2162,10 +2128,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) mod->mod_disk_image = r->multi->disk_image; r->multi->disk_image = NULL; } else { - __wt_checkpoint_tree_reconcile_update(session, r->multi->addr.newest_start_durable_ts, - r->multi->addr.oldest_start_ts, r->multi->addr.oldest_start_txn, - r->multi->addr.newest_stop_durable_ts, r->multi->addr.newest_stop_ts, - r->multi->addr.newest_stop_txn); + __wt_checkpoint_tree_reconcile_update(session, &r->multi->addr.ta); WT_RET(__wt_bt_write(session, r->wrapup_checkpoint, NULL, NULL, NULL, true, F_ISSET(r, WT_REC_CHECKPOINT), r->wrapup_checkpoint_compressed)); } @@ -2267,7 +2230,7 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) { - WT_ERR(__wt_hs_insert_updates(session->hs_cursor, S2BT(session), r->page, multi)); + WT_ERR(__wt_hs_insert_updates(session, r->page, multi)); r->cache_write_hs = true; if (!multi->supd_restore) { __wt_free(session, multi->supd); @@ -2286,9 +2249,7 @@ err: */ int __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type, - wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn, - wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare, - uint64_t rle) + WT_TIME_WINDOW *tw, uint64_t rle) { WT_BM *bm; WT_BTREE *btree; @@ -2343,8 +2304,7 @@ __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *k WT_ERR(__wt_buf_set(session, &kv->buf, addr, size)); /* Build the cell and return. */ - kv->cell_len = __wt_cell_pack_ovfl(session, &kv->cell, type, start_durable_ts, start_ts, - start_txn, stop_durable_ts, stop_ts, stop_txn, prepare, rle, kv->buf.size); + kv->cell_len = __wt_cell_pack_ovfl(session, &kv->cell, type, tw, rle, kv->buf.size); kv->len = kv->cell_len + kv->buf.size; err: diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c index 16365ba94c5..96a7537e980 100644 --- a/src/third_party/wiredtiger/src/schema/schema_util.c +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -26,7 +26,7 @@ __schema_backup_check_int(WT_SESSION_IMPL *session, const char *name) * There is a window at the end of a backup where the list has been cleared from the connection * but the flag is still set. It is safe to drop at that point. */ - if (!conn->hot_backup || (backup_list = conn->hot_backup_list) == NULL) { + if (conn->hot_backup_start == 0 || (backup_list = conn->hot_backup_list) == NULL) { return (0); } for (i = 0; backup_list[i] != NULL; ++i) { @@ -50,7 +50,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) WT_DECL_RET; conn = S2C(session); - if (!conn->hot_backup) + if (conn->hot_backup_start == 0) return (0); WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, ret = __schema_backup_check_int(session, name)); return (ret); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index c843d592160..459a0757032 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1570,20 +1570,12 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, verify, config, cfg); WT_ERR(__wt_inmem_unsupported_op(session, NULL)); - /* - * Even if we're not verifying the history store, we need to be able to iterate over the history - * store content for another table. In order to do this, we must ignore tombstones in the - * history store since every history store record is succeeded with a tombstone. - */ - F_SET(session, WT_SESSION_IGNORE_HS_TOMBSTONE); - /* Block out checkpoints to avoid spurious EBUSY errors. */ WT_WITH_CHECKPOINT_LOCK( session, WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_verify, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY))); WT_ERR(ret); err: - F_CLR(session, WT_SESSION_IGNORE_HS_TOMBSTONE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_verify_fail); else @@ -1658,7 +1650,7 @@ err: F_CLR(session, WT_SESSION_RESOLVING_TXN); } else if (F_ISSET(txn, WT_TXN_RUNNING)) { if (F_ISSET(txn, WT_TXN_PREPARE)) - WT_PANIC_RET(session, ret, "failed to commit prepared transaction, failing the system"); + WT_RET_PANIC(session, ret, "failed to commit prepared transaction, failing the system"); WT_TRET(__wt_session_reset_cursors(session, false)); F_SET(session, WT_SESSION_RESOLVING_TXN); diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index 18351ad4062..b8e43ae5a7b 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -332,6 +332,73 @@ __wt_errx_func(WT_SESSION_IMPL *session, const char *func, int line, const char } /* + * __wt_panic_func -- + * A standard error message when we panic. + */ +int +__wt_panic_func(WT_SESSION_IMPL *session, int error, const char *func, int line, const char *fmt, + ...) WT_GCC_FUNC_ATTRIBUTE((cold)) WT_GCC_FUNC_ATTRIBUTE((format(printf, 5, 6))) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + va_list ap; + + /* + * Ignore error returns from underlying event handlers, we already have an error value to + * return. + */ + va_start(ap, fmt); + WT_IGNORE_RET(__eventv(session, false, error, func, line, fmt, ap)); + va_end(ap); + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + * + * If the connection has already panicked, just return the error. + */ + if (session != NULL && F_ISSET(S2C(session), WT_CONN_PANIC)) + return (WT_PANIC); + + /* + * Call the error callback function before setting the connection's panic flag, so applications + * can trace the failing thread before being flooded with panic returns from API calls. Using + * the variable-arguments list from the current call even thought the format doesn't need it as + * I'm not confident of underlying support for a NULL. + */ + va_start(ap, fmt); + WT_IGNORE_RET( + __eventv(session, false, WT_PANIC, func, line, "the process must exit and restart", ap)); + va_end(ap); + +/* + * Confusing #ifdef structure because gcc/clang knows the abort call won't return, and Visual Studio + * doesn't. + */ +#if defined(HAVE_DIAGNOSTIC) + __wt_abort(session); /* Drop core if testing. */ + /* NOTREACHED */ +#endif +#if !defined(HAVE_DIAGNOSTIC) || defined(_WIN32) + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + * + * Panic the connection; + */ + if (session != NULL) + F_SET(S2C(session), WT_CONN_PANIC); + + /* + * !!! + * Chaos reigns within. + * Reflect, repent, and reboot. + * Order shall return. + */ + return (WT_PANIC); +#endif +} + +/* * __wt_set_return_func -- * Conditionally log the source of an error code and return the error. */ @@ -467,67 +534,6 @@ __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v) } /* - * __wt_panic -- - * A standard error message when we panic. - */ -int -__wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((cold)) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) -{ - /* - * !!! - * This function MUST handle a NULL WT_SESSION_IMPL handle. - * - * If the connection has already panicked, just return the error. - */ - if (session != NULL && F_ISSET(S2C(session), WT_CONN_PANIC)) - return (WT_PANIC); - - /* - * Call the error callback function before setting the connection's panic flag, so applications - * can trace the failing thread before being flooded with panic returns from API calls. - */ - __wt_err(session, WT_PANIC, "the process must exit and restart"); - -/* - * Confusing #ifdef structure because gcc/clang knows the abort call won't return, and Visual Studio - * doesn't. - */ -#if defined(HAVE_DIAGNOSTIC) - __wt_abort(session); /* Drop core if testing. */ - /* NOTREACHED */ -#endif -#if !defined(HAVE_DIAGNOSTIC) || defined(_WIN32) - /* - * !!! - * This function MUST handle a NULL WT_SESSION_IMPL handle. - * - * Panic the connection; - */ - if (session != NULL) - F_SET(S2C(session), WT_CONN_PANIC); - - /* - * Chaos reigns within. Reflect, repent, and reboot. Order shall return. - */ - return (WT_PANIC); -#endif -} - -/* - * __wt_illegal_value_func -- - * A standard error message when we detect an illegal value. - */ -int -__wt_illegal_value_func(WT_SESSION_IMPL *session, uintmax_t v, const char *func, int line) - WT_GCC_FUNC_ATTRIBUTE((cold)) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) -{ - __wt_err_func(session, EINVAL, func, line, "%s: 0x%" PRIxMAX, - "encountered an illegal file format or internal value", v); - return (__wt_panic(session)); -} - -/* * __wt_inmem_unsupported_op -- * Print a standard error message for an operation that's not supported for in-memory * configurations. diff --git a/src/third_party/wiredtiger/src/support/generation.c b/src/third_party/wiredtiger/src/support/generation.c index bd4a6410f52..e3a60621b5c 100644 --- a/src/third_party/wiredtiger/src/support/generation.c +++ b/src/third_party/wiredtiger/src/support/generation.c @@ -136,8 +136,8 @@ __wt_gen_drain(WT_SESSION_IMPL *session, int which, uint64_t generation) /* If we're waiting on ourselves, we're deadlocked. */ if (session == s) { - WT_ASSERT(session, session != s); - WT_IGNORE_RET(__wt_panic(session)); + WT_IGNORE_RET(__wt_panic(session, WT_PANIC, "self-deadlock")); + return; } /* diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index 45580b7d0cc..336ee99c27c 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -210,7 +210,7 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_REF *ref) * A serious error, we should always find the hazard pointer. Panic, because using a page we * didn't have pinned down implies corruption. */ - WT_PANIC_RET(session, EINVAL, "session %p: clear hazard pointer: %p: not found", + WT_RET_PANIC(session, EINVAL, "session %p: clear hazard pointer: %p: not found", (void *)session, (void *)ref); } diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index caaaa7abfbb..010ef9a80d1 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -81,7 +81,7 @@ __wt_modify_pack(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries, WT_ITEM ** uint8_t *data; int i; - session = (WT_SESSION_IMPL *)cursor->session; + session = CUR2S(cursor); *modifyp = NULL; /* @@ -346,33 +346,18 @@ __modify_apply_no_overlap(WT_SESSION_IMPL *session, WT_ITEM *value, const size_t } /* - * __wt_modify_apply -- - * Apply a single set of WT_MODIFY changes to a cursor buffer. - */ -int -__wt_modify_apply(WT_CURSOR *cursor, const void *modify) -{ - WT_SESSION_IMPL *session; - bool sformat; - - session = (WT_SESSION_IMPL *)cursor->session; - sformat = cursor->value_format[0] == 'S'; - - return (__wt_modify_apply_item(session, &cursor->value, modify, sformat)); -} - -/* * __wt_modify_apply_item -- * Apply a single set of WT_MODIFY changes to a WT_ITEM buffer. */ int -__wt_modify_apply_item(WT_SESSION_IMPL *session, WT_ITEM *value, const void *modify, bool sformat) +__wt_modify_apply_item( + WT_SESSION_IMPL *session, const char *value_format, WT_ITEM *value, const void *modify) { WT_MODIFY mod; size_t datasz, destsz, item_offset, tmp; const size_t *p; int napplied, nentries; - bool overlap; + bool overlap, sformat; /* * Get the number of modify entries and set a second pointer to reference the replacement data. @@ -382,6 +367,13 @@ __wt_modify_apply_item(WT_SESSION_IMPL *session, WT_ITEM *value, const void *mod nentries = (int)tmp; /* + * Modifies can only be applied on a single value field. Make sure we are not applying modifies + * to schema with multiple value fields. + */ + WT_ASSERT(session, value_format[1] == '\0'); + sformat = value_format[0] == 'S'; + + /* * Grow the buffer first. This function is often called using a cursor buffer referencing * on-page memory and it's easy to overwrite a page. A side-effect of growing the buffer is to * ensure the buffer's value is in buffer-local memory. @@ -437,10 +429,11 @@ __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_DECL_RET; WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify)); - WT_ERR(__wt_modify_apply(cursor, modify->data)); + WT_ERR( + __wt_modify_apply_item(CUR2S(cursor), cursor->value_format, &cursor->value, modify->data)); err: - __wt_scr_free((WT_SESSION_IMPL *)cursor->session, &modify); + __wt_scr_free(CUR2S(cursor), &modify); return (ret); } @@ -523,3 +516,73 @@ __wt_modify_vector_free(WT_MODIFY_VECTOR *modifies) __wt_free(modifies->session, modifies->listp); __wt_modify_vector_init(modifies->session, modifies); } + +/* + * __wt_modify_reconstruct_from_upd_list -- + * Takes an in-memory modify and populates an update value with the reconstructed full value. + */ +int +__wt_modify_reconstruct_from_upd_list( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, WT_UPDATE_VALUE *upd_value) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_MODIFY_VECTOR modifies; + WT_TIME_WINDOW tw; + + WT_ASSERT(session, upd->type == WT_UPDATE_MODIFY); + + cursor = &cbt->iface; + + /* While we have a pointer to our original modify, grab this information. */ + upd_value->start_ts = upd->start_ts; + upd_value->txnid = upd->txnid; + upd_value->prepare_state = upd->prepare_state; + + /* Construct full update */ + __wt_modify_vector_init(session, &modifies); + /* Find a complete update. */ + for (; upd != NULL; upd = upd->next) { + if (upd->txnid == WT_TXN_ABORTED) + continue; + + if (WT_UPDATE_DATA_VALUE(upd)) + break; + + if (upd->type == WT_UPDATE_MODIFY) + WT_ERR(__wt_modify_vector_push(&modifies, upd)); + } + /* + * If there's no full update, the base item is the on-page item. If the update is a tombstone, + * the base item is an empty item. + */ + if (upd == NULL) { + /* + * Callers of this function set the cursor slot to an impossible value to check we don't try + * and return on-page values when the update list should have been sufficient (which + * happens, for example, if an update list was truncated, deleting some standard update + * required by a previous modify update). Assert the case. + */ + WT_ASSERT(session, cbt->slot != UINT32_MAX); + + WT_ERR(__wt_value_return_buf(cbt, cbt->ref, &upd_value->buf, &tw)); + /* + * Applying modifies on top of a tombstone is invalid. So if we're using the onpage value, + * the stop time pair should be unset. + */ + WT_ASSERT(session, tw.stop_txn == WT_TXN_MAX && tw.stop_ts == WT_TS_MAX); + } else { + /* The base update must not be a tombstone. */ + WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD); + WT_ERR(__wt_buf_set(session, &upd_value->buf, upd->data, upd->size)); + } + /* Once we have a base item, roll forward through any visible modify updates. */ + while (modifies.size > 0) { + __wt_modify_vector_pop(&modifies, &upd); + WT_ERR(__wt_modify_apply_item(session, cursor->value_format, &upd_value->buf, upd->data)); + } + upd_value->type = WT_UPDATE_STANDARD; +err: + __wt_modify_vector_free(&modifies); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 6bd6b74de24..751a86deb53 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -89,7 +89,8 @@ static const char *const __stats_dsrc_desc[] = { "reconciliation: maximum blocks required for a page", "reconciliation: overflow values written", "reconciliation: page checksum matches", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", "reconciliation: pages deleted", - "session: object compaction", "transaction: update conflicts", + "reconciliation: prepared values written", "session: object compaction", + "transaction: update conflicts", }; int @@ -274,6 +275,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->rec_pages = 0; stats->rec_pages_eviction = 0; stats->rec_page_delete = 0; + stats->rec_prepare_value = 0; stats->session_compact = 0; stats->txn_update_conflict = 0; } @@ -447,6 +449,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to) to->rec_pages += from->rec_pages; to->rec_pages_eviction += from->rec_pages_eviction; to->rec_page_delete += from->rec_page_delete; + to->rec_prepare_value += from->rec_prepare_value; to->session_compact += from->session_compact; to->txn_update_conflict += from->txn_update_conflict; } @@ -617,6 +620,7 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to) to->rec_pages += WT_STAT_READ(from, rec_pages); to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); to->rec_page_delete += WT_STAT_READ(from, rec_page_delete); + to->rec_prepare_value += WT_STAT_READ(from, rec_prepare_value); to->session_compact += WT_STAT_READ(from, session_compact); to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict); } diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c index aca2a3d9aa7..d968ee8c3b8 100644 --- a/src/third_party/wiredtiger/src/support/thread_group.c +++ b/src/third_party/wiredtiger/src/support/thread_group.c @@ -39,7 +39,7 @@ err: ret = thread->stop_func(session, thread); if (ret != 0 && F_ISSET(thread, WT_THREAD_PANIC_FAIL)) - WT_PANIC_MSG(session, ret, "Unrecoverable utility thread error"); + WT_IGNORE_RET(__wt_panic(session, ret, "Unrecoverable utility thread error")); /* * The three cases when threads are expected to stop are: @@ -232,7 +232,7 @@ err: group->min = new_min; WT_TRET(__wt_thread_group_destroy(session, group)); - WT_PANIC_RET(session, ret, "Error while resizing thread group"); + WT_RET_PANIC(session, ret, "Error while resizing thread group"); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 58607c7cf2c..0fdf022bd51 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -598,6 +598,165 @@ __wt_txn_release(WT_SESSION_IMPL *session) } /* + * __txn_fixup_prepared_update -- + * Fix/restore the history store update of a prepared datastore update based on transaction + * status. + */ +static int +__txn_fixup_prepared_update(WT_SESSION_IMPL *session, WT_TXN_OP *op, WT_CURSOR *cursor, bool commit) +{ + WT_CURSOR *hs_cursor; + WT_CURSOR_BTREE *cbt, *hs_cbt; + WT_DECL_ITEM(hs_key); + WT_DECL_ITEM(hs_value); + WT_DECL_RET; + WT_TXN *txn; + WT_UPDATE *hs_upd, *tombstone, *upd; + wt_timestamp_t durable_ts, hs_start_ts, hs_stop_ts; + uint64_t hs_counter, type_full; + uint32_t hs_btree_id, session_flags, txn_flags; + int cmp; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool is_owner; + + hs_cursor = NULL; + cbt = (WT_CURSOR_BTREE *)cursor; + txn = session->txn; + hs_upd = tombstone = upd = NULL; + durable_ts = hs_start_ts = WT_TS_NONE; + hs_btree_id = S2BT(session)->id; + session_flags = 0; + is_owner = false; + + /* + * Transaction error and prepare are cleared temporarily as cursor functions are not allowed + * after an error or a prepared transaction. + */ + txn_flags = FLD_MASK(txn->flags, WT_TXN_ERROR | WT_TXN_PREPARE); + F_CLR(txn, txn_flags); + + /* Allocate buffers for the data store and history store key. */ + WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); + WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); + + /* Open a history store table cursor. */ + WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner)); + hs_cursor = session->hs_cursor; + hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; + + /* + * Scan the history store for the given btree and key with maximum start and stop time pair to + * let the search point to the last version of the key and start traversing backwards to find + * out the satisfying record according the given timestamp. Any satisfying history store record + * is moved into data store and removed from history store. If none of the history store records + * satisfy the given timestamp, the key is removed from data store. + */ + WT_ERR(__wt_hs_cursor_position(session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); + + /* Stop before crossing over to the next btree */ + if (hs_btree_id != S2BT(session)->id) + goto err; + + /* + * Keys are sorted in an order, skip the ones before the desired key, and bail out if we have + * crossed over the desired key and not found the record we are looking for. + */ + WT_ERR(__wt_compare(session, NULL, hs_key, &op->u.op_row.key, &cmp)); + if (cmp != 0) + goto err; + + /* + * As part of the history store search, we never get an exact match based on our search criteria + * as we always search for a maximum record for that key. Make sure that we set the comparison + * result as an exact match to remove this key as part of rollback to stable. In case if we + * don't mark the comparison result as same, later the __wt_row_modify function will not + * properly remove the update from history store. + */ + hs_cbt->compare = 0; + + /* Get current value and convert to full update if it is a modify. */ + WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_ts, &durable_ts, &type_full, hs_value)); + + /* + * If we found a history value that satisfied the given timestamp, add it to the update list. + * Otherwise remove the key by adding a tombstone. + */ + if (commit) { + /* + * It is possible that the update in the history store may already been removed by an older + * transaction but retained it due to an history window. + */ + if (hs_stop_ts != WT_TS_MAX) + goto err; + + WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); + hs_upd->durable_ts = hs_upd->start_ts = txn->durable_timestamp; + hs_upd->txnid = txn->id; + } else { + WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, NULL)); + + upd->txnid = WT_TXN_NONE; + upd->durable_ts = durable_ts; + upd->start_ts = hs_start_ts; + __wt_verbose(session, WT_VERB_TRANSACTION, + "update restored from history store (txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", + upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), + __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); + + /* + * Set the flag to indicate that this update has been restored from history store for the + * rollback of a prepared transaction. + */ + F_SET(upd, WT_UPDATE_RESTORED_FOR_ROLLBACK); + + /* + * There should be only one aborted prepared update in the list, append it after the new + * update. + */ + if (cbt->ins != NULL) + upd->next = cbt->ins->upd; + else if (cbt->ref->page->modify != NULL && cbt->ref->page->modify->mod_row_update != NULL) + upd->next = cbt->ref->page->modify->mod_row_update[cbt->slot]; + WT_ASSERT(session, + upd->next != NULL && upd->next->next == NULL && upd->next->txnid == WT_TXN_ABORTED); + + /* Append a tombstone if the stop timestamp exists. */ + if (hs_stop_ts != WT_TS_MAX) { + WT_ERR(__wt_upd_alloc(session, NULL, WT_UPDATE_TOMBSTONE, &tombstone, NULL)); + tombstone->durable_ts = hs_stop_ts; + tombstone->start_ts = hs_stop_ts; + tombstone->txnid = WT_TXN_NONE; + tombstone->next = upd; + } else + tombstone = upd; + + WT_WITH_BTREE(session, cbt->btree, + ret = __wt_row_modify(cbt, &cbt->iface.key, NULL, tombstone, WT_UPDATE_INVALID, true)); + WT_ERR(ret); + tombstone = NULL; + upd = NULL; + + /* Remove the restored update from history store. */ + WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL)); + } + + WT_ERR(__wt_hs_modify(hs_cbt, hs_upd)); + hs_upd = NULL; + +err: + __wt_scr_free(session, &hs_key); + __wt_scr_free(session, &hs_value); + __wt_free(session, hs_upd); + __wt_free(session, upd); + __wt_free(session, tombstone); + WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); + F_SET(txn, txn_flags); + + return (ret); +} + +/* * __txn_search_prepared_op -- * Search for an operation's prepared update. */ @@ -645,12 +804,12 @@ __txn_search_prepared_op( case WT_TXN_OP_REF_DELETE: case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: - WT_RET_ASSERT(session, false, WT_PANIC, "invalid prepared operation update type"); + WT_RET_PANIC_ASSERT(session, false, WT_PANIC, "invalid prepared operation update type"); break; } F_CLR(txn, txn_flags); - WT_WITH_BTREE(session, op->btree, ret = __wt_btcur_search_uncommitted(cursor, updp)); + WT_WITH_BTREE(session, op->btree, ret = __wt_btcur_search_prepared(cursor, updp)); F_SET(txn, txn_flags); WT_RET(ret); WT_RET_ASSERT(session, *updp != NULL, WT_NOTFOUND, @@ -668,8 +827,10 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, { WT_TXN *txn; WT_UPDATE *upd; + bool resolved; txn = session->txn; + resolved = false; WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd)); @@ -686,9 +847,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, if (!commit) { upd->txnid = WT_TXN_ABORTED; + resolved = true; continue; } + /* Ignore the already resolved updates. */ + if (upd->prepare_state == WT_PREPARE_RESOLVED) + continue; + /* * Newer updates are inserted at head of update chain, and transaction operations are added * at the tail of the transaction modify chain. @@ -714,8 +880,19 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, * Resolve the prepared update to be committed update. */ __txn_resolve_prepared_update(session, upd); + resolved = true; } + /* + * Fix the history store contents if they exist, when there are no more updates in the update + * list. Only in eviction, it is possible to write an unfinished history store update when the + * prepared updates are written to the data store. When the page is read back into memory, there + * will be only one uncommitted prepared update. There can be a false positive of fixing history + * store when handling prepared inserts, but it doesn't cost much. + */ + if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && resolved && upd == NULL) + WT_RET_NOTFOUND_OK(__txn_fixup_prepared_update(session, op, *cursorp, commit)); + return (0); } @@ -1144,7 +1321,7 @@ err: * a prepared transaction. */ if (prepare) - WT_PANIC_RET(session, ret, "failed to commit prepared transaction, failing the system"); + WT_RET_PANIC(session, ret, "failed to commit prepared transaction, failing the system"); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); @@ -1202,6 +1379,15 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) if (WT_IS_METADATA(op->btree->dhandle)) continue; + /* + * Logged table updates should never be prepared. As these updates are immediately durable, + * it is not possible to roll them back if the prepared transaction is rolled back. + */ + if (!F_ISSET(op->btree, WT_BTREE_NO_LOGGING) && + (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) || + F_ISSET(S2C(session), WT_CONN_IN_MEMORY))) + WT_RET_MSG(session, EINVAL, "transaction prepare is not supported with logged tables"); + switch (op->type) { case WT_TXN_OP_NONE: break; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index af97d01a0fb..577281a2a47 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -745,8 +745,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; wt_timestamp_t ckpt_tmp_ts; + uint64_t finish_secs, hs_ckpt_duration_usecs, time_start_hs, time_stop_hs; uint64_t fsync_duration_usecs, generation, time_start_fsync, time_stop_fsync; - uint64_t time_start_hs, time_stop_hs, hs_ckpt_duration_usecs; u_int i; bool can_skip, failed, full, idle, logging, tracking, use_timestamp; void *saved_meta_next; @@ -985,6 +985,16 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn->txn_global.last_ckpt_timestamp = conn->txn_global.recovery_timestamp; } else conn->txn_global.last_ckpt_timestamp = WT_TS_NONE; + + /* + * Save clock value marking end of checkpoint processing. If a hot backup starts before the + * next checkpoint, we will need to keep all checkpoints up to this clock value until the + * backup completes. + */ + __wt_seconds(session, &finish_secs); + /* Be defensive: time is only monotonic per session */ + if (finish_secs > conn->ckpt_finish_secs) + conn->ckpt_finish_secs = finish_secs; } err: @@ -1146,7 +1156,6 @@ static void __drop(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt; - u_int max_ckpt_drop; /* * If we're dropping internal checkpoints, match to the '.' separating the checkpoint name from @@ -1155,20 +1164,9 @@ __drop(WT_CKPT *ckptbase, const char *name, size_t len) * it's one we want to drop. */ if (strncmp(WT_CHECKPOINT, name, len) == 0) { - /* - * Currently, hot backup cursors block checkpoint drop, which means releasing a hot backup - * cursor can result in immediately attempting to drop lots of checkpoints, which involves a - * fair amount of work while holding locks. Limit the number of standard checkpoints dropped - * per checkpoint. - */ - max_ckpt_drop = 0; WT_CKPT_FOREACH (ckptbase, ckpt) - if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) F_SET(ckpt, WT_CKPT_DELETE); -#define WT_MAX_CHECKPOINT_DROP 4 - if (++max_ckpt_drop >= WT_MAX_CHECKPOINT_DROP) - break; - } } else WT_CKPT_FOREACH (ckptbase, ckpt) if (WT_STRING_MATCH(ckpt->name, name, len)) @@ -1248,30 +1246,44 @@ __checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, b { WT_CONNECTION_IMPL *conn; WT_DECL_RET; + u_int max_ckpt_drop; + bool is_wt_ckpt; WT_UNUSED(is_checkpoint); conn = S2C(session); - /* - * We can't delete checkpoints if a backup cursor is open. WiredTiger checkpoints are uniquely - * named and it's OK to have multiple of them in the system: clear the delete flag for them, and - * otherwise fail. Hold the lock until we're done (blocking hot backups from starting), we don't - * want to race with a future hot backup. - */ - if (conn->hot_backup) - WT_CKPT_FOREACH (ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) - continue; - if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + /* Check that it is OK to remove all the checkpoints marked for deletion. */ + max_ckpt_drop = 0; + WT_CKPT_FOREACH (ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + is_wt_ckpt = WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT); + + /* + * If there is a hot backup, don't delete any WiredTiger checkpoint that could possibly have + * been created before the backup started. Fail if trying to delete any other named + * checkpoint. + */ + if (conn->hot_backup_start != 0 && ckpt->sec <= conn->hot_backup_start) { + if (is_wt_ckpt) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } WT_RET_MSG(session, EBUSY, - "checkpoint %s blocked by hot backup: it would" + "checkpoint %s blocked by hot backup: it would " "delete an existing checkpoint, and checkpoints " "cannot be deleted during a hot backup", ckpt->name); } + /* + * Dropping checkpoints involves a fair amount of work while holding locks. Limit the number + * of WiredTiger checkpoints dropped per checkpoint. + */ + if (is_wt_ckpt) +#define WT_MAX_CHECKPOINT_DROP 4 + if (++max_ckpt_drop >= WT_MAX_CHECKPOINT_DROP) + F_CLR(ckpt, WT_CKPT_DELETE); + } /* * Mark old checkpoints that are being deleted and figure out which trees we can skip in this @@ -1291,6 +1303,8 @@ __checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, b WT_CKPT_FOREACH (ckptbase, ckpt) { if (!F_ISSET(ckpt, WT_CKPT_DELETE)) continue; + WT_ASSERT(session, !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT) || + conn->hot_backup_start == 0 || ckpt->sec > conn->hot_backup_start); /* * We can't delete checkpoints referenced by a cursor. WiredTiger checkpoints are * uniquely named and it's OK to have multiple in the system: clear the delete flag for @@ -1496,9 +1510,7 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) * Update a checkpoint based on reconciliation results. */ void -__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t start_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn) +__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; @@ -1514,12 +1526,7 @@ __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t s WT_CKPT_FOREACH (ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { ckpt->write_gen = btree->write_gen; - ckpt->start_durable_ts = start_durable_ts; - ckpt->oldest_start_ts = oldest_start_ts; - ckpt->oldest_start_txn = oldest_start_txn; - ckpt->stop_durable_ts = stop_durable_ts; - ckpt->newest_stop_ts = newest_stop_ts; - ckpt->newest_stop_txn = newest_stop_txn; + __wt_time_aggregate_copy(&ckpt->ta, ta); } } @@ -1536,6 +1543,7 @@ __checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[ WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; + WT_TIME_AGGREGATE ta; bool fake_ckpt, resolve_bm; WT_UNUSED(cfg); @@ -1545,6 +1553,7 @@ __checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[ conn = S2C(session); dhandle = session->dhandle; fake_ckpt = resolve_bm = false; + __wt_time_aggregate_init(&ta); /* * Set the checkpoint LSN to the maximum LSN so that if logging is disabled, recovery will never @@ -1565,8 +1574,7 @@ __checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[ * tears. */ if (is_checkpoint && btree->original) { - __wt_checkpoint_tree_reconcile_update( - session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX); + __wt_checkpoint_tree_reconcile_update(session, &ta); fake_ckpt = true; goto fake; diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index b5f5dab0077..90aa7ccae0a 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -35,7 +35,7 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * the original page. */ if (cbt->ins == NULL) { - session = (WT_SESSION_IMPL *)cbt->iface.session; + session = CUR2S(cbt); page = cbt->ref->page; WT_ASSERT(session, cbt->slot < page->entries); rip = &page->pg_row[cbt->slot]; @@ -554,8 +554,9 @@ __wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_ * connection close, only during a full checkpoint. A clean close may not update any * metadata LSN and we do not want to archive in that case. */ - if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || - FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && + if (conn->hot_backup_start == 0 && + (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || + FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index ed493f7765f..56a1f238c05 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -144,7 +144,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * * Build/insert a complete value during recovery rather than using cursor modify to * create a partial update (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(cursor, value.data)); + WT_ERR(__wt_modify_apply_item( + CUR2S(cursor), cursor->value_format, &cursor->value, value.data)); WT_ERR(cursor->insert(cursor)); } break; @@ -205,7 +206,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * * Build/insert a complete value during recovery rather than using cursor modify to * create a partial update (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(cursor, value.data)); + WT_ERR(__wt_modify_apply_item( + CUR2S(cursor), cursor->value_format, &cursor->value, value.data)); WT_ERR(cursor->insert(cursor)); } break; @@ -424,7 +426,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) } if (r->files[fileid].uri != NULL) - WT_PANIC_RET(r->session, WT_PANIC, + WT_RET_PANIC(r->session, WT_PANIC, "metadata corruption: files %s and %s have the same " "file ID %u", uri, r->files[fileid].uri, fileid); @@ -514,13 +516,14 @@ __recovery_file_scan(WT_RECOVERY *r) * Run recovery. */ int -__wt_txn_recover(WT_SESSION_IMPL *session) +__wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; WT_RECOVERY_FILE *metafile; + WT_SESSION *wt_session; char *config; char ts_string[2][WT_TS_INT_STRING_SIZE]; bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup; @@ -548,18 +551,6 @@ __wt_txn_recover(WT_SESSION_IMPL *session) metafile->c = metac; /* - * We should check whether the history store file exists or not. or not. If it does not, then we - * should not apply rollback to stable to each table. This might happen if we're upgrading from - * an older version. - */ - metac->set_key(metac, WT_HS_URI); - WT_ERR_NOTFOUND_OK(metac->search(metac), true); - if (ret == WT_NOTFOUND) - hs_exists = false; - /* Unpin the page from cache. */ - WT_ERR(metac->reset(metac)); - - /* * If no log was found (including if logging is disabled), or if the last checkpoint was done * with logging disabled, recovery should not run. Scan the metadata to figure out the largest * file ID. @@ -633,6 +624,53 @@ __wt_txn_recover(WT_SESSION_IMPL *session) WT_ERR(ret); } + /* + * We should check whether the history store file exists in the metadata or not. If it does not, + * then we should skip rollback to stable for each table. This might happen if we're upgrading + * from an older version. If it does exist in the metadata we should check that it exists on + * disk to confirm that it wasn't deleted between runs. + * + * This needs to happen after we apply the logs as they may contain the metadata changes which + * include the history store creation. As such the on disk metadata file won't contain the + * history store but will after log application. + */ + metac->set_key(metac, WT_HS_URI); + WT_ERR_NOTFOUND_OK(metac->search(metac), true); + if (ret == WT_NOTFOUND) { + hs_exists = false; + } else { + /* Given the history store exists in the metadata validate whether it exists on disk. */ + WT_ERR(__wt_fs_exist(session, WT_HS_FILE, &hs_exists)); + if (hs_exists) { + /* + * Attempt to configure the history store, this will detect corruption if it fails. + */ + ret = __wt_hs_config(session, cfg); + if (ret != 0) { + if (F_ISSET(conn, WT_CONN_SALVAGE)) { + wt_session = &session->iface; + WT_ERR(wt_session->salvage(wt_session, WT_HS_URI, NULL)); + } else + WT_ERR(ret); + } + } else { + /* + * We're attempting to salvage the database with a missing history store, remove it from + * the metadata and pretend it never existed. As such we won't run rollback to stable + * later. + */ + if (F_ISSET(conn, WT_CONN_SALVAGE)) { + hs_exists = false; + metac->remove(metac); + } else + /* The history store file has likely been deleted, we cannot recover from this. */ + WT_ERR_MSG(session, WT_TRY_SALVAGE, "%s file is corrupted or missing", WT_HS_FILE); + } + } + + /* Unpin the page from cache. */ + WT_ERR(metac->reset(metac)); + /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index b27342c3f93..e2489bc8563 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -26,7 +26,8 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd, if (upd->txnid == WT_TXN_ABORTED) { if (upd == first_upd) first_upd = upd->next; - } else if (rollback_timestamp < upd->durable_ts) { + } else if (rollback_timestamp < upd->durable_ts || + upd->prepare_state == WT_PREPARE_INPROGRESS) { /* * If any updates are aborted, all newer updates better be aborted as well. * @@ -40,9 +41,10 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd, __wt_verbose(session, WT_VERB_RTS, "rollback to stable update aborted with txnid: %" PRIu64 - " durable timestamp: %s and stable timestamp: %s", + " durable timestamp: %s and stable timestamp: %s, prepared: %s", upd->txnid, __wt_timestamp_to_string(upd->durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), + rollback_timestamp < upd->durable_ts ? "false" : "true"); upd->txnid = WT_TXN_ABORTED; WT_STAT_CONN_INCR(session, txn_rts_upd_aborted); @@ -229,7 +231,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_ts, &durable_ts, &type_full, hs_value)); type = (uint8_t)type_full; if (type == WT_UPDATE_MODIFY) - WT_ERR(__wt_modify_apply_item(session, &full_value, hs_value->data, false)); + WT_ERR(__wt_modify_apply_item( + session, S2BT(session)->value_format, &full_value, hs_value->data)); else { WT_ASSERT(session, type == WT_UPDATE_STANDARD); WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size)); @@ -352,17 +355,20 @@ __rollback_abort_row_ondisk_kv( WT_DECL_RET; WT_ITEM buf; WT_UPDATE *upd; - char ts_string[3][WT_TS_INT_STRING_SIZE]; + char ts_string[4][WT_TS_INT_STRING_SIZE]; + bool prepared; vpack = &_vpack; upd = NULL; __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); - if (vpack->durable_start_ts > rollback_timestamp) { + prepared = F_ISSET(vpack, WT_CELL_UNPACK_PREPARE); + if (vpack->tw.durable_start_ts > rollback_timestamp || + (vpack->tw.durable_stop_ts == WT_TS_NONE && prepared)) { __wt_verbose(session, WT_VERB_RTS, - "on-disk update aborted with start durable timestamp: %s, commit timestamp: %s and " - "stable timestamp: %s", - __wt_timestamp_to_string(vpack->durable_start_ts, ts_string[0]), - __wt_timestamp_to_string(vpack->start_ts, ts_string[1]), + "on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, " + "prepared: %s and stable timestamp: %s", + __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]), + __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), prepared ? "true" : "false", __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true)); @@ -374,8 +380,8 @@ __rollback_abort_row_ondisk_kv( WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); WT_STAT_CONN_INCR(session, txn_rts_keys_removed); } - } else if (vpack->durable_stop_ts != WT_TS_NONE && - vpack->durable_stop_ts > rollback_timestamp) { + } else if (vpack->tw.durable_stop_ts != WT_TS_NONE && + (vpack->tw.durable_stop_ts > rollback_timestamp || prepared)) { /* * Clear the remove operation from the key by inserting the original on-disk value as a * standard update. @@ -391,14 +397,19 @@ __rollback_abort_row_ondisk_kv( WT_RET(__wt_page_cell_data_ref(session, page, vpack, &buf)); WT_RET(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, NULL)); - upd->txnid = vpack->start_txn; - upd->durable_ts = vpack->durable_start_ts; - upd->start_ts = vpack->start_ts; + upd->txnid = vpack->tw.start_txn; + upd->durable_ts = vpack->tw.durable_start_ts; + upd->start_ts = vpack->tw.start_ts; WT_STAT_CONN_INCR(session, txn_rts_keys_restored); __wt_verbose(session, WT_VERB_RTS, - "key restored (txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", upd->txnid, + "key restored with commit timestamp: %s, durable timestamp: %s txnid: %" PRIu64 + "and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64 + ", prepared: %s", __wt_timestamp_to_string(upd->start_ts, ts_string[0]), - __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); + __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), upd->txnid, + __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[2]), + __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[3]), vpack->tw.stop_txn, + prepared ? "true" : "false"); } else /* Stable version according to the timestamp. */ return (0); @@ -517,13 +528,14 @@ __rollback_abort_row_reconciled_page( return (0); if (mod->rec_result == WT_PM_REC_REPLACE && - (mod->mod_replace.newest_start_durable_ts > rollback_timestamp || - mod->mod_replace.newest_stop_durable_ts > rollback_timestamp)) { + (mod->mod_replace.ta.newest_start_durable_ts > rollback_timestamp || + mod->mod_replace.ta.newest_stop_durable_ts > rollback_timestamp || + mod->mod_replace.ta.prepare)) { __wt_verbose(session, WT_VERB_RTS, "reconciled replace block page history store update removal On-disk with start " "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s", - __wt_timestamp_to_string(mod->mod_replace.newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(mod->mod_replace.newest_stop_durable_ts, ts_string[1]), + __wt_timestamp_to_string(mod->mod_replace.ta.newest_start_durable_ts, ts_string[0]), + __wt_timestamp_to_string(mod->mod_replace.ta.newest_stop_durable_ts, ts_string[1]), __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); WT_RET(__rollback_abort_row_reconciled_page_internal(session, mod->u1.r.disk_image, @@ -538,14 +550,15 @@ __rollback_abort_row_reconciled_page( } else if (mod->rec_result == WT_PM_REC_MULTIBLOCK) { for (multi = mod->mod_multi, multi_entry = 0; multi_entry < mod->mod_multi_entries; ++multi, ++multi_entry) - if (multi->addr.newest_start_durable_ts > rollback_timestamp || - multi->addr.newest_stop_durable_ts > rollback_timestamp) { + if (multi->addr.ta.newest_start_durable_ts > rollback_timestamp || + multi->addr.ta.newest_stop_durable_ts > rollback_timestamp || + multi->addr.ta.prepare) { __wt_verbose(session, WT_VERB_RTS, - "reconciled multi block page history store update removal On-disk with " + "reconciled multi block page history store update removal on-disk with " "start durable timestamp: %s, stop durable timestamp: %s and stable " "timestamp: %s", - __wt_timestamp_to_string(multi->addr.newest_start_durable_ts, ts_string[0]), - __wt_timestamp_to_string(multi->addr.newest_stop_durable_ts, ts_string[1]), + __wt_timestamp_to_string(multi->addr.ta.newest_start_durable_ts, ts_string[0]), + __wt_timestamp_to_string(multi->addr.ta.newest_stop_durable_ts, ts_string[1]), __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); WT_RET(__rollback_abort_row_reconciled_page_internal(session, multi->disk_image, @@ -629,13 +642,13 @@ __rollback_page_needs_abort( uint32_t i; char ts_string[WT_TS_INT_STRING_SIZE]; const char *tag; - bool result; + bool prepared, result; addr = ref->addr; mod = ref->page == NULL ? NULL : ref->page->modify; durable_ts = WT_TS_NONE; tag = "undefined state"; - result = false; + prepared = result = false; /* * The rollback operation should be performed on this page when any one of the following is @@ -647,31 +660,37 @@ __rollback_page_needs_abort( */ if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replace block"; - durable_ts = - WT_MAX(mod->mod_replace.newest_start_durable_ts, mod->mod_replace.newest_stop_durable_ts); - result = (durable_ts > rollback_timestamp); + durable_ts = WT_MAX( + mod->mod_replace.ta.newest_start_durable_ts, mod->mod_replace.ta.newest_stop_durable_ts); + prepared = mod->mod_replace.ta.prepare; + result = (durable_ts > rollback_timestamp) || prepared; } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) { tag = "reconciled multi block"; /* Calculate the max durable timestamp by traversing all multi addresses. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - durable_ts = WT_MAX(durable_ts, multi->addr.newest_start_durable_ts); - durable_ts = WT_MAX(durable_ts, multi->addr.newest_stop_durable_ts); + durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_start_durable_ts); + durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_stop_durable_ts); + if (multi->addr.ta.prepare) + prepared = true; } - result = (durable_ts > rollback_timestamp); + result = (durable_ts > rollback_timestamp) || prepared; } else if (!__wt_off_page(ref->home, addr)) { tag = "on page cell"; /* Check if the page is obsolete using the page disk address. */ __wt_cell_unpack(session, ref->home, (WT_CELL *)addr, &vpack); - durable_ts = WT_MAX(vpack.newest_start_durable_ts, vpack.newest_stop_durable_ts); - result = (durable_ts > rollback_timestamp); + durable_ts = WT_MAX(vpack.ta.newest_start_durable_ts, vpack.ta.newest_stop_durable_ts); + prepared = F_ISSET(&vpack, WT_CELL_UNPACK_PREPARE); + result = (durable_ts > rollback_timestamp) || prepared; } else if (addr != NULL) { tag = "address"; - durable_ts = WT_MAX(addr->newest_start_durable_ts, addr->newest_stop_durable_ts); - result = (durable_ts > rollback_timestamp); + durable_ts = WT_MAX(addr->ta.newest_start_durable_ts, addr->ta.newest_stop_durable_ts); + prepared = addr->ta.prepare; + result = (durable_ts > rollback_timestamp) || prepared; } - __wt_verbose(session, WT_VERB_RTS, "%p: page with %s durable timestamp: %s", (void *)ref, tag, - __wt_timestamp_to_string(durable_ts, ts_string)); + __wt_verbose(session, WT_VERB_RTS, + "%p: page with %s durable timestamp: %s and prepared updates: %s", (void *)ref, tag, + __wt_timestamp_to_string(durable_ts, ts_string), prepared ? "true" : "false"); return (result); } @@ -694,9 +713,9 @@ __rollback_verify_ondisk_page( /* Review updates that belong to keys that are on the disk image. */ WT_ROW_FOREACH (page, rip, i) { __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); - WT_ASSERT(session, vpack->start_ts <= rollback_timestamp); - if (vpack->stop_ts != WT_TS_MAX) - WT_ASSERT(session, vpack->stop_ts <= rollback_timestamp); + WT_ASSERT(session, vpack->tw.durable_start_ts <= rollback_timestamp); + WT_ASSERT(session, vpack->tw.durable_stop_ts == WT_TS_NONE || + vpack->tw.durable_stop_ts <= rollback_timestamp); } } #endif @@ -733,7 +752,8 @@ __rollback_abort_newer_updates( #ifdef HAVE_DIAGNOSTIC if (ref->page == NULL && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { WT_RET(__wt_page_in(session, ref, 0)); - __rollback_verify_ondisk_page(session, ref->page, rollback_timestamp); + if (ref->page->type == WT_PAGE_ROW_LEAF) + __rollback_verify_ondisk_page(session, ref->page, rollback_timestamp); WT_TRET(__wt_page_release(session, ref, 0)); } #endif @@ -846,7 +866,7 @@ __rollback_to_stable_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_tim */ if (__wt_btree_immediately_durable(session)) { if (btree->id >= conn->stable_rollback_maxfile) - WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32 " larger than max %" PRIu32, + WT_RET_PANIC(session, EINVAL, "btree file ID %" PRIu32 " larger than max %" PRIu32, btree->id, conn->stable_rollback_maxfile); return (0); } @@ -991,10 +1011,11 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) WT_CURSOR *cursor; WT_DECL_RET; WT_TXN_GLOBAL *txn_global; - wt_timestamp_t max_durable_ts, start_durable_ts, stop_durable_ts, rollback_timestamp; + wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts, + rollback_timestamp; char ts_string[2][WT_TS_INT_STRING_SIZE]; const char *config, *uri; - bool durable_ts_found; + bool durable_ts_found, prepared_updates; txn_global = &S2C(session)->txn_global; @@ -1025,25 +1046,33 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) WT_ERR(cursor->get_value(cursor, &config)); /* Find out the max durable timestamp of the object from checkpoint. */ - start_durable_ts = stop_durable_ts = WT_TS_NONE; - durable_ts_found = false; + newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE; + durable_ts_found = prepared_updates = false; WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval)); __wt_config_subinit(session, &ckptconf, &cval); for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { - ret = __wt_config_subgets(session, &cval, "start_durable_ts", &durableval); + ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &durableval); if (ret == 0) { - start_durable_ts = WT_MAX(start_durable_ts, (wt_timestamp_t)durableval.val); + newest_start_durable_ts = + WT_MAX(newest_start_durable_ts, (wt_timestamp_t)durableval.val); durable_ts_found = true; } WT_ERR_NOTFOUND_OK(ret, false); - ret = __wt_config_subgets(session, &cval, "stop_durable_ts", &durableval); + ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval); if (ret == 0) { - stop_durable_ts = WT_MAX(stop_durable_ts, (wt_timestamp_t)durableval.val); + newest_stop_durable_ts = + WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val); durable_ts_found = true; } WT_ERR_NOTFOUND_OK(ret, false); + ret = __wt_config_subgets(session, &cval, "prepare", &durableval); + if (ret == 0) { + if (durableval.val) + prepared_updates = true; + } + WT_ERR_NOTFOUND_OK(ret, false); } - max_durable_ts = WT_MAX(start_durable_ts, stop_durable_ts); + max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts); ret = __wt_session_get_dhandle(session, uri, NULL, NULL, 0); /* Ignore performing rollback to stable on files that don't exist. */ if (ret == ENOENT) @@ -1056,12 +1085,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session) * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp. * 3. There is no durable timestamp in any checkpoint. */ - if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || !durable_ts_found) { + if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || prepared_updates || + !durable_ts_found) { __wt_verbose(session, WT_VERB_RTS, "tree rolled back with durable timestamp: %s, or when tree is modified: %s or " - "when durable time is not found: %s", + "prepared updates: %s or when durable time is not found: %s", __wt_timestamp_to_string(max_durable_ts, ts_string[0]), - S2BT(session)->modified ? "true" : "false", !durable_ts_found ? "true" : "false"); + S2BT(session)->modified ? "true" : "false", prepared_updates ? "true" : "false", + !durable_ts_found ? "true" : "false"); WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp)); } else __wt_verbose(session, WT_VERB_RTS, @@ -1141,9 +1172,9 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp * Rollback to stable should ignore tombstones in the history store since it needs to scan the * entire table sequentially. */ - F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE); + F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE); ret = __rollback_to_stable(session, cfg); - F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE); + F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE); WT_RET(ret); /* diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index f81e7b54acc..4bd11d5456a 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -9,6 +9,18 @@ #include "wt_internal.h" /* + * __wt_timestamp_to_string -- + * Convert a timestamp to the MongoDB string representation. + */ +char * +__wt_timestamp_to_string(wt_timestamp_t ts, char *ts_string) +{ + WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE, "(%" PRIu32 ", %" PRIu32 ")", + (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff))); + return (ts_string); +} + +/* * __wt_time_pair_to_string -- * Converts a time pair to a standard string representation. */ @@ -23,15 +35,41 @@ __wt_time_pair_to_string(wt_timestamp_t timestamp, uint64_t txn_id, char *tp_str } /* - * __wt_timestamp_to_string -- - * Convert a timestamp to the MongoDB string representation. + * __wt_time_window_to_string -- + * Converts a time window to a standard string representation. */ char * -__wt_timestamp_to_string(wt_timestamp_t ts, char *ts_string) +__wt_time_window_to_string(WT_TIME_WINDOW *tw, char *tw_string) { - WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE, "(%" PRIu32 ", %" PRIu32 ")", - (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff))); - return (ts_string); + char ts_string[4][WT_TS_INT_STRING_SIZE]; + + WT_IGNORE_RET(__wt_snprintf(tw_string, WT_TIME_STRING_SIZE, + "start: %s/%s/%" PRIu64 " stop: %s/%s/%" PRIu64 "%s", + __wt_timestamp_to_string(tw->durable_start_ts, ts_string[0]), + __wt_timestamp_to_string(tw->start_ts, ts_string[1]), tw->start_txn, + __wt_timestamp_to_string(tw->durable_stop_ts, ts_string[2]), + __wt_timestamp_to_string(tw->stop_ts, ts_string[3]), tw->stop_txn, + tw->prepare ? ", prepared" : "")); + return (tw_string); +} + +/* + * __wt_time_aggregate_to_string -- + * Converts a time aggregate to a standard string representation. + */ +char * +__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string) +{ + char ts_string[4][WT_TS_INT_STRING_SIZE]; + + WT_IGNORE_RET(__wt_snprintf(ta_string, WT_TIME_STRING_SIZE, + "newest durable: %s/%s oldest start: %s/%" PRIu64 " newest stop %s/%" PRIu64 "%s", + __wt_timestamp_to_string(ta->newest_start_durable_ts, ts_string[0]), + __wt_timestamp_to_string(ta->newest_stop_durable_ts, ts_string[1]), + __wt_timestamp_to_string(ta->oldest_start_ts, ts_string[2]), ta->oldest_start_txn, + __wt_timestamp_to_string(ta->newest_stop_ts, ts_string[3]), ta->newest_stop_txn, + ta->prepare ? ", prepared" : "")); + return (ta_string); } /* @@ -700,8 +738,7 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[1])); /* - * FIXME: - * WT-4779 disabled to buy time to understand a test failure. + * FIXME-WT-4780: Disabled to buy time to understand a test failure. * WT_RET(__txn_assert_after_reads( * session, "commit", commit_ts, NULL)); */ diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c index 47b6f99bbe9..9ea4e10e7aa 100755 --- a/src/third_party/wiredtiger/src/utilities/util_dump.c +++ b/src/third_party/wiredtiger/src/utilities/util_dump.c @@ -155,7 +155,7 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) * case, we're specifically interested in what is visible at a given read timestamp. */ if (WT_STREQ(simpleuri, WT_HS_URI) && timestamp == NULL) - F_SET(session_impl, WT_SESSION_IGNORE_HS_TOMBSTONE); + F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); if (dump_config(session, simpleuri, cursor, hex, json) != 0) goto err; @@ -164,6 +164,7 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) if (json && dump_json_table_end(session) != 0) goto err; + F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE); ret = cursor->close(cursor); cursor = NULL; if (ret != 0) { @@ -179,9 +180,12 @@ err: ret = 1; } - F_CLR(session_impl, WT_SESSION_IGNORE_HS_TOMBSTONE); - if (cursor != NULL && (ret = cursor->close(cursor)) != 0) - ret = util_err(session, ret, NULL); + if (cursor != NULL) { + F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE); + if ((ret = cursor->close(cursor)) != 0) + ret = util_err(session, ret, NULL); + } + if (ofile != NULL && (ret = fclose(fp)) != 0) ret = util_err(session, errno, NULL); diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index 8eaf1a83756..fdfb42ffc3f 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -8,8 +8,7 @@ all_TESTS= noinst_PROGRAMS= # The import test is only a shell script -# Temporarily disabled -# all_TESTS += import/smoke.sh +all_TESTS += import/smoke.sh test_incr_backup_SOURCES = incr_backup/main.c noinst_PROGRAMS += test_incr_backup diff --git a/src/third_party/wiredtiger/test/csuite/random_abort/main.c b/src/third_party/wiredtiger/test/csuite/random_abort/main.c index 906492c6c20..52161d089a8 100644 --- a/src/third_party/wiredtiger/test/csuite/random_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/random_abort/main.c @@ -160,7 +160,7 @@ thread_run(void *arg) /* * Make sure that alternative threads operate on column-store table * - * FIXME: temporarily turn off column store test. + * FIXME-WT-6125: temporarily turn off column store test. */ if (td->id % 2 != 0) columnar_table = true; @@ -180,7 +180,7 @@ thread_run(void *arg) if (i == 0) i++; - /* FIXME: temporarily turn off tests for lower isolation levels. */ + /* FIXME-WT-6035: temporarily turn off tests for lower isolation levels. */ testutil_check(session->begin_transaction(session, "isolation=snapshot")); /* @@ -208,7 +208,7 @@ thread_run(void *arg) cursor->set_value(cursor, &data); testutil_check(cursor->insert(cursor)); - /* FIXME: temporarily turn off tests for lower isolation levels. */ + /* FIXME-WT-6035: temporarily turn off tests for lower isolation levels. */ testutil_check(session->commit_transaction(session, NULL)); /* @@ -221,7 +221,7 @@ thread_run(void *arg) * Decide what kind of operation can be performed on the already inserted data. */ if (i % MAX_NUM_OPS == OP_TYPE_DELETE) { - /* FIXME: temporarily turn off tests for lower isolation levels. */ + /* FIXME-WT-6035: temporarily turn off tests for lower isolation levels. */ testutil_check(session->begin_transaction(session, "isolation=snapshot")); if (columnar_table) @@ -231,7 +231,7 @@ thread_run(void *arg) testutil_check(cursor->remove(cursor)); - /* FIXME: temporarily turn off tests for lower isolation levels. */ + /* FIXME-WT-6035: temporarily turn off tests for lower isolation levels. */ testutil_check(session->commit_transaction(session, NULL)); /* Save the key separately for checking later.*/ @@ -391,7 +391,7 @@ recover_and_verify(uint32_t nthreads) cursor = row_cursor; } #else - /* FIXME: temporarily turn off column store test. */ + /* FIXME-WT-6125: temporarily turn off column store test. */ columnar_table = false; cursor = row_cursor; #endif diff --git a/src/third_party/wiredtiger/test/csuite/random_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/random_abort/smoke.sh index 713b000b4f1..aea7fedf4b9 100755 --- a/src/third_party/wiredtiger/test/csuite/random_abort/smoke.sh +++ b/src/third_party/wiredtiger/test/csuite/random_abort/smoke.sh @@ -9,6 +9,10 @@ set -e top_builddir=${top_builddir:-../../build_posix} top_srcdir=${top_srcdir:-../..} +#FIXME-WT-6093: reenable calls to test_random_abort +echo "Warning: test_random_abort temporarily disabled" +exit 0 + $TEST_WRAPPER $top_builddir/test/csuite/test_random_abort -t 10 -T 5 $TEST_WRAPPER $top_builddir/test/csuite/test_random_abort -m -t 10 -T 5 $TEST_WRAPPER $top_builddir/test/csuite/test_random_abort -C -t 10 -T 5 diff --git a/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c b/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c index 3483b047fed..a63cfb2724a 100644 --- a/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt4105_large_doc_small_upd/main.c @@ -132,7 +132,7 @@ main(int argc, char *argv[]) modify_entry.data.size = strlen(modify_entry.data.data); modify_entry.offset = offset; modify_entry.size = modify_entry.data.size; - /* FIXME-PM-1521: extend timeout to pass the test */ + /* FIXME-WT-6113: extend timeout to pass the test */ (void)alarm(7); testutil_check(c->modify(c, &modify_entry, 1)); (void)alarm(0); diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c index 8f5394332ad..9a263eb9ef6 100644 --- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c @@ -109,8 +109,8 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp) /* * Use a checkpoint handle for 50% of reads. * - * FIXME: Checkpoint cursors are known to have issues in durable history so we've removing - * the use of checkpoint handles in this test. As part of WT-5927, we should either + * FIXME-WT-5927: Checkpoint cursors are known to have issues in durable history so we've + * removing the use of checkpoint handles in this test. As part of WT-5927, we should either * re-enable the testing of checkpoint cursors or remove this comment. */ ret = session->open_cursor(session, uri_list[i], NULL, NULL, &cursor); diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 2649f1ee703..62b5c205035 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -198,7 +198,9 @@ functions: script: | set -o errexit set -o verbose - ${test_env_vars|} ./test_random_abort ${random_abort_args|} 2>&1 + #FIXME-WT-6093: reenable all calls to test_random_abort + echo "Warning: test_random_abort temporarily disabled" + ##${test_env_vars|} ./test_random_abort ${random_abort_args|} 2>&1 "timestamp abort test": command: shell.exec params: @@ -226,19 +228,20 @@ functions: for i in $(seq ${times|1}); do # Run the various combinations of args. Let time and threads be random. # Run current version with write-no-sync txns. - ${test_env_vars|} ./test_random_abort 2>&1 + echo "Warning: test_random_abort temporarily disabled" + ##${test_env_vars|} ./test_random_abort 2>&1 ${test_env_vars|} ./test_timestamp_abort 2>&1 # Current version with memory-based txns (MongoDB usage). - ${test_env_vars|} ./test_random_abort -m 2>&1 + ##${test_env_vars|} ./test_random_abort -m 2>&1 ${test_env_vars|} ./test_timestamp_abort -m 2>&1 # V1 log compatibility mode with write-no-sync txns. - ${test_env_vars|} ./test_random_abort -C 2>&1 + ##${test_env_vars|} ./test_random_abort -C 2>&1 ${test_env_vars|} ./test_timestamp_abort -C 2>&1 # V1 log compatibility mode with memory-based txns. - ${test_env_vars|} ./test_random_abort -C -m 2>&1 + ##${test_env_vars|} ./test_random_abort -C -m 2>&1 ${test_env_vars|} ./test_timestamp_abort -C -m 2>&1 ${test_env_vars|} ./test_truncated_log ${truncated_log_args|} 2>&1 @@ -270,15 +273,14 @@ functions: rm -rf "wiredtiger" rm -rf "wiredtiger.tgz" - # Temporarily disabled - # "checkpoint test": - # command: shell.exec - # params: - # working_dir: "wiredtiger/build_posix/test/checkpoint" - # script: | - # set -o errexit - # set -o verbose - # ./t ${checkpoint_args} 2>&1 + "checkpoint test": + command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/checkpoint" + script: | + set -o errexit + set -o verbose + ./t ${checkpoint_args} 2>&1 "checkpoint stress test": command: shell.exec @@ -577,17 +579,16 @@ tasks: vars: directory: test/cursor_order - # Temporarily disabled - # - name: fops-test - # tags: ["pull_request"] - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - func: "compile wiredtiger" - # - func: "make check directory" - # vars: - # directory: test/fops + - name: fops-test + tags: ["pull_request"] + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - func: "compile wiredtiger" + - func: "make check directory" + vars: + directory: test/fops # Temporarily disabled # - name: format-test @@ -699,21 +700,20 @@ tasks: # Start of csuite test tasks - # Temporarily disabled - # - name: csuite-import-test - # tags: ["pull_request"] - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - command: shell.exec - # params: - # working_dir: "wiredtiger/build_posix" - # script: | - # set -o errexit - # set -o verbose + - name: csuite-import-test + tags: ["pull_request"] + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix" + script: | + set -o errexit + set -o verbose - # ${test_env_vars|} $(pwd)/../test/csuite/import/smoke.sh 2>&1 + ${test_env_vars|} $(pwd)/../test/csuite/import/smoke.sh 2>&1 - name: csuite-incr-backup-test tags: ["pull_request"] @@ -1494,40 +1494,23 @@ tasks: pip install scons==3.1.1 scons-3.1.1.bat ${smp_command|} check - # Temporarily disabled - # - name: fops - # tags: ["pull_request"] - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # if [ "Windows_NT" = "$OS" ]; then - # cmd.exe /c t_fops.exe - # else - # build_posix/test/fops/t - # fi - - # Temporarily disabled - # - name: format - # tags: ["windows_only"] - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # # format assumes we run it from the format directory - # cmd.exe /c "cd test\\format && ..\\..\\t_format.exe reverse=0 encryption=none logging_compression=none runs=20" + - name: fops + tags: ["pull_request"] + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + if [ "Windows_NT" = "$OS" ]; then + cmd.exe /c t_fops.exe + else + build_posix/test/fops/t + fi - name: million-collection-test commands: @@ -1555,135 +1538,129 @@ tasks: set -o verbose test/evergreen/compatibility_test_for_releases.sh - # Temporarily disabled - # - name: generate-datafile-little-endian - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - func: "compile wiredtiger" - # - func: "format test" - # vars: - # times: 10 - # config: ../../../test/format/CONFIG.endian - # extra_args: -h "WT_TEST.$i" - # - command: shell.exec - # params: - # working_dir: "wiredtiger/build_posix/test/format" - # shell: bash - # script: | - # set -o errexit - # set -o verbose - # # Archive the WT_TEST directories which include the generated wt data files - # tar -zcvf WT_TEST.tgz WT_TEST* - # - command: s3.put - # params: - # aws_secret: ${aws_secret} - # aws_key: ${aws_key} - # local_file: wiredtiger/build_posix/test/format/WT_TEST.tgz - # bucket: build_external - # permissions: public-read - # content_type: application/tar - # display_name: WT_TEST - # remote_file: wiredtiger/little-endian/${revision}/artifacts/WT_TEST.tgz + - name: generate-datafile-little-endian + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - func: "compile wiredtiger" + - func: "format test" + vars: + times: 10 + config: ../../../test/format/CONFIG.endian + extra_args: -h "WT_TEST.$i" + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/format" + shell: bash + script: | + set -o errexit + set -o verbose + # Archive the WT_TEST directories which include the generated wt data files + tar -zcvf WT_TEST.tgz WT_TEST* + - command: s3.put + params: + aws_secret: ${aws_secret} + aws_key: ${aws_key} + local_file: wiredtiger/build_posix/test/format/WT_TEST.tgz + bucket: build_external + permissions: public-read + content_type: application/tar + display_name: WT_TEST + remote_file: wiredtiger/little-endian/${revision}/artifacts/WT_TEST.tgz - # Temporarily disabled - # - name: verify-datafile-little-endian - # depends_on: - # - name: compile - # - name: generate-datafile-little-endian - # commands: - # - func: "fetch artifacts" - # - func: "fetch artifacts from little-endian" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # ./test/evergreen/verify_wt_datafiles.sh 2>&1 + - name: verify-datafile-little-endian + depends_on: + - name: compile + - name: generate-datafile-little-endian + commands: + - func: "fetch artifacts" + - func: "fetch artifacts from little-endian" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + ./test/evergreen/verify_wt_datafiles.sh 2>&1 - # Temporarily disabled - # - name: verify-datafile-from-little-endian - # depends_on: - # - name: compile - # - name: generate-datafile-little-endian - # variant: little-endian - # commands: - # - func: "fetch artifacts" - # - func: "fetch artifacts from little-endian" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # ./test/evergreen/verify_wt_datafiles.sh 2>&1 + - name: verify-datafile-from-little-endian + depends_on: + - name: compile + - name: generate-datafile-little-endian + variant: little-endian + commands: + - func: "fetch artifacts" + - func: "fetch artifacts from little-endian" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + ./test/evergreen/verify_wt_datafiles.sh 2>&1 - # Temporarily disabled - # - name: generate-datafile-big-endian - # depends_on: - # - name: compile - # commands: - # - func: "fetch artifacts" - # - func: "compile wiredtiger" - # - func: "format test" - # vars: - # times: 10 - # config: ../../../test/format/CONFIG.endian - # extra_args: -h "WT_TEST.$i" - # - command: shell.exec - # params: - # working_dir: "wiredtiger/build_posix/test/format" - # shell: bash - # script: | - # set -o errexit - # set -o verbose - # # Archive the WT_TEST directories which include the generated wt data files - # tar -zcvf WT_TEST.tgz WT_TEST* - # - command: s3.put - # params: - # aws_secret: ${aws_secret} - # aws_key: ${aws_key} - # local_file: wiredtiger/build_posix/test/format/WT_TEST.tgz - # bucket: build_external - # permissions: public-read - # content_type: application/tar - # display_name: WT_TEST - # remote_file: wiredtiger/big-endian/${revision}/artifacts/WT_TEST.tgz + - name: generate-datafile-big-endian + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - func: "compile wiredtiger" + - func: "format test" + vars: + times: 10 + config: ../../../test/format/CONFIG.endian + extra_args: -h "WT_TEST.$i" + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/format" + shell: bash + script: | + set -o errexit + set -o verbose + # Archive the WT_TEST directories which include the generated wt data files + tar -zcvf WT_TEST.tgz WT_TEST* + - command: s3.put + params: + aws_secret: ${aws_secret} + aws_key: ${aws_key} + local_file: wiredtiger/build_posix/test/format/WT_TEST.tgz + bucket: build_external + permissions: public-read + content_type: application/tar + display_name: WT_TEST + remote_file: wiredtiger/big-endian/${revision}/artifacts/WT_TEST.tgz - # Temporarily disabled - # - name: verify-datafile-big-endian - # depends_on: - # - name: compile - # - name: generate-datafile-big-endian - # commands: - # - func: "fetch artifacts" - # - func: "fetch artifacts from big-endian" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # ./test/evergreen/verify_wt_datafiles.sh 2>&1 + - name: verify-datafile-big-endian + depends_on: + - name: compile + - name: generate-datafile-big-endian + commands: + - func: "fetch artifacts" + - func: "fetch artifacts from big-endian" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + ./test/evergreen/verify_wt_datafiles.sh 2>&1 - # Temporarily disabled - # - name: verify-datafile-from-big-endian - # depends_on: - # - name: compile - # - name: generate-datafile-big-endian - # variant: big-endian - # commands: - # - func: "fetch artifacts" - # - func: "fetch artifacts from big-endian" - # - command: shell.exec - # params: - # working_dir: "wiredtiger" - # script: | - # set -o errexit - # set -o verbose - # ./test/evergreen/verify_wt_datafiles.sh 2>&1 + - name: verify-datafile-from-big-endian + depends_on: + - name: compile + - name: generate-datafile-big-endian + variant: big-endian + commands: + - func: "fetch artifacts" + - func: "fetch artifacts from big-endian" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + ./test/evergreen/verify_wt_datafiles.sh 2>&1 - name: clang-analyzer tags: ["pull_request"] @@ -1731,7 +1708,6 @@ tasks: vars: format_test_script_args: -t 110 -j 4 direct_io=1 - # Temporarily disabled # - name: linux-directio # depends_on: # - name: compile @@ -1744,18 +1720,17 @@ tasks: # config: ../../../test/format/CONFIG.stress # extra_args: -C "direct_io=[data]" - # Temporarily disabled - # - name: format-linux-no-ftruncate - # depends_on: - # - name: compile-linux-no-ftruncate - # commands: - # - func: "fetch artifacts" - # vars: - # dependent_task: compile-linux-no-ftruncate - # - func: "compile wiredtiger no linux ftruncate" - # - func: "format test" - # vars: - # times: 3 + - name: format-linux-no-ftruncate + depends_on: + - name: compile-linux-no-ftruncate + commands: + - func: "fetch artifacts" + vars: + dependent_task: compile-linux-no-ftruncate + - func: "compile wiredtiger no linux ftruncate" + - func: "format test" + vars: + times: 3 - name: package commands: @@ -1782,113 +1757,109 @@ tasks: set -o verbose ${python_binary|python} syscall.py --verbose --preserve - # Temporarily disabled - # - name: checkpoint-filetypes-test - # commands: - # - func: "get project" - # - func: "compile wiredtiger" - # vars: - # # Don't use diagnostic - this test looks for timing problems that are more likely to occur without it - # posix_configure_flags: --enable-strict - # - func: "checkpoint test" - # vars: - # checkpoint_args: -t m -n 1000000 -k 5000000 -C cache_size=100MB - # - func: "checkpoint test" - # vars: - # checkpoint_args: -t r -n 1000000 -k 5000000 -C cache_size=100MB - # - func: "checkpoint test" - # vars: - # checkpoint_args: -t c -n 1000000 -k 5000000 -C cache_size=100MB + - name: checkpoint-filetypes-test + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + # Don't use diagnostic - this test looks for timing problems that are more likely to occur without it + posix_configure_flags: --enable-strict + - func: "checkpoint test" + vars: + checkpoint_args: -t m -n 1000000 -k 5000000 -C cache_size=100MB + - func: "checkpoint test" + vars: + checkpoint_args: -t r -n 1000000 -k 5000000 -C cache_size=100MB + - func: "checkpoint test" + vars: + checkpoint_args: -t c -n 1000000 -k 5000000 -C cache_size=100MB - # Temporarily disabled - # - name: coverage-report - # commands: - # - func: "get project" - # - func: "compile wiredtiger" - # vars: - # configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH CFLAGS="--coverage -fPIC -ggdb" LDFLAGS=--coverage - # posix_configure_flags: --enable-silent-rules --enable-diagnostic --enable-strict --enable-python --with-builtins=lz4,snappy,zlib - # - func: "make check all" - # - func: "unit test" - # vars: - # unit_test_args: -v 2 --long - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=snappy logging=1 logging_compression=snappy logging_prealloc=1 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row alter=1 backups=1 compaction=1 data_extend=1 prepare=1 rebalance=1 salvage=1 statistics=1 statistics_server=1 verify=1 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row firstfit=1 internal_key_truncation=1 - # - func: "format test" - # vars: - # extra_args: leak_memory=0 mmap=1 file_type=row checkpoints=0 in_memory=1 reverse=1 truncate=1 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=zlib huffman_key=1 huffman_value=1 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row isolation=random transaction_timestamps=0 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row data_source=lsm bloom=1 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=var compression=snappy checksum=uncompressed dictionary=1 repeat_data_pct=10 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=lz4 prefix_compression=1 leaf_page_max=9 internal_page_max=9 key_min=256 value_min=256 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=var leaf_page_max=9 internal_page_max=9 value_min=256 - # - func: "format test" - # vars: - # extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=fix - # - command: shell.exec - # params: - # working_dir: "wiredtiger/build_posix" - # script: | - # set -o errexit - # set -o verbose + - name: coverage-report + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH CFLAGS="--coverage -fPIC -ggdb" LDFLAGS=--coverage + posix_configure_flags: --enable-silent-rules --enable-diagnostic --enable-strict --enable-python --with-builtins=lz4,snappy,zlib + - func: "make check all" + - func: "unit test" + vars: + unit_test_args: -v 2 --long + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=snappy logging=1 logging_compression=snappy logging_prealloc=1 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row alter=1 backups=1 compaction=1 data_extend=1 prepare=1 rebalance=1 salvage=1 statistics=1 statistics_server=1 verify=1 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row firstfit=1 internal_key_truncation=1 + - func: "format test" + vars: + extra_args: leak_memory=0 mmap=1 file_type=row checkpoints=0 in_memory=1 reverse=1 truncate=1 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=zlib huffman_key=1 huffman_value=1 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row isolation=random transaction_timestamps=0 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row data_source=lsm bloom=1 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=var compression=snappy checksum=uncompressed dictionary=1 repeat_data_pct=10 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=row compression=lz4 prefix_compression=1 leaf_page_max=9 internal_page_max=9 key_min=256 value_min=256 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=var leaf_page_max=9 internal_page_max=9 value_min=256 + - func: "format test" + vars: + extra_args: checkpoints=1 leak_memory=0 mmap=1 file_type=fix + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix" + script: | + set -o errexit + set -o verbose - # GCOV=/opt/mongodbtoolchain/v3/bin/gcov gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html - # - command: s3.put - # params: - # aws_secret: ${aws_secret} - # aws_key: ${aws_key} - # local_file: wiredtiger/coverage_report.html - # bucket: build_external - # permissions: public-read - # content_type: text/html - # display_name: Coverage report - # remote_file: wiredtiger/${build_variant}/${revision}/coverage_report/coverage_report_${build_id}.html + GCOV=/opt/mongodbtoolchain/v3/bin/gcov gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html + - command: s3.put + params: + aws_secret: ${aws_secret} + aws_key: ${aws_key} + local_file: wiredtiger/coverage_report.html + bucket: build_external + permissions: public-read + content_type: text/html + display_name: Coverage report + remote_file: wiredtiger/${build_variant}/${revision}/coverage_report/coverage_report_${build_id}.html - # Temporarily disabled - # - name: spinlock-gcc-test - # commands: - # - func: "get project" - # - func: "compile wiredtiger" - # vars: - # posix_configure_flags: --enable-python --with-spinlock=gcc --enable-strict - # - func: "make check all" - # - func: "format test" - # vars: - # times: 3 - # - func: "unit test" + - name: spinlock-gcc-test + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + posix_configure_flags: --enable-python --with-spinlock=gcc --enable-strict + - func: "make check all" + - func: "format test" + vars: + times: 3 + - func: "unit test" - # Temporarily disabled - # - name: spinlock-pthread-adaptive-test - # commands: - # - func: "get project" - # - func: "compile wiredtiger" - # vars: - # posix_configure_flags: --enable-python --with-spinlock=pthread_adaptive --enable-strict - # - func: "make check all" - # - func: "format test" - # vars: - # times: 3 - # - func: "unit test" + - name: spinlock-pthread-adaptive-test + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + posix_configure_flags: --enable-python --with-spinlock=pthread_adaptive --enable-strict + - func: "make check all" + - func: "format test" + vars: + times: 3 + - func: "unit test" - name: wtperf-test depends_on: @@ -2000,21 +1971,20 @@ tasks: - func: "truncated log test" # format test - # Temporarily disabled - # - func: "format test" - # vars: - # extra_args: file_type=fix - # - func: "format test" - # vars: - # extra_args: file_type=row + - func: "format test" + vars: + extra_args: file_type=fix + - func: "format test" + vars: + extra_args: file_type=row - #FIXME: Add wtperf testing from Jenkin "wiredtiger-test-check-long" after fixing WT-5270 + #FIXME-WT-5270: Add wtperf testing from Jenkin "wiredtiger-test-check-long" after fixing WT-5270 - name: time-shift-sensitivity-test depends_on: - - name: compile - vars: - posix_configure_flags: --enable-strict + - name: compile + vars: + posix_configure_flags: --enable-strict commands: - func: "fetch artifacts" vars: @@ -2107,7 +2077,7 @@ tasks: set -o errexit set -o verbose for i in {1..10}; do ${python_binary|python} split_stress.py; done - # Temporarily disabled + - name: format-stress-test # Set 25 hours timeout exec_timeout_secs: 90000 @@ -2119,7 +2089,6 @@ tasks: #run for 24 hours ( 24 * 60 = 1440 minutes), use default config format_test_script_args: -b "SEGFAULT_SIGNALS=all catchsegv ./t" -t 1440 - # Temporarily disabled - name: format-stress-smoke-test # Set 7 hours timeout exec_timeout_secs: 25200 @@ -2260,19 +2229,16 @@ buildvariants: - name: make-check-msan-test - name: compile-ubsan - name: ubsan-test - # Temporarily disabled - # - name: linux-directio - # distros: ubuntu1804-build + - name: linux-directio + distros: ubuntu1804-build - name: syscall-linux - name: make-check-asan-test - name: configure-combinations - # Temporarily disabled # - name: checkpoint-filetypes-test # - name: coverage-report - name: unit-test-long - # Temporarily disabled # - name: spinlock-gcc-test - # - name: spinlock-pthread-adaptive-test + - name: spinlock-pthread-adaptive-test - name: compile-wtperf - name: wtperf-test - name: ftruncate-test @@ -2347,8 +2313,7 @@ buildvariants: - name: compile-linux-no-ftruncate - name: make-check-linux-no-ftruncate-test - name: unit-linux-no-ftruncate-test - # Temporarily disabled - # - name: format-linux-no-ftruncate + - name: format-linux-no-ftruncate - name: rhel80 display_name: RHEL 8.0 @@ -2362,31 +2327,26 @@ buildvariants: - name: compile - name: make-check-test - name: unit-test - # Temporarily disabled - # - name: fops + - name: fops - name: time-shift-sensitivity-test - name: compile-msan - name: make-check-msan-test - name: compile-ubsan - name: ubsan-test - # Temporarily disabled - # - name: linux-directio - # distros: rhel80-build + - name: linux-directio + distros: rhel80-build - name: syscall-linux - name: compile-asan - name: make-check-asan-test - # Temporarily disabled # - name: checkpoint-filetypes-test - name: unit-test-long - # Temporarily disabled # - name: spinlock-gcc-test - # - name: spinlock-pthread-adaptive-test + - name: spinlock-pthread-adaptive-test - name: compile-wtperf - name: wtperf-test - name: ftruncate-test - name: long-test - name: configure-combinations - # Temporarily disabled # - name: coverage-report - name: large-scale-tests @@ -2413,8 +2373,7 @@ buildvariants: - name: compile - name: ".windows_only" - name: ".unit_test" - # Temporarily disabled - # - name: fops + - name: fops - name: macos-1012 display_name: OS X 10.12 @@ -2430,40 +2389,37 @@ buildvariants: - name: compile - name: make-check-test - name: unit-test - # Temporarily disabled - # - name: fops - -# Temporarily disabled -# - name: little-endian -# display_name: Little-endian (x86) -# run_on: -# - ubuntu1804-test -# batchtime: 10080 # 7 days -# expansions: -# smp_command: -j $(grep -c ^processor /proc/cpuinfo) -# test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd) -# tasks: -# - name: compile -# - name: generate-datafile-little-endian -# - name: verify-datafile-little-endian -# - name: verify-datafile-from-big-endian - -# Temporarily disabled -# - name: big-endian -# display_name: Big-endian (s390x/zSeries) -# modules: -# - enterprise -# run_on: -# - ubuntu1804-zseries-build -# batchtime: 10080 # 7 days -# expansions: -# smp_command: -j $(grep -c ^processor /proc/cpuinfo) -# test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.lib top_srcdir=$(pwd)/.. top_builddir=$(pwd) -# tasks: -# - name: compile -# - name: generate-datafile-big-endian -# - name: verify-datafile-big-endian -# - name: verify-datafile-from-little-endian + - name: fops + +- name: little-endian + display_name: Little-endian (x86) + run_on: + - ubuntu1804-test + batchtime: 10080 # 7 days + expansions: + smp_command: -j $(grep -c ^processor /proc/cpuinfo) + test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd) + tasks: + - name: compile + # - name: generate-datafile-little-endian + # - name: verify-datafile-little-endian + # - name: verify-datafile-from-big-endian + +- name: big-endian + display_name: Big-endian (s390x/zSeries) + modules: + - enterprise + run_on: + - ubuntu1804-zseries-build + batchtime: 10080 # 7 days + expansions: + smp_command: -j $(grep -c ^processor /proc/cpuinfo) + test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.lib top_srcdir=$(pwd)/.. top_builddir=$(pwd) + tasks: + - name: compile + # - name: generate-datafile-big-endian + # - name: verify-datafile-big-endian + # - name: verify-datafile-from-little-endian - name: ubuntu1804-ppc display_name: Ubuntu 18.04 PPC diff --git a/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_releases.sh b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_releases.sh index a030f80c712..5dfd5554689 100755 --- a/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_releases.sh +++ b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_releases.sh @@ -6,10 +6,10 @@ set -e ############################################################# -# format_b_flag: +# bflag: # arg1: branch name ############################################################# -format_b_flag() +bflag() { # Return if the branch's format command takes the -B flag for backward compatibility. test "$1" = "develop" && echo "-B " @@ -63,7 +63,7 @@ run_format() cd "$1/test/format" - flags="-1q $(format_b_flag $1)" + flags="-1q $(bflag $1)" args="" args+="cache=80 " # Medium cache so there's eviction @@ -115,7 +115,7 @@ verify_branches() for am in $3; do echo "$1/wt verifying $2 access method $am..." dir="$2/test/format/RUNDIR.$am" - WIREDTIGER_CONFIG="$EXT" ./wt -h "../$dir" verify table:wt + WIREDTIGER_CONFIG="$EXT" ./wt $(bflag $1) -h "../$dir" verify table:wt done } @@ -138,12 +138,12 @@ upgrade_downgrade() for reps in {1..2}; do echo "$1 format running on $2 access method $am..." cd "$top/$1/test/format" - flags="-1qR $(format_b_flag $1)" + flags="-1qR $(bflag $1)" ./t $flags -h "$top/$2/test/format/RUNDIR.$am" timer=2 echo "$2 format running on $2 access method $am..." cd "$top/$2/test/format" - flags="-1qR $(format_b_flag $2)" + flags="-1qR $(bflag $2)" ./t $flags -h "RUNDIR.$am" timer=2 done done @@ -184,20 +184,20 @@ cd develop; wt2=$(get_prev_version 2); cd .. (verify_branches mongodb-3.6 mongodb-3.4 "fix row var") (verify_branches mongodb-4.0 mongodb-3.6 "fix row var") (verify_branches mongodb-4.2 mongodb-4.0 "fix row var") -### (verify_branches mongodb-4.4 mongodb-4.2 "fix row var") -### (verify_branches develop mongodb-4.4 "row") +(verify_branches mongodb-4.4 mongodb-4.2 "fix row var") +(verify_branches develop mongodb-4.4 "row") (verify_branches develop mongodb-4.2 "row") (verify_branches "$wt1" "$wt2" "row") (verify_branches develop "$wt1" "row") # Verify forward compatibility for supported access methods. -### (verify_branches mongodb-4.2 mongodb-4.4 "row") +(verify_branches mongodb-4.2 mongodb-4.4 "row") (verify_branches mongodb-4.2 develop "row") -### (verify_branches mongodb-4.4 develop "row") +(verify_branches mongodb-4.4 develop "row") # Upgrade/downgrade testing for supported access methods. -### (upgrade_downgrade mongodb-4.2 mongodb-4.4 "row") +(upgrade_downgrade mongodb-4.2 mongodb-4.4 "row") (upgrade_downgrade mongodb-4.2 develop "row") -### (upgrade_downgrade mongodb-4.4 develop "row") +(upgrade_downgrade mongodb-4.4 develop "row") exit 0 diff --git a/src/third_party/wiredtiger/test/fops/Makefile.am b/src/third_party/wiredtiger/test/fops/Makefile.am index 7a5920221ae..519f6315445 100644 --- a/src/third_party/wiredtiger/test/fops/Makefile.am +++ b/src/third_party/wiredtiger/test/fops/Makefile.am @@ -11,8 +11,7 @@ t_LDADD +=$(top_builddir)/libwiredtiger.la t_LDFLAGS = -static # Run this during a "make check" smoke test. -# Temporarily disabled -# TESTS = $(noinst_PROGRAMS) +TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) clean-local: diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index 0f41a311e43..69f986aa79a 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -133,20 +133,20 @@ wts_load(void) if (!is_bulk) cursor->set_key(cursor, keyno); cursor->set_value(cursor, *(uint8_t *)value.data); - logop(session, "%-10s %" PRIu64 " {0x%02" PRIx8 "}", "bulk", keyno, + logop(session, "%-10s %" PRIu32 " {0x%02" PRIx8 "}", "bulk", keyno, ((uint8_t *)value.data)[0]); break; case VAR: if (!is_bulk) cursor->set_key(cursor, keyno); cursor->set_value(cursor, &value); - logop(session, "%-10s %" PRIu64 " {%.*s}", "bulk", keyno, (int)value.size, + logop(session, "%-10s %" PRIu32 " {%.*s}", "bulk", keyno, (int)value.size, (char *)value.data); break; case ROW: cursor->set_key(cursor, &key); cursor->set_value(cursor, &value); - logop(session, "%-10s %" PRIu64 " {%.*s}, {%.*s}", "bulk", keyno, (int)key.size, + logop(session, "%-10s %" PRIu32 " {%.*s}, {%.*s}", "bulk", keyno, (int)key.size, (char *)key.data, (int)value.size, (char *)value.data); break; } @@ -175,15 +175,8 @@ wts_load(void) g.c_delete_pct += g.c_insert_pct - 5; g.c_insert_pct = 5; } - if (g.c_delete_pct < 20) { - g.c_delete_pct += g.c_write_pct / 2; - g.c_write_pct = g.c_write_pct / 2; - } - if (g.c_delete_pct < 20) { - g.c_delete_pct += g.c_modify_pct / 2; - g.c_write_pct = g.c_modify_pct / 2; - } - break; + g.c_delete_pct += g.c_write_pct / 2; + g.c_write_pct = g.c_write_pct / 2; } } diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 6a668fa4f45..ef3a79e7b53 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -496,7 +496,7 @@ prepare_transaction(TINFO *tinfo) { WT_DECL_RET; WT_SESSION *session; - uint64_t ts; + uint64_t longwait, pause_ms, ts; char buf[64]; session = tinfo->session; @@ -523,6 +523,19 @@ prepare_transaction(TINFO *tinfo) lock_writeunlock(session, &g.ts_lock); + /* + * Sometimes add a delay after prepare to induce extra memory stress. For 80% of the threads, + * there is never a delay, so there is always a dedicated set of threads trying to do work. For + * the other 20%, we'll sometimes delay. For these threads, 99% of the time, proceed without + * delay. The rest of the time, pause up to 5 seconds, weighted toward the smaller delays. + */ + if (tinfo->id % 5 == 0) { + longwait = mmrand(&tinfo->rnd, 0, 999); + if (longwait < 10) { + pause_ms = mmrand(&tinfo->rnd, 1, 10) << longwait; + __wt_sleep(0, pause_ms * WT_THOUSAND); + } + } return (ret); } @@ -1240,11 +1253,11 @@ order_error_col: * less-than, row-store inserts new rows in-between rows by appending a new suffix * to the row's key.) */ - testutil_check(__wt_buf_fmt((WT_SESSION_IMPL *)cursor->session, tinfo->tbuf, "%.*s", + testutil_check(__wt_buf_fmt(CUR2S(cursor), tinfo->tbuf, "%.*s", (int)tinfo->key->size, (char *)tinfo->key->data)); keyno_prev = strtoul(tinfo->tbuf->data, NULL, 10); - testutil_check(__wt_buf_fmt((WT_SESSION_IMPL *)cursor->session, tinfo->tbuf, "%.*s", - (int)key.size, (char *)key.data)); + testutil_check(__wt_buf_fmt( + CUR2S(cursor), tinfo->tbuf, "%.*s", (int)key.size, (char *)key.data)); keyno = strtoul(tinfo->tbuf->data, NULL, 10); if (incrementing) { if (keyno_prev != keyno && keyno_prev + 1 != keyno) @@ -1258,8 +1271,7 @@ order_error_row: (char *)tinfo->key->data, (int)key.size, (char *)key.data); } - testutil_check( - __wt_buf_set((WT_SESSION_IMPL *)cursor->session, tinfo->key, key.data, key.size)); + testutil_check(__wt_buf_set(CUR2S(cursor), tinfo->key, key.data, key.size)); break; } break; diff --git a/src/third_party/wiredtiger/test/suite/test_backup01.py b/src/third_party/wiredtiger/test/suite/test_backup01.py index ff4dd439189..a39da2e3d18 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup01.py +++ b/src/third_party/wiredtiger/test/suite/test_backup01.py @@ -30,6 +30,7 @@ import glob import os import shutil import string +import time from suite_subprocess import suite_subprocess import wiredtiger, wttest from wtdataset import SimpleDataSet, ComplexDataSet, ComplexLSMDataSet @@ -163,8 +164,7 @@ class test_backup(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(ret, wiredtiger.WT_NOTFOUND) self.assertEqual(i, total) - # Test that named checkpoints can't be deleted while backup cursors are - # open, but that normal checkpoints continue to work. + # Test interaction between checkpoints and a backup cursor. def test_checkpoint_delete(self): # You cannot name checkpoints including LSM tables, skip those. self.populate(1) @@ -177,7 +177,8 @@ class test_backup(wttest.WiredTigerTestCase, suite_subprocess): self.objs[0][0], None, "checkpoint=one")) # Confirm opening a backup cursor causes checkpoint to fail if dropping - # a named checkpoint, but does not stop a default checkpoint. + # a named checkpoint created before the backup cursor, but does not stop a + # default checkpoint. cursor = self.session.open_cursor('backup:', None, None) self.session.checkpoint() msg = '/checkpoints cannot be deleted during a hot backup/' @@ -187,7 +188,24 @@ class test_backup(wttest.WiredTigerTestCase, suite_subprocess): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.checkpoint("name=three,drop=(two)"), msg) self.session.checkpoint() + + # Confirm that a named checkpoint created after a backup cursor can be dropped. + # Need to pause a couple seconds; checkpoints that are assigned the same timestamp as + # the backup will be pinned, even if they occur after the backup starts. + time.sleep(2) + self.session.checkpoint("name=four") + self.session.checkpoint("drop=(four)") + self.assertRaises(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor( + self.objs[0][0], None, "checkpoint=four")) + + # Confirm that after closing the backup cursor the original named checkpoint can + # be deleted. cursor.close() + self.session.checkpoint("drop=(two)") + self.assertRaises(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor( + self.objs[0][0], None, "checkpoint=two")) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint05.py b/src/third_party/wiredtiger/test/suite/test_checkpoint05.py new file mode 100644 index 00000000000..58af3003a60 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_checkpoint05.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_checkpoint05.py +# Verify that we don't accumulate a lot of checkpoints while a backup +# cursor is open. WiredTiger checkpoints created after the backup cursor +# should get deleted as usual. + +import time +import wiredtiger, wttest + +class test_checkpoint05(wttest.WiredTigerTestCase): + conn_config = 'create,cache_size=100MB,log=(archive=false,enabled=true,file_max=100K)' + + def count_checkpoints(self): + metadata_cursor = self.session.open_cursor('metadata:', None, None) + + nckpt = 0 + while metadata_cursor.next() == 0: + key = metadata_cursor.get_key() + value = metadata_cursor[key] + nckpt = nckpt + value.count("WiredTigerCheckpoint") + metadata_cursor.close() + return nckpt + + def test_checkpoints_during_backup(self): + self.uri = 'table:ckpt05' + self.session.create(self.uri, 'key_format=i,value_format=i') + + # Setup: Insert some data and checkpoint it + cursor = self.session.open_cursor(self.uri, None) + for i in range(16): + cursor[i] = i + self.session.checkpoint(None) + + # Create backup and check how many checkpoints we have. + backup_cursor = self.session.open_cursor('backup:', None, None) + initial_count = self.count_checkpoints() + + # Checkpoints created immediately after a backup cursor may get pinned. + # Pause to avoid this. + time.sleep(2) + + # Take a bunch of checkpoints. + for i in range (50): + self.session.checkpoint('force=true') + cursor.close() + + # There may be a few more checkpoints than when we opened the + # backup cursor, but not too many more. The factor of three + # is generous. But if WT isn't deleting checkpoints there would + # be about 30x more checkpoints here. + final_count = self.count_checkpoints() + self.assertTrue (final_count < initial_count * 3) + + self.session.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py index 9ecbb6aa196..089e77f604a 100644 --- a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py @@ -42,7 +42,7 @@ class test_debug_mode05(wttest.WiredTigerTestCase): uri = 'file:test_debug_mode05' def test_table_logging_rollback_to_stable(self): - self.session.create(self.uri, 'key_format=i,value_format=u') + self.session.create(self.uri, 'key_format=i,value_format=u,log=(enabled=false)') cursor = self.session.open_cursor(self.uri, None) diff --git a/src/third_party/wiredtiger/test/suite/test_durable_ts03.py b/src/third_party/wiredtiger/test/suite/test_durable_ts03.py index 8fdb1f615ae..43e03431709 100755 --- a/src/third_party/wiredtiger/test/suite/test_durable_ts03.py +++ b/src/third_party/wiredtiger/test/suite/test_durable_ts03.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. from helper import copy_wiredtiger_home -import unittest, wiredtiger, wttest +import wiredtiger, wttest def timestamp_str(t): return '%x' %t @@ -38,7 +38,6 @@ class test_durable_ts03(wttest.WiredTigerTestCase): conn_config = 'cache_size=10MB' session_config = 'isolation=snapshot' - @unittest.skip("Temporarily disabled") def test_durable_ts03(self): # Create a table. uri = 'table:test_durable_ts03' diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py index 042d9c731cb..b00a196c809 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs06.py +++ b/src/third_party/wiredtiger/test/suite/test_hs06.py @@ -47,7 +47,8 @@ class test_hs06(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB,statistics=(fast)' session_config = 'isolation=snapshot' key_format_values = [ - ('column', dict(key_format='r')), + # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061). + # ('column', dict(key_format='r')), ('integer', dict(key_format='i')), ('string', dict(key_format='S')) ] diff --git a/src/third_party/wiredtiger/test/suite/test_hs08.py b/src/third_party/wiredtiger/test/suite/test_hs08.py index b0d0d497dd0..905cc84db1a 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs08.py +++ b/src/third_party/wiredtiger/test/suite/test_hs08.py @@ -45,7 +45,7 @@ class test_hs08(wttest.WiredTigerTestCase): stat_cursor.close() return val - def test_modify_insert_to_las(self): + def test_modify_insert_to_hs(self): uri = "table:test_hs08" create_params = 'value_format=S,key_format=i' value1 = 'a' * 1000 diff --git a/src/third_party/wiredtiger/test/suite/test_hs09.py b/src/third_party/wiredtiger/test/suite/test_hs09.py index 43ced8ad589..ac34e3f7b17 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs09.py +++ b/src/third_party/wiredtiger/test/suite/test_hs09.py @@ -42,7 +42,8 @@ class test_hs09(wttest.WiredTigerTestCase): session_config = 'isolation=snapshot' uri = "table:test_hs09" key_format_values = [ - ('column', dict(key_format='r')), + # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061). + #('column', dict(key_format='r')), ('integer', dict(key_format='i')), ('string', dict(key_format='S')), ] @@ -63,7 +64,7 @@ class test_hs09(wttest.WiredTigerTestCase): cursor.close() # Check the history store file value cursor = session.open_cursor("file:WiredTigerHS.wt", None, 'checkpoint=WiredTigerCheckpoint') - for _, _, hs_start_ts, _, hs_stop_ts, _, _, _, type, value in cursor: + for _, _, hs_start_ts, _, hs_stop_ts, _, type, value in cursor: # No WT_UPDATE_TOMBSTONE in the history store self.assertNotEqual(type, 5) # No WT_UPDATE_BIRTHMARK in the history store diff --git a/src/third_party/wiredtiger/test/suite/test_hs10.py b/src/third_party/wiredtiger/test/suite/test_hs10.py index 4a33ced8125..f41f18bb999 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs10.py +++ b/src/third_party/wiredtiger/test/suite/test_hs10.py @@ -45,7 +45,7 @@ class test_hs10(wttest.WiredTigerTestCase): stat_cursor.close() return val - def test_modify_insert_to_las(self): + def test_modify_insert_to_hs(self): uri = "table:test_hs10" uri2 = "table:test_hs10_otherdata" create_params = 'value_format=S,key_format=i' diff --git a/src/third_party/wiredtiger/test/suite/test_hs12.py b/src/third_party/wiredtiger/test/suite/test_hs12.py new file mode 100644 index 00000000000..7403126fd5e --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_hs12.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest, time + +def timestamp_str(t): + return '%x' % t + +# test_hs12.py +# Verify we can correctly append modifies to the end of string values +class test_hs12(wttest.WiredTigerTestCase): + conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)' + session_config = 'isolation=snapshot' + + def test_modify_append_to_string(self): + uri = "table:test_reverse_modify01_notimestamp" + create_params = 'value_format=S,key_format=i' + value1 = 'abcedfghijklmnopqrstuvwxyz' * 5 + value2 = 'b' * 100 + valuebig = 'e' * 1000 + self.session.create(uri, create_params) + cursor = self.session.open_cursor(uri) + + session2 = self.setUpSessionOpen(self.conn) + session2.create(uri, create_params) + cursor2 = session2.open_cursor(uri) + + # Insert a full value. + self.session.begin_transaction() + cursor[1] = value1 + cursor[2] = value1 + self.session.commit_transaction() + + # Insert a modify + self.session.begin_transaction() + cursor.set_key(1) + cursor.modify([wiredtiger.Modify('A', 130, 0)]) + cursor.set_key(2) + cursor.modify([wiredtiger.Modify('AB', 0, 0)]) + self.session.commit_transaction() + + # Validate that we do see the correct value. + session2.begin_transaction() + cursor2.set_key(1) + cursor2.search() + self.assertEquals(cursor2.get_value(), value1 + 'A') + cursor2.set_key(2) + cursor2.search() + self.assertEquals(cursor2.get_value(), 'AB' + value1) + session2.commit_transaction() + + # Begin transaction on session 2 so it sees the current snap_min and snap_max + session2.begin_transaction() + + # reset the cursor + cursor2.reset() + + # Insert one more value + self.session.begin_transaction() + cursor.set_key(1) + cursor[1] = value2 + self.session.commit_transaction() + + # Insert a whole bunch of data into the table to force wiredtiger to evict data + # from the previous table. + self.session.begin_transaction() + for i in range(2, 10000): + cursor[i] = valuebig + self.session.commit_transaction() + + # Try to find the value we saw earlier + cursor2.set_key(1) + cursor2.search() + self.assertEquals(cursor2.get_value(), value1 + 'A') + cursor2.set_key(2) + cursor2.search() + self.assertEquals(cursor2.get_value(), 'AB' + value1) + +if __name__ == '__main__': + wttest.run()
\ No newline at end of file diff --git a/src/third_party/wiredtiger/test/suite/test_prepare_hs01.py b/src/third_party/wiredtiger/test/suite/test_prepare_hs01.py index aa1a8e875ee..494193e3da4 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare_hs01.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare_hs01.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. from helper import copy_wiredtiger_home -import unittest, wiredtiger, wttest +import wiredtiger, wttest from wtdataset import SimpleDataSet def timestamp_str(t): @@ -39,6 +39,19 @@ class test_prepare_hs01(wttest.WiredTigerTestCase): # Force a small cache. conn_config = 'cache_size=50MB' + def check(self, uri, ds, nrows, nsessions, nkeys, read_ts, expected_value, not_expected_value): + cursor = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + timestamp_str(read_ts)) + for i in range(1, nsessions * nkeys): + cursor.set_key(ds.key(nrows + i)) + self.assertEquals(cursor.search(), 0) + # Correctness Test - commit_value should be visible + self.assertEquals(cursor.get_value(), expected_value) + # Correctness Test - prepare_value should NOT be visible + self.assertNotEquals(cursor.get_value(), not_expected_value) + cursor.close() + self.session.commit_transaction() + def prepare_updates(self, uri, ds, nrows, nsessions, nkeys): # Update a large number of records in their individual transactions. # This will force eviction and start history store eviction of committed @@ -84,16 +97,9 @@ class test_prepare_hs01(wttest.WiredTigerTestCase): self.assertEquals(cursors[j].insert(), 0) sessions[j].prepare_transaction('prepare_timestamp=' + timestamp_str(2)) - # Re-read the original versions of all the data. To do this, the pages - # that were just evicted need to be read back. This ensures reading - # prepared updates from the history store - cursor = self.session.open_cursor(uri) - self.session.begin_transaction('read_timestamp=' + timestamp_str(1)) - for i in range(1, nsessions * nkeys): - cursor.set_key(ds.key(nrows + i)) - self.assertEquals(cursor.search(), 0) - cursor.close() - self.session.commit_transaction() + # Re-read the original versions of all the data. This ensures reading + # original versions from the history store + self.check(uri, ds, nrows, nsessions, nkeys, 1, bigvalue1, bigvalue2) # Close all cursors and sessions, this will cause prepared updates to be # rollback-ed @@ -101,7 +107,11 @@ class test_prepare_hs01(wttest.WiredTigerTestCase): cursors[j].close() sessions[j].close() - @unittest.skip("Temporarily disabled") + # Re-read the original versions of all the data. This ensures reading + # original versions from the data store as the prepared updates are + # aborted + self.check(uri, ds, nrows, nsessions, nkeys, 2, bigvalue1, bigvalue2) + def test_prepare_hs(self): # Create a small table. uri = "table:test_prepare_hs01" diff --git a/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py b/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py index 847bc0977c8..946a6597447 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. from helper import copy_wiredtiger_home -import unittest, wiredtiger, wttest +import wiredtiger, wttest from wtdataset import SimpleDataSet import os, shutil from wtscenario import make_scenarios @@ -183,7 +183,6 @@ class test_prepare_hs03(wttest.WiredTigerTestCase): # and call verify self.corrupt_salvage_verify() - @unittest.skip("Temporarily disabled") def test_prepare_hs(self): nrows = 100 ds = SimpleDataSet(self, self.uri, nrows, key_format="S", value_format='u') diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py index 36daf16573f..b16ee76a1f2 100755 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py @@ -150,7 +150,7 @@ class test_rollback_to_stable05(test_rollback_to_stable_base): else: self.assertEqual(pages_visited, 0) self.assertEqual(upd_aborted, 0) - self.assertEqual(hs_removed, nrows * 3 * 2) + self.assertEqual(hs_removed, 0) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_txn19.py b/src/third_party/wiredtiger/test/suite/test_txn19.py index cd68c0c9931..7ba1e60f68d 100755 --- a/src/third_party/wiredtiger/test/suite/test_txn19.py +++ b/src/third_party/wiredtiger/test/suite/test_txn19.py @@ -423,7 +423,6 @@ class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess): not_salvageable = [ "removal:WiredTiger.turtle", "removal:WiredTiger.wt", - "removal:WiredTigerHS.wt", "truncate:WiredTiger.wt", "truncate:WiredTigerHS.wt", "zero:WiredTiger.wt", @@ -482,13 +481,7 @@ class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess): closeconn=False) if expect_fail: - errmsg = 'WT_TRY_SALVAGE: database corruption detected' - if self.filename == 'WiredTigerHS.wt': - if self.kind == 'removal': - errmsg = 'handle-open' - elif self.kind == 'truncate': - errmsg = 'file size=0, alloc size=4096' - self.check_file_contains_one_of(errfile, [errmsg]) + self.check_file_contains_one_of(errfile, ['WT_TRY_SALVAGE: database corruption detected']) def test_corrupt_meta(self): errfile = 'list.err' @@ -546,13 +539,10 @@ class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess): # an error during the wiredtiger_open. But the nature of the # messages produced during the error is variable by which case # it is, and even variable from system to system. - if self.filename == "WiredTigerHS.wt": - self.run_wt_and_check(salvagedir, salvagedir + '_' + errfile, salvagedir + '_' + outfile, True) - else: - with self.expectedStdoutPattern('.'): - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.reopen_conn(salvagedir, salvage_config), - '/.*/') + with self.expectedStdoutPattern('.'): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.reopen_conn(salvagedir, salvage_config), + '/.*/') if __name__ == '__main__': wttest.run() |