summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-03-24 16:34:23 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-03-24 06:16:46 +0000
commitb7d015ba4e8730ddcfcf7c5258b9ffa5cfb0949a (patch)
treeae407d81714508df63eba1a1d536bf4e547c01e2
parent81c6113198d2f5debf3da38a42bf61d7a079de2e (diff)
downloadmongo-b7d015ba4e8730ddcfcf7c5258b9ffa5cfb0949a.tar.gz
Import wiredtiger: 4500a8ed93af91b15901d00a9f9c2587f94275f9 from branch mongodb-4.4
ref: f6ab94b43b..4500a8ed93 for: 4.4.0-rc0 WT-5616 Coverity: Memory leak WT-5669 Prepare support with durable history: backport data format changes to 4.2 WT-5672 Prepare support with durable history: implement rollback_to_stable WT-5674 Prepare support with durable history: add/debug functional tests WT-5754 Ensure the value is globally visible before encode it into WT_ROW WT-5813 Fix heap-use-after-free in __split_parent WT-5855 Clean up operation tracking documentation WT-5880 test/format backup missing expected log file WT-5885 Improve WT rollback to stable verbose output for better debugging WT-5887 Turn off verbose output when test/format output is redirected to a file WT-5890 Bump log/WT version to facilitate upgrade/downgrade floor for MongoDB WT-5894 Don't persist durable timestamp if it is same as commit timestamp WT-5896 Recovery sometimes attempts rollback to stable with an absent history store file WT-5898 Fix memory leak in verifying history store WT-5899 Restore WT_PAGE_LAS_UPDATE on-disk page flag value to avoid break backward compatibility WT-5900 Display failed version numbers in error message for version mismatch failures WT-5910 Fix start_durable_ts when handling out-of-order updates
-rw-r--r--src/third_party/wiredtiger/dist/s_clang-scan.diff4
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c38
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c38
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_rebalance.c26
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c80
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c5
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c53
-rw-r--r--src/third_party/wiredtiger/src/docs/devdoc-optrack.dox51
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_lgnd.pngbin0 -> 69976 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_nav_bar.pngbin0 -> 45102 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_threads_waiting.pngbin0 -> 264187 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/optrack-t2-slow-lsm-worker-thread.pngbin0 -> 133261 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/outlier_histograms.pngbin0 -> 124071 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/session_1_dead_period.pngbin0 -> 67242 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_outliers.pngbin0 -> 126937 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_over_15ms.pngbin0 -> 85736 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/spell.ok2
-rw-r--r--src/third_party/wiredtiger/src/history/hs.c1
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h1
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i384
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h62
-rw-r--r--src/third_party/wiredtiger/src/include/log.h13
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h3
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h9
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.i51
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c27
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c174
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c125
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c20
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c46
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c7
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c11
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c30
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c162
-rw-r--r--src/third_party/wiredtiger/test/format/backup.c25
-rw-r--r--src/third_party/wiredtiger/test/format/t.c19
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat01.py15
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat02.py23
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat03.py18
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat04.py13
-rw-r--r--src/third_party/wiredtiger/test/suite/test_prepare_hs03.py209
47 files changed, 1122 insertions, 647 deletions
diff --git a/src/third_party/wiredtiger/dist/s_clang-scan.diff b/src/third_party/wiredtiger/dist/s_clang-scan.diff
index fc5ce463cd9..7530bd2fbef 100644
--- a/src/third_party/wiredtiger/dist/s_clang-scan.diff
+++ b/src/third_party/wiredtiger/dist/s_clang-scan.diff
@@ -14,10 +14,6 @@ src/conn/conn_capacity.c:291:5: warning: Value stored to 'capacity' is never rea
capacity = steal_capacity = 0;
^ ~~~~~~~~~~~~~~~~~~
1 warning generated.
-src/reconcile/rec_col.c:1079:25: warning: Null pointer passed as an argument to a 'nonnull' parameter
- memcmp(last.value->data, data, size) == 0))) {
- ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1 warning generated.
In file included from src/reconcile/rec_write.c:9:
In file included from ./src/include/wt_internal.h:423:
./src/include/mutex.i:184:16: warning: Null pointer passed as an argument to a 'nonnull' parameter
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index 20512c31929..6c67f53a7ee 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -47,6 +47,8 @@ WT_LOG_V3_MAJOR
WT_LOG_V3_MINOR
WT_LOG_V4_MAJOR
WT_LOG_V4_MINOR
+WT_LOG_V5_MAJOR
+WT_LOG_V5_MINOR
WT_OPTRACK_BUFSIZE
WT_OPTRACK_MAXRECS
WT_PACKED_STRUCT_BEGIN
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index cc68dee60e9..9e803b502df 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "f6ab94b43bc56ce16ba0192ed15d1b602e9f2017"
+ "commit": "4500a8ed93af91b15901d00a9f9c2587f94275f9"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 6fd6cb2e89c..35c3425d7ee 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -63,9 +63,22 @@ restart_read:
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
} else {
- if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE)
- return (__wt_value_return(cbt, upd));
- cbt->iface.value.data = upd->data;
+ /*
+ * If this update has been restored from the disk, it needs to be freed after copying it
+ * to the user cursor.
+ */
+ if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) {
+ switch (upd->type) {
+ case WT_UPDATE_TOMBSTONE:
+ cbt->iface.value.data = upd->data;
+ __wt_free_update_list(session, &upd);
+ break;
+ default:
+ return (__wt_value_return(cbt, upd));
+ }
+ }
+ if (upd != NULL)
+ cbt->iface.value.data = upd->data;
}
}
cbt->iface.value.size = 1;
@@ -124,9 +137,22 @@ restart_read:
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
cbt->iface.value.data = &cbt->v;
} else {
- if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE)
- return (__wt_value_return(cbt, upd));
- cbt->iface.value.data = upd->data;
+ /*
+ * If this update has been restored from the disk, it needs to be freed after copying it to
+ * the user cursor.
+ */
+ if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) {
+ switch (upd->type) {
+ case WT_UPDATE_TOMBSTONE:
+ cbt->iface.value.data = upd->data;
+ __wt_free_update_list(session, &upd);
+ break;
+ default:
+ return (__wt_value_return(cbt, upd));
+ }
+ }
+ if (upd != NULL)
+ cbt->iface.value.data = upd->data;
}
cbt->iface.value.size = 1;
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index c84fc94e686..f762d035a39 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -203,9 +203,22 @@ restart_read:
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
} else {
- if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE)
- return (__wt_value_return(cbt, upd));
- cbt->iface.value.data = upd->data;
+ /*
+ * If this update has been restored from the disk, it needs to be freed after copying it
+ * to the user cursor.
+ */
+ if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) {
+ switch (upd->type) {
+ case WT_UPDATE_TOMBSTONE:
+ cbt->iface.value.data = upd->data;
+ __wt_free_update_list(session, &upd);
+ break;
+ default:
+ return (__wt_value_return(cbt, upd));
+ }
+ }
+ if (upd != NULL)
+ cbt->iface.value.data = upd->data;
}
}
cbt->iface.value.size = 1;
@@ -264,9 +277,22 @@ restart_read:
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
cbt->iface.value.data = &cbt->v;
} else {
- if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK) && upd->type != WT_UPDATE_TOMBSTONE)
- return (__wt_value_return(cbt, upd));
- cbt->iface.value.data = upd->data;
+ /*
+ * If this update has been restored from the disk, it needs to be freed after copying it to
+ * the user cursor.
+ */
+ if (F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK)) {
+ switch (upd->type) {
+ case WT_UPDATE_TOMBSTONE:
+ cbt->iface.value.data = upd->data;
+ __wt_free_update_list(session, &upd);
+ break;
+ default:
+ return (__wt_value_return(cbt, upd));
+ }
+ }
+ if (upd != NULL)
+ cbt->iface.value.data = upd->data;
}
cbt->iface.value.size = 1;
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index ca776d91918..f6e49e27557 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -545,10 +545,8 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_BTREE *btree;
WT_CELL_UNPACK unpack;
WT_ROW *rip;
- WT_TXN_GLOBAL *txn_global;
btree = S2BT(session);
- txn_global = &S2C(session)->txn_global;
/* Walk the page, building indices. */
rip = page->pg_row;
@@ -578,8 +576,8 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
* the value is globally visible at the point in time where we read the page into cache.
*/
if (!btree->huffman_value && unpack.stop_txn == WT_TXN_MAX &&
- unpack.stop_ts == WT_TS_MAX && txn_global->has_oldest_timestamp &&
- unpack.start_ts <= txn_global->oldest_timestamp)
+ unpack.stop_ts == WT_TS_MAX &&
+ __wt_txn_visible_all(session, unpack.start_txn, unpack.start_ts))
__wt_row_leaf_value_set(page, rip - 1, &unpack);
break;
case WT_CELL_VALUE_OVFL:
diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
index eae6857dca3..f31754675f3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c
+++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
@@ -56,8 +56,8 @@ __rebalance_discard(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
* Add a new entry to the list of leaf pages.
*/
static int
-__rebalance_leaf_append(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const uint8_t *key,
- size_t key_len, WT_CELL_UNPACK *unpack, WT_REBALANCE_STUFF *rs)
+__rebalance_leaf_append(WT_SESSION_IMPL *session, const uint8_t *key, size_t key_len,
+ WT_CELL_UNPACK *unpack, WT_REBALANCE_STUFF *rs)
{
WT_ADDR *copy_addr;
WT_REF *copy;
@@ -76,10 +76,10 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, con
WT_RET(__wt_calloc_one(session, &copy_addr));
copy->addr = copy_addr;
- /* FIXME-prepare-support: use durable timestamps from unpack struct */
- copy_addr->stop_durable_ts = durable_ts;
+ copy_addr->start_durable_ts = unpack->newest_start_durable_ts;
copy_addr->oldest_start_ts = unpack->oldest_start_ts;
copy_addr->oldest_start_txn = unpack->oldest_start_txn;
+ copy_addr->stop_durable_ts = unpack->newest_stop_durable_ts;
copy_addr->newest_stop_ts = unpack->newest_stop_ts;
copy_addr->newest_stop_txn = unpack->newest_stop_txn;
WT_RET(__wt_memdup(session, unpack->data, unpack->size, &copy_addr->addr));
@@ -188,8 +188,7 @@ __rebalance_free_original(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
* Walk a column-store page and its descendants.
*/
static int
-__rebalance_col_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const WT_PAGE_HEADER *dsk,
- WT_REBALANCE_STUFF *rs)
+__rebalance_col_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs)
{
WT_BTREE *btree;
WT_CELL_UNPACK unpack;
@@ -213,14 +212,14 @@ __rebalance_col_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const
case WT_CELL_ADDR_INT:
/* An internal page: read it and recursively walk it. */
WT_ERR(__wt_bt_read(session, buf, unpack.data, unpack.size));
- WT_ERR(__rebalance_col_walk(session, unpack.newest_stop_durable_ts, buf->data, rs));
+ WT_ERR(__rebalance_col_walk(session, buf->data, rs));
__wt_verbose(session, WT_VERB_REBALANCE, "free-list append internal page: %s",
__wt_addr_string(session, unpack.data, unpack.size, rs->tmp1));
WT_ERR(__rebalance_fl_append(session, unpack.data, unpack.size, rs));
break;
case WT_CELL_ADDR_LEAF:
case WT_CELL_ADDR_LEAF_NO:
- WT_ERR(__rebalance_leaf_append(session, durable_ts, NULL, 0, &unpack, rs));
+ WT_ERR(__rebalance_leaf_append(session, NULL, 0, &unpack, rs));
break;
default:
WT_ERR(__wt_illegal_value(session, unpack.type));
@@ -264,8 +263,7 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, const uint8_t *addr, size_t a
* Walk a row-store page and its descendants.
*/
static int
-__rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const WT_PAGE_HEADER *dsk,
- WT_REBALANCE_STUFF *rs)
+__rebalance_row_walk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs)
{
WT_BTREE *btree;
WT_CELL_UNPACK key, unpack;
@@ -327,7 +325,7 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const
/* Read and recursively walk the page. */
WT_ERR(__wt_bt_read(session, buf, unpack.data, unpack.size));
- WT_ERR(__rebalance_row_walk(session, unpack.newest_stop_durable_ts, buf->data, rs));
+ WT_ERR(__rebalance_row_walk(session, buf->data, rs));
break;
case WT_CELL_ADDR_LEAF:
case WT_CELL_ADDR_LEAF_NO:
@@ -349,7 +347,7 @@ __rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, const
p = key.data;
len = key.size;
}
- WT_ERR(__rebalance_leaf_append(session, durable_ts, p, len, &unpack, rs));
+ WT_ERR(__rebalance_leaf_append(session, p, len, &unpack, rs));
first_cell = false;
break;
@@ -407,10 +405,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
*/
switch (rs->type) {
case WT_PAGE_ROW_INT:
- WT_ERR(__rebalance_row_walk(session, WT_TS_MAX, ref->page->dsk, rs));
+ WT_ERR(__rebalance_row_walk(session, ref->page->dsk, rs));
break;
case WT_PAGE_COL_INT:
- WT_ERR(__rebalance_col_walk(session, WT_TS_MAX, ref->page->dsk, rs));
+ WT_ERR(__rebalance_col_walk(session, ref->page->dsk, rs));
break;
default:
WT_ERR(__wt_illegal_value(session, rs->type));
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index cc934937681..a225eabc938 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -186,9 +186,10 @@ __slvg_checkpoint(WT_SESSION_IMPL *session, WT_REF *root)
__wt_seconds(session, &ckptbase->sec);
WT_ERR(__wt_metadata_search(session, dhandle->name, &config));
WT_ERR(__wt_meta_block_metadata(session, config, ckptbase));
- ckptbase->newest_durable_ts = WT_TS_NONE;
+ ckptbase->start_durable_ts = WT_TS_NONE;
ckptbase->oldest_start_ts = WT_TS_NONE;
ckptbase->oldest_start_txn = WT_TXN_NONE;
+ ckptbase->stop_durable_ts = WT_TS_NONE;
ckptbase->newest_stop_ts = WT_TS_MAX;
ckptbase->newest_stop_txn = WT_TXN_MAX;
ckptbase->write_gen = btree->write_gen;
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index fa58bc17caf..7e369fcaa0f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -854,7 +854,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t
parent_entries, result_entries, deleted_entries);
err:
- __wt_scr_free(session, &scr);
/*
* A note on error handling: if we completed the split, return success, nothing really bad can
* have happened, and our caller has to proceed with the split.
@@ -890,6 +889,7 @@ err:
}
break;
}
+ __wt_scr_free(session, &scr);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 4a7eb1a697b..229d7d51ce0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -257,7 +257,6 @@ __verify_key_hs(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *unpack,
{
WT_BTREE *btree;
WT_CURSOR *hs_cursor;
- WT_DECL_ITEM(hs_key);
WT_DECL_RET;
wt_timestamp_t newer_start_ts, older_start_ts, older_stop_ts;
uint64_t hs_counter;
@@ -279,8 +278,6 @@ __verify_key_hs(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *unpack,
older_stop_ts = 0;
is_owner = false;
- WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
-
/*
* Open a history store cursor positioned at the end of the data store key (the newest record)
* and iterate backwards until we reach a different key or btree.
@@ -295,12 +292,12 @@ __verify_key_hs(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *unpack,
WT_ERR(hs_cursor->prev(hs_cursor));
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &older_start_ts, &hs_counter));
+ WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp1, &older_start_ts, &hs_counter));
if (hs_btree_id != btree->id)
break;
- WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
+ WT_ERR(__wt_compare(session, NULL, vs->tmp1, key, &cmp));
if (cmp != 0)
break;
@@ -319,7 +316,8 @@ __verify_key_hs(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *unpack,
", Key %s has a overlap of "
"timestamp ranges between history store stop timestamp %s being "
"newer than a more recent timestamp range having start timestamp %s",
- hs_btree_id, __wt_buf_set_printable(session, hs_key->data, hs_key->size, vs->tmp1),
+ hs_btree_id,
+ __wt_buf_set_printable(session, vs->tmp1->data, vs->tmp1->size, vs->tmp1),
__verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]),
__verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1]));
}
@@ -338,7 +336,6 @@ err:
if (ret == WT_NOTFOUND)
ret = 0;
- __wt_scr_free(session, &hs_key);
WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
return (ret);
@@ -435,9 +432,10 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
* Create a fake, unpacked parent cell for the tree based on the checkpoint information.
*/
memset(&addr_unpack, 0, sizeof(addr_unpack));
- addr_unpack.newest_stop_durable_ts = ckpt->newest_durable_ts;
+ addr_unpack.newest_start_durable_ts = ckpt->start_durable_ts;
addr_unpack.oldest_start_ts = ckpt->oldest_start_ts;
addr_unpack.oldest_start_txn = ckpt->oldest_start_txn;
+ addr_unpack.newest_stop_durable_ts = ckpt->stop_durable_ts;
addr_unpack.newest_stop_ts = ckpt->newest_stop_ts;
addr_unpack.newest_stop_txn = ckpt->newest_stop_txn;
addr_unpack.raw = WT_CELL_ADDR_INT;
@@ -584,9 +582,10 @@ int
__wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
{
WT_CURSOR *cursor, *data_cursor;
+ WT_DECL_ITEM(hs_key);
+ WT_DECL_ITEM(prev_hs_key);
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_ITEM hs_key, prev_hs_key;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t btree_id, btree_id_given_uri, session_flags, prev_btree_id;
@@ -594,15 +593,17 @@ __wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
char *uri_itr;
bool is_owner;
+ cursor = data_cursor = NULL;
session_flags = 0;
- data_cursor = NULL;
- WT_CLEAR(prev_hs_key);
- WT_CLEAR(hs_key);
btree_id_given_uri = 0; /* [-Wconditional-uninitialized] */
prev_btree_id = 0; /* [-Wconditional-uninitialized] */
+ is_owner = false; /* [-Wconditional-uninitialized] */
uri_itr = NULL;
- WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner));
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &prev_hs_key));
+
+ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
cursor = session->hs_cursor;
/*
@@ -618,7 +619,7 @@ __wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
* Position the cursor at the first record of the specified btree, or one after. It is
* possible there are no records in the history store for this btree.
*/
- cursor->set_key(cursor, btree_id_given_uri, &hs_key, 0, 0, 0, 0);
+ cursor->set_key(cursor, btree_id_given_uri, hs_key, 0, 0, 0, 0);
ret = cursor->search_near(cursor, &exact);
if (ret == 0 && exact < 0)
ret = cursor->next(cursor);
@@ -627,7 +628,7 @@ __wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
/* We have the history store cursor positioned at the first record that we want to verify. */
for (; ret == 0; ret = cursor->next(cursor)) {
- WT_ERR(cursor->get_key(cursor, &btree_id, &hs_key, &hs_start_ts, &hs_counter));
+ WT_ERR(cursor->get_key(cursor, &btree_id, hs_key, &hs_start_ts, &hs_counter));
/* When limiting our verification to a uri, bail out if the btree-id doesn't match. */
if (uri != NULL && btree_id != btree_id_given_uri)
@@ -659,27 +660,27 @@ __wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
WT_ERR_MSG(session, ret, "Unable to find btree-id %" PRIu32
" in the metadata file for the associated "
"history store key %s",
- btree_id, __wt_buf_set_printable(session, hs_key.data, hs_key.size, tmp));
+ btree_id, __wt_buf_set_printable(session, hs_key->data, hs_key->size, tmp));
}
WT_ERR(__wt_open_cursor(session, uri_itr, NULL, NULL, &data_cursor));
F_SET(data_cursor, WT_CURSOR_RAW_OK);
} else {
- WT_ERR(__wt_compare(session, NULL, &hs_key, &prev_hs_key, &cmp));
+ WT_ERR(__wt_compare(session, NULL, hs_key, prev_hs_key, &cmp));
if (cmp == 0)
continue;
}
- WT_ERR(__wt_buf_set(session, &prev_hs_key, hs_key.data, hs_key.size));
+ WT_ERR(__wt_buf_set(session, prev_hs_key, hs_key->data, hs_key->size));
prev_btree_id = btree_id;
- data_cursor->set_key(data_cursor, &hs_key);
+ data_cursor->set_key(data_cursor, hs_key);
ret = data_cursor->search(data_cursor);
if (ret == WT_NOTFOUND) {
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR_MSG(session, WT_NOTFOUND,
"In the URI %s, the associated history store key %s cannot be found in the data "
"store",
- uri_itr, __wt_buf_set_printable(session, hs_key.data, hs_key.size, tmp));
+ uri_itr, __wt_buf_set_printable(session, hs_key->data, hs_key->size, tmp));
}
WT_ERR(ret);
}
@@ -687,7 +688,11 @@ __wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
err:
if (data_cursor != NULL)
WT_TRET(data_cursor->close(data_cursor));
- WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
+ if (cursor != NULL)
+ WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
+
+ __wt_scr_free(session, &hs_key);
+ __wt_scr_free(session, &prev_hs_key);
__wt_scr_free(session, &tmp);
__wt_free(session, uri_itr);
return (ret);
@@ -1262,15 +1267,26 @@ __verify_page_cell(
unpack.oldest_start_txn, unpack.newest_stop_txn);
}
- /* FIXME-prepare-support: check newest start durable timestamp as well. */
- WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest durable",
- unpack.newest_stop_durable_ts, "newest durable", addr_unpack->newest_stop_durable_ts,
- false, vs));
+ /*
+ * FIXME-prepare-support: Enable verification once all durable is finished.
+ *
+ * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start durable",
+ * unpack.newest_start_durable_ts, "start durable",
+ * addr_unpack->newest_start_durable_ts, false, vs));
+ */
WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start",
unpack.oldest_start_ts, "oldest start", addr_unpack->oldest_start_ts, true, vs));
WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "oldest start",
unpack.oldest_start_txn, "oldest start", addr_unpack->oldest_start_txn, true, dsk,
vs));
+
+ /*
+ * FIXME-prepare-support: Enable verification once all durable is finished.
+ *
+ * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop durable",
+ * unpack.newest_stop_durable_ts, "stop durable", addr_unpack->newest_stop_durable_ts,
+ * false, vs));
+ */
WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest stop",
unpack.newest_stop_ts, "newest stop", addr_unpack->newest_stop_ts, false, vs));
WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "newest stop",
@@ -1305,10 +1321,24 @@ __verify_page_cell(
cell_num - 1, __verify_addr_string(session, ref, vs->tmp1), unpack.start_txn,
unpack.stop_txn);
+ /*
+ * FIXME-prepare-support: Enable verification once all durable is finished.
+ *
+ * WT_RET(
+ * __verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.durable_start_ts,
+ * "durable start", addr_unpack->newest_start_durable_ts, true, vs));
+ */
WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_ts,
"oldest start", addr_unpack->oldest_start_ts, true, vs));
WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_txn,
"oldest start", addr_unpack->oldest_start_txn, true, dsk, vs));
+ /*
+ * FIXME-prepare-support: Enable verification once all durable is finished.
+ *
+ * WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start",
+ * unpack.durable_stop_ts,
+ * "durable stop", addr_unpack->newest_stop_durable_ts, true, vs));
+ */
WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_ts,
"newest stop", addr_unpack->newest_stop_ts, false, vs));
WT_RET(__verify_txn_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_txn,
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index 6313bfa45f5..fb8cff2d9a6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -110,6 +110,8 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H
}
if (LF_ISSET(WT_PAGE_ENCRYPTED))
LF_CLR(WT_PAGE_ENCRYPTED);
+ if (LF_ISSET(WT_PAGE_UNUSED))
+ LF_CLR(WT_PAGE_UNUSED);
if (flags != 0)
WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags);
@@ -296,13 +298,14 @@ __verify_dsk_validity(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t
if (addr == NULL)
break;
- /* FIXME-prepare-support: check newest start durable timestamp as well. */
- WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest durable",
- unpack->newest_stop_durable_ts, "newest durable", addr->stop_durable_ts, false, tag));
+ WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "start durable",
+ unpack->newest_start_durable_ts, "start durable", addr->start_durable_ts, false, tag));
WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "oldest start",
unpack->oldest_start_ts, "oldest start", addr->oldest_start_ts, true, tag));
WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "oldest start",
unpack->oldest_start_txn, "oldest start", addr->oldest_start_txn, true, tag, dsk));
+ WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop durable",
+ unpack->newest_stop_durable_ts, "stop durable", addr->stop_durable_ts, false, tag));
WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop",
unpack->newest_stop_ts, "newest stop", addr->newest_stop_ts, false, tag));
WT_RET(__verify_dsk_txn_addr_cmp(session, cell_num - 1, "newest stop",
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index b1cabb5e098..30c58f3571a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1298,8 +1298,9 @@ __conn_config_check_version(WT_SESSION_IMPL *session, const char *config)
if (vmajor.val > WIREDTIGER_VERSION_MAJOR ||
(vmajor.val == WIREDTIGER_VERSION_MAJOR && vminor.val > WIREDTIGER_VERSION_MINOR))
WT_RET_MSG(session, ENOTSUP,
- "WiredTiger configuration is from an incompatible release "
- "of the WiredTiger engine");
+ "WiredTiger configuration is from an incompatible release of the WiredTiger engine, "
+ "configuration major, minor of (%" PRId64 ", %" PRId64 "), with build (%d, %d)",
+ vmajor.val, vminor.val, WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 850c8393a5f..f09d5130aae 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -103,38 +103,53 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig)
* Note: downgrade in this context means the new version is not the latest possible version. It
* does not mean the direction of change from the release we may be running currently.
*/
- if (conn->compat_major < WT_LOG_V2_MAJOR) {
- new_version = 1;
- first_record = WT_LOG_END_HEADER;
- downgrade = true;
- } else if (conn->compat_major == WT_LOG_V2_MAJOR) {
- new_version = conn->compat_minor == WT_LOG_V2_MINOR ? 2 : 3;
- first_record = WT_LOG_END_HEADER + log->allocsize;
- downgrade = true;
- } else {
+ if (conn->compat_major == WT_LOG_V5_MAJOR) {
new_version = WT_LOG_VERSION;
first_record = WT_LOG_END_HEADER + log->allocsize;
downgrade = false;
+ } else if (conn->compat_major == WT_LOG_V4_MAJOR) {
+ if (conn->compat_minor == WT_LOG_V4_MINOR)
+ new_version = 4;
+ else if (conn->compat_minor > WT_LOG_V2_MINOR)
+ new_version = 3;
+ else
+ new_version = 2;
+ first_record = WT_LOG_END_HEADER + log->allocsize;
+ downgrade = true;
+ } else {
+ new_version = 1;
+ first_record = WT_LOG_END_HEADER;
+ downgrade = true;
}
/*
* Set up the maximum and minimum log version required if needed.
*/
if (conn->req_max_major != WT_CONN_COMPAT_NONE) {
- if (conn->req_max_major < WT_LOG_V2_MAJOR)
- conn->log_req_max = 1;
- else if (conn->req_max_major == WT_LOG_V2_MAJOR)
- conn->log_req_max = conn->req_max_minor == WT_LOG_V2_MINOR ? 2 : 3;
- else
+ if (conn->req_max_major == WT_LOG_V5_MAJOR)
conn->log_req_max = WT_LOG_VERSION;
+ else if (conn->req_max_major == WT_LOG_V4_MAJOR)
+ if (conn->req_max_minor == WT_LOG_V4_MINOR)
+ conn->log_req_max = 4;
+ else if (conn->req_max_minor > WT_LOG_V2_MINOR)
+ conn->log_req_max = 3;
+ else
+ conn->log_req_max = 2;
+ else
+ conn->log_req_max = 1;
}
if (conn->req_min_major != WT_CONN_COMPAT_NONE) {
- if (conn->req_min_major < WT_LOG_V2_MAJOR)
- conn->log_req_min = 1;
- else if (conn->req_min_major == WT_LOG_V2_MAJOR)
- conn->log_req_min = conn->req_min_minor == WT_LOG_V2_MINOR ? 2 : 3;
- else
+ if (conn->req_min_major == WT_LOG_V5_MAJOR)
conn->log_req_min = WT_LOG_VERSION;
+ else if (conn->req_min_major == WT_LOG_V4_MAJOR)
+ if (conn->req_min_minor == WT_LOG_V4_MINOR)
+ conn->log_req_min = 4;
+ else if (conn->req_min_minor > WT_LOG_V2_MINOR)
+ conn->log_req_min = 3;
+ else
+ conn->log_req_min = 2;
+ else
+ conn->log_req_min = 1;
}
/*
diff --git a/src/third_party/wiredtiger/src/docs/devdoc-optrack.dox b/src/third_party/wiredtiger/src/docs/devdoc-optrack.dox
index 360b3b6de47..f92238db424 100644
--- a/src/third_party/wiredtiger/src/docs/devdoc-optrack.dox
+++ b/src/third_party/wiredtiger/src/docs/devdoc-optrack.dox
@@ -2,9 +2,15 @@
# Overview
-This tutorial will walk you through using operation tracking on one of `wtperf`
-workloads: from preparing the workload for most effective data collection, to
-gathering, visualizing and interpreting the execution logs.
+Operation tracking tracks all API calls in WiredTiger as well as certain
+functions that are deemed important for performance, such as those in the
+eviction module. Tracking is performed by generating a log record when the
+execution enters and exits a tracked function. A log record contains a function
+name and its timestamp.
+
+This tutorial will walk you through using operation tracking in WiredTiger on
+one of `wtperf` workloads: from preparing the workload for most effective data
+collection, to gathering, visualizing and interpreting the execution logs.
## Why use operation tracking?
@@ -102,7 +108,7 @@ the path to your WiredTiger tree as WT. Suppose that the process ID that
generated the operation tracking files is 25660. Then you'd run the decode
script like so:
- % WT/tools/optrack/wt_optrack_decode.py -m optrack-map.0000025660 optrack.0000025660.00000000*
+ % python WT/tools/optrack/wt_optrack_decode.py -m optrack-map.0000025660 optrack.0000025660.00000000*
As the script runs you will see lots of output on the screen reporting the
progress through the parsing process. One kind of output you might see is
@@ -122,10 +128,25 @@ The "internal" files are the log files for WT internal sessions (such as
eviction threads). The "external" files are for the sessions created by the
client application.
-Now that the files have been decoded, it's time to visualize the data. This can
-be done using the script find-latency-spikes.py located in the `tools/optrack`
-directory of the WiredTiger tree. To process your text files from our example,
-you would run this script like so:
+## Preparing data for viewing
+
+There are two ways to view operation tracking data, besides manually plowing
+through the log files. The quickest way is to generate CSV files viewable with
+`t2` -- MongoDB's internal tool. With `t2`, you can view the frequency of
+tracked operations across time and across threads, like this:
+
+![Perf output visualized with t2](optrack-t2-slow-lsm-worker-thread.png)
+
+To produce a CSV file that can be loaded directly into `t2`, run the following
+command:
+
+ % WT/tools/optrack/optrack_to_t2.py optrack.0000025660.00000000*.txt
+
+The second option is to use a script that will help you locate latency spikes --
+invocations of operations that took an unusually long time -- and visually
+examine per-thread operation logs around those spikes. To obtain such a
+visualization, use the script `find-latency-spikes.py` located in the
+`tools/optrack` directory of the WiredTiger tree. To process the text files in our example, you run this script as follows:
% WT/tools/optrack/find-latency-spikes.py optrack.0000025660.00000000*.txt
@@ -179,7 +200,7 @@ x-axis shows the execution timeline (in nanoseconds). The y-axis shows how many
abnormally long executions of the function occurred during very period of the
execution.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/outlier_histograms.png)
+![](outlier_histograms.png)
You can click on outlier bars and look at the detailed visualization of the
period during which the abnormally long function invocations occurred. But
@@ -279,7 +300,7 @@ Now you can see the outlier charts complying with the configuration parameters
we supplied. For example, on the chart for the `__wt_cache_eviction_worker`, we
see only the intervals where that function took longer than 15 ms to complete.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/wt_cache_eviction_worker_outliers.png)
+![](wt_cache_eviction_worker_outliers.png)
Let's click on one of those intervals to examine what happened there. I am
going to click on the tallest bar in the chart, which will take me to a
@@ -294,12 +315,12 @@ within the bar you can navigate to other intervals. For example, if you wanted
to look at the execution interval located after the current one, you will just
click on the white bar following the current, red-highlighted, bar.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/interval_137_nav_bar.png)
+![](interval_137_nav_bar.png)
Next you see a legend that shows all functions that were called during this
execution interval and their corresponding colours.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/interval_137_lgnd.png)
+![](interval_137_lgnd.png)
Then you will see the most interesting information: function calls across
threads that occurred during this period. Durations and hierarchy of function
@@ -316,7 +337,7 @@ In our example visualization, if we scroll down to the function sequences for
external threads, we will quickly see a few instances where
__wt_cache_eviction_worker took longer than 15 ms, for example here:
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/wt_cache_eviction_worker_over_15ms.png)
+![](wt_cache_eviction_worker_over_15ms.png)
As we can see, the threads are simply waiting on the condition variable inside
the eviction worker. To try and understand why, we might want to scroll up and
@@ -324,11 +345,11 @@ look at the activity of internal threads during this period. Interestingly
enough, most of the internal threads are also waiting on the condition variable
during this period.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/interval_137_threads_waiting.png)
+![](interval_137_threads_waiting.png)
Looking at the internal thread with session id 1 shows something suspicious.
-![](http://www.ece.ubc.ca/~sasha/TUTORIAL-DEMO-CONFIG/IMG/session_1_dead_period.png)
+![](session_1_dead_period.png)
During this period where all threads are waiting, this thread shows no activity
at all, whereas prior and after that "dead" period it was making regular calls
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_lgnd.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_lgnd.png
new file mode 100644
index 00000000000..6244c77cf70
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_lgnd.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_nav_bar.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_nav_bar.png
new file mode 100644
index 00000000000..9d0f2705abf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_nav_bar.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_threads_waiting.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_threads_waiting.png
new file mode 100644
index 00000000000..0166bfb2c2d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/interval_137_threads_waiting.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/optrack-t2-slow-lsm-worker-thread.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/optrack-t2-slow-lsm-worker-thread.png
new file mode 100644
index 00000000000..fec38fe534c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/optrack-t2-slow-lsm-worker-thread.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/outlier_histograms.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/outlier_histograms.png
new file mode 100644
index 00000000000..bc835d52f1c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/outlier_histograms.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/session_1_dead_period.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/session_1_dead_period.png
new file mode 100644
index 00000000000..2bdc20c2fcd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/session_1_dead_period.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_outliers.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_outliers.png
new file mode 100644
index 00000000000..8d07ba82414
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_outliers.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_over_15ms.png b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_over_15ms.png
new file mode 100644
index 00000000000..70dd4be97c2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/operation_tracking_images/wt_cache_eviction_worker_over_15ms.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index b34771df3f0..ed61b2321b2 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -9,6 +9,7 @@ CFLAGS
CPPFLAGS
CPUs
CRC
+CSV
Cheng
Christoph
Collet's
@@ -502,6 +503,7 @@ svg
sys
syscalls
sz
+t2
tRuE
tablename
tcl
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c
index 492a2242018..2607e05b0bc 100644
--- a/src/third_party/wiredtiger/src/history/hs.c
+++ b/src/third_party/wiredtiger/src/history/hs.c
@@ -382,6 +382,7 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W
*/
WT_ERR(__wt_update_alloc(session, NULL, &hs_upd, &notused, WT_UPDATE_TOMBSTONE));
hs_upd->start_ts = stop_ts_pair.timestamp;
+ hs_upd->durable_ts = stop_ts_pair.timestamp;
hs_upd->txnid = stop_ts_pair.txnid;
/*
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index e9f728e3ef9..9cb06987879 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -74,6 +74,7 @@ struct __wt_page_header {
#define WT_PAGE_EMPTY_V_ALL 0x02u /* Page has all zero-length values */
#define WT_PAGE_EMPTY_V_NONE 0x04u /* Page has no zero-length values */
#define WT_PAGE_ENCRYPTED 0x08u /* Page is encrypted on disk */
+#define WT_PAGE_UNUSED 0x10u /* Historic lookaside store page updates, no longer used */
uint8_t flags; /* 25: flags */
/* A byte of padding, positioned to be added to the flags. */
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index b3eb91efc78..2601d0653d6 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -12,37 +12,38 @@
*/
static inline void
__cell_check_value_validity(WT_SESSION_IMPL *session, wt_timestamp_t durable_start_ts,
- wt_timestamp_t durable_stop_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts,
wt_timestamp_t stop_ts, uint64_t stop_txn)
{
#ifdef HAVE_DIAGNOSTIC
char ts_string[2][WT_TS_INT_STRING_SIZE];
- if (durable_start_ts > durable_stop_ts) {
- __wt_errx(session, "a durable start timestamp %s newer than its durable stop timestamp %s",
- __wt_timestamp_to_string(durable_start_ts, ts_string[0]),
- __wt_timestamp_to_string(durable_stop_ts, ts_string[1]));
- WT_ASSERT(session, durable_start_ts <= durable_stop_ts);
- }
+ if (start_ts > durable_start_ts)
+ WT_ERR_ASSERT(session, start_ts <= durable_start_ts, WT_PANIC,
+ "a start timestamp %s newer than its durable start timestamp %s",
+ __wt_timestamp_to_string(start_ts, ts_string[0]),
+ __wt_timestamp_to_string(durable_start_ts, ts_string[1]));
- if (start_ts != WT_TS_NONE && stop_ts == WT_TS_NONE) {
- __wt_errx(session, "stop timestamp of 0");
- WT_ASSERT(session, stop_ts != WT_TS_NONE);
- }
- if (start_ts > stop_ts) {
- __wt_errx(session, "a start timestamp %s newer than its stop timestamp %s",
+ if (start_ts != WT_TS_NONE && stop_ts == WT_TS_NONE)
+ WT_ERR_ASSERT(session, stop_ts != WT_TS_NONE, WT_PANIC, "stop timestamp of 0");
+
+ if (start_ts > stop_ts)
+ WT_ERR_ASSERT(session, start_ts <= stop_ts, WT_PANIC,
+ "a start timestamp %s newer than its stop timestamp %s",
__wt_timestamp_to_string(start_ts, ts_string[0]),
__wt_timestamp_to_string(stop_ts, ts_string[1]));
- WT_ASSERT(session, start_ts <= stop_ts);
- }
- if (start_txn > stop_txn) {
- __wt_errx(session, "a start transaction ID %" PRIu64
- " newer than its stop "
- "transaction ID %" PRIu64,
+ if (start_txn > stop_txn)
+ WT_ERR_ASSERT(session, start_txn <= stop_txn, WT_PANIC,
+ "a start transaction ID %" PRIu64 " newer than its stop transaction ID %" PRIu64,
start_txn, stop_txn);
- WT_ASSERT(session, start_txn <= stop_txn);
- }
+
+ if (stop_ts != WT_TS_MAX && stop_ts > durable_stop_ts)
+ WT_ERR_ASSERT(session, stop_ts <= durable_stop_ts, WT_PANIC,
+ "a stop timestamp %s newer than its durable stop timestamp %s",
+ __wt_timestamp_to_string(stop_ts, ts_string[0]),
+ __wt_timestamp_to_string(durable_stop_ts, ts_string[1]));
+
#else
WT_UNUSED(session);
WT_UNUSED(durable_start_ts);
@@ -60,59 +61,64 @@ __cell_check_value_validity(WT_SESSION_IMPL *session, wt_timestamp_t durable_sta
*/
static inline void
__cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t durable_start_ts,
- wt_timestamp_t durable_stop_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts,
wt_timestamp_t stop_ts, uint64_t stop_txn, bool prepare)
{
uint8_t flags, *flagsp;
+ /* Globally visible values have no associated validity window. */
+ if (durable_start_ts == WT_TS_NONE && start_ts == WT_TS_NONE && start_txn == WT_TXN_NONE &&
+ durable_stop_ts == WT_TS_NONE && stop_ts == WT_TS_MAX && stop_txn == WT_TXN_MAX) {
+ ++*pp;
+ return;
+ }
+
__cell_check_value_validity(
- session, durable_start_ts, durable_stop_ts, start_ts, start_txn, stop_ts, stop_txn);
+ session, durable_start_ts, start_ts, start_txn, durable_stop_ts, stop_ts, stop_txn);
- /* Globally visible values have no associated validity window, set a flag bit and store them. */
- if (start_ts == WT_TS_NONE && start_txn == WT_TXN_NONE && stop_ts == WT_TS_MAX &&
- stop_txn == WT_TXN_MAX)
- ++*pp;
- else {
- **pp |= WT_CELL_SECOND_DESC;
- ++*pp;
- flagsp = *pp;
- ++*pp;
+ **pp |= WT_CELL_SECOND_DESC;
+ ++*pp;
+ flagsp = *pp;
+ ++*pp;
- flags = 0;
- if (start_ts != WT_TS_NONE) {
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_ts));
- LF_SET(WT_CELL_TS_START);
- }
- if (start_txn != WT_TXN_NONE) {
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_txn));
- LF_SET(WT_CELL_TXN_START);
- }
- if (durable_start_ts != WT_TS_NONE) {
- /* Store differences, not absolutes. */
- WT_ASSERT(session, start_ts != WT_TS_NONE && start_ts <= durable_start_ts);
+ flags = 0;
+ if (start_ts != WT_TS_NONE) {
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_ts));
+ LF_SET(WT_CELL_TS_START);
+ }
+ if (start_txn != WT_TXN_NONE) {
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_txn));
+ LF_SET(WT_CELL_TXN_START);
+ }
+ if (durable_start_ts != WT_TS_NONE) {
+ WT_ASSERT(session, start_ts != WT_TS_NONE && start_ts <= durable_start_ts);
+ /* Store differences if any, not absolutes. */
+ if (durable_start_ts - start_ts > 0) {
WT_IGNORE_RET(__wt_vpack_uint(pp, 0, durable_start_ts - start_ts));
LF_SET(WT_CELL_TS_DURABLE_START);
}
- if (stop_ts != WT_TS_MAX) {
- /* Store differences, not absolutes. */
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_ts - start_ts));
- LF_SET(WT_CELL_TS_STOP);
- }
- if (stop_txn != WT_TXN_MAX) {
- /* Store differences, not absolutes. */
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_txn - start_txn));
- LF_SET(WT_CELL_TXN_STOP);
- }
- if (durable_stop_ts != WT_TS_NONE) {
- /* Store differences, not absolutes. */
- WT_ASSERT(session, stop_ts != WT_TS_MAX && stop_ts <= durable_stop_ts);
+ }
+ if (stop_ts != WT_TS_MAX) {
+ /* Store differences, not absolutes. */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_ts - start_ts));
+ LF_SET(WT_CELL_TS_STOP);
+ }
+ if (stop_txn != WT_TXN_MAX) {
+ /* Store differences, not absolutes. */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_txn - start_txn));
+ LF_SET(WT_CELL_TXN_STOP);
+ }
+ if (durable_stop_ts != WT_TS_NONE) {
+ WT_ASSERT(session, stop_ts != WT_TS_MAX && stop_ts <= durable_stop_ts);
+ /* Store differences if any, not absolutes. */
+ if (durable_stop_ts - stop_ts > 0) {
WT_IGNORE_RET(__wt_vpack_uint(pp, 0, durable_stop_ts - stop_ts));
LF_SET(WT_CELL_TS_DURABLE_STOP);
}
- if (prepare)
- LF_SET(WT_CELL_PREPARE);
- *flagsp = flags;
}
+ if (prepare)
+ LF_SET(WT_CELL_PREPARE);
+ *flagsp = flags;
}
/*
@@ -120,36 +126,45 @@ __cell_pack_value_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_
* Check the address' validity window for sanity.
*/
static inline void
-__wt_check_addr_validity(WT_SESSION_IMPL *session, wt_timestamp_t oldest_start_ts,
- uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
+__wt_check_addr_validity(WT_SESSION_IMPL *session, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
{
-/* FIXME-prepare-support: accept durable timestamps as args, and do checks on them. */
#ifdef HAVE_DIAGNOSTIC
char ts_string[2][WT_TS_INT_STRING_SIZE];
- if (oldest_start_ts != WT_TS_NONE && newest_stop_ts == WT_TS_NONE) {
- __wt_errx(session, "newest stop timestamp of 0");
- WT_ASSERT(session, newest_stop_ts != WT_TS_NONE);
- }
- if (oldest_start_ts > newest_stop_ts) {
- __wt_errx(session,
- "an oldest start timestamp %s newer than its newest "
- "stop timestamp %s",
+ if (oldest_start_ts != WT_TS_NONE && newest_stop_ts == WT_TS_NONE)
+ WT_ERR_ASSERT(
+ session, newest_stop_ts != WT_TS_NONE, WT_PANIC, "newest stop timestamp of 0");
+
+ if (oldest_start_ts > newest_stop_ts)
+ WT_ERR_ASSERT(session, oldest_start_ts <= newest_stop_ts, WT_PANIC,
+ "an oldest start timestamp %s newer than its newest stop timestamp %s",
__wt_timestamp_to_string(oldest_start_ts, ts_string[0]),
__wt_timestamp_to_string(newest_stop_ts, ts_string[1]));
- WT_ASSERT(session, oldest_start_ts <= newest_stop_ts);
- }
- if (oldest_start_txn > newest_stop_txn) {
- __wt_errx(session, "an oldest start transaction %" PRIu64
- " newer than its "
- "newest stop transaction %" PRIu64,
+
+ if (oldest_start_txn > newest_stop_txn)
+ WT_ERR_ASSERT(session, oldest_start_txn <= newest_stop_txn, WT_PANIC,
+ "an oldest start transaction %" PRIu64 " newer than its newest stop transaction %" PRIu64,
oldest_start_txn, newest_stop_txn);
- WT_ASSERT(session, oldest_start_txn <= newest_stop_txn);
- }
+
+ if (oldest_start_ts > start_durable_ts)
+ WT_ERR_ASSERT(session, oldest_start_ts <= start_durable_ts, WT_PANIC,
+ "an oldest start timestamp %s newer than its durable start timestamp %s",
+ __wt_timestamp_to_string(oldest_start_ts, ts_string[0]),
+ __wt_timestamp_to_string(start_durable_ts, ts_string[1]));
+
+ if (newest_stop_ts != WT_TS_MAX && newest_stop_ts > stop_durable_ts)
+ WT_ERR_ASSERT(session, newest_stop_ts <= stop_durable_ts, WT_PANIC,
+ "a newest stop timestamp %s newer than its durable stop timestamp %s",
+ __wt_timestamp_to_string(newest_stop_ts, ts_string[0]),
+ __wt_timestamp_to_string(stop_durable_ts, ts_string[1]));
#else
WT_UNUSED(session);
+ WT_UNUSED(start_durable_ts);
WT_UNUSED(oldest_start_ts);
WT_UNUSED(oldest_start_txn);
+ WT_UNUSED(stop_durable_ts);
WT_UNUSED(newest_stop_ts);
WT_UNUSED(newest_stop_txn);
#endif
@@ -161,64 +176,81 @@ __wt_check_addr_validity(WT_SESSION_IMPL *session, wt_timestamp_t oldest_start_t
*/
static inline void
__cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t start_durable_ts,
- wt_timestamp_t stop_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts,
wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
{
uint8_t flags, *flagsp;
- /* FIXME-prepare-support: Check validity of durable timestamps. */
- __wt_check_addr_validity(
- session, oldest_start_ts, oldest_start_txn, newest_stop_ts, newest_stop_txn);
-
- /* Globally visible values have no associated validity window, set a flag bit and store them. */
+ /* Globally visible values have no associated validity window. */
if (start_durable_ts == WT_TS_NONE && stop_durable_ts == WT_TS_NONE &&
oldest_start_ts == WT_TS_NONE && oldest_start_txn == WT_TXN_NONE &&
- newest_stop_ts == WT_TS_MAX && newest_stop_txn == WT_TXN_MAX)
- ++*pp;
- else {
- **pp |= WT_CELL_SECOND_DESC;
- ++*pp;
- flagsp = *pp;
+ newest_stop_ts == WT_TS_MAX && newest_stop_txn == WT_TXN_MAX) {
++*pp;
+ return;
+ }
- flags = 0;
- if (oldest_start_ts != WT_TS_NONE) {
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_ts));
- LF_SET(WT_CELL_TS_START);
- }
- if (oldest_start_txn != WT_TXN_NONE) {
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_txn));
- LF_SET(WT_CELL_TXN_START);
- }
- if (start_durable_ts != WT_TS_NONE) {
- /* Store differences, not absolutes. */
- WT_ASSERT(
- session, oldest_start_ts != WT_TS_NONE && oldest_start_ts <= start_durable_ts);
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_durable_ts - oldest_start_ts));
- LF_SET(WT_CELL_TS_DURABLE_START);
- }
- if (newest_stop_ts != WT_TS_MAX) {
- /* Store differences, not absolutes. */
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_ts - oldest_start_ts));
- LF_SET(WT_CELL_TS_STOP);
- }
- if (newest_stop_txn != WT_TXN_MAX) {
- /* Store differences, not absolutes. */
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_txn - oldest_start_txn));
- LF_SET(WT_CELL_TXN_STOP);
- }
- if (stop_durable_ts != WT_TS_NONE) {
- /* Store differences, not absolutes. */
- /*
- * FIXME-prepare-support:
- * WT_ASSERT(session,
- * newest_stop_ts != WT_TS_MAX && newest_stop_ts <= stop_durable__ts);
- */
- WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_durable_ts - newest_stop_ts));
- LF_SET(WT_CELL_TS_DURABLE_STOP);
- }
- *flagsp = flags;
+ __wt_check_addr_validity(session, start_durable_ts, oldest_start_ts, oldest_start_txn,
+ stop_durable_ts, newest_stop_ts, newest_stop_txn);
+
+ **pp |= WT_CELL_SECOND_DESC;
+ ++*pp;
+ flagsp = *pp;
+ ++*pp;
+
+ flags = 0;
+ if (oldest_start_ts != WT_TS_NONE) {
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_ts));
+ LF_SET(WT_CELL_TS_START);
+ }
+ if (oldest_start_txn != WT_TXN_NONE) {
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, oldest_start_txn));
+ LF_SET(WT_CELL_TXN_START);
+ }
+ if (start_durable_ts != WT_TS_NONE) {
+ /* Store differences, not absolutes. */
+ /*
+ * FIXME-prepare-support:
+ * WT_ASSERT(
+ * session, oldest_start_ts != WT_TS_NONE && oldest_start_ts <= start_durable_ts);
+ */
+ /*
+ * Unlike value cell, we store the durable start timestamp even the difference is zero
+ * compared to oldest commit timestamp. The difference can only be zero when the page
+ * contains all the key/value pairs with the same timestamp. But this scenario is rare and
+ * having that check to find out whether it is zero or not will unnecessarily add overhead
+ * than benefit.
+ */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, start_durable_ts - oldest_start_ts));
+ LF_SET(WT_CELL_TS_DURABLE_START);
+ }
+ if (newest_stop_ts != WT_TS_MAX) {
+ /* Store differences, not absolutes. */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_ts - oldest_start_ts));
+ LF_SET(WT_CELL_TS_STOP);
}
+ if (newest_stop_txn != WT_TXN_MAX) {
+ /* Store differences, not absolutes. */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, newest_stop_txn - oldest_start_txn));
+ LF_SET(WT_CELL_TXN_STOP);
+ }
+ if (stop_durable_ts != WT_TS_NONE) {
+ /* Store differences, not absolutes. */
+ /*
+ * FIXME-prepare-support:
+ * WT_ASSERT(session,
+ * newest_stop_ts != WT_TS_MAX && newest_stop_ts <= stop_durable__ts);
+ */
+ /*
+ * Unlike value cell, we store the durable stop timestamp even the difference is zero
+ * compared to newest commit timestamp. The difference can only be zero when the page
+ * contains all the key/value pairs with the same timestamp. But this scenario is rare and
+ * having that check to find out whether it is zero or not will unnecessarily add overhead
+ * than benefit.
+ */
+ WT_IGNORE_RET(__wt_vpack_uint(pp, 0, stop_durable_ts - newest_stop_ts));
+ LF_SET(WT_CELL_TS_DURABLE_STOP);
+ }
+ *flagsp = flags;
}
/*
@@ -227,24 +259,18 @@ __cell_pack_addr_validity(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t
*/
static inline size_t
__wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, uint64_t recno,
- wt_timestamp_t stop_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
- wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, size_t size)
+ wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn,
+ size_t size)
{
- wt_timestamp_t start_durable_ts;
uint8_t *p;
- /*
- * FIXME-prepare-support: This value should be passed in when support for prepared transactions
- * with durable history is fully implemented.
- */
- start_durable_ts = WT_TS_NONE;
-
/* Start building a cell: the descriptor byte starts zero. */
p = cell->__chunk;
*p = '\0';
- __cell_pack_addr_validity(session, &p, start_durable_ts, stop_durable_ts, oldest_start_ts,
- oldest_start_txn, newest_stop_ts, newest_stop_txn);
+ __cell_pack_addr_validity(session, &p, start_durable_ts, oldest_start_ts, oldest_start_txn,
+ stop_durable_ts, newest_stop_ts, newest_stop_txn);
if (recno == WT_RECNO_OOB)
cell->__chunk[0] |= (uint8_t)cell_type; /* Type */
@@ -263,26 +289,21 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, ui
* Set a value item's WT_CELL contents.
*/
static inline size_t
-__wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts,
- uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, size_t size)
+__wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t durable_start_ts,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t durable_stop_ts,
+ wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, size_t size)
{
- wt_timestamp_t durable_start_ts, durable_stop_ts;
uint8_t byte, *p;
bool prepare, validity;
- /*
- * FIXME-prepare-support: These values should be passed in when support for prepared
- * transactions with durable history is fully implemented.
- */
- durable_start_ts = WT_TS_NONE;
- durable_stop_ts = WT_TS_NONE;
+ /* FIXME-prepare-support: The prepare flag should be passed in. */
prepare = false;
/* Start building a cell: the descriptor byte starts zero. */
p = cell->__chunk;
*p = '\0';
- __cell_pack_value_validity(session, &p, durable_start_ts, durable_stop_ts, start_ts, start_txn,
+ __cell_pack_value_validity(session, &p, durable_start_ts, start_ts, start_txn, durable_stop_ts,
stop_ts, stop_txn, prepare);
/*
@@ -405,10 +426,10 @@ __wt_cell_pack_value_match(
* Write a copy value cell.
*/
static inline size_t
-__wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts,
- uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, uint64_t v)
+__wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, uint64_t v)
{
- wt_timestamp_t durable_start_ts, durable_stop_ts;
uint8_t *p;
bool prepare;
@@ -416,15 +437,13 @@ __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t star
* FIXME-prepare-support: These values should be passed in when support for prepared
* transactions with durable history is fully implemented.
*/
- durable_start_ts = WT_TS_NONE;
- durable_stop_ts = WT_TS_NONE;
prepare = false;
/* Start building a cell: the descriptor byte starts zero. */
p = cell->__chunk;
*p = '\0';
- __cell_pack_value_validity(session, &p, durable_start_ts, durable_stop_ts, start_ts, start_txn,
+ __cell_pack_value_validity(session, &p, start_durable_ts, start_ts, start_txn, stop_durable_ts,
stop_ts, stop_txn, prepare);
if (rle < 2)
@@ -445,8 +464,9 @@ __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t star
* Write a deleted value cell.
*/
static inline size_t
-__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts,
- uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
+__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
{
uint8_t *p;
@@ -454,9 +474,9 @@ __wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start
p = cell->__chunk;
*p = '\0';
- /* FIXME-prepare-support: we should pass durable start and stop values. */
- __cell_pack_value_validity(
- session, &p, WT_TS_NONE, WT_TS_NONE, start_ts, start_txn, stop_ts, stop_txn, false);
+ /* FIXME-prepare-support: we should pass prepare value. */
+ __cell_pack_value_validity(session, &p, start_durable_ts, start_ts, start_txn, stop_durable_ts,
+ stop_ts, stop_txn, false);
if (rle < 2)
cell->__chunk[0] |= WT_CELL_DEL; /* Type */
@@ -542,16 +562,15 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
* Pack an overflow cell.
*/
static inline size_t
-__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, wt_timestamp_t start_ts,
- uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, size_t size)
+__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type,
+ wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ size_t size)
{
- wt_timestamp_t durable_start_ts, durable_stop_ts;
uint8_t *p;
bool prepare;
- /* FIXME-prepare-support: The durable timestamps should be passed in. */
- durable_start_ts = WT_TS_NONE;
- durable_stop_ts = WT_TS_NONE;
+ /* FIXME-prepare-support: The prepare flag should be passed in. */
prepare = false;
/* Start building a cell: the descriptor byte starts zero. */
@@ -565,8 +584,8 @@ __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, wt_ti
break;
case WT_CELL_VALUE_OVFL:
case WT_CELL_VALUE_OVFL_RM:
- __cell_pack_value_validity(session, &p, durable_start_ts, durable_stop_ts, start_ts,
- start_txn, stop_ts, stop_txn, prepare);
+ __cell_pack_value_validity(session, &p, durable_start_ts, start_ts, start_txn,
+ durable_stop_ts, stop_ts, stop_txn, prepare);
break;
}
@@ -723,8 +742,10 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE
struct {
uint64_t v;
wt_timestamp_t start_ts;
+ wt_timestamp_t durable_start_ts;
uint64_t start_txn;
wt_timestamp_t stop_ts;
+ wt_timestamp_t durable_stop_ts;
uint64_t stop_txn;
uint32_t len;
} copy;
@@ -734,8 +755,10 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE
copy.v = 0; /* -Werror=maybe-uninitialized */
copy.start_ts = WT_TS_NONE;
+ copy.durable_start_ts = WT_TS_NONE;
copy.start_txn = WT_TXN_NONE;
copy.stop_ts = WT_TS_MAX;
+ copy.durable_stop_ts = WT_TS_NONE;
copy.stop_txn = WT_TXN_MAX;
copy.len = 0;
@@ -844,6 +867,7 @@ restart:
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_start_durable_ts));
unpack->newest_start_durable_ts += unpack->oldest_start_ts;
}
+
if (LF_ISSET(WT_CELL_TS_STOP)) {
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_stop_ts));
@@ -860,9 +884,9 @@ restart:
unpack->newest_stop_durable_ts += unpack->newest_stop_ts;
}
- /* FIXME-prepare-support: Check validity of durable timestamps. */
- __wt_check_addr_validity(session, unpack->oldest_start_ts, unpack->oldest_start_txn,
- unpack->newest_stop_ts, unpack->newest_stop_txn);
+ __wt_check_addr_validity(session, unpack->newest_start_durable_ts, unpack->oldest_start_ts,
+ unpack->oldest_start_txn, unpack->newest_stop_durable_ts, unpack->newest_stop_ts,
+ unpack->newest_stop_txn);
break;
case WT_CELL_DEL:
case WT_CELL_VALUE:
@@ -883,7 +907,9 @@ restart:
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->durable_start_ts));
unpack->durable_start_ts += unpack->start_ts;
- }
+ } else
+ unpack->durable_start_ts = unpack->start_ts;
+
if (LF_ISSET(WT_CELL_TS_STOP)) {
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->stop_ts));
unpack->stop_ts += unpack->start_ts;
@@ -896,9 +922,13 @@ restart:
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->durable_stop_ts));
unpack->durable_stop_ts += unpack->stop_ts;
- }
- __cell_check_value_validity(session, unpack->durable_start_ts, unpack->durable_stop_ts,
- unpack->start_ts, unpack->start_txn, unpack->stop_ts, unpack->stop_txn);
+ } else if (unpack->stop_ts != WT_TS_MAX)
+ unpack->durable_stop_ts = unpack->stop_ts;
+ else
+ unpack->durable_stop_ts = WT_TS_NONE;
+
+ __cell_check_value_validity(session, unpack->durable_start_ts, unpack->start_ts,
+ unpack->start_txn, unpack->durable_stop_ts, unpack->stop_ts, unpack->stop_txn);
break;
}
@@ -923,8 +953,10 @@ restart:
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
copy.v = unpack->v;
copy.start_ts = unpack->start_ts;
+ copy.durable_start_ts = unpack->durable_start_ts;
copy.start_txn = unpack->start_txn;
copy.stop_ts = unpack->stop_ts;
+ copy.durable_stop_ts = unpack->durable_stop_ts;
copy.stop_txn = unpack->stop_txn;
copy.len = WT_PTRDIFF32(p, cell);
cell = (WT_CELL *)((uint8_t *)cell - v);
@@ -983,8 +1015,10 @@ done:
unpack->raw = WT_CELL_VALUE_COPY;
unpack->v = copy.v;
unpack->start_ts = copy.start_ts;
+ unpack->durable_start_ts = copy.durable_start_ts;
unpack->start_txn = copy.start_txn;
unpack->stop_ts = copy.stop_ts;
+ unpack->durable_stop_ts = copy.durable_stop_ts;
unpack->stop_txn = copy.stop_txn;
unpack->__len = copy.len;
}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 940ab258eb8..9de6f4160d8 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1171,8 +1171,9 @@ extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocate
extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret,
size_t bytes_to_allocate, void *retp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv,
- uint8_t type, wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts,
- uint64_t stop_txn, uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint8_t type, wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rec_child_modify(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref,
bool *hazardp, WT_CHILD_STATE *statep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
@@ -1602,8 +1603,8 @@ extern void __wt_capacity_throttle(WT_SESSION_IMPL *session, uint64_t bytes, WT_
extern void __wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing);
extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
extern void __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session,
- wt_timestamp_t newest_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
- wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn);
+ wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn);
extern void __wt_ckpt_verbose(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag,
const char *ckpt_name, const uint8_t *ckpt_string);
extern void __wt_cond_auto_wait(
@@ -1948,12 +1949,13 @@ static inline int __wt_page_swap_func(
static inline int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len,
void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- const void *data, size_t size, wt_timestamp_t start_ts, uint64_t start_txn,
- wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ const void *data, size_t size, wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts,
+ uint64_t start_txn, wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn,
+ uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle, WT_REC_KV *val) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ WT_REC_KV *val) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_row_leaf_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip,
@@ -2041,25 +2043,29 @@ static inline int __wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t
static inline int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len,
const void *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type,
- uint64_t recno, wt_timestamp_t stop_durable_ts, wt_timestamp_t oldest_start_ts,
- uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn, size_t size)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint64_t recno, wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts,
+ uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts,
+ uint64_t newest_stop_txn, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_int_key(WT_CELL *cell, size_t size)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_pack_value(WT_SESSION_IMPL *session, WT_CELL *cell,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_cell_total_len(WT_CELL_UNPACK *unpack)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline size_t __wt_strnlen(const char *s, size_t maxlen)
@@ -2140,8 +2146,8 @@ static inline void __wt_cell_unpack(
static inline void __wt_cell_unpack_dsk(
WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack);
static inline void __wt_check_addr_validity(WT_SESSION_IMPL *session,
- wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts,
- uint64_t newest_stop_txn);
+ wt_timestamp_t start_durable_ts, wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn);
static inline void __wt_cond_wait(
WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *));
static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session);
@@ -2154,12 +2160,12 @@ static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref);
static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page);
static inline void __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page);
-static inline void __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *newest_durable_ts,
- wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *newest_stop_tsp,
- uint64_t *newest_stop_txnp);
-static inline void __wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t newest_durable_ts,
- wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts,
- uint64_t newest_stop_txn);
+static inline void __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *start_durable_ts,
+ wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *stop_durable_ts,
+ wt_timestamp_t *newest_stop_tsp, uint64_t *newest_stop_txnp);
+static inline void __wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn);
static inline void __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_ADDR *addr, WT_CELL_UNPACK *vpack, bool proxy_cell, uint64_t recno);
static inline void __wt_rec_image_copy(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv);
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 6c232221338..c3c108a833e 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -349,11 +349,10 @@ struct __wt_log_desc {
/*
* NOTE: We bumped the log version from 2 to 3 to make it convenient for
* MongoDB to detect users accidentally running old binaries on a newer
- * release. There are no actual log file format changes with version 2 and
- * 3.
- * NOTE: We bumped the log version from 3 to 4 for the same reason.
+ * release. There are no actual log file format changes in versions 2
+ * through 5.
*/
-#define WT_LOG_VERSION 4
+#define WT_LOG_VERSION 5
uint16_t version; /* 04-05: Log version */
uint16_t unused; /* 06-07: Unused */
uint64_t log_size; /* 08-15: Log file size */
@@ -370,8 +369,10 @@ struct __wt_log_desc {
#define WT_LOG_V2_MINOR 0
#define WT_LOG_V3_MAJOR 3
#define WT_LOG_V3_MINOR 1
-#define WT_LOG_V4_MAJOR 10
-#define WT_LOG_V4_MINOR 0
+#define WT_LOG_V4_MAJOR 3
+#define WT_LOG_V4_MINOR 3
+#define WT_LOG_V5_MAJOR 10
+#define WT_LOG_V5_MINOR 0
/*
* __wt_log_desc_byteswap --
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index 10d8d923b5d..9b0ae3c4a72 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -134,9 +134,10 @@ struct __wt_ckpt {
WT_BLOCK_MODS backup_blocks[WT_BLKINCR_MAX];
/* Validity window */
- wt_timestamp_t newest_durable_ts;
+ wt_timestamp_t start_durable_ts;
wt_timestamp_t oldest_start_ts;
uint64_t oldest_start_txn;
+ wt_timestamp_t stop_durable_ts;
wt_timestamp_t newest_stop_ts;
uint64_t newest_stop_txn;
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 783345420c7..116fe0e35ab 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -114,9 +114,10 @@ struct __wt_reconcile {
uint32_t entries;
uint64_t recno;
WT_ITEM key;
- wt_timestamp_t newest_durable_ts;
+ wt_timestamp_t start_durable_ts;
wt_timestamp_t oldest_start_ts;
uint64_t oldest_start_txn;
+ wt_timestamp_t stop_durable_ts;
wt_timestamp_t newest_stop_ts;
uint64_t newest_stop_txn;
@@ -124,9 +125,10 @@ struct __wt_reconcile {
uint32_t min_entries;
uint64_t min_recno;
WT_ITEM min_key;
- wt_timestamp_t min_newest_durable_ts;
+ wt_timestamp_t min_start_durable_ts;
wt_timestamp_t min_oldest_start_ts;
uint64_t min_oldest_start_txn;
+ wt_timestamp_t min_stop_durable_ts;
wt_timestamp_t min_newest_stop_ts;
uint64_t min_newest_stop_txn;
@@ -241,9 +243,10 @@ struct __wt_reconcile {
typedef struct {
WT_UPDATE *upd; /* Update to write (or NULL) */
- wt_timestamp_t durable_ts; /* Transaction IDs, timestamps */
+ wt_timestamp_t start_durable_ts; /* Transaction IDs, timestamps */
wt_timestamp_t start_ts;
uint64_t start_txn;
+ wt_timestamp_t stop_durable_ts;
wt_timestamp_t stop_ts;
uint64_t stop_txn;
} WT_UPDATE_SELECT;
diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i
index 89416ed12ec..6f605ec6030 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.i
+++ b/src/third_party/wiredtiger/src/include/reconcile.i
@@ -43,9 +43,9 @@ __wt_rec_need_split(WT_RECONCILE *r, size_t len)
* Initialize an address timestamp triplet.
*/
static inline void
-__wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *newest_durable_ts,
- wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *newest_stop_tsp,
- uint64_t *newest_stop_txnp)
+__wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *start_durable_ts,
+ wt_timestamp_t *oldest_start_tsp, uint64_t *oldest_start_txnp, wt_timestamp_t *stop_durable_ts,
+ wt_timestamp_t *newest_stop_tsp, uint64_t *newest_stop_txnp)
{
/*
* If the page is not fixed-length column-store, where we don't maintain timestamps at all, set
@@ -53,13 +53,13 @@ __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *newest_durable_ts,
* corrected as we process key/value items. Otherwise, set the oldest/newest timestamps to
* simple durability.
*/
- *newest_durable_ts = WT_TS_NONE;
+ *start_durable_ts = WT_TS_NONE;
*oldest_start_tsp = WT_TS_MAX;
*oldest_start_txnp = WT_TXN_MAX;
+ *stop_durable_ts = WT_TS_NONE;
*newest_stop_tsp = WT_TS_NONE;
*newest_stop_txnp = WT_TXN_NONE;
if (r->page->type == WT_PAGE_COL_FIX) {
- *newest_durable_ts = WT_TS_NONE;
*oldest_start_tsp = WT_TS_NONE;
*oldest_start_txnp = WT_TXN_NONE;
*newest_stop_tsp = WT_TS_MAX;
@@ -72,13 +72,14 @@ __wt_rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *newest_durable_ts,
* Update the chunk's timestamp information.
*/
static inline void
-__wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t newest_durable_ts,
- wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts,
- uint64_t newest_stop_txn)
+__wt_rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
{
- r->cur_ptr->newest_durable_ts = WT_MAX(newest_durable_ts, r->cur_ptr->newest_durable_ts);
+ r->cur_ptr->start_durable_ts = WT_MAX(start_durable_ts, r->cur_ptr->start_durable_ts);
r->cur_ptr->oldest_start_ts = WT_MIN(oldest_start_ts, r->cur_ptr->oldest_start_ts);
r->cur_ptr->oldest_start_txn = WT_MIN(oldest_start_txn, r->cur_ptr->oldest_start_txn);
+ r->cur_ptr->stop_durable_ts = WT_MAX(stop_durable_ts, r->cur_ptr->stop_durable_ts);
r->cur_ptr->newest_stop_ts = WT_MAX(newest_stop_ts, r->cur_ptr->newest_stop_ts);
r->cur_ptr->newest_stop_txn = WT_MAX(newest_stop_txn, r->cur_ptr->newest_stop_txn);
}
@@ -195,15 +196,16 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add
val->buf.data = addr->addr;
val->buf.size = addr->size;
val->cell_len = __wt_cell_pack_addr(session, &val->cell, cell_type, recno,
- addr->stop_durable_ts, addr->oldest_start_ts, addr->oldest_start_txn,
- addr->newest_stop_ts, addr->newest_stop_txn, val->buf.size);
+ addr->start_durable_ts, addr->oldest_start_ts, addr->oldest_start_txn,
+ addr->stop_durable_ts, addr->newest_stop_ts, addr->newest_stop_txn, val->buf.size);
} else {
WT_ASSERT(session, addr == NULL);
val->buf.data = vpack->data;
val->buf.size = vpack->size;
- val->cell_len = __wt_cell_pack_addr(session, &val->cell, cell_type, recno,
- vpack->newest_stop_durable_ts, vpack->oldest_start_ts, vpack->oldest_start_txn,
- vpack->newest_stop_ts, vpack->newest_stop_txn, val->buf.size);
+ val->cell_len =
+ __wt_cell_pack_addr(session, &val->cell, cell_type, recno, vpack->newest_start_durable_ts,
+ vpack->oldest_start_ts, vpack->oldest_start_txn, vpack->newest_stop_durable_ts,
+ vpack->newest_stop_ts, vpack->newest_stop_txn, val->buf.size);
}
val->len = val->cell_len + val->buf.size;
@@ -215,8 +217,8 @@ __wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ADDR *add
*/
static inline int
__wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *data, size_t size,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle)
+ wt_timestamp_t durable_start_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t durable_stop_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
{
WT_BTREE *btree;
WT_REC_KV *val;
@@ -242,13 +244,13 @@ __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *d
if (val->buf.size > btree->maxleafvalue) {
WT_STAT_DATA_INCR(session, rec_overflow_value);
- return (__wt_rec_cell_build_ovfl(
- session, r, val, WT_CELL_VALUE_OVFL, start_ts, start_txn, stop_ts, stop_txn, rle));
+ return (__wt_rec_cell_build_ovfl(session, r, val, WT_CELL_VALUE_OVFL, durable_start_ts,
+ start_ts, start_txn, durable_stop_ts, stop_ts, stop_txn, rle));
}
}
- val->cell_len = __wt_cell_pack_value(
- session, &val->cell, start_ts, start_txn, stop_ts, stop_txn, rle, val->buf.size);
+ val->cell_len = __wt_cell_pack_value(session, &val->cell, durable_start_ts, start_ts, start_txn,
+ durable_stop_ts, stop_ts, stop_txn, rle, val->buf.size);
val->len = val->cell_len + val->buf.size;
return (0);
@@ -259,8 +261,9 @@ __wt_rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *d
* Check for a dictionary match.
*/
static inline int
-__wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t start_ts,
- uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, WT_REC_KV *val)
+__wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, WT_REC_KV *val)
{
WT_REC_DICTIONARY *dp;
uint64_t offset;
@@ -296,8 +299,8 @@ __wt_rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t
* offset from the beginning of the page.
*/
offset = (uint64_t)WT_PTRDIFF(r->first_free, (uint8_t *)r->cur_ptr->image.mem + dp->offset);
- val->len = val->cell_len = __wt_cell_pack_copy(
- session, &val->cell, start_ts, start_txn, stop_ts, stop_txn, rle, offset);
+ val->len = val->cell_len = __wt_cell_pack_copy(session, &val->cell, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, rle, offset);
val->buf.data = NULL;
val->buf.size = 0;
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index db846698277..29f30a8879f 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -587,23 +587,26 @@ __ckpt_load(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_C
ckpt->size = (uint64_t)a.val;
/* Default to durability. */
- ret = __wt_config_subgets(session, v, "newest_durable_ts", &a);
+ ret = __wt_config_subgets(session, v, "start_durable_ts", &a);
WT_RET_NOTFOUND_OK(ret);
- ckpt->newest_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val;
+ ckpt->start_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val;
ret = __wt_config_subgets(session, v, "oldest_start_ts", &a);
WT_RET_NOTFOUND_OK(ret);
ckpt->oldest_start_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val;
ret = __wt_config_subgets(session, v, "oldest_start_txn", &a);
WT_RET_NOTFOUND_OK(ret);
ckpt->oldest_start_txn = ret == WT_NOTFOUND || a.len == 0 ? WT_TXN_NONE : (uint64_t)a.val;
+ ret = __wt_config_subgets(session, v, "stop_durable_ts", &a);
+ WT_RET_NOTFOUND_OK(ret);
+ ckpt->stop_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val;
ret = __wt_config_subgets(session, v, "newest_stop_ts", &a);
WT_RET_NOTFOUND_OK(ret);
ckpt->newest_stop_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_MAX : (uint64_t)a.val;
ret = __wt_config_subgets(session, v, "newest_stop_txn", &a);
WT_RET_NOTFOUND_OK(ret);
ckpt->newest_stop_txn = ret == WT_NOTFOUND || a.len == 0 ? WT_TXN_MAX : (uint64_t)a.val;
- __wt_check_addr_validity(session, ckpt->oldest_start_ts, ckpt->oldest_start_txn,
- ckpt->newest_stop_ts, ckpt->newest_stop_txn);
+ __wt_check_addr_validity(session, ckpt->start_durable_ts, ckpt->oldest_start_ts,
+ ckpt->oldest_start_txn, ckpt->stop_durable_ts, ckpt->newest_stop_ts, ckpt->newest_stop_txn);
WT_RET(__wt_config_subgets(session, v, "write_gen", &a));
if (a.len == 0)
@@ -689,8 +692,9 @@ __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM
WT_RET(__wt_raw_to_hex(session, ckpt->raw.data, ckpt->raw.size, &ckpt->addr));
}
- __wt_check_addr_validity(session, ckpt->oldest_start_ts, ckpt->oldest_start_txn,
- ckpt->newest_stop_ts, ckpt->newest_stop_txn);
+ __wt_check_addr_validity(session, ckpt->start_durable_ts, ckpt->oldest_start_ts,
+ ckpt->oldest_start_txn, ckpt->stop_durable_ts, ckpt->newest_stop_ts,
+ ckpt->newest_stop_txn);
WT_RET(__wt_buf_catfmt(session, buf, "%s%s", sep, ckpt->name));
sep = ",";
@@ -703,12 +707,13 @@ __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_ITEM
*/
WT_RET(__wt_buf_catfmt(session, buf,
"=(addr=\"%.*s\",order=%" PRId64 ",time=%" PRIu64 ",size=%" PRId64
- ",newest_durable_ts=%" PRId64 ",oldest_start_ts=%" PRId64 ",oldest_start_txn=%" PRId64
- ",newest_stop_ts=%" PRId64 ",newest_stop_txn=%" PRId64 ",write_gen=%" PRId64 ")",
+ ",start_durable_ts=%" PRId64 ",oldest_start_ts=%" PRId64 ",oldest_start_txn=%" PRId64
+ ",stop_durable_ts=%" PRId64 ",newest_stop_ts=%" PRId64 ",newest_stop_txn=%" PRId64
+ ",write_gen=%" PRId64 ")",
(int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec,
- (int64_t)ckpt->size, (int64_t)ckpt->newest_durable_ts, (int64_t)ckpt->oldest_start_ts,
- (int64_t)ckpt->oldest_start_txn, (int64_t)ckpt->newest_stop_ts,
- (int64_t)ckpt->newest_stop_txn, (int64_t)ckpt->write_gen));
+ (int64_t)ckpt->size, (int64_t)ckpt->start_durable_ts, (int64_t)ckpt->oldest_start_ts,
+ (int64_t)ckpt->oldest_start_txn, (int64_t)ckpt->stop_durable_ts,
+ (int64_t)ckpt->newest_stop_ts, (int64_t)ckpt->newest_stop_txn, (int64_t)ckpt->write_gen));
}
WT_RET(__wt_buf_catfmt(session, buf, ")"));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index adf2db1a76d..0b5f3057376 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -116,8 +116,8 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet
val = &r->v;
if (deleted) {
- val->cell_len = __wt_cell_pack_del(
- session, &val->cell, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle);
+ val->cell_len = __wt_cell_pack_del(session, &val->cell, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE,
+ WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle);
val->buf.data = NULL;
val->buf.size = 0;
val->len = val->cell_len;
@@ -127,7 +127,7 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet
* which means we want the previous value seen, not the current value.
*/
WT_RET(__wt_rec_cell_build_val(session, r, cbulk->last.data, cbulk->last.size, WT_TS_NONE,
- WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle));
+ WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle));
/* Boundary: split or write the page. */
if (WT_CROSSING_SPLIT_BND(r, val->len))
@@ -135,10 +135,11 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool delet
/* Copy the value onto the page. */
if (btree->dictionary)
- WT_RET(__wt_rec_dict_replace(
- session, r, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX, cbulk->rle, val));
+ WT_RET(__wt_rec_dict_replace(session, r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE,
+ WT_TS_MAX, WT_TXN_MAX, cbulk->rle, val));
__wt_rec_image_copy(session, r, val);
- __wt_rec_addr_ts_update(r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
+ __wt_rec_addr_ts_update(
+ r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX);
/* Update the starting record number in case we split. */
r->recno += cbulk->rle;
@@ -178,12 +179,9 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Copy the value onto the page. */
__wt_rec_image_copy(session, r, val);
- /*
- * FIXME-prepare-support: audit the use of durable timestamps in this file, use both durable
- * timestamps.
- */
__wt_rec_addr_ts_update(r, addr->start_durable_ts, addr->oldest_start_ts,
- addr->oldest_start_txn, addr->newest_stop_ts, addr->newest_stop_txn);
+ addr->oldest_start_txn, addr->stop_durable_ts, addr->newest_stop_ts,
+ addr->newest_stop_txn);
}
return (0);
}
@@ -203,7 +201,7 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
WT_PAGE *child, *page;
WT_REC_KV *val;
WT_REF *ref;
- wt_timestamp_t newest_durable_ts, newest_stop_ts, oldest_start_ts;
+ wt_timestamp_t newest_stop_ts, oldest_start_ts, start_durable_ts, stop_durable_ts;
uint64_t newest_stop_txn, oldest_start_txn;
bool hazard;
@@ -285,16 +283,18 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
val->buf.size = __wt_cell_total_len(vpack);
val->cell_len = 0;
val->len = val->buf.size;
- newest_durable_ts = vpack->newest_stop_durable_ts;
+ start_durable_ts = vpack->newest_start_durable_ts;
oldest_start_ts = vpack->oldest_start_ts;
oldest_start_txn = vpack->oldest_start_txn;
+ stop_durable_ts = vpack->newest_stop_durable_ts;
newest_stop_ts = vpack->newest_stop_ts;
newest_stop_txn = vpack->newest_stop_txn;
} else {
__wt_rec_cell_build_addr(session, r, addr, NULL, false, ref->ref_recno);
- newest_durable_ts = addr->stop_durable_ts;
+ start_durable_ts = addr->start_durable_ts;
oldest_start_ts = addr->oldest_start_ts;
oldest_start_txn = addr->oldest_start_txn;
+ stop_durable_ts = addr->stop_durable_ts;
newest_stop_ts = addr->newest_stop_ts;
newest_stop_txn = addr->newest_stop_txn;
}
@@ -306,8 +306,8 @@ __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
/* Copy the value onto the page. */
__wt_rec_image_copy(session, r, val);
- __wt_rec_addr_ts_update(
- r, newest_durable_ts, oldest_start_ts, oldest_start_txn, newest_stop_ts, newest_stop_txn);
+ __wt_rec_addr_ts_update(r, start_durable_ts, oldest_start_ts, oldest_start_txn,
+ stop_durable_ts, newest_stop_ts, newest_stop_txn);
}
WT_INTL_FOREACH_END;
@@ -515,8 +515,9 @@ __wt_rec_col_fix_slvg(
*/
static int
__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKIE *salvage,
- WT_ITEM *value, wt_timestamp_t durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
- wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle, bool deleted, bool overflow_type)
+ WT_ITEM *value, wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle,
+ bool deleted, bool overflow_type)
{
WT_BTREE *btree;
WT_REC_KV *val;
@@ -555,20 +556,21 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKI
}
if (deleted) {
- val->cell_len =
- __wt_cell_pack_del(session, &val->cell, start_ts, start_txn, stop_ts, stop_txn, rle);
+ val->cell_len = __wt_cell_pack_del(session, &val->cell, start_durable_ts, start_ts,
+ start_txn, stop_durable_ts, stop_ts, stop_txn, rle);
val->buf.data = NULL;
val->buf.size = 0;
val->len = val->cell_len;
} else if (overflow_type) {
- val->cell_len = __wt_cell_pack_ovfl(session, &val->cell, WT_CELL_VALUE_OVFL, start_ts,
- start_txn, stop_ts, stop_txn, rle, value->size);
+ val->cell_len =
+ __wt_cell_pack_ovfl(session, &val->cell, WT_CELL_VALUE_OVFL, start_durable_ts, start_ts,
+ start_txn, stop_durable_ts, stop_ts, stop_txn, rle, value->size);
val->buf.data = value->data;
val->buf.size = value->size;
val->len = val->cell_len + value->size;
} else
- WT_RET(__wt_rec_cell_build_val(
- session, r, value->data, value->size, start_ts, start_txn, stop_ts, stop_txn, rle));
+ WT_RET(__wt_rec_cell_build_val(session, r, value->data, value->size, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, rle));
/* Boundary: split or write the page. */
if (__wt_rec_need_split(r, val->len))
@@ -576,9 +578,11 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKI
/* Copy the value onto the page. */
if (!deleted && !overflow_type && btree->dictionary)
- WT_RET(__wt_rec_dict_replace(session, r, start_ts, start_txn, stop_ts, stop_txn, rle, val));
+ WT_RET(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn,
+ stop_durable_ts, stop_ts, stop_txn, rle, val));
__wt_rec_image_copy(session, r, val);
- __wt_rec_addr_ts_update(r, durable_ts, start_ts, start_txn, stop_ts, stop_txn);
+ __wt_rec_addr_ts_update(
+ r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn);
/* Update the starting record number in case we split. */
r->recno += rle;
@@ -596,14 +600,15 @@ __wt_rec_col_var(
{
enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
struct {
- WT_ITEM *value; /* Value */
- wt_timestamp_t start_ts; /* Timestamps/TxnID */
+ WT_ITEM *value; /* Value */
+ wt_timestamp_t durable_start_ts; /* Timestamps/TxnID */
+ wt_timestamp_t start_ts;
uint64_t start_txn;
+ wt_timestamp_t durable_stop_ts;
wt_timestamp_t stop_ts;
uint64_t stop_txn;
bool deleted; /* If deleted */
} last;
- WT_ADDR *addr;
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK *vpack, _vpack;
@@ -615,7 +620,7 @@ __wt_rec_col_var(
WT_PAGE *page;
WT_UPDATE *upd;
WT_UPDATE_SELECT upd_select;
- wt_timestamp_t durable_ts, newest_durable_ts, start_ts, stop_ts;
+ wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts;
uint64_t n, nrepeat, repeat_count, rle, skip, src_recno;
uint64_t start_txn, stop_txn;
uint32_t i, size;
@@ -632,23 +637,12 @@ __wt_rec_col_var(
cbt = &r->update_modify_cbt;
cbt->iface.session = (WT_SESSION *)session;
- /*
- * Acquire the newest-durable timestamp for this page so we can roll it forward. If it exists,
- * it's in the WT_REF structure or the parent's disk image.
- */
- if ((addr = pageref->addr) == NULL)
- newest_durable_ts = WT_TS_NONE;
- else if (__wt_off_page(pageref->home, addr))
- newest_durable_ts = addr->stop_durable_ts;
- else {
- __wt_cell_unpack(session, pageref->home, pageref->addr, vpack);
- newest_durable_ts = vpack->newest_stop_durable_ts;
- }
-
/* Set the "last" values to cause failure if they're not set. */
last.value = r->last;
+ last.durable_start_ts = WT_TS_MAX;
last.start_ts = WT_TS_MAX;
last.start_txn = WT_TXN_MAX;
+ last.durable_stop_ts = WT_TS_MAX;
last.stop_ts = WT_TS_NONE;
last.stop_txn = WT_TXN_NONE;
last.deleted = false;
@@ -658,9 +652,10 @@ __wt_rec_col_var(
* [-Werror=maybe-uninitialized]
*/
/* NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) */
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_MAX;
start_txn = WT_TXN_MAX;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_NONE;
stop_txn = WT_TS_NONE;
@@ -681,8 +676,10 @@ __wt_rec_col_var(
if (salvage != NULL && salvage->missing != 0) {
if (salvage->skip == 0) {
rle = salvage->missing;
+ last.durable_start_ts = WT_TS_NONE;
last.start_ts = WT_TS_NONE;
last.start_txn = WT_TXN_NONE;
+ last.durable_stop_ts = WT_TS_NONE;
last.stop_ts = WT_TS_MAX;
last.stop_txn = WT_TXN_MAX;
last.deleted = true;
@@ -694,7 +691,7 @@ __wt_rec_col_var(
salvage->take += salvage->missing;
} else
WT_ERR(__rec_col_var_helper(session, r, NULL, NULL, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE,
- WT_TS_MAX, WT_TXN_MAX, salvage->missing, true, false));
+ WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, salvage->missing, true, false));
}
/*
@@ -768,9 +765,10 @@ record_loop:
deleted = false;
if (upd != NULL) {
- durable_ts = upd_select.durable_ts;
+ start_durable_ts = upd_select.start_durable_ts;
start_ts = upd_select.start_ts;
start_txn = upd_select.start_txn;
+ stop_durable_ts = upd_select.stop_durable_ts;
stop_ts = upd_select.stop_ts;
stop_txn = upd_select.stop_txn;
@@ -787,9 +785,10 @@ record_loop:
size = upd->size;
break;
case WT_UPDATE_TOMBSTONE:
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_NONE;
start_txn = WT_TXN_NONE;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_MAX;
stop_txn = WT_TXN_MAX;
deleted = true;
@@ -812,24 +811,21 @@ record_loop:
deleted = orig_deleted;
if (deleted) {
/* Set time pairs for the deleted key. */
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_NONE;
start_txn = WT_TXN_NONE;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_MAX;
stop_txn = WT_TXN_MAX;
goto compare;
}
- /*
- * The key on the old disk image is unchanged. Use time pairs from the cell.
- *
- * FIXME-prepare-support: Currently, we don't store durable_ts in cell, which is a
- * problem we need to solve for prepared transactions.
- */
- durable_ts = newest_durable_ts;
+ /* The key on the old disk image is unchanged. Use time pairs from the cell. */
+ start_durable_ts = vpack->durable_start_ts;
start_ts = vpack->start_ts;
start_txn = vpack->start_txn;
+ stop_durable_ts = vpack->durable_stop_ts;
stop_ts = vpack->stop_ts;
stop_txn = vpack->stop_txn;
@@ -846,16 +842,18 @@ record_loop:
* We're going to copy the on-page cell, write out any record we're tracking.
*/
if (rle != 0) {
- WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, durable_ts,
- last.start_ts, last.start_txn, last.stop_ts, last.stop_txn, rle,
- last.deleted, false));
+ WT_ERR(__rec_col_var_helper(session, r, salvage, last.value,
+ last.durable_start_ts, last.start_ts, last.start_txn,
+ last.durable_stop_ts, last.stop_ts, last.stop_txn, rle, last.deleted,
+ false));
rle = 0;
}
last.value->data = vpack->data;
last.value->size = vpack->size;
- WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, durable_ts,
- start_ts, start_txn, stop_ts, stop_txn, repeat_count, false, true));
+ WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, repeat_count, false,
+ true));
/* Track if page has overflow items. */
r->ovfl_items = true;
@@ -890,7 +888,8 @@ compare:
* record number, we've been doing that all along.
*/
if (rle != 0) {
- if ((last.start_ts == start_ts && last.start_txn == start_txn &&
+ if ((last.durable_start_ts == start_durable_ts && last.start_ts == start_ts &&
+ last.start_txn == start_txn && last.durable_stop_ts == stop_durable_ts &&
last.stop_ts == stop_ts && last.stop_txn == stop_txn) &&
((deleted && last.deleted) ||
(!deleted && !last.deleted && last.value->size == size &&
@@ -901,15 +900,17 @@ compare:
* tombstone to write to disk and the deletion of the keys must be globally
* visible.
*/
- WT_ASSERT(session, (!deleted && !last.deleted) ||
- (last.start_ts == WT_TS_NONE && last.start_txn == WT_TXN_NONE &&
- last.stop_ts == WT_TS_MAX && last.stop_txn == WT_TXN_MAX));
+ WT_ASSERT(session,
+ (!deleted && !last.deleted) ||
+ (last.durable_start_ts == WT_TS_NONE && last.start_ts == WT_TS_NONE &&
+ last.start_txn == WT_TXN_NONE && last.durable_stop_ts == WT_TS_NONE &&
+ last.stop_ts == WT_TS_MAX && last.stop_txn == WT_TXN_MAX));
rle += repeat_count;
continue;
}
- WT_ERR(
- __rec_col_var_helper(session, r, salvage, last.value, durable_ts, last.start_ts,
- last.start_txn, last.stop_ts, last.stop_txn, rle, last.deleted, false));
+ WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.durable_start_ts,
+ last.start_ts, last.start_txn, last.durable_stop_ts, last.stop_ts, last.stop_txn,
+ rle, last.deleted, false));
}
/*
@@ -937,8 +938,10 @@ compare:
if (upd != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DISK))
__wt_free_update_list(session, &upd);
+ last.durable_start_ts = start_durable_ts;
last.start_ts = start_ts;
last.start_txn = start_txn;
+ last.durable_stop_ts = stop_durable_ts;
last.stop_ts = stop_ts;
last.stop_txn = stop_txn;
last.deleted = deleted;
@@ -1004,8 +1007,9 @@ compare:
* tombstone to write to disk and the deletion of the keys must be globally
* visible.
*/
- WT_ASSERT(session, last.start_ts == WT_TS_NONE &&
- last.start_txn == WT_TXN_NONE && last.stop_ts == WT_TS_MAX &&
+ WT_ASSERT(session, last.durable_start_ts == WT_TS_NONE &&
+ last.start_ts == WT_TS_NONE && last.start_txn == WT_TXN_NONE &&
+ last.durable_stop_ts == WT_TS_NONE && last.stop_ts == WT_TS_MAX &&
last.stop_txn == WT_TXN_MAX);
/*
* The record adjustment is decremented by one so we can naturally fall into the
@@ -1017,26 +1021,29 @@ compare:
src_recno += skip;
} else {
/* Set time pairs for the first deleted key in a deleted range. */
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_NONE;
start_txn = WT_TXN_NONE;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_MAX;
stop_txn = WT_TXN_MAX;
}
} else if (upd == NULL) {
/* The updates on the key are all uncommitted so we write a deleted key to disk. */
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_NONE;
start_txn = WT_TXN_NONE;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_MAX;
stop_txn = WT_TXN_MAX;
deleted = true;
} else {
/* Set time pairs for a key. */
- durable_ts = upd_select.durable_ts;
+ start_durable_ts = upd_select.start_durable_ts;
start_ts = upd_select.start_ts;
start_txn = upd_select.start_txn;
+ stop_durable_ts = upd_select.stop_durable_ts;
stop_ts = upd_select.stop_ts;
stop_txn = upd_select.stop_txn;
@@ -1056,9 +1063,10 @@ compare:
size = upd->size;
break;
case WT_UPDATE_TOMBSTONE:
- durable_ts = WT_TS_NONE;
+ start_durable_ts = WT_TS_NONE;
start_ts = WT_TS_NONE;
start_txn = WT_TXN_NONE;
+ stop_durable_ts = WT_TS_NONE;
stop_ts = WT_TS_MAX;
stop_txn = WT_TXN_MAX;
deleted = true;
@@ -1076,7 +1084,8 @@ compare:
/*
* FIXME-PM-1521: Follow up issue with clang in WT-5341.
*/
- if ((last.start_ts == start_ts && last.start_txn == start_txn &&
+ if ((last.durable_start_ts == start_durable_ts && last.start_ts == start_ts &&
+ last.start_txn == start_txn && last.durable_stop_ts == stop_durable_ts &&
last.stop_ts == stop_ts && last.stop_txn == stop_txn) &&
((deleted && last.deleted) ||
(!deleted && !last.deleted && last.value->size == size &&
@@ -1088,14 +1097,16 @@ compare:
* visible.
*/
WT_ASSERT(session, (!deleted && !last.deleted) ||
- (last.start_ts == WT_TS_NONE && last.start_txn == WT_TXN_NONE &&
+ (last.durable_start_ts == start_durable_ts && last.start_ts == WT_TS_NONE &&
+ last.start_txn == WT_TXN_NONE &&
+ last.durable_stop_ts == stop_durable_ts &&
last.stop_ts == WT_TS_MAX && last.stop_txn == WT_TXN_MAX));
++rle;
goto next;
}
- WT_ERR(
- __rec_col_var_helper(session, r, salvage, last.value, durable_ts, last.start_ts,
- last.start_txn, last.stop_ts, last.stop_txn, rle, last.deleted, false));
+ WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.durable_start_ts,
+ last.start_ts, last.start_txn, last.durable_stop_ts, last.stop_ts, last.stop_txn,
+ rle, last.deleted, false));
}
/*
@@ -1118,8 +1129,10 @@ compare:
__wt_free_update_list(session, &upd);
/* Ready for the next loop, reset the RLE counter. */
+ last.durable_start_ts = start_durable_ts;
last.start_ts = start_ts;
last.start_txn = start_txn;
+ last.durable_stop_ts = stop_durable_ts;
last.stop_ts = stop_ts;
last.stop_txn = stop_txn;
last.deleted = deleted;
@@ -1145,8 +1158,9 @@ next:
/* If we were tracking a record, write it. */
if (rle != 0)
- WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, durable_ts, last.start_ts,
- last.start_txn, last.stop_ts, last.stop_txn, rle, last.deleted, false));
+ WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, last.durable_start_ts,
+ last.start_ts, last.start_txn, last.durable_stop_ts, last.stop_ts, last.stop_txn, rle,
+ last.deleted, false));
/* Write the remnant page. */
ret = __wt_rec_split_finish(session, r);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index 443ac43e186..0aba6bf9534 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -80,8 +80,8 @@ __rec_cell_build_int_key(
WT_STAT_DATA_INCR(session, rec_overflow_key_internal);
*is_ovflp = true;
- return (__wt_rec_cell_build_ovfl(
- session, r, key, WT_CELL_KEY_OVFL, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TXN_NONE, 0));
+ return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, WT_TS_NONE, WT_TS_NONE,
+ WT_TXN_NONE, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, 0));
}
key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
@@ -173,7 +173,7 @@ __rec_cell_build_leaf_key(
*is_ovflp = true;
return (__wt_rec_cell_build_ovfl(session, r, key, WT_CELL_KEY_OVFL, WT_TS_NONE,
- WT_TXN_NONE, WT_TS_NONE, WT_TXN_NONE, 0));
+ WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, 0));
}
return (__rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
}
@@ -206,7 +206,8 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
cursor->key.data, cursor->key.size, &ovfl_key));
WT_RET(__wt_rec_cell_build_val(session, r, cursor->value.data, /* Build value cell */
- cursor->value.size, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX, 0));
+ cursor->value.size, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX,
+ 0));
/* Boundary: split or write the page. */
if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
@@ -229,11 +230,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
else {
r->all_empty_value = false;
if (btree->dictionary)
- WT_RET(__wt_rec_dict_replace(
- session, r, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX, 0, val));
+ WT_RET(__wt_rec_dict_replace(session, r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE,
+ WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX, 0, val));
__wt_rec_image_copy(session, r, val);
}
- __wt_rec_addr_ts_update(r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
+ __wt_rec_addr_ts_update(
+ r, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_NONE, WT_TS_MAX, WT_TXN_MAX);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
@@ -277,12 +279,9 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Copy the key and value onto the page. */
__wt_rec_image_copy(session, r, key);
__wt_rec_image_copy(session, r, val);
- /*
- * FIXME-prepare-support: audit the use of durable timestamps in this file, use both durable
- * timestamps.
- */
- __wt_rec_addr_ts_update(r, addr->stop_durable_ts, addr->oldest_start_ts,
- addr->oldest_start_txn, addr->newest_stop_ts, addr->newest_stop_txn);
+ __wt_rec_addr_ts_update(r, addr->start_durable_ts, addr->oldest_start_ts,
+ addr->oldest_start_txn, addr->stop_durable_ts, addr->newest_stop_ts,
+ addr->newest_stop_txn);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
@@ -307,7 +306,7 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_PAGE *child;
WT_REC_KV *key, *val;
WT_REF *ref;
- wt_timestamp_t newest_durable_ts, newest_stop_ts, oldest_start_ts;
+ wt_timestamp_t newest_stop_ts, oldest_start_ts, start_durable_ts, stop_durable_ts;
size_t key_overflow_size, size;
uint64_t newest_stop_txn, oldest_start_txn;
bool force, hazard, key_onpage_ovfl, ovfl_key;
@@ -438,9 +437,10 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
if (__wt_off_page(page, addr)) {
__wt_rec_cell_build_addr(session, r, addr, NULL, state == WT_CHILD_PROXY, WT_RECNO_OOB);
- newest_durable_ts = addr->stop_durable_ts;
+ start_durable_ts = addr->start_durable_ts;
oldest_start_ts = addr->oldest_start_ts;
oldest_start_txn = addr->oldest_start_txn;
+ stop_durable_ts = addr->stop_durable_ts;
newest_stop_ts = addr->newest_stop_ts;
newest_stop_txn = addr->newest_stop_txn;
} else {
@@ -463,9 +463,10 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
val->cell_len = 0;
val->len = val->buf.size;
}
- newest_durable_ts = vpack->newest_stop_durable_ts;
+ start_durable_ts = vpack->newest_start_durable_ts;
oldest_start_ts = vpack->oldest_start_ts;
oldest_start_txn = vpack->oldest_start_txn;
+ stop_durable_ts = vpack->newest_stop_durable_ts;
newest_stop_ts = vpack->newest_stop_ts;
newest_stop_txn = vpack->newest_stop_txn;
}
@@ -521,8 +522,8 @@ __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Copy the key and value onto the page. */
__wt_rec_image_copy(session, r, key);
__wt_rec_image_copy(session, r, val);
- __wt_rec_addr_ts_update(
- r, newest_durable_ts, oldest_start_ts, oldest_start_txn, newest_stop_ts, newest_stop_txn);
+ __wt_rec_addr_ts_update(r, start_durable_ts, oldest_start_ts, oldest_start_txn,
+ stop_durable_ts, newest_stop_ts, newest_stop_txn);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
@@ -566,7 +567,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_REC_KV *key, *val;
WT_UPDATE *upd;
WT_UPDATE_SELECT upd_select;
- wt_timestamp_t durable_ts, start_ts, stop_ts;
+ wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts;
uint64_t start_txn, stop_txn;
bool ovfl_key;
@@ -585,9 +586,10 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
if ((upd = upd_select.upd) == NULL)
continue;
- durable_ts = upd_select.durable_ts;
+ start_durable_ts = upd_select.start_durable_ts;
start_ts = upd_select.start_ts;
start_txn = upd_select.start_txn;
+ stop_durable_ts = upd_select.stop_durable_ts;
stop_ts = upd_select.stop_ts;
stop_txn = upd_select.stop_txn;
@@ -599,12 +601,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
cbt->slot = UINT32_MAX;
WT_RET(__wt_value_return_upd(cbt, upd));
WT_RET(__wt_rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size,
- start_ts, start_txn, stop_ts, stop_txn, 0));
+ start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, 0));
break;
case WT_UPDATE_STANDARD:
/* Take the value from the update. */
- WT_ERR(__wt_rec_cell_build_val(
- session, r, upd->data, upd->size, start_ts, start_txn, stop_ts, stop_txn, 0));
+ WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, 0));
break;
case WT_UPDATE_TOMBSTONE:
continue;
@@ -642,11 +644,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
else {
r->all_empty_value = false;
if (btree->dictionary)
- WT_ERR(__wt_rec_dict_replace(
- session, r, start_ts, start_txn, stop_ts, stop_txn, 0, val));
+ WT_ERR(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn,
+ stop_durable_ts, stop_ts, stop_txn, 0, val));
__wt_rec_image_copy(session, r, val);
}
- __wt_rec_addr_ts_update(r, durable_ts, start_ts, start_txn, stop_ts, stop_txn);
+ __wt_rec_addr_ts_update(
+ r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
@@ -666,7 +669,8 @@ err:
*/
static inline int
__rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_CELL_UNPACK *vpack,
- uint64_t start_txn, wt_timestamp_t start_ts, uint64_t stop_txn, wt_timestamp_t stop_ts)
+ wt_timestamp_t start_durable_ts, uint64_t start_txn, wt_timestamp_t start_ts,
+ wt_timestamp_t stop_durable_ts, uint64_t stop_txn, wt_timestamp_t stop_ts)
{
WT_DECL_ITEM(tmpval);
WT_DECL_RET;
@@ -685,7 +689,8 @@ __rec_cell_repack(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT
p = tmpval->data;
size = tmpval->size;
}
- WT_ERR(__wt_rec_cell_build_val(session, r, p, size, start_ts, start_txn, stop_ts, stop_txn, 0));
+ WT_ERR(__wt_rec_cell_build_val(session, r, p, size, start_durable_ts, start_ts, start_txn,
+ stop_durable_ts, stop_ts, stop_txn, 0));
err:
__wt_scr_free(session, &tmpval);
@@ -701,7 +706,6 @@ __wt_rec_row_leaf(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
{
static WT_UPDATE upd_tombstone = {.txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE};
- WT_ADDR *addr;
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
@@ -715,7 +719,7 @@ __wt_rec_row_leaf(
WT_ROW *rip;
WT_UPDATE *upd;
WT_UPDATE_SELECT upd_select;
- wt_timestamp_t durable_ts, newest_durable_ts, start_ts, stop_ts;
+ wt_timestamp_t start_durable_ts, start_ts, stop_durable_ts, stop_ts;
uint64_t slvg_skip, start_txn, stop_txn;
uint32_t i;
bool dictionary, key_onpage_ovfl, ovfl_key;
@@ -734,19 +738,6 @@ __wt_rec_row_leaf(
upd = NULL;
- /*
- * Acquire the newest-durable timestamp for this page so we can roll it forward. If it exists,
- * it's in the WT_REF structure or the parent's disk image.
- */
- if ((addr = pageref->addr) == NULL)
- newest_durable_ts = WT_TS_NONE;
- else if (__wt_off_page(pageref->home, addr))
- newest_durable_ts = addr->stop_durable_ts;
- else {
- __wt_cell_unpack(session, pageref->home, pageref->addr, vpack);
- newest_durable_ts = vpack->newest_stop_durable_ts;
- }
-
WT_RET(__wt_rec_split_init(session, r, page, 0, btree->maxleafpage_precomp));
/*
@@ -794,28 +785,17 @@ __wt_rec_row_leaf(
/* Look for an update. */
WT_ERR(__wt_rec_upd_select(session, r, NULL, rip, vpack, &upd_select));
if ((upd = upd_select.upd) != NULL) {
- durable_ts = upd_select.durable_ts;
+ start_durable_ts = upd_select.start_durable_ts;
start_ts = upd_select.start_ts;
start_txn = upd_select.start_txn;
+ stop_durable_ts = upd_select.stop_durable_ts;
stop_ts = upd_select.stop_ts;
stop_txn = upd_select.stop_txn;
} else {
- /*
- * FIXME: Temporary fix until the value cell has the durable timestamp. Currently, value
- * cell doesn't store the information of durable timestamp, so we lose the information
- * of aggregated durable timestamp information when the page is reconciled without
- * writing to the disk (in-memory page re-instantiate). As part of page re-instantiate
- * scenarios, the calculated aggregated durable timestamp gets lost and when the same
- * page gets reconciled again, we don't have any durable timestamp from the cell. Use
- * commit timestamp from the cell also as the durable timestamp instead of setting it to
- * zero until we store the durable timestamp in the cell.
- */
- if (newest_durable_ts != WT_TS_NONE)
- durable_ts = newest_durable_ts;
- else
- durable_ts = vpack->start_ts;
+ start_durable_ts = vpack->durable_start_ts;
start_ts = vpack->start_ts;
start_txn = vpack->start_txn;
+ stop_durable_ts = vpack->durable_stop_ts;
stop_ts = vpack->stop_ts;
stop_txn = vpack->stop_txn;
}
@@ -840,8 +820,8 @@ __wt_rec_row_leaf(
* Repack the cell if we clear the transaction ids in the cell.
*/
if (vpack->raw == WT_CELL_VALUE_COPY) {
- WT_ERR(__rec_cell_repack(
- session, btree, r, vpack, start_txn, start_ts, stop_txn, stop_ts));
+ WT_ERR(__rec_cell_repack(session, btree, r, vpack, start_durable_ts, start_txn,
+ start_ts, stop_durable_ts, stop_txn, stop_ts));
dictionary = true;
} else if (F_ISSET(vpack, WT_CELL_UNPACK_TIME_PAIRS_CLEARED)) {
@@ -856,12 +836,13 @@ __wt_rec_row_leaf(
val->buf.size = vpack->size;
/* Rebuild the cell. */
- val->cell_len = __wt_cell_pack_ovfl(session, &val->cell, vpack->raw, start_ts,
- start_txn, stop_ts, stop_txn, 0, val->buf.size);
+ val->cell_len =
+ __wt_cell_pack_ovfl(session, &val->cell, vpack->raw, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, 0, val->buf.size);
val->len = val->cell_len + val->buf.size;
} else
- WT_ERR(__rec_cell_repack(
- session, btree, r, vpack, start_txn, start_ts, stop_txn, stop_ts));
+ WT_ERR(__rec_cell_repack(session, btree, r, vpack, start_durable_ts, start_txn,
+ start_ts, stop_durable_ts, stop_txn, stop_ts));
dictionary = true;
} else {
@@ -883,14 +864,15 @@ __wt_rec_row_leaf(
case WT_UPDATE_MODIFY:
cbt->slot = WT_ROW_SLOT(page, rip);
WT_ERR(__wt_value_return_upd(cbt, upd));
- WT_ERR(__wt_rec_cell_build_val(session, r, cbt->iface.value.data,
- cbt->iface.value.size, start_ts, start_txn, stop_ts, stop_txn, 0));
+ WT_ERR(
+ __wt_rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size,
+ start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, 0));
dictionary = true;
break;
case WT_UPDATE_STANDARD:
/* Take the value from the update. */
- WT_ERR(__wt_rec_cell_build_val(
- session, r, upd->data, upd->size, start_ts, start_txn, stop_ts, stop_txn, 0));
+ WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, start_durable_ts,
+ start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn, 0));
dictionary = true;
break;
case WT_UPDATE_TOMBSTONE:
@@ -1024,11 +1006,12 @@ build:
else {
r->all_empty_value = false;
if (dictionary && btree->dictionary)
- WT_ERR(__wt_rec_dict_replace(
- session, r, start_ts, start_txn, stop_ts, stop_txn, 0, val));
+ WT_ERR(__wt_rec_dict_replace(session, r, start_durable_ts, start_ts, start_txn,
+ stop_durable_ts, stop_ts, stop_txn, 0, val));
__wt_rec_image_copy(session, r, val);
}
- __wt_rec_addr_ts_update(r, durable_ts, start_ts, start_txn, stop_ts, stop_txn);
+ __wt_rec_addr_ts_update(
+ r, start_durable_ts, start_ts, start_txn, stop_durable_ts, stop_ts, stop_txn);
/* Update compression state. */
__rec_key_state_update(r, ovfl_key);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index 9d44e893d6d..ed8da3394b3 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -183,9 +183,10 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* both must be initialized.
*/
upd_select->upd = NULL;
- upd_select->durable_ts = WT_TS_NONE;
+ upd_select->start_durable_ts = WT_TS_NONE;
upd_select->start_ts = WT_TS_NONE;
upd_select->start_txn = WT_TXN_NONE;
+ upd_select->stop_durable_ts = WT_TS_NONE;
upd_select->stop_ts = WT_TS_MAX;
upd_select->stop_txn = WT_TXN_MAX;
@@ -194,7 +195,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
upd_memsize = 0;
checkpoint_timestamp = S2C(session)->txn_global.checkpoint_timestamp;
max_ts = WT_TS_NONE;
- tombstone_durable_ts = WT_TS_MAX;
+ tombstone_durable_ts = WT_TS_NONE;
max_txn = WT_TXN_NONE;
has_newer_updates = upd_saved = false;
is_hs_page = F_ISSET(S2BT(session), WT_BTREE_HS);
@@ -359,15 +360,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
}
if (upd != NULL) {
/* The beginning of the validity window is the selected update's time pair. */
- upd_select->durable_ts = upd_select->start_ts = upd->start_ts;
+ upd_select->start_durable_ts = upd_select->start_ts = upd->start_ts;
/* If durable timestamp is provided, use it. */
if (upd->durable_ts != WT_TS_NONE)
- upd_select->durable_ts = upd->durable_ts;
+ upd_select->start_durable_ts = upd->durable_ts;
upd_select->start_txn = upd->txnid;
/* Use the tombstone durable timestamp as the overall durable timestamp if it exists. */
- if (tombstone_durable_ts != WT_TS_MAX)
- upd_select->durable_ts = tombstone_durable_ts;
+ if (tombstone_durable_ts != WT_TS_NONE)
+ upd_select->stop_durable_ts = tombstone_durable_ts;
} else if (upd_select->stop_ts != WT_TS_NONE || upd_select->stop_txn != WT_TXN_NONE) {
/* If we only have a tombstone in the update list, we must have an ondisk value. */
WT_ASSERT(session, vpack != NULL);
@@ -381,12 +382,12 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* keep the same on-disk value but set the stop time pair to indicate that the validity
* window ends when this tombstone started.
*/
- upd_select->durable_ts = upd_select->start_ts = vpack->start_ts;
+ upd_select->start_durable_ts = upd_select->start_ts = vpack->start_ts;
upd_select->start_txn = vpack->start_txn;
/* Use the tombstone durable timestamp as the overall durable timestamp if it exists. */
- if (tombstone_durable_ts != WT_TS_MAX)
- upd_select->durable_ts = tombstone_durable_ts;
+ if (tombstone_durable_ts != WT_TS_NONE)
+ upd_select->stop_durable_ts = tombstone_durable_ts;
/*
* Leaving the update unset means that we can skip reconciling. If we've set the stop
@@ -432,6 +433,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
"Warning: fixing out-of-order timestamps remove at %s earlier than value at %s",
__wt_timestamp_to_string(upd_select->stop_ts, ts_string[0]),
__wt_timestamp_to_string(upd_select->start_ts, ts_string[1]));
+ upd_select->start_durable_ts = upd_select->stop_durable_ts;
upd_select->start_ts = upd_select->stop_ts;
upd_select->start_txn = upd_select->stop_txn;
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index d06692c71a2..3619f69456f 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -844,15 +844,17 @@ __rec_split_chunk_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *
/* Don't touch the key item memory, that memory is reused. */
chunk->key.size = 0;
chunk->entries = 0;
- __wt_rec_addr_ts_init(r, &chunk->newest_durable_ts, &chunk->oldest_start_ts,
- &chunk->oldest_start_txn, &chunk->newest_stop_ts, &chunk->newest_stop_txn);
+ __wt_rec_addr_ts_init(r, &chunk->start_durable_ts, &chunk->oldest_start_ts,
+ &chunk->oldest_start_txn, &chunk->stop_durable_ts, &chunk->newest_stop_ts,
+ &chunk->newest_stop_txn);
chunk->min_recno = WT_RECNO_OOB;
/* Don't touch the key item memory, that memory is reused. */
chunk->min_key.size = 0;
chunk->min_entries = 0;
- __wt_rec_addr_ts_init(r, &chunk->min_newest_durable_ts, &chunk->min_oldest_start_ts,
- &chunk->min_oldest_start_txn, &chunk->min_newest_stop_ts, &chunk->min_newest_stop_txn);
+ __wt_rec_addr_ts_init(r, &chunk->min_start_durable_ts, &chunk->min_oldest_start_ts,
+ &chunk->min_oldest_start_txn, &chunk->min_stop_durable_ts, &chunk->min_newest_stop_ts,
+ &chunk->min_newest_stop_txn);
chunk->min_offset = 0;
/*
@@ -1270,9 +1272,10 @@ __wt_rec_split_crossing_bnd(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t ne
r->cur_ptr->min_recno = r->recno;
if (S2BT(session)->type == BTREE_ROW)
WT_RET(__rec_split_row_promote(session, r, &r->cur_ptr->min_key, r->page->type));
- r->cur_ptr->min_newest_durable_ts = r->cur_ptr->newest_durable_ts;
+ r->cur_ptr->min_start_durable_ts = r->cur_ptr->start_durable_ts;
r->cur_ptr->min_oldest_start_ts = r->cur_ptr->oldest_start_ts;
r->cur_ptr->min_oldest_start_txn = r->cur_ptr->oldest_start_txn;
+ r->cur_ptr->min_stop_durable_ts = r->cur_ptr->stop_durable_ts;
r->cur_ptr->min_newest_stop_ts = r->cur_ptr->newest_stop_ts;
r->cur_ptr->min_newest_stop_txn = r->cur_ptr->newest_stop_txn;
@@ -1325,10 +1328,10 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* boundaries and create a single chunk.
*/
prev_ptr->entries += cur_ptr->entries;
- prev_ptr->newest_durable_ts =
- WT_MAX(prev_ptr->newest_durable_ts, cur_ptr->newest_durable_ts);
+ prev_ptr->start_durable_ts = WT_MAX(prev_ptr->start_durable_ts, cur_ptr->start_durable_ts);
prev_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts);
prev_ptr->oldest_start_txn = WT_MIN(prev_ptr->oldest_start_txn, cur_ptr->oldest_start_txn);
+ prev_ptr->stop_durable_ts = WT_MAX(prev_ptr->stop_durable_ts, cur_ptr->stop_durable_ts);
prev_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts);
prev_ptr->newest_stop_txn = WT_MAX(prev_ptr->newest_stop_txn, cur_ptr->newest_stop_txn);
dsk = r->cur_ptr->image.mem;
@@ -1373,18 +1376,19 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r)
cur_ptr->recno = prev_ptr->min_recno;
WT_RET(
__wt_buf_set(session, &cur_ptr->key, prev_ptr->min_key.data, prev_ptr->min_key.size));
- cur_ptr->newest_durable_ts =
- WT_MAX(prev_ptr->newest_durable_ts, cur_ptr->newest_durable_ts);
+ cur_ptr->start_durable_ts = WT_MAX(prev_ptr->start_durable_ts, cur_ptr->start_durable_ts);
cur_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts);
cur_ptr->oldest_start_txn = WT_MIN(prev_ptr->oldest_start_txn, cur_ptr->oldest_start_txn);
+ cur_ptr->stop_durable_ts = WT_MAX(prev_ptr->stop_durable_ts, cur_ptr->stop_durable_ts);
cur_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts);
cur_ptr->newest_stop_txn = WT_MAX(prev_ptr->newest_stop_txn, cur_ptr->newest_stop_txn);
cur_ptr->image.size += len_to_move;
prev_ptr->entries = prev_ptr->min_entries;
- prev_ptr->newest_durable_ts = prev_ptr->min_newest_durable_ts;
+ prev_ptr->start_durable_ts = prev_ptr->min_start_durable_ts;
prev_ptr->oldest_start_ts = prev_ptr->min_oldest_start_ts;
prev_ptr->oldest_start_txn = prev_ptr->min_oldest_start_txn;
+ prev_ptr->stop_durable_ts = prev_ptr->min_stop_durable_ts;
prev_ptr->newest_stop_ts = prev_ptr->min_newest_stop_ts;
prev_ptr->newest_stop_txn = prev_ptr->min_newest_stop_txn;
prev_ptr->image.size -= len_to_move;
@@ -1760,14 +1764,11 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk
WT_RET(__wt_realloc_def(session, &r->multi_allocated, r->multi_next + 1, &r->multi));
multi = &r->multi[r->multi_next++];
- /*
- * FIXME-prepare-support: audit the use of durable timestamps in this file, use both durable
- * timestamps.
- */
/* Initialize the address (set the addr type for the parent). */
- multi->addr.stop_durable_ts = chunk->newest_durable_ts;
+ multi->addr.start_durable_ts = chunk->start_durable_ts;
multi->addr.oldest_start_ts = chunk->oldest_start_ts;
multi->addr.oldest_start_txn = chunk->oldest_start_txn;
+ multi->addr.stop_durable_ts = chunk->stop_durable_ts;
multi->addr.newest_stop_ts = chunk->newest_stop_ts;
multi->addr.newest_stop_txn = chunk->newest_stop_txn;
@@ -2164,7 +2165,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
ref = r->ref;
if (__wt_ref_is_root(ref)) {
__wt_checkpoint_tree_reconcile_update(
- session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
+ session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
WT_RET(bm->checkpoint(bm, session, NULL, btree->ckpt, false));
}
@@ -2203,9 +2204,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
mod->mod_disk_image = r->multi->disk_image;
r->multi->disk_image = NULL;
} else {
- __wt_checkpoint_tree_reconcile_update(session, r->multi->addr.stop_durable_ts,
+ __wt_checkpoint_tree_reconcile_update(session, r->multi->addr.start_durable_ts,
r->multi->addr.oldest_start_ts, r->multi->addr.oldest_start_txn,
- r->multi->addr.newest_stop_ts, r->multi->addr.newest_stop_txn);
+ r->multi->addr.stop_durable_ts, r->multi->addr.newest_stop_ts,
+ r->multi->addr.newest_stop_txn);
WT_RET(__wt_bt_write(session, r->wrapup_checkpoint, NULL, NULL, NULL, true,
F_ISSET(r, WT_REC_CHECKPOINT), r->wrapup_checkpoint_compressed));
}
@@ -2326,8 +2328,8 @@ err:
*/
int
__wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type,
- wt_timestamp_t start_ts, uint64_t start_txn, wt_timestamp_t stop_ts, uint64_t stop_txn,
- uint64_t rle)
+ wt_timestamp_t start_durable_ts, wt_timestamp_t start_ts, uint64_t start_txn,
+ wt_timestamp_t stop_durable_ts, wt_timestamp_t stop_ts, uint64_t stop_txn, uint64_t rle)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -2382,8 +2384,8 @@ __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *k
WT_ERR(__wt_buf_set(session, &kv->buf, addr, size));
/* Build the cell and return. */
- kv->cell_len = __wt_cell_pack_ovfl(
- session, &kv->cell, type, start_ts, start_txn, stop_ts, stop_txn, rle, kv->buf.size);
+ kv->cell_len = __wt_cell_pack_ovfl(session, &kv->cell, type, start_durable_ts, start_ts,
+ start_txn, stop_durable_ts, stop_ts, stop_txn, rle, kv->buf.size);
kv->len = kv->cell_len + kv->buf.size;
err:
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index d180ed076b7..d5e6a4071fe 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -1602,6 +1602,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const cha
WT_DECL_RET;
WT_SESSION *wt_session;
WT_SESSION_IMPL *s;
+ char ts_string[WT_TS_INT_STRING_SIZE];
const char *ckpt_cfg;
conn = S2C(session);
@@ -1624,8 +1625,12 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const cha
* Perform rollback to stable to ensure that the stable version is written to disk on a
* clean shutdown.
*/
- if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP))
+ if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP)) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "Performing shutdown rollback to stable with stable timestamp: %s",
+ __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string));
WT_TRET(__wt_rollback_to_stable(session, cfg, true));
+ }
s = NULL;
WT_TRET(__wt_open_internal_session(conn, "close_ckpt", true, 0, &s));
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 00d68c614eb..6570f9c16bd 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -1498,9 +1498,9 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
* Update a checkpoint based on reconciliation results.
*/
void
-__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t newest_durable_ts,
- wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts,
- uint64_t newest_stop_txn)
+__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t start_durable_ts,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t stop_durable_ts,
+ wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
{
WT_BTREE *btree;
WT_CKPT *ckpt, *ckptbase;
@@ -1516,9 +1516,10 @@ __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t n
WT_CKPT_FOREACH (ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_ADD)) {
ckpt->write_gen = btree->write_gen;
- ckpt->newest_durable_ts = newest_durable_ts;
+ ckpt->start_durable_ts = start_durable_ts;
ckpt->oldest_start_ts = oldest_start_ts;
ckpt->oldest_start_txn = oldest_start_txn;
+ ckpt->stop_durable_ts = stop_durable_ts;
ckpt->newest_stop_ts = newest_stop_ts;
ckpt->newest_stop_txn = newest_stop_txn;
}
@@ -1567,7 +1568,7 @@ __checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[
*/
if (is_checkpoint && btree->original) {
__wt_checkpoint_tree_reconcile_update(
- session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
+ session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
fake_ckpt = true;
goto fake;
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index e3ca72e3b43..70ca3553973 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -523,6 +523,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
WT_RECOVERY r;
WT_RECOVERY_FILE *metafile;
char *config;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup;
conn = S2C(session);
@@ -548,6 +549,19 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
metafile->c = metac;
/*
+ * We should check whether the history store file exists or not. or not. If it does not, then we
+ * should not apply rollback to stable to each table. This might happen if we're upgrading from
+ * an older version.
+ */
+ metac->set_key(metac, WT_HS_URI);
+ ret = metac->search(metac);
+ if (ret == WT_NOTFOUND)
+ hs_exists = false;
+ WT_ERR_NOTFOUND_OK(ret);
+ /* Unpin the page from cache. */
+ WT_ERR(metac->reset(metac));
+
+ /*
* If no log was found (including if logging is disabled), or if the last checkpoint was done
* with logging disabled, recovery should not run. Scan the metadata to figure out the largest
* file ID.
@@ -630,17 +644,6 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
WT_NOT_READ(metafile, NULL);
/*
- * While we have the metadata cursor open, we should check whether the history store file exists
- * or not. If it does not, then we should not apply rollback to stable to each table. This might
- * happen if we're upgrading from an older version.
- */
- metac->set_key(metac, WT_HS_URI);
- ret = metac->search(metac);
- if (ret == WT_NOTFOUND)
- hs_exists = false;
- WT_ERR_NOTFOUND_OK(ret);
-
- /*
* We no longer need the metadata cursor: close it to avoid pinning any resources that could
* block eviction during recovery.
*/
@@ -746,6 +749,11 @@ done:
*/
conn->txn_global.oldest_timestamp = WT_TS_NONE;
conn->txn_global.has_oldest_timestamp = true;
+ __wt_verbose(session, WT_VERB_RTS,
+ "Performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
+ "%s",
+ __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
WT_ERR(__wt_rollback_to_stable(session, NULL, false));
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 28f8dcac8ba..052ead85833 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -18,6 +18,7 @@ __rollback_abort_newer_update(
WT_SESSION_IMPL *session, WT_UPDATE *first_upd, wt_timestamp_t rollback_timestamp)
{
WT_UPDATE *upd;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
for (upd = first_upd; upd != NULL; upd = upd->next) {
/*
@@ -40,6 +41,12 @@ __rollback_abort_newer_update(
upd == first_upd);
first_upd = upd->next;
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: Rollback to stable update aborted with durable timestamp: %s and stable "
+ "timestamp: %s",
+ S2BT(session)->dhandle->name, __wt_timestamp_to_string(upd->durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+
upd->txnid = WT_TXN_ABORTED;
WT_STAT_CONN_INCR(session, txn_rts_upd_aborted);
upd->durable_ts = upd->start_ts = WT_TS_NONE;
@@ -145,6 +152,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
uint32_t hs_btree_id, session_flags;
uint8_t type;
int cmp;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
bool is_owner, valid_update_found;
hs_cursor = NULL;
@@ -229,15 +237,29 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* the current version stop timestamp. Also it confirms that history store doesn't contains
* any newer version than the current version for the key.
*/
- if (hs_stop_ts <= rollback_timestamp)
+ if (hs_stop_ts <= rollback_timestamp) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s :History store update valid with stop timestamp: %s and stable timestamp: %s",
+ S2BT(session)->dhandle->name, __wt_timestamp_to_string(hs_stop_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
break;
+ }
/* Stop processing when we find a stable update according to the given timestamp. */
if (durable_ts <= rollback_timestamp) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s :History store update valid with durable timestamp: %s and stable timestamp: %s",
+ S2BT(session)->dhandle->name, __wt_timestamp_to_string(durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
valid_update_found = true;
break;
}
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s :History store update aborted with durable timestamp: %s and stable timestamp: %s",
+ S2BT(session)->dhandle->name, __wt_timestamp_to_string(durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+
newer_hs_ts = hs_start_ts;
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd));
WT_ERR(__wt_hs_modify(cbt, hs_upd));
@@ -257,8 +279,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
upd->durable_ts = durable_ts;
upd->start_ts = hs_start_ts;
__wt_verbose(session, WT_VERB_RTS, "Update restored from history store (txnid: %" PRIu64
- ", start_ts: %" PRIu64 ", durable_ts: %" PRIu64 ")",
- upd->txnid, upd->start_ts, upd->durable_ts);
+ ", start_ts: %s, durable_ts: %s",
+ upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
/*
* Set the flag to indicate that this update has been restored from history store for
@@ -308,13 +331,22 @@ __rollback_abort_row_ondisk_kv(
WT_ITEM buf;
WT_UPDATE *upd;
size_t size;
+ char ts_string[3][WT_TS_INT_STRING_SIZE];
vpack = &_vpack;
upd = NULL;
__wt_row_leaf_value_cell(session, page, rip, NULL, vpack);
- if (vpack->start_ts > rollback_timestamp)
+ if (vpack->durable_start_ts > rollback_timestamp) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: On-disk update aborted with start durable timestamp: %s, commit timestamp: %s and "
+ "stable timestamp: %s",
+ S2BT(session)->dhandle->name,
+ __wt_timestamp_to_string(vpack->durable_start_ts, ts_string[0]),
+ __wt_timestamp_to_string(vpack->start_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true));
- else if (vpack->stop_ts != WT_TS_MAX && vpack->stop_ts > rollback_timestamp) {
+ } else if (vpack->durable_stop_ts != WT_TS_NONE &&
+ vpack->durable_stop_ts > rollback_timestamp) {
/*
* Clear the remove operation from the key by inserting the original on-disk value as a
* standard update.
@@ -331,12 +363,13 @@ __rollback_abort_row_ondisk_kv(
WT_RET(__wt_update_alloc(session, &buf, &upd, &size, WT_UPDATE_STANDARD));
upd->txnid = vpack->start_txn;
- upd->durable_ts = vpack->start_ts;
+ upd->durable_ts = vpack->durable_start_ts;
upd->start_ts = vpack->start_ts;
WT_STAT_CONN_INCR(session, txn_rts_keys_restored);
__wt_verbose(session, WT_VERB_RTS,
- "Key restored (txnid: %" PRIu64 ", start_ts: %" PRIu64 ", durable_ts: %" PRIu64 ")",
- upd->txnid, upd->start_ts, upd->durable_ts);
+ "Key restored (txnid: %" PRIu64 ", start_ts: %s, durable_ts: %s", upd->txnid,
+ __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
} else
/* Stable version according to the timestamp. */
return (0);
@@ -447,16 +480,22 @@ __rollback_abort_row_reconciled_page(
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
uint32_t multi_entry;
+ char ts_string[3][WT_TS_INT_STRING_SIZE];
if ((mod = page->modify) == NULL)
return (0);
- /*
- * FIXME-prepare-support: audit the use of durable timestamps in this file, use both durable
- * timestamps.
- */
if (mod->rec_result == WT_PM_REC_REPLACE &&
- mod->mod_replace.stop_durable_ts > rollback_timestamp) {
+ (mod->mod_replace.start_durable_ts > rollback_timestamp ||
+ mod->mod_replace.stop_durable_ts > rollback_timestamp)) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: Reconciled replace block page history store update removal On-disk with start "
+ "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s",
+ S2BT(session)->dhandle->name,
+ __wt_timestamp_to_string(mod->mod_replace.start_durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(mod->mod_replace.stop_durable_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
+
WT_RET(__rollback_abort_row_reconciled_page_internal(session, mod->u1.r.disk_image,
mod->u1.r.replace.addr, mod->u1.r.replace.size, rollback_timestamp));
@@ -469,7 +508,17 @@ __rollback_abort_row_reconciled_page(
} else if (mod->rec_result == WT_PM_REC_MULTIBLOCK) {
for (multi = mod->mod_multi, multi_entry = 0; multi_entry < mod->mod_multi_entries;
++multi, ++multi_entry)
- if (multi->addr.stop_durable_ts > rollback_timestamp) {
+ if (multi->addr.start_durable_ts > rollback_timestamp ||
+ multi->addr.stop_durable_ts > rollback_timestamp) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: Reconciled multi block page history store update removal On-disk with "
+ "start durable timestamp: %s, stop durable timestamp: %s and stable "
+ "timestamp: %s",
+ S2BT(session)->dhandle->name,
+ __wt_timestamp_to_string(multi->addr.start_durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(multi->addr.stop_durable_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
+
WT_RET(__rollback_abort_row_reconciled_page_internal(session, multi->disk_image,
multi->addr.addr, multi->addr.size, rollback_timestamp));
@@ -543,11 +592,17 @@ __rollback_page_needs_abort(
WT_CELL_UNPACK vpack;
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
- wt_timestamp_t multi_newest_durable_ts;
+ wt_timestamp_t durable_ts;
uint32_t i;
+ char ts_string[WT_TS_INT_STRING_SIZE];
+ const char *tag;
+ bool result;
addr = ref->addr;
mod = ref->page == NULL ? NULL : ref->page->modify;
+ durable_ts = WT_TS_NONE;
+ tag = "undefined state";
+ result = false;
/*
* The rollback operation should be performed on this page when any one of the following is
@@ -557,22 +612,34 @@ __rollback_page_needs_abort(
* 3. The on page address max durable timestamp.
* 4. The off page address max durable timestamp.
*/
- if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE)
- return (mod->mod_replace.stop_durable_ts > rollback_timestamp);
- else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) {
- multi_newest_durable_ts = WT_TS_NONE;
+ if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) {
+ tag = "reconciled replace block";
+ durable_ts = WT_MAX(mod->mod_replace.start_durable_ts, mod->mod_replace.stop_durable_ts);
+ result = (durable_ts > rollback_timestamp);
+ } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) {
+ tag = "reconciled multi block";
/* Calculate the max durable timestamp by traversing all multi addresses. */
- for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i)
- multi_newest_durable_ts = WT_MAX(multi_newest_durable_ts, multi->addr.stop_durable_ts);
- return (multi_newest_durable_ts > rollback_timestamp);
+ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ durable_ts = WT_MAX(durable_ts, multi->addr.start_durable_ts);
+ durable_ts = WT_MAX(durable_ts, multi->addr.stop_durable_ts);
+ }
+ result = (durable_ts > rollback_timestamp);
} else if (!__wt_off_page(ref->home, addr)) {
+ tag = "on page cell";
/* Check if the page is obsolete using the page disk address. */
__wt_cell_unpack(session, ref->home, (WT_CELL *)addr, &vpack);
- return (vpack.newest_stop_durable_ts > rollback_timestamp);
- } else if (addr != NULL)
- return (addr->stop_durable_ts > rollback_timestamp);
+ durable_ts = WT_MAX(vpack.newest_start_durable_ts, vpack.newest_stop_durable_ts);
+ result = (durable_ts > rollback_timestamp);
+ } else if (addr != NULL) {
+ tag = "address";
+ durable_ts = WT_MAX(addr->start_durable_ts, addr->stop_durable_ts);
+ result = (durable_ts > rollback_timestamp);
+ }
+
+ __wt_verbose(session, WT_VERB_RTS, "%p: page with %s durable timestamp: %s", (void *)ref, tag,
+ __wt_timestamp_to_string(durable_ts, ts_string));
- return (false);
+ return (result);
}
#ifdef HAVE_DIAGNOSTIC
@@ -647,7 +714,8 @@ __rollback_abort_newer_updates(
page = ref->page;
}
WT_STAT_CONN_INCR(session, txn_rts_pages_visited);
- __wt_verbose(session, WT_VERB_RTS, "%p: page rolled back", (void *)ref);
+ __wt_verbose(session, WT_VERB_RTS, "%p: page rolled back when page is modified: %s",
+ (void *)ref, __wt_page_is_modified(page) ? "true" : "false");
switch (page->type) {
case WT_PAGE_COL_FIX:
@@ -730,6 +798,11 @@ __rollback_to_stable_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_tim
btree = S2BT(session);
conn = S2C(session);
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: Rollback to stable connection logging enabled: %s and btree logging enabled: %s",
+ btree->dhandle->name, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ? "true" : "false",
+ !F_ISSET(btree, WT_BTREE_NO_LOGGING) ? "true" : "false");
+
/*
* Immediately durable files don't get their commits wiped. This case mostly exists to support
* the semantic required for the oplog in MongoDB - updates that have been made to the oplog
@@ -882,7 +955,8 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_CURSOR *cursor;
WT_DECL_RET;
WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t newest_durable_ts, rollback_timestamp;
+ wt_timestamp_t max_durable_ts, start_durable_ts, stop_durable_ts, rollback_timestamp;
+ char ts_string[WT_TS_INT_STRING_SIZE];
const char *config, *uri;
bool durable_ts_found;
@@ -894,6 +968,8 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
* without a lock would violate protocol.
*/
WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
+ __wt_verbose(session, WT_VERB_RTS, "Performing rollback to stable with stable timestamp: %s",
+ __wt_timestamp_to_string(rollback_timestamp, ts_string));
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
WT_RET(__wt_metadata_cursor(session, &cursor));
@@ -911,19 +987,25 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_ERR(cursor->get_value(cursor, &config));
/* Find out the max durable timestamp of the object from checkpoint. */
- newest_durable_ts = WT_TS_NONE;
+ start_durable_ts = stop_durable_ts = WT_TS_NONE;
durable_ts_found = false;
WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
__wt_config_subinit(session, &ckptconf, &cval);
for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
- ret = __wt_config_subgets(session, &cval, "newest_durable_ts", &durableval);
+ ret = __wt_config_subgets(session, &cval, "start_durable_ts", &durableval);
if (ret == 0) {
- newest_durable_ts = WT_MAX(newest_durable_ts, (wt_timestamp_t)durableval.val);
+ start_durable_ts = WT_MAX(start_durable_ts, (wt_timestamp_t)durableval.val);
+ durable_ts_found = true;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ ret = __wt_config_subgets(session, &cval, "stop_durable_ts", &durableval);
+ if (ret == 0) {
+ stop_durable_ts = WT_MAX(stop_durable_ts, (wt_timestamp_t)durableval.val);
durable_ts_found = true;
}
WT_ERR_NOTFOUND_OK(ret);
}
-
+ max_durable_ts = WT_MAX(start_durable_ts, stop_durable_ts);
ret = __wt_session_get_dhandle(session, uri, NULL, NULL, 0);
/* Ignore performing rollback to stable on files that don't exist. */
if (ret == ENOENT) {
@@ -936,18 +1018,22 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
/*
* The rollback operation should be performed on this file based on the following:
* 1. The tree is modified.
- * 2. The checkpoint durable timestamp is greater than the rollback timestamp.
+ * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
* 3. There is no durable timestamp in any checkpoint.
*/
- if (S2BT(session)->modified || newest_durable_ts > rollback_timestamp ||
- !durable_ts_found) {
- __wt_verbose(session, WT_VERB_RTS, "%s: file rolled back", uri);
+ if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || !durable_ts_found) {
+ __wt_verbose(session, WT_VERB_RTS,
+ "%s: file rolled back with durable timestamp: %s, or when tree is modified: %s or "
+ "when durable time is not found: %s",
+ uri, __wt_timestamp_to_string(max_durable_ts, ts_string),
+ S2BT(session)->modified ? "true" : "false", !durable_ts_found ? "true" : "false");
WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp));
} else
- __wt_verbose(session, WT_VERB_RTS, "%s: file skipped", uri);
+ __wt_verbose(session, WT_VERB_RTS, "%s: file skipped with durable timestamp: %s", uri,
+ __wt_timestamp_to_string(max_durable_ts, ts_string));
/* Cleanup any history store entries for this non-timestamped table. */
- if (newest_durable_ts == WT_TS_NONE && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
+ if (max_durable_ts == WT_TS_NONE && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
__wt_verbose(
session, WT_VERB_RTS, "%s: non-timestamped file history store cleanup", uri);
WT_TRET(__rollback_to_stable_btree_hs_cleanup(session, S2BT(session)->id));
diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c
index 46b6e7b879c..f0123418888 100644
--- a/src/third_party/wiredtiger/test/format/backup.c
+++ b/src/third_party/wiredtiger/test/format/backup.c
@@ -327,8 +327,8 @@ backup(void *arg)
WT_CURSOR *backup_cursor;
WT_DECL_RET;
WT_SESSION *session;
- uint32_t src_id, src_prev;
u_int incremental, period;
+ uint32_t src_id;
const char *config, *key;
char cfg[512];
bool full, incr_full;
@@ -382,28 +382,11 @@ backup(void *arg)
full = true;
incr_full = false;
} else {
- /*
- * 75% of the time, use the most recent source id. 25% of the time, use the id
- * that is from two incremental backups prior. The handling of the active files for
- * the source one or two incrementals prior is unpleasant but necessary.
- */
- src_prev = mmrand(NULL, 1, 4) == 2 && g.backup_id >= 2 && full == false ? 2 : 1;
- if (src_prev == 2) {
- /*
- * If we're going back two incrementals ago, set active_prev to the other list
- * of active files (i.e. the active list that is not the immediate previous
- * list) and overwrite active_prev with the current one.
- */
- active_now = active_prev;
- if (active_prev == &active[0])
- active_prev = &active[1];
- else
- active_prev = &active[0];
- } else if (active_prev == &active[0])
+ if (active_prev == &active[0])
active_now = &active[1];
else
active_now = &active[0];
- src_id = g.backup_id - src_prev;
+ src_id = g.backup_id - 1;
testutil_check(__wt_snprintf(cfg, sizeof(cfg),
"incremental=(enabled,src_id=ID%u,this_id=ID%" PRIu32 ")", src_id,
g.backup_id++));
@@ -476,7 +459,7 @@ backup(void *arg)
* more incremental backups).
*/
if (full)
- incremental = g.c_logging_archive ? 1 : mmrand(NULL, 1, 5);
+ incremental = g.c_logging_archive ? 1 : mmrand(NULL, 1, 8);
if (--incremental == 0) {
check_copy();
/* We ran recovery in the backup directory, so next time it must be a full backup. */
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 8b7ba07fe9e..67e2821f52d 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -116,8 +116,9 @@ main(int argc, char *argv[])
{
uint64_t now, start;
u_int ops_seconds;
- int ch, onerun, reps;
+ int ch, reps;
const char *config, *home;
+ bool one_flag, quiet_flag;
custom_die = format_die; /* Local death handler. */
@@ -148,16 +149,13 @@ main(int argc, char *argv[])
(void)setenv("MALLOC_OPTIONS", "AJ", 1);
#endif
- /* Track progress unless we're re-directing output to a file. */
- g.c_quiet = isatty(1) ? 0 : 1;
-
/* Set values from the command line. */
home = NULL;
- onerun = 0;
+ one_flag = quiet_flag = false;
while ((ch = __wt_getopt(progname, argc, argv, "1C:c:h:lqrt:")) != EOF)
switch (ch) {
case '1': /* One run */
- onerun = 1;
+ one_flag = true;
break;
case 'C': /* wiredtiger_open config */
g.config_open = __wt_optarg;
@@ -172,7 +170,7 @@ main(int argc, char *argv[])
g.logging = true;
break;
case 'q': /* Quiet */
- g.c_quiet = 1;
+ quiet_flag = true;
break;
case 'r': /* Replay a run */
g.replay = true;
@@ -229,10 +227,13 @@ main(int argc, char *argv[])
g.c_runs = 1;
/*
- * Let the command line -1 flag override runs configured from other sources.
+ * Let the command line -1 and -q flags override values configured from other sources.
+ * Regardless, don't go all verbose if we're not talking to a terminal.
*/
- if (onerun)
+ if (one_flag)
g.c_runs = 1;
+ if (quiet_flag || !isatty(1))
+ g.c_quiet = 1;
/*
* Initialize locks to single-thread named checkpoints and backups, last last-record updates,
diff --git a/src/third_party/wiredtiger/test/suite/test_compat01.py b/src/third_party/wiredtiger/test/suite/test_compat01.py
index 2d959dad23c..4600d8e66cb 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat01.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat01.py
@@ -45,16 +45,16 @@ class test_compat01(wttest.WiredTigerTestCase, suite_subprocess):
# Declare the log versions that do and do not have prevlsn.
# Log version 1 does not have the prevlsn record.
# Log version 2 introduced that record.
- # Log version 3 continues to have that record.
- # Log version 4 continues to have that record.
+ # Log versions 3 and higher continue to have that record.
min_logv = 2
- latest_logv = 4
+ latest_logv = 5
# The API uses only the major and minor numbers but accepts with
# and without the patch number. Test both.
start_compat = [
- ('def', dict(compat1='none', logv1=4)),
- ('100', dict(compat1='10.0', logv1=4)),
+ ('def', dict(compat1='none', logv1=5)),
+ ('100', dict(compat1='10.0', logv1=5)),
+ ('33', dict(compat1='3.3', logv1=4)),
('32', dict(compat1='3.2', logv1=3)),
('31', dict(compat1="3.1", logv1=3)),
('30', dict(compat1="3.0", logv1=2)),
@@ -64,8 +64,9 @@ class test_compat01(wttest.WiredTigerTestCase, suite_subprocess):
('old_patch', dict(compat1="1.8.1", logv1=1)),
]
restart_compat = [
- ('def2', dict(compat2='none', logv2=4)),
- ('100_2', dict(compat2='10.0', logv2=4)),
+ ('def2', dict(compat2='none', logv2=5)),
+ ('100_2', dict(compat2='10.0', logv2=5)),
+ ('33_2', dict(compat2='3.3', logv2=4)),
('32_2', dict(compat2='3.2', logv2=3)),
('31_2', dict(compat2="3.1", logv2=3)),
('30_2', dict(compat2="3.0", logv2=2)),
diff --git a/src/third_party/wiredtiger/test/suite/test_compat02.py b/src/third_party/wiredtiger/test/suite/test_compat02.py
index 1a39448ac65..e3397d0d7ea 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat02.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat02.py
@@ -45,8 +45,7 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
# Declare the log versions that do and do not have prevlsn.
# Log version 1 does not have the prevlsn record.
# Log version 2 introduced that record.
- # Log version 3 continues to have that record.
- # Log version 4 continues to have that record.
+ # Log versions 3 and higher continue to have that record.
min_logv = 2
# Test detecting a not-yet-existing log version. This should
@@ -59,8 +58,9 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
# required minimum just for testing of parsing.
compat_create = [
- ('def', dict(create_rel='none', log_create=4)),
- ('100', dict(create_rel="10.0", log_create=4)),
+ ('def', dict(create_rel='none', log_create=5)),
+ ('100', dict(create_rel="10.0", log_create=5)),
+ ('33', dict(create_rel="3.3", log_create=4)),
('32', dict(create_rel="3.2", log_create=3)),
('31', dict(create_rel="3.1", log_create=3)),
('30', dict(create_rel="3.0", log_create=2)),
@@ -68,8 +68,9 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
]
compat_release = [
- ('def_rel', dict(rel='none', log_rel=4)),
- ('100_rel', dict(rel="10.0", log_rel=4)),
+ ('def_rel', dict(rel='none', log_rel=5)),
+ ('100_rel', dict(rel="10.0", log_rel=5)),
+ ('33_rel', dict(rel="3.3", log_rel=4)),
('32_rel', dict(rel="3.2", log_rel=3)),
('31_rel', dict(rel="3.1", log_rel=3)),
('30_rel', dict(rel="3.0", log_rel=2)),
@@ -85,8 +86,9 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
# This rule exemption applies to the minimum verison check as well.
compat_max = [
('future_max', dict(max_req=future_rel, log_max=future_logv)),
- ('def_max', dict(max_req='none', log_max=4)),
- ('100_max', dict(max_req="10.0", log_max=4)),
+ ('def_max', dict(max_req='none', log_max=5)),
+ ('100_max', dict(max_req="10.0", log_max=5)),
+ ('33_max', dict(max_req="3.3", log_max=4)),
('32_max', dict(max_req="3.2", log_max=3)),
('30_max', dict(max_req="3.0", log_max=2)),
('26_max', dict(max_req="2.6", log_max=1)),
@@ -96,8 +98,9 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
# Only the minimum version should exist below for each log version.
compat_min = [
('future_min', dict(min_req=future_rel, log_min=future_logv)),
- ('def_min', dict(min_req='none', log_min=4)),
- ('100_min', dict(min_req="10.0", log_min=4)),
+ ('def_min', dict(min_req='none', log_min=5)),
+ ('100_min', dict(min_req="10.0", log_min=5)),
+ ('33_min', dict(min_req="3.3", log_min=4)),
('31_min', dict(min_req="3.1", log_min=3)),
('30_min', dict(min_req="3.0", log_min=2)),
('26_min', dict(min_req="2.6", log_min=1)),
diff --git a/src/third_party/wiredtiger/test/suite/test_compat03.py b/src/third_party/wiredtiger/test/suite/test_compat03.py
index 36d07a25a37..6e4fc4cb190 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat03.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat03.py
@@ -45,8 +45,7 @@ class test_compat03(wttest.WiredTigerTestCase, suite_subprocess):
# Declare the log versions that do and do not have prevlsn.
# Log version 1 does not have the prevlsn record.
# Log version 2 introduced that record.
- # Log version 3 continues to have that record.
- # Log version 4 continues to have that record.
+ # Log versions 3 and higher continue to have that record.
min_logv = 2
# Test detecting a not-yet-existing log version. This should
@@ -59,8 +58,9 @@ class test_compat03(wttest.WiredTigerTestCase, suite_subprocess):
# required minimum just for testing of parsing.
compat_release = [
('future_rel', dict(rel=future_rel, log_rel=future_logv)),
- ('def_rel', dict(rel='none', log_rel=4)),
- ('100_rel', dict(rel='10.0', log_rel=4)),
+ ('def_rel', dict(rel='none', log_rel=5)),
+ ('100_rel', dict(rel="10.0", log_rel=5)),
+ ('33_rel', dict(rel="3.3", log_rel=4)),
('32_rel', dict(rel="3.2", log_rel=3)),
('31_rel', dict(rel="3.1", log_rel=3)),
('30_rel', dict(rel="3.0", log_rel=2)),
@@ -76,8 +76,9 @@ class test_compat03(wttest.WiredTigerTestCase, suite_subprocess):
# This rule exemption applies to the minimum verison check as well.
compat_max = [
('future_max', dict(max_req=future_rel, log_max=future_logv)),
- ('def_max', dict(max_req='none', log_max=4)),
- ('100_max', dict(max_req='10.0', log_max=4)),
+ ('def_max', dict(max_req='none', log_max=5)),
+ ('100_max', dict(max_req="10.0", log_max=5)),
+ ('33_max', dict(max_req="3.3", log_max=4)),
('32_max', dict(max_req="3.2", log_max=3)),
('30_max', dict(max_req="3.0", log_max=2)),
('26_max', dict(max_req="2.6", log_max=1)),
@@ -87,8 +88,9 @@ class test_compat03(wttest.WiredTigerTestCase, suite_subprocess):
# Only the minimum version should exist below for each log version.
compat_min = [
('future_min', dict(min_req=future_rel, log_min=future_logv)),
- ('def_min', dict(min_req='none', log_min=4)),
- ('100_min', dict(min_req='10.0', log_min=4)),
+ ('def_min', dict(min_req='none', log_min=5)),
+ ('100_min', dict(min_req="10.0", log_min=5)),
+ ('33_min', dict(min_req="3.3", log_min=4)),
('31_min', dict(min_req="3.1", log_min=3)),
('30_min', dict(min_req="3.0", log_min=2)),
('26_min', dict(min_req="2.6", log_min=1)),
diff --git a/src/third_party/wiredtiger/test/suite/test_compat04.py b/src/third_party/wiredtiger/test/suite/test_compat04.py
index ecd54458850..18b0ea2797d 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat04.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat04.py
@@ -45,8 +45,7 @@ class test_compat04(wttest.WiredTigerTestCase, suite_subprocess):
# Declare the log versions that do and do not have prevlsn.
# Log version 1 does not have the prevlsn record.
# Log version 2 introduced that record.
- # Log version 3 continues to have that record.
- # Log version 4 continues to have that record.
+ # Log versions 3 and higher continue to have that record.
min_logv = 2
# The outline of this test is that we create the database at the
@@ -56,15 +55,17 @@ class test_compat04(wttest.WiredTigerTestCase, suite_subprocess):
# should be successful for all directions.
#
create_release = [
- ('def_rel', dict(create_rel='none', log_crrel=4)),
- ('100_rel', dict(create_rel='10.0', log_crrel=4)),
- ('32_rel', dict(create_rel='3.2', log_crrel=3)),
+ ('def_rel', dict(create_rel='none', log_crrel=5)),
+ ('100_rel', dict(create_rel="10.0", log_crrel=5)),
+ ('33_rel', dict(create_rel="3.3", log_crrel=4)),
+ ('32_rel', dict(create_rel="3.2", log_crrel=3)),
('31_rel', dict(create_rel="3.1", log_crrel=3)),
('30_rel', dict(create_rel="3.0", log_crrel=2)),
('26_rel', dict(create_rel="2.6", log_crrel=1)),
]
reconfig_release = [
- ('100_rel', dict(rel="10.0", log_rel=4)),
+ ('100_rel', dict(rel="10.0", log_rel=5)),
+ ('33_rel', dict(rel="3.3", log_rel=4)),
('32_rel', dict(rel="3.2", log_rel=3)),
('31_rel', dict(rel="3.1", log_rel=3)),
('30_rel', dict(rel="3.0", log_rel=2)),
diff --git a/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py b/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py
new file mode 100644
index 00000000000..847bc0977c8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_prepare_hs03.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import copy_wiredtiger_home
+import unittest, wiredtiger, wttest
+from wtdataset import SimpleDataSet
+import os, shutil
+from wtscenario import make_scenarios
+from wiredtiger import stat
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_prepare_hs03.py
+# test to ensure salvage, verify & simulating crash are working for prepared transactions.
+class test_prepare_hs03(wttest.WiredTigerTestCase):
+ # Force a small cache.
+ conn_config = 'cache_size=50MB,statistics=(fast)'
+
+ # Create a small table.
+ uri = "table:test_prepare_hs03"
+
+ scenarios = make_scenarios([
+ ('corrupt_table', dict(corrupt=True)),
+ ('dont_corrupt_table', dict(corrupt=False))
+ ])
+
+ def corrupt_table(self):
+ tablename="test_prepare_hs03.wt"
+ self.assertEquals(os.path.exists(tablename), True)
+
+ with open(tablename, 'r+') as tablepointer:
+ tablepointer.seek(1024)
+ tablepointer.write('Bad!' * 1024)
+
+ def corrupt_salvage_verify(self):
+ if self.corrupt == True:
+ self.corrupt_table()
+ self.session.salvage(self.uri, "force")
+ self.session.verify(self.uri, None)
+
+ def get_stat(self, stat):
+ stat_cursor = self.session.open_cursor('statistics:')
+ val = stat_cursor[stat][2]
+ stat_cursor.close()
+ return val
+
+ def prepare_updates(self, ds, nrows, nsessions, nkeys):
+ # Insert some records with commit timestamp, corrupt file and call salvage, verify before checkpoint.
+
+ # Commit some updates to get eviction and history store fired up
+ commit_value = b"bbbbb" * 100
+ cursor = self.session.open_cursor(self.uri)
+ for i in range(1, nsessions * nkeys):
+ self.session.begin_transaction('isolation=snapshot')
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(commit_value)
+ self.assertEquals(cursor.insert(), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(1))
+ cursor.close()
+
+ # Corrupt the table, Call salvage to recover data from the corrupted table and call verify
+ self.corrupt_salvage_verify()
+
+ # Call checkpoint
+ self.session.checkpoint()
+
+ hs_writes_start = self.get_stat(stat.conn.cache_write_hs)
+
+ # Have prepared updates in multiple sessions. This should ensure writing
+ # prepared updates to the history store
+ sessions = [0] * nsessions
+ cursors = [0] * nsessions
+ prepare_value = b"ccccc" * 100
+ for j in range (0, nsessions):
+ sessions[j] = self.conn.open_session()
+ sessions[j].begin_transaction('isolation=snapshot')
+ cursors[j] = sessions[j].open_cursor(self.uri)
+ # Each session will update many consecutive keys.
+ start = (j * nkeys)
+ end = start + nkeys
+ for i in range(start, end):
+ cursors[j].set_key(ds.key(nrows + i))
+ cursors[j].set_value(prepare_value)
+ self.assertEquals(cursors[j].insert(), 0)
+ sessions[j].prepare_transaction('prepare_timestamp=' + timestamp_str(4))
+
+ hs_writes = self.get_stat(stat.conn.cache_write_hs) - hs_writes_start
+
+ # Assert if not writing anything to the history store.
+ self.assertGreaterEqual(hs_writes, 0)
+
+ # Test if we can read prepared updates from the history store.
+ cursor = self.session.open_cursor(self.uri)
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
+ for i in range(1, nsessions * nkeys):
+ cursor.set_key(ds.key(nrows + i))
+ # The search should pass.
+ self.assertEqual(cursor.search(), 0)
+ # Correctness Test - commit_value should be visible
+ self.assertEquals(cursor.get_value(), commit_value)
+ # Correctness Test - prepare_value should NOT be visible
+ self.assertNotEquals(cursor.get_value(), prepare_value)
+ cursor.close()
+
+ # Close all cursors and sessions, this will cause prepared updates to be
+ # rollback-ed
+ for j in range (0, nsessions):
+ cursors[j].close()
+ sessions[j].close()
+
+ # Corrupt the table, Call salvage to recover data from the corrupted table and call verify
+ self.corrupt_salvage_verify()
+
+ self.session.commit_transaction()
+ self.session.checkpoint()
+
+ # Corrupt the table, Call salvage to recover data from the corrupted table and call verify
+ self.corrupt_salvage_verify()
+
+ # Finally, search for the keys inserted with commit timestamp
+ cursor = self.session.open_cursor(self.uri)
+ self.pr('Read Keys')
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(4))
+ for i in range(1, nkeys):
+ cursor.set_key(ds.key(nrows + i))
+ # The search should pass
+ self.assertEqual(cursor.search(), 0)
+ # Correctness Test - commit_value should be visible
+ self.assertEquals(cursor.get_value(), commit_value)
+ cursor.close()
+
+ self.session.commit_transaction()
+ self.session.checkpoint()
+
+ # Simulate a crash by copying to a new directory(RESTART).
+ copy_wiredtiger_home(".", "RESTART")
+
+ # Open the new directory.
+ self.conn = self.setUpConnectionOpen("RESTART")
+ self.session = self.setUpSessionOpen(self.conn)
+ cursor = self.session.open_cursor(self.uri)
+
+ # Search the keys inserted with commit timestamp after crash
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(4))
+ for i in range(1, nkeys):
+ cursor.set_key(ds.key(nrows + i))
+ # The search should pass
+ self.assertEqual(cursor.search(), 0)
+ # Correctness Test - commit_value should be visible
+ self.assertEquals(cursor.get_value(), commit_value)
+ # Correctness Test - prepare_value should NOT be visible
+ self.assertNotEquals(cursor.get_value(), prepare_value)
+ cursor.close()
+ self.session.commit_transaction()
+
+ # After simulating a crash, corrupt the table, call salvage to recover data from the corrupted table
+ # and call verify
+ self.corrupt_salvage_verify()
+
+ @unittest.skip("Temporarily disabled")
+ def test_prepare_hs(self):
+ nrows = 100
+ ds = SimpleDataSet(self, self.uri, nrows, key_format="S", value_format='u')
+ ds.populate()
+ bigvalue = b"aaaaa" * 100
+
+ # Initially load huge data
+ cursor = self.session.open_cursor(self.uri)
+ for i in range(1, 10000):
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(bigvalue)
+ self.assertEquals(cursor.insert(), 0)
+ cursor.close()
+ self.session.checkpoint()
+
+ # We put prepared updates in multiple sessions so that we do not hang
+ # because of cache being full with uncommitted updates.
+ nsessions = 3
+ nkeys = 4000
+ self.prepare_updates(ds, nrows, nsessions, nkeys)
+
+if __name__ == '__main__':
+ wttest.run()