summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-02-03 13:58:53 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-02-03 03:20:38 +0000
commit7aa1b65641938719accd595bda3e45e97dc5f475 (patch)
tree06442d7ad52cf475dca61aa2ca96ed908f450388
parentea687c2a4bcd9937d02658043b5a2529943ef950 (diff)
downloadmongo-r4.4.4-rc0.tar.gz
Import wiredtiger: 332dddfe0e48eb1c263455d3db9219ec5f7cdc30 from branch mongodb-4.4r4.4.4-rc0
ref: a52cd5a47a..332dddfe0e for: 4.4.4 WT-6430 Move WT_CONN_SERVER flags into their own field WT-6504 Don't fallback to onpage value as base value if we see the onpage value in the history store WT-6567 Write "rollback to stable" subpage for Architecture Guide WT-6772 Add support for prepared updates in datastore for test_hs09 WT-6901 Write "cursor" subpage for Architecture Guide WT-7069 Enable column store configuration to history store WT-7089 Don't skip checkpointing objects that have obsolete pages WT-7091 Restrict usage of LSM to only operate in conjunction with compatible incremental backup mechanism WT-7117 RTS to skip modifies that are more than on-disk base update while restoring an update WT-7121 Include log-structured allocation python tests in WT WT-7126 Coverity analysis defect 116991: Explicit null dereferenced WT-7127 Coverity analysis defect 116992: Unchecked return value WT-7128 Coverity analysis defect 116993: Resource leak WT-7131 Tiered cursors should return error if configured with zero tiers
-rw-r--r--src/third_party/wiredtiger/dist/docs_data.py3
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_void1
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_capacity.c6
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c6
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c12
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c8
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c6
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-cursor.dox390
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-index.dox4
-rw-r--r--src/third_party/wiredtiger/src/docs/arch-rts.dox113
-rw-r--r--src/third_party/wiredtiger/src/docs/spell.ok11
-rw-r--r--src/third_party/wiredtiger/src/history/hs_cursor.c34
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h25
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h63
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h9
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h4
-rw-r--r--src/third_party/wiredtiger/src/include/txn_inline.h144
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in20
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c6
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c2
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c14
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_cursor.c18
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_schema.c26
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c46
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c40
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c6
-rw-r--r--src/third_party/wiredtiger/test/format/config.c27
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_checkpoint08.py125
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs05.py9
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs06.py8
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs07.py11
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs08.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs09.py39
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs10.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs12.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs13.py10
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs14.py29
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs20.py91
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py304
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered01.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered02.py63
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered03.py97
48 files changed, 1655 insertions, 230 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py
index a1301c87057..9f2bb32486c 100644
--- a/src/third_party/wiredtiger/dist/docs_data.py
+++ b/src/third_party/wiredtiger/dist/docs_data.py
@@ -65,6 +65,9 @@ arch_doc_pages = [
ArchDocPage('arch-row',
['WT_BTREE'],
['src/include/btree.h']),
+ ArchDocPage('arch-rts',
+ [''],
+ ['src/txn/']),
ArchDocPage('arch-schema',
['WT_COLGROUP', 'WT_INDEX', 'WT_LSM_TREE', 'WT_TABLE'],
['src/include/intpack_inline.h', 'src/include/packing_inline.h',
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 5beea1cddc1..fca1ccc9810 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -78,6 +78,7 @@ func_ok()
-e '/int __wt_stat_dsrc_desc$/d' \
-e '/int __wt_stat_join_desc$/d' \
-e '/int __wt_stat_session_desc/d' \
+ -e '/int __wt_txn_read_upd_list$/d' \
-e '/int __wt_txn_rollback_required$/d' \
-e '/int __wt_win_directory_list_free$/d' \
-e '/int bdb_compare_reverse$/d' \
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 997337c7896..3ccc3e0b57e 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -856,8 +856,10 @@ conn_dsrc_stats = [
##########################################
# Transaction statistics
##########################################
+ TxnStat('txn_checkpoint_obsolete_applied', 'transaction checkpoints due to obsolete pages'),
TxnStat('txn_read_race_prepare_update', 'race to read prepared update retry'),
TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
+ TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'),
TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index eb0dde936e8..3bbabe4e470 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "a52cd5a47a7e9af9e2c341e66f0ffdd9bc977930"
+ "commit": "332dddfe0e48eb1c263455d3db9219ec5f7cdc30"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 05394d0ae98..63d187b7442 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -57,7 +57,7 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
cbt->iface.value.data = &cbt->v;
} else {
restart_read:
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
@@ -157,7 +157,7 @@ new_page:
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
restart_read:
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
++*skippedp;
@@ -232,7 +232,7 @@ restart_read:
cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
__wt_upd_value_clear(cbt->upd_value);
if (cbt->ins != NULL)
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type != WT_UPDATE_INVALID) {
if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
@@ -365,7 +365,7 @@ restart_read_insert:
if ((ins = cbt->ins) != NULL) {
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
++*skippedp;
continue;
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index bb2c3a9e05c..7fe24232317 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -197,7 +197,7 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
cbt->iface.value.data = &cbt->v;
} else {
restart_read:
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
cbt->v = 0;
cbt->iface.value.data = &cbt->v;
@@ -297,7 +297,7 @@ new_page:
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
restart_read:
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
++*skippedp;
continue;
@@ -372,7 +372,7 @@ restart_read:
cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
__wt_upd_value_clear(cbt->upd_value);
if (cbt->ins != NULL)
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type != WT_UPDATE_INVALID) {
if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
@@ -514,7 +514,7 @@ restart_read_insert:
if ((ins = cbt->ins) != NULL) {
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd));
if (cbt->upd_value->type == WT_UPDATE_INVALID) {
++*skippedp;
continue;
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index fc7e542a12e..e4f538d80df 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -230,7 +230,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *vali
* update that's been deleted is not a valid key/value pair).
*/
if (cbt->ins != NULL) {
- WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL));
+ WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
if (cbt->upd_value->type != WT_UPDATE_INVALID) {
if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE)
return (0);
diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c
index 2213f696e4f..2d0085dfb7b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_capacity.c
+++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c
@@ -72,7 +72,7 @@ __capacity_config(WT_SESSION_IMPL *session, const char *cfg[])
static bool
__capacity_server_run_chk(WT_SESSION_IMPL *session)
{
- return (F_ISSET(S2C(session), WT_CONN_SERVER_CAPACITY));
+ return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CAPACITY));
}
/*
@@ -129,7 +129,7 @@ __capacity_server_start(WT_CONNECTION_IMPL *conn)
{
WT_SESSION_IMPL *session;
- F_SET(conn, WT_CONN_SERVER_CAPACITY);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_CAPACITY);
/*
* The capacity server gets its own session.
@@ -196,7 +196,7 @@ __wt_capacity_server_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
- F_CLR(conn, WT_CONN_SERVER_CAPACITY);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_CAPACITY);
if (conn->capacity_tid_set) {
__wt_cond_signal(session, conn->capacity_cond);
WT_TRET(__wt_thread_join(session, &conn->capacity_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index b71ac6c28f4..27a5121ab2e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -63,7 +63,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
static bool
__ckpt_server_run_chk(WT_SESSION_IMPL *session)
{
- return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT));
+ return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CHECKPOINT));
}
/*
@@ -134,7 +134,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
if (conn->ckpt_session != NULL)
return (0);
- F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_CHECKPOINT);
/*
* The checkpoint server gets its own session.
@@ -201,7 +201,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
- F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_CHECKPOINT);
if (conn->ckpt_tid_set) {
__wt_cond_signal(session, conn->ckpt_cond);
WT_TRET(__wt_thread_join(session, &conn->ckpt_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index acba9ebb12c..0cb718d92f0 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -522,7 +522,7 @@ __wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force)
conn = S2C(session);
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
- if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) &&
+ if (!force && FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running");
@@ -566,7 +566,7 @@ __log_file_server(void *arg)
log = conn->log;
locked = false;
yield_count = 0;
- while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+ while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
/*
* If there is a log file to close, make sure any outstanding write operations have
* completed, then fsync and close it.
@@ -838,7 +838,7 @@ __log_wrlsn_server(void *arg)
log = conn->log;
yield = 0;
WT_INIT_LSN(&prev);
- while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+ while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
/*
* Write out any log record buffers if anything was done since last time. Only call the
* function to walk the slots if the system is not idle. On an idle system the alloc_lsn
@@ -908,7 +908,7 @@ __log_server(void *arg)
* records sitting in the buffer over the time it takes to sync out an earlier file.
*/
did_work = true;
- while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
+ while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
/*
* Slots depend on future activity. Force out buffered writes in case we are idle. This
* cannot be part of the wrlsn thread because of interaction advancing the write_lsn and a
@@ -1036,7 +1036,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
return (0);
- F_SET(conn, WT_CONN_SERVER_LOG);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_LOG);
/*
* Start the log close thread. It is not configurable. If logging is enabled, this thread runs.
@@ -1104,7 +1104,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
- F_CLR(conn, WT_CONN_SERVER_LOG);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_LOG);
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 8d45b5eb678..0c93433a59e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -77,7 +77,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* The LSM services are not shut down in this path (which is called when wiredtiger_open hits an
* error (as well as during normal shutdown). Assert they're not running.
*/
- WT_ASSERT(session, !F_ISSET(conn, WT_CONN_SERVER_LSM));
+ WT_ASSERT(session, !FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM));
/* Shut down the subsystems, ensuring workers see the state change. */
F_SET(conn, WT_CONN_CLOSING);
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 69989939c43..06ae5b14350 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -528,7 +528,7 @@ __statlog_on_close(WT_SESSION_IMPL *session)
if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE))
return (0);
- if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+ if (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_STATISTICS))
WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running");
WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp));
@@ -547,7 +547,7 @@ err:
static bool
__statlog_server_run_chk(WT_SESSION_IMPL *session)
{
- return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS));
+ return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_STATISTICS));
}
/*
@@ -614,7 +614,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn)
if (conn->stat_session != NULL)
return (0);
- F_SET(conn, WT_CONN_SERVER_STATISTICS);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_STATISTICS);
/* The statistics log server gets its own session. */
WT_RET(__wt_open_internal_session(conn, "statlog-server", true, 0, &conn->stat_session));
@@ -685,7 +685,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
conn = S2C(session);
/* Stop the server thread. */
- F_CLR(conn, WT_CONN_SERVER_STATISTICS);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_STATISTICS);
if (conn->stat_tid_set) {
__wt_cond_signal(session, conn->stat_cond);
WT_TRET(__wt_thread_join(session, &conn->stat_tid));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 0ab2b97e21e..7931f4e8a79 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -249,7 +249,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
static bool
__sweep_server_run_chk(WT_SESSION_IMPL *session)
{
- return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP));
+ return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_SWEEP));
}
/*
@@ -375,7 +375,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
conn = S2C(session);
/* Set first, the thread might run before we finish up. */
- F_SET(conn, WT_CONN_SERVER_SWEEP);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_SWEEP);
/*
* Handle sweep does enough I/O it may be called upon to perform slow operations for the block
@@ -406,7 +406,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
conn = S2C(session);
- F_CLR(conn, WT_CONN_SERVER_SWEEP);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_SWEEP);
if (conn->sweep_tid_set) {
__wt_cond_signal(session, conn->sweep_cond);
WT_TRET(__wt_thread_join(session, &conn->sweep_tid));
diff --git a/src/third_party/wiredtiger/src/docs/arch-cursor.dox b/src/third_party/wiredtiger/src/docs/arch-cursor.dox
index 60e47c5a8ad..7eee860af86 100644
--- a/src/third_party/wiredtiger/src/docs/arch-cursor.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-cursor.dox
@@ -3,13 +3,393 @@
Cursors are used in WiredTiger to get and modify data.
A caller of WiredTiger uses WT_SESSION::open_cursor to create
a WT_CURSOR. Methods on the WT_CURSOR can then be used to
-position, iterate, get, and set data.
+position, iterate, get, and set data. In the typical case, a cursor
+will be used to access keys and values in a Btree. However, cursors
+can also used to access indexed data, WiredTiger statistics, log files,
+and metadata. Additionally, cursors are used for managing backups.
-Depending on the <code>uri</code> used when creating a cursor, the cursor will
-be internally implemented as one of the many cursor structures that include
-WT_CURSOR_BTREE, WT_CURSOR_BACKUP, WT_CURSOR_INDEX, WT_CURSOR_LOG,
-WT_CURSOR_METADATA, WT_CURSOR_STAT. Each of these structures starts
+The various kinds of cursors are created by the WT_SESSION::open_cursor call.
+Depending on the <code>uri</code> used when opening a cursor, the cursor will
+be implemented internally as one of the many cursor structures that include
+\c WT_CURSOR_BTREE, \c WT_CURSOR_BACKUP, \c WT_CURSOR_INDEX, \c WT_CURSOR_LOG,
+\c WT_CURSOR_METADATA, \c WT_CURSOR_STAT, \c WT_CURSOR_TABLE. Each of these structures starts
with the common \c %WT_CURSOR structure, which contain all
of the data and method pointers that make up the public part of the API.
+Thus, any one of these "extended cursor structs" can be allocated and
+returned as a WT_CURSOR pointer. Since the method pointers are filled with
+a specific implementations, for example \c __curtable_reset, a call to
+WT_CURSOR::reset will call the specific implementation function, and the first
+argument can be cast to be a pointer to the cursor struct used by the implementation.
+Thus, in our C code, we have something similar to a class hierarchy, having
+an abstract base class (WT_CURSOR) with virtual methods, and a number of
+implementation classes.
+
+The code for each cursor type's methods are generally organized in a single file.
+For example, a backup cursor is implemented in \c src/cursor/cur_backup.c .
+Similarly, shared or utility cursor methods are defined in \c src/cursor/cur_std.c .
+
+Several cursor types have the concept of subordinate cursors, or child
+cursors. For example, a table cursor is composed of subordinate file cursors,
+each representing a column group. A metadata cursor has a subordinate
+cursor that is a file cursor on the metadata file.
+
+Every open cursor appears on a list in \c WT_SESSION->cursors. This list is ordered
+in a way that cursors are closed when the session is closed. Thus, if a subordinate
+cursor needs to be closed before its parent, it must be listed before the parent.
+
+@section arch_cursor_raw Data translation
+
+Cursors that expose Btree data, like file, table and index cursors, return a set of keys and values,
+translating encoded data to types that match the schema. For example, if the \c value_format specified for
+a table is \c "iSi" (an integer, a null-terminated string, an integer), then data `{0, "abc", 5}` might
+be stored in the Btree in a packed format as:
+```
+0x80 0x61 0x62 0x63 0x00 0x85
+```
+When retrieved, the values are decoded and stored into typed variables whose addresses are passed to WT_CURSOR::get_value.
+
+As an aside, `0x80` represents `0` using a variable length encoding. Using 0x80 as zero allows negative integers
+to be stored (\c -1 is \c 0x7f) in a way that they will sort before zero and positive integers. Small integer values
+can be stored in a single byte. See comments in \c src/include/intpack_inline.h for more information.
+
+When creating a cursor via WT_SESSION::open_cursor, the *raw* flag can be used. This has the effect
+of disabling the translation provided by the schema, transferring a single block of unencoded data
+for the key or value.
+
+@section arch_cursor_file File cursors
+
+File cursors (also known as Btree cursors) are one of fundamental kind of cursors, allowing
+direct accesses to WiredTiger Btrees. The implementation structure for a file
+cursor is \c WT_CURSOR_BTREE. The file cursor methods are generally small wrappers around calls
+into the Btree layer, where the \c WT_CURSOR_BTREE structure is used. The Btree layer handles
+all aspects of cursor positioning, and transfers of raw key and value data.
+
+@section arch_cursor_table Table and index cursors
+
+A table cursor is a higher level concept built on file cursors. A WiredTiger table allows data
+to be physically split into separate Btrees, this is done via the concept of column groups. Column
+groups may be defined that contain a set of named columns (`columns` are synonymous with `fields`).
+Each column group's columns are the stored in a single btree and may be looked up by the table's key.
+See @ref schema_column_groups for API details.
+
+Tables may also have indices. These are implemented as Btrees mapping the index key(s) to the
+main key(s) of the table. Indices may be added after the data is populated in the main table.
+This requires the index to be filled at the time it is added. Index cursors are implemented using
+\c WT_CURSOR_INDEX. Methods on this cursor that set and get values know to use the value that appears
+in the index as a key into the main table.
+See @ref schema_indices for API details.
+
+Internally, we define the concept of a "simple" table as one that has no named columns. *Columns*,
+that is the set of keys and values, may be optionally named when creating a table.
+With no named columns, column groups are not possible (as they must reference names),
+and all of a table's keys and values reside in a single file.
+Also without named columns, there is no possibility that a cursor can use projections.
+A "simple" table, by its nature, can be implemented by a single Btree file and needs no special translation.
+Thus, if a cursor is opened on a "simple" table, we can return a file cursor on the single file used
+to store its data, instead of a table cursor. This optimization means that every cursor method
+goes directly to the file cursor implementation, saving CPU time throughout the lifetime of the cursor.
+
+@section arch_cursor_projections Projections and plans
+
+Projections are an indication, when a cursor is opened, that the indicated values, and possibly some keys,
+should be returned by WT_CURSOR::get_value calls. This is only available for table cursors.
+If the table was configured with column groups, a projection has a bearing on which column group
+files must be opened in a cursor. When a subset of values is returned, it's possible that some
+column groups will not be needed. To implement projections and column groups, cursors use a *plan*.
+
+A plan is a string that indicates a series of actions that must be taken to retrieve the needed
+values from the subordinate cursors in the table cursor. Remember that each column group gets
+its own cursor. When a table is created, a default plan is created that asks to copy all
+columns from each column group cursor in order.
+When a projection is used for a cursor, a more complex plan may be created.
+A plan contains numbers and action letters. The numbers are arguments, the
+action letters are commands. The actions 'k' and 'v' indicate that the subordinate
+cursor indicated by the numeric argument will be used to get keys or values that follow.
+The 'n' action gets one or more next key or value columns from the cursor. The 's' skips
+one or more columns. Switching to another cursor resets the cursor to point at the first
+column in the indicated cursor. This allows a way to find keys and values in arbitrary
+order.
+
+As a contrived example, suppose we have a table with two keys
+\c "k1,k2" and four values \c "x1,x2,x3,x4". The column groups are created as follows:
+```
+ session->create(session, "colgroup:main:c1", "columns=(x1,x2)");
+ session->create(session, "colgroup:main:c2", "columns=(x3,x4)");
+```
+The column group \c "main:c1" has keys "k1,k2" and values "x1,x2";
+the column group \c "main:c2" has keys "k1,k2" and values "x3,x4". A table cursor
+on the main table will have two sub-cursors, one for each column group.
+Now consider the ill-considered projection that is opened as:
+```
+ session->open_cursor(session, "table:main(x1,x4,x3,k2,k1,x2,x4)", NULL, NULL, &cursor);
+
+```
+In this case, the plan is this string (with spacing added):
+```
+0v n 1v s n 1v n 0k s n 0k n 0v s n 1v s n
+```
+To break it down,
+- \c "0v" means use sub-cursor 0 and prepare to read values.
+- \c "n" means get the next value, that is \c "x1".
+- \c "1v" means use sub-cursor 1 and prepare to read values.
+- \c "s" means skip a value, that is \c "x3".
+- \c "n" means get the next value, that is \c "x4".
+- \c ...
+
+Notice that this plan requires many switches of cursors and several \c "s" (skip) operations.
+Each skip involves enough decoding of the data item to determine its length so its data can
+be skipped over.
+
+With the default (complete) projection, getting values is fast. Using the default plan, the needed
+columns are pulled out of each subordinate cursor one by one, and get copied to the caller's arguments.
+With projections, the simple algorithm following the plan works well if the columns in the projection
+are grouped by column group and requested in order. Without that discipline, as in this example,
+the performance will not be optimal.
+
+The implementation of plan creation and execution resides in the @ref arch-schema.
+
+@section arch_cursor_dump Dump cursors
+
+Dump cursors are used in two rather different ways. Regular dump cursors retrieve the raw keys and values
+and translate bytes either as raw characters or as hex values. This flavor of dump cursor is used
+by the `wt dump` or `wt dump -x` utility. JSON dump cursors do more sophisticated translation, returning a string that is a JSON
+formatted record, with the name for each key and its data and the name for each value and its corresponding data.
+Data is translated to either integral, floating point, or string depending on the format of the column in the schema.
+This flavor of dump cursor is used by the `wt dump -j` command as part of creating a JSON dump of a WiredTiger table or file.
+
+There is a single \c WT_CURSOR_DUMP struct that is used to implement both flavors.
+The dump code checks for the \c WT_CURSTD_DUMP_JSON flag and as needed, calls into functions
+in `src/cursor/cur_json.c` . The code in that file also implements several external functions that are used
+by the `wt` utility when loading JSON-dumped files. In particular, the \c __wt_json_token function
+returns individual JSON tokens from an input string.
+
+The JSON code used by the dump cursor uses some storage that hangs off of `cursor->json_private` which
+is typed as \c WT_CURSOR_JSON. When a JSON flavored cursor is created, the list of key column names
+and value column names is populated in \c WT_CURSOR_JSON. These names, obtained from the
+configuration string that created the table or file, are useful to have in advance,
+as they are used once per row to help fill out the JSON output. The functions that get
+rows iterate these names and unpack the corresponding column data,
+converting them into the appropriate JSON format for the data.
+
+@section arch_cursor_backup Backup cursors
+
+A backup cursor is used to manage backups. It is implemented using a \c WT_CURSOR_BACKUP structure.
+A backup cursor can be configured to do a full backup or an incremental backup.
+First, we'll look at full backups.
+
+A backup cursor for a full backup returns the set of files that need to be copied to achieve the
+backup. The backup cursor, when opened, ensures that it is the only backup cursor running
+in the system and returns an error if not. This is managed using the
+\c WT_CONNECTION_IMPL->hot_backup_start variable, which can only be accessed when the
+connection schema lock is held. A non-zero value means a hot backup is in progress.
+Closing the backup cursor sets it back to zero.
+
+Having an open backup influences actions
+elsewhere in WiredTiger, since part of the backup protocol involves the application copying
+whole data files. Thus, having an open backup may cause the block manager and log file server
+thread to avoid truncating data and log files. A truncation of a file being copied by the
+application would be unexpected. Also, open checkpoints are not deleted during the
+course of a backup.
+
+When the backup cursor is initialized, the complete set of files needed to back up is generated
+and stored in the cursor. This makes the backup's \c next function easy as it just returns the next
+file in the list.
+
+Incremental backups work much the same way, except that the file list is reduced to files that
+have changed since a previous backup referenced in the configuration when the cursor is opened.
+The other twist is that for each file returned, the caller does a duplicate operation on the backup
+cursor, and the duplicate code actually returns a specialized incremental backup cursor. This kind
+of cursor has its own \c next method that causes it to return information about individual pieces
+of this file that need to be copied. The code to implement incremental cursors
+is in \c src/cursor/cur_backup_incr.c .
+
+@section arch_cursor_join Join cursors
+
+Join cursors implement a join mechanism for WiredTiger. The idea is that joins can be configured
+by opening a special join cursor on a table, and attaching the table's index cursors to it, to
+return rows that match a filter, like:
+```
+(row.price > 100 and row.price <= 200) and row.in_stock < 10
+```
+Building up the conditions essentially creates a tree that is used for evaluation. This is stored
+in \c WT_CURSOR_JOIN, and is returned as the cursor object. Entries in the tree representing
+an index's participation in one clause are stored in \c WT_CURSOR_JOIN_ENTRY objects. \c WT_CURSOR_JOIN and
+\c WT_CURSOR_JOIN_ENTRY objects can be composed in a hierarchical manner, representing the shape of the tree representing
+the query. These two structures are somewhat static, being created when the join is created.
+
+The join cursor contains a pointer to a \c WT_CURSOR_JOIN_ITER, which in some sense encodes the "position"
+of the cursor. To get the next row that satisfies a join requires that multiple cursors be iterated.
+Generally, the "left-most" index cursor is iterated first. Using the example above, the \c price
+index would be iterated, skipping over any entries that did not satisfy `(row.price > 100 and row.price <= 200)`.
+Then other cursors are checked to see that any other conditions are satisfied.
+The join may be configured to use Bloom filters, and when that occurs,
+\c WT_CURSOR_JOIN_ENTRIES contain the bloom filter for individual index checks. This allows checks
+to occur quickly, at the expense of an initialization that occurs when the cursor begins iteration.
+
+During a join cursor iteration, multiple \c WT_CURSOR_JOIN_ITER objects may be created.
+There is one \c WT_CURSOR_JOIN_ITER object corresponding to each level of nesting (queries can have
+arbitrary nesting of \c AND/OR). Each \c WT_CURSOR_JOIN_ITER allows the question to be asked: does
+the current position of the main table cursor satisfy the join conditions for this part of the join tree?
+
+Part of the reason for this dynamic structure is that disjunctions may require some dynamic action.
+Consider
+```
+(row.price > 100 and row.in_stock < 10) .... or (row.aisle == 12 and row.on_sale == 1)
+```
+The last part of the query (being part of an OR clause) will be first executed after a number of items
+are returned that satisfy the first part of the query. If the join cursor is closed before then,
+it would be a waste to have opened subordinate cursors on the \c aisle and \c on_sale indices, and
+potentially computed bloom filters, etc.
+
+When positioned on an entry that represents a nested join, a new child \c WT_CURSOR_JOIN_ITER is
+created that will be bound to the nested \c WT_CURSOR_JOIN. That iterator is then used to generate candidate
+primary keys. When its iteration is completed, that iterator is destroyed and the parent iterator
+advances to the next entry. Thus, depending on how deeply joins are nested, a similarly deep
+stack of iterators is created.
+
+@section arch_cursor_duplicate Duplicating cursors
+
+Cursors may be duplicated, this occurs by passing a cursor to be duplicated as part of the
+WT_SESSION::open_cursor call. Cursor duplication does not occur in the cursor type code.
+Rather, a new cursor of the requested type is created, and the cursor's position is duplicated
+via a call to \c __wt_cursor_dup_position. This function gets the key from the original cursor
+in *raw* form (not converting it using \c key_format), sets the key in the new cursor, and
+does a search to set the position properly.
+
+@section arch_cursor_dhandle File cursors, Btrees and data handles
+
+File cursors, Btrees and data handles exist in a WiredTiger system as different ways to
+reference the data in a Btree. It is useful to understand the differences between these structures
+and how they are used.
+
+At the bottom is the data handle (also known as *dhandle*). This is an abstraction of an operating
+system file handle, with a set of flags and some reference counts. A Btree is a much larger
+abstraction, with a memory cache of key value pairs along with functions to read and write data
+as needed to and from the data file. A Btree is paired with a data handle to allow the transfer of data.
+Both the data handle and the Btree are owned by the connection. That is, they are shared among all sessions.
+
+File cursors, on the other hand, are owned by a session. When a session opens a cursor on a file for the
+first time in that session, a file cursor is created. This occurs even if the file may be opened by cursors
+in other sessions already. The session owns the cursor, and the cursor may only be used by that session.
+Open cursors do increment reference counts in the data handle, so that the data handle "knows" it is
+being used, so that the file may not be dropped, renamed, verified or salvaged.
+So when WT_CURSOR::close is called for a file cursor, the cursor's memory may be released (or
+retained if cached), and reference counts decremented. Other sessions may retain open cursors on that file,
+they are independent.
+
+@section arch_cursor_caching Cursor caching
+
+Cursors, upon closing, may be cached in the session. An open of the same URI will return a cached
+cursor if one is found matching the URI. Cursor caching is currently only done on file cursors.
+Because of the optimization for simple tables described above, cursors on simple tables are also cached.
+
+To help implement caching, two methods, \c cache and \c reopen have been added to the cursor API.
+These are not public. Their function is to perform cursor-type specific operations to change a cursor
+from an open state to a cached state (\c cache) and change from a cached state to the open state (\c reopen).
+
+When a cursor is opened the first time, it is marked as *cacheable* or not. Cursors that specify
+certain options, like bulk loading, random, or readonly, are not cacheable. When a cursor is closed, the
+cursor is checked if it is cacheable. If so and if cursor caching is enabled in the session, then it will be
+cached. Cached cursors live in a hash table that is owned by the session. A hash function on the URI is used
+to determine which hash bucket to use. We compute the hash of the URI once, its value is stored in
+in the cursor for future use. Thus, caching a cursor (what happens within the type-specific \c cache function
+(e.g. \c __curfile_cache) is relatively quick:
+
+- Free storage that we don't want held (for example, storage used by the cursor's key and value).
+- Get a *weak* reference to the data handle (increment \c dhandle->session_ref).
+- Release the *strong* reference to the data handle (decrement \c dhandle->session_inuse).
+- Determine the hash bucket needed (using the hash value in the cursor).
+- Move the cursor from the session's open list to the list in the hash bucket.
+- Increment statistics and decrement the connection's open cursor count.
+- Set the cache cursor flag.
+- Unlock the dhandle.
+
+The change of reference from a *strong* to a *weak* is significant. When a dhandle's \c session_inuse
+(*strong* reference) drops to zero, it means that no cursor is open on the dhandle, and the only
+references are from cached cursors. In this state, the dhandle may be marked dead by the dhandle sweep.
+When the dhandle is dead, the dhandle's memory will still persist, but each session will eventually
+notice, during its cursor cache sweep, and "fully close" the cursor, removing it from the cache list,
+releasing its weak reference before freeing the cursor. Each session holding cached cursors must have
+some periodic activity that causes it to run its sweep, an occasional call to WT_SESSION::reset will suffice.
+After a dhandle has been dead for enough time, it is expected that all of its weak references will drop to zero,
+and the dhandle itself can be freed by the dhandle sweep.
+
+If there is a failure during the \c cache function, then we would want to fully close the cursor.
+Rather than having special case code to handle this rare condition, we instead call \c reopen to
+temporarily bring the cursor back to an open state, and turn cursor caching off temporarily in
+the session while we close the cursor, releasing all its resources and references.
+
+During a cursor open, if the cursor configuration options allow caching, we hash the uri, and look
+at the corresponding hash bucket in the session cursor cache. If we find a matching cursor, we
+call \c reopen on the cursor. This is what happens within the call to the type-specific \c reopen function
+(e.g. \c __curfile_reopen):
+
+- Lock the dhandle.
+- If the dhandle is no longer open, release it and mark the reopen to fail (but continue).
+- Get a "strong" reference to the data handle (increment \c dhandle->session_inuse).
+- Release the "weak" reference to the data handle (decrement \c dhandle->session_ref).
+- Increment statistics and the connection's open cursor count.
+- Move the handle from the hash bucket to the session's open list (the hash value was previously saved in the cursor)
+- Clear the cache cursor flag.
+- Update convenient pointers within the cursor to parts of the WT_BTREE that may have changed
+ which the cursor was cached.
+
+If the reopen fails (probably due to the dhandle no longer being open or being marked dead),
+we have ensured that enough of the cursor is opened so that it can be legally closed.
+We have studiously avoided having a cursor that is in some state that is half-open and half-closed,
+as it is hard to know how to dispose of it.
+
+@section arch_cursor_sweep Session cursor sweep
+
+Consider a large system that has many sessions using the same set of tables. When all sessions
+have closed a particular table, there will be no need to keep the underlying data handle open.
+In fact, if a WT_SESSION::drop call is called, we want to ensure that the data handle has been
+closed and the corresponding file is removed. Some systems, like Windows, require that all open
+file handles be closed before a file can be successfully removed from the file system. So there
+is motivation to periodically mark data handles that have no active references, and have sessions
+free cached cursors that have weak references to such marked data handles. The former job
+occurs in the connection sweep code. The latter job occurs in the session cursor cache sweep.
+
+The session cursor cache sweep currently happens in the WT_CURSOR::close call,
+and also on calls to WT_SESSION::reset. On one hand, we don't want the overhead
+of a sweep to occur too frequently. It is quite possible that both close and reset can be called a lot
+and there may be many of thousands of cached cursors in the session. For that reason, we'd like
+to do the sweep in small increments, and not on each call. On the other hand, in a larger system,
+a session may be part of a pool servicing higher level requests. When a session completes its work,
+it may be left idle by the caller of WiredTiger, and such a session may
+then be idle for long periods of time. When cursor caching is enabled and sessions are not active,
+we want even occasional calls to WT_SESSION::reset to have a strong effect. We want occasional
+sweeps to keep up with freeing up references to data handles, so that otherwise unused data handles
+may in turn be freed eventually.
+
+Our solution to this is three-fold. First, every time we want to call the sweep, a countdown
+counter is used, so we only consider a sweep every \c WT_SESSION_CURSOR_SWEEP_COUNTDOWN times
+(currently `WT_SESSION_CURSOR_SWEEP_COUNTDOWN == 40`). Secondly, we won't sweep if it's already
+been done this second in time. Finally, we sweep by walking a small set of buckets, initially 5 out of
+typically 512 configured buckets. However, depending on how productive our sweep is, that is,
+how many references to closed data handles are freed, we may continue our walk. This should
+usually strike a good balance between not having a lot of overhead for sweeps, and keeping up with
+the need to free up shared resources.
+
+@section arch_cursor_debug_copy Debug copy
+
+When cursors are positioned, their data may point to data in the btree or data allocated in the cursor.
+A caller may use a pointer to that data until the next cursor call. After that, the pointer should not be
+considered valid. By default, WiredTiger does not enforce this. When opening a cursor, a \c "debug=copy" configuration
+flag can be used. This forces any data that is returned by WT_CURSOR::get_key or WT_CURSOR::get_value
+to be in malloc'd memory, and explicitly freed on the next API call. Systems that
+are instrumented to track memory references can detect the references to freed memory, thus latent
+bugs can be detected.
+
+The implementation is straightforward. The key and value in each cursor is represented as a \c WT_ITEM.
+A \c WT_ITEM includes a pointer and size, and it can point to arbitrary memory. However, the \c WT_ITEM
+also includes a memory buffer that may or may not be allocated. When the \c WT_ITEM pointer points
+to the item's own memory buffer, then it is already in malloc'd memory. When \c "debug=copy" is
+configured, it is a simple matter to check if a key and value being returned are already in the item's malloc'd memory.
+If not, memory is allocated, the copy is made and the item's pointer is updated. On the beginning
+of the next API call using that cursor, the item's malloc'd memory is overwritten and freed.
+Thus, in the presence of a memory tracker, uses
+of "stray" pointers will be detected. Even without a memory tracker, uses of "stray" pointers into
+the freed storage will likely yield the overwritten bytes, and not the previously seen key or value.
*/
diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox
index 21af5d1f62f..52ebc0120d4 100644
--- a/src/third_party/wiredtiger/src/docs/arch-index.dox
+++ b/src/third_party/wiredtiger/src/docs/arch-index.dox
@@ -184,6 +184,10 @@ WiredTiger has a Python API that is useful for scripting and experimentation.
Row Stores are Btrees that have a variable size key and data.
+@subpage arch-rts
+
+Rollback to stable to remove the unstable updates from the database.
+
@subpage arch-schema
A schema defines the format of the application data in WiredTiger.
diff --git a/src/third_party/wiredtiger/src/docs/arch-rts.dox b/src/third_party/wiredtiger/src/docs/arch-rts.dox
new file mode 100644
index 00000000000..451732f5776
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/arch-rts.dox
@@ -0,0 +1,113 @@
+/*! @arch_page arch-rts Rollback to stable
+
+Rollback to stable is an operation that retains only the modifications that
+are stable according to the stable timestamp and recovered checkpoint snapshot.
+The checkpoint transaction snapshot details are saved at the end of every
+checkpoint are recovered and used as a recovered checkpoint snapshot.
+
+@section rts-overview Overview of rollback to stable
+
+Rollback to stable scans each and every table present in the database except
+metadata @ref arch-metadata to remove the modifications from the table
+that are more than stable timestamp and recovered checkpoint snapshot.
+
+In the process of removing newer modifications from the table, all the in-memory
+updates are aborted and the on-disk version updates are replaced with an update
+from history store otherwise the data store version is removed.
+
+Rollback to stable is performed in three phases
+1. WT startup
+2. WT shutdown
+3. Application initiated
+
+To improve the performance of rollback to stable operation, rollback to stable
+will perform only on particular tables that need rollback. Rollback to stable
+doesn't operate on logged tables as the updates on these are stable when the
+transaction gets committed.
+
+@section rts-stable-update Stable update of rollback to stable
+
+According to rollback to stable, the stable version of update is an update that
+has durable timestamp of less than or equal to the stable timestamp and it's
+transaction id must be committed according to the checkpoint snapshot.
+
+@section rts-preconditions Pre-conditions required for rollback to stable
+
+To perform rollback to stable, there shouldn't be any transaction activity happening
+in the WiredTiger.
+
+@section rts-table-check Checks performed on a table by rollback to stable
+
+Rollback to stable consider a table to be processed for rollback based on the following
+conditions.
+1. Table is modified
+2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
+3. There is no durable timestamp in any checkpoint.
+4. Has prepared updates
+5. Has updates from transactions greater than checkpoint snapshot (only in restart phase)
+
+There are some special conditions where the table is skipped for rollback to stable.
+1. Empty file
+2. Table has timestamp updates but there is no stable timestamp set.
+3. Files that don't exist
+4. Files that are corrupted.
+
+If the table has no timestamp updates, then the history store table is
+scanned to remove any historical versions related to this table as these
+older versions no longer required.
+
+Once all the tables are process for rollback to stable, at the end the history store
+is processed to remove any unstable updates more than the stable timestamp.
+
+@section rts-how How rollback to stable fixes the unstable updates
+
+Once a table is identified to perform rollback to stable, it reads the pages into
+the cache if they don't exist and process it to remove the unstable updates.
+
+There are two types of rollbacks that rollback to stable performs:
+1. Rolling back unstable fast truncate
+2. Rolling back unstable updates
+
+All internal pages are traversed to rollback unstable fast truncate operations and
+leaf pages are traversed to remove the unstable updates.
+
+Once the leaf page is identified to rollback the updates, it is performed in the
+following order.
+1. Check smallest insert lists on the page
+2. Traverse through all the on-disk keys
+ a. Check update list
+ b. Check insert list
+ c. Check the on-disk version if no stable update found in the update list.
+3. Traverse through the reconciled pages to abort any history store updates.
+
+@section rts-abort-update How rollback to stable aborts in-memory updates
+
+Traverse through all the updates in the update list and abort them until a stable
+update is found.
+
+@section rts-abort-on-disk-update How rollback to stable aborts on-disk update
+
+If the start time pair is not stable try to find a valid update from the history
+store that is stable to replace the on-disk version otherwise remove the on-disk
+key. In case if the stop time pair if exists and if its not stable, restore the
+on-disk update again into the update list.
+
+To remove any existing update, rollback to stable adds globally visible tombstone to
+the key update list and this key will get removed later during the reconciliation.
+
+Note: As of now, rollback to stable don't work on removing on-disk columnar updates.
+
+@section rts-hs-search Rollback to stable history store search
+
+Rollback to stable searches the history store to find a stable update to replace an
+unstable update in the data store. It searches the history store with the given data
+store key with a maximum timestamp and traverse back till a stable update found. If
+no valid update is found the data store key is removed.
+
+@section rts-page-skip Skipping reading unnecessary pages into memory
+
+Rollback to stable doesn't load the pages that don't have any unstable updates to be
+removed to improve the performance of rollback to stable by verifying the time aggregated
+values with the stable timestamp or recovered checkpoint snapshot during the tree walk.
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index be45196ff97..5a37d260c1e 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -17,6 +17,7 @@ Christoph
Collet's
Coverity
Coverity's
+CURSTD
DB's
DBTs
DHANDLE
@@ -51,6 +52,8 @@ Google's
HyperDex
HyperLevelDB
IEC
+IMPL
+ITER
JDK
JIRA
JavaScript
@@ -114,6 +117,7 @@ Yann
Za
Zstd
aR
+abc
abstime
ack'ed
ajn
@@ -149,6 +153,7 @@ bufs
builtin
builtins
bzip
+cacheable
cachesize
calc
callbk
@@ -180,6 +185,7 @@ crashless
crc
curfile
cursortype
+curtable
customerABC
cv
dN
@@ -197,6 +203,7 @@ dbformat
dbm
dbt
decl
+decrement
decrementing
decrypt
decrypted
@@ -291,6 +298,7 @@ html
htmlinclude
huffman
hugepage
+iSi
icount
ie
iflag
@@ -299,6 +307,7 @@ indices
init
insn
intl
+intpack
inuse
io
ip
@@ -469,6 +478,7 @@ rmw
ro
rotn
rpc
+rts
runnable
runtime
rwlock
@@ -497,6 +507,7 @@ src
ssd
startsync
startuml
+startup
statlog
stderr
stdout
diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c
index 333ec0e25bb..6baaf94847c 100644
--- a/src/third_party/wiredtiger/src/history/hs_cursor.c
+++ b/src/third_party/wiredtiger/src/history/hs_cursor.c
@@ -180,7 +180,7 @@ __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t bt
static int
__hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare,
- WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw)
+ WT_ITEM *base_value_buf)
{
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *hs_cbt;
@@ -333,14 +333,14 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
if (ret == WT_NOTFOUND) {
/*
- * Fallback to the onpage value as the base value.
+ * Fallback to the provided value as the base value.
*
* Work around of clang analyzer complaining the value is never read as it is reset
* again by the following WT_ERR macro.
*/
WT_NOT_READ(ret, 0);
orig_hs_value_buf = hs_value;
- hs_value = on_disk_buf;
+ hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
@@ -356,9 +356,9 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp));
if (hs_btree_id != btree_id) {
- /* Fallback to the onpage value as the base value. */
+ /* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
- hs_value = on_disk_buf;
+ hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
@@ -366,24 +366,22 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0) {
- /* Fallback to the onpage value as the base value. */
+ /* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
- hs_value = on_disk_buf;
+ hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
/*
- * If we find a history store record that either corresponds to the on-disk value or is
- * newer than it then we should use the on-disk value as the base value and apply our
- * modifies on top of it.
+ * If the stop time pair on the tombstone in the history store is already globally
+ * visible fall back to the base value. This is possible in scenarios where the latest
+ * updates are aborted by RTS according to stable timestamp.
*/
- if (on_disk_tw->start_ts < hs_start_ts_tmp ||
- (on_disk_tw->start_ts == hs_start_ts_tmp &&
- on_disk_tw->start_txn <= hs_cbt->upd_value->tw.start_txn)) {
- /* Fallback to the onpage value as the base value. */
+ if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
+ /* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
- hs_value = on_disk_buf;
+ hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
@@ -451,7 +449,7 @@ err:
*/
int
__wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno,
- WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw)
+ WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -460,8 +458,8 @@ __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma
WT_RET(__wt_hs_cursor_open(session));
WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
- (ret = __hs_find_upd_int(session, btree->id, key, value_format, recno, upd_value,
- allow_prepare, on_disk_buf, on_disk_tw)));
+ (ret = __hs_find_upd_int(
+ session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf)));
WT_TRET(__wt_hs_cursor_close(session));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index f11210061b1..79addcfd048 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -252,18 +252,19 @@ struct __wt_btree {
* Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these
* flag values for that reason, there's no way to start at an offset.
*/
-#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */
-#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */
-#define WT_BTREE_CLOSED 0x000400u /* Handle closed */
-#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */
-#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */
-#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */
-#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */
-#define WT_BTREE_READONLY 0x008000u /* Handle is readonly */
-#define WT_BTREE_SALVAGE 0x010000u /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x020000u /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x040000u /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x080000u /* Handle is for verify */
+#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */
+#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */
+#define WT_BTREE_CLOSED 0x000400u /* Handle closed */
+#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */
+#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */
+#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */
+#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */
+#define WT_BTREE_OBSOLETE_PAGES 0x008000u /* Handle has obsolete pages */
+#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */
+#define WT_BTREE_SALVAGE 0x020000u /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x040000u /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x080000u /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x100000u /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index f62570a8041..5af42c1a0e7 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -548,35 +548,42 @@ struct __wt_connection_impl {
*/
WT_FILE_SYSTEM *file_system;
+/*
+ * Server subsystem flags.
+ */
+/* AUTOMATIC FLAG VALUE GENERATION START */
+#define WT_CONN_SERVER_CAPACITY 0x01u
+#define WT_CONN_SERVER_CHECKPOINT 0x02u
+#define WT_CONN_SERVER_LOG 0x04u
+#define WT_CONN_SERVER_LSM 0x08u
+#define WT_CONN_SERVER_STATISTICS 0x10u
+#define WT_CONN_SERVER_SWEEP 0x20u
+ /* AUTOMATIC FLAG VALUE GENERATION STOP */
+ uint32_t server_flags;
+
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_CONN_CACHE_CURSORS 0x0000001u
-#define WT_CONN_CACHE_POOL 0x0000002u
-#define WT_CONN_CKPT_SYNC 0x0000004u
-#define WT_CONN_CLOSING 0x0000008u
-#define WT_CONN_CLOSING_NO_MORE_OPENS 0x0000010u
-#define WT_CONN_CLOSING_TIMESTAMP 0x0000020u
-#define WT_CONN_COMPATIBILITY 0x0000040u
-#define WT_CONN_DATA_CORRUPTION 0x0000080u
-#define WT_CONN_EVICTION_RUN 0x0000100u
-#define WT_CONN_FILE_CLOSE_SYNC 0x0000200u
-#define WT_CONN_HS_OPEN 0x0000400u
-#define WT_CONN_INCR_BACKUP 0x0000800u
-#define WT_CONN_IN_MEMORY 0x0001000u
-#define WT_CONN_LEAK_MEMORY 0x0002000u
-#define WT_CONN_LSM_MERGE 0x0004000u
-#define WT_CONN_OPTRACK 0x0008000u
-#define WT_CONN_PANIC 0x0010000u
-#define WT_CONN_READONLY 0x0020000u
-#define WT_CONN_RECONFIGURING 0x0040000u
-#define WT_CONN_RECOVERING 0x0080000u
-#define WT_CONN_SALVAGE 0x0100000u
-#define WT_CONN_SERVER_CAPACITY 0x0200000u
-#define WT_CONN_SERVER_CHECKPOINT 0x0400000u
-#define WT_CONN_SERVER_LOG 0x0800000u
-#define WT_CONN_SERVER_LSM 0x1000000u
-#define WT_CONN_SERVER_STATISTICS 0x2000000u
-#define WT_CONN_SERVER_SWEEP 0x4000000u
-#define WT_CONN_WAS_BACKUP 0x8000000u
+#define WT_CONN_CACHE_CURSORS 0x000001u
+#define WT_CONN_CACHE_POOL 0x000002u
+#define WT_CONN_CKPT_SYNC 0x000004u
+#define WT_CONN_CLOSING 0x000008u
+#define WT_CONN_CLOSING_NO_MORE_OPENS 0x000010u
+#define WT_CONN_CLOSING_TIMESTAMP 0x000020u
+#define WT_CONN_COMPATIBILITY 0x000040u
+#define WT_CONN_DATA_CORRUPTION 0x000080u
+#define WT_CONN_EVICTION_RUN 0x000100u
+#define WT_CONN_FILE_CLOSE_SYNC 0x000200u
+#define WT_CONN_HS_OPEN 0x000400u
+#define WT_CONN_INCR_BACKUP 0x000800u
+#define WT_CONN_IN_MEMORY 0x001000u
+#define WT_CONN_LEAK_MEMORY 0x002000u
+#define WT_CONN_LSM_MERGE 0x004000u
+#define WT_CONN_OPTRACK 0x008000u
+#define WT_CONN_PANIC 0x010000u
+#define WT_CONN_READONLY 0x020000u
+#define WT_CONN_RECONFIGURING 0x040000u
+#define WT_CONN_RECOVERING 0x080000u
+#define WT_CONN_SALVAGE 0x100000u
+#define WT_CONN_WAS_BACKUP 0x200000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 68e636cdebf..405ae73401b 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -763,8 +763,8 @@ extern int __wt_hs_delete_key_from_ts(
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format,
- uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf,
- WT_TIME_WINDOW *on_disk_tw) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
@@ -2070,7 +2070,10 @@ static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- WT_UPDATE *upd, WT_UPDATE **prepare_updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+static inline int __wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_UPDATE *upd, WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_txn_search_check(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index e1c6cea488a..3635dd41d1b 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -783,8 +783,10 @@ struct __wt_connection_stats {
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
+ int64_t txn_rts_hs_restore_updates;
int64_t txn_rts_sweep_hs_keys;
int64_t txn_rts_hs_removed;
+ int64_t txn_checkpoint_obsolete_applied;
int64_t txn_update_conflict;
};
@@ -996,8 +998,10 @@ struct __wt_dsrc_stats {
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
+ int64_t txn_rts_hs_restore_updates;
int64_t txn_rts_sweep_hs_keys;
int64_t txn_rts_hs_removed;
+ int64_t txn_checkpoint_obsolete_applied;
int64_t txn_update_conflict;
};
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 9aa86728d59..f4a4c552ddb 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -836,18 +836,21 @@ __wt_upd_alloc_tombstone(WT_SESSION_IMPL *session, WT_UPDATE **updp, size_t *siz
}
/*
- * __wt_txn_read_upd_list --
- * Get the first visible update in a list (or NULL if none are visible).
+ * __wt_txn_read_upd_list_internal --
+ * Internal helper function to get the first visible update in a list (or NULL if none are
+ * visible).
*/
static inline int
-__wt_txn_read_upd_list(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, WT_UPDATE **prepare_updp)
+__wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd,
+ WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp)
{
WT_VISIBLE_TYPE upd_visible;
uint8_t prepare_state, type;
if (prepare_updp != NULL)
*prepare_updp = NULL;
+ if (restored_updp != NULL)
+ *restored_updp = NULL;
__wt_upd_value_clear(cbt->upd_value);
for (; upd != NULL; upd = upd->next) {
@@ -888,6 +891,16 @@ __wt_txn_read_upd_list(
F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS))
*prepare_updp = upd;
+ /*
+ * Save the restored update to use it as base value update in case if we need to reach
+ * history store instead of on-disk value.
+ */
+ if (restored_updp != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_HS) &&
+ type == WT_UPDATE_STANDARD) {
+ WT_ASSERT(session, *restored_updp == NULL);
+ *restored_updp = upd;
+ }
+
if (upd_visible == WT_VISIBLE_PREPARE) {
/* Ignore the prepared update, if transaction configuration says so. */
if (F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE))
@@ -916,6 +929,16 @@ __wt_txn_read_upd_list(
}
/*
+ * __wt_txn_read_upd_list --
+ * Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline int
+__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ return __wt_txn_read_upd_list_internal(session, cbt, upd, NULL, NULL);
+}
+
+/*
* __wt_txn_read --
* Get the first visible update in a chain. This function will first check the update list
* supplied as a function argument. If there is no visible update, it will check the onpage
@@ -927,14 +950,14 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint
WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack)
{
WT_TIME_WINDOW tw;
- WT_UPDATE *prepare_upd;
+ WT_UPDATE *prepare_upd, *restored_upd;
bool have_stop_tw, retry;
- prepare_upd = NULL;
+ prepare_upd = restored_upd = NULL;
retry = true;
retry:
- WT_RET(__wt_txn_read_upd_list(session, cbt, upd, &prepare_upd));
+ WT_RET(__wt_txn_read_upd_list_internal(session, cbt, upd, &prepare_upd, &restored_upd));
if (WT_UPDATE_DATA_VALUE(cbt->upd_value) ||
(cbt->upd_value->type == WT_UPDATE_MODIFY && cbt->upd_value->skip_buf))
return (0);
@@ -947,66 +970,77 @@ retry:
}
/*
- * When we inspected the update list we may have seen a tombstone leaving us with a valid stop
- * time window, we don't want to overwrite this stop time window.
+ * Skip retrieving the on-disk value when there exists a restored update from history store in
+ * the update list. Having a restored update as part of the update list indicates that the
+ * existing on-disk value is unstable.
*/
- have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw);
-
- /* Check the ondisk value. */
- if (vpack == NULL) {
- WT_TIME_WINDOW_INIT(&tw);
- WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw));
+ if (restored_upd != NULL) {
+ WT_ASSERT(session, !WT_IS_HS(session->dhandle));
+ cbt->upd_value->buf.data = restored_upd->data;
+ cbt->upd_value->buf.size = restored_upd->size;
} else {
- WT_TIME_WINDOW_COPY(&tw, &vpack->tw);
- cbt->upd_value->buf.data = vpack->data;
- cbt->upd_value->buf.size = vpack->size;
- }
-
- /*
- * If the stop time point is set, that means that there is a tombstone at that time. If it is
- * not prepared and it is visible to our txn it means we've just spotted a tombstone and should
- * return "not found", except scanning the history store during rollback to stable and when we
- * are told to ignore non-globally visible tombstones.
- */
- if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
- !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
- cbt->upd_value->buf.data = NULL;
- cbt->upd_value->buf.size = 0;
- cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
- cbt->upd_value->tw.stop_ts = tw.stop_ts;
- cbt->upd_value->tw.stop_txn = tw.stop_txn;
- cbt->upd_value->tw.prepare = tw.prepare;
- cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
- return (0);
- }
-
- /* Store the stop time pair of the history store record that is returning. */
- if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) {
- cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
- cbt->upd_value->tw.stop_ts = tw.stop_ts;
- cbt->upd_value->tw.stop_txn = tw.stop_txn;
- cbt->upd_value->tw.prepare = tw.prepare;
- }
+ /*
+ * When we inspected the update list we may have seen a tombstone leaving us with a valid
+ * stop time window, we don't want to overwrite this stop time window.
+ */
+ have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw);
+
+ /* Check the ondisk value. */
+ if (vpack == NULL) {
+ WT_TIME_WINDOW_INIT(&tw);
+ WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw));
+ } else {
+ WT_TIME_WINDOW_COPY(&tw, &vpack->tw);
+ cbt->upd_value->buf.data = vpack->data;
+ cbt->upd_value->buf.size = vpack->size;
+ }
- /* If the start time point is visible then we need to return the ondisk value. */
- if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
- if (cbt->upd_value->skip_buf) {
+ /*
+ * If the stop time point is set, that means that there is a tombstone at that time. If it
+ * is not prepared and it is visible to our txn it means we've just spotted a tombstone and
+ * should return "not found", except scanning the history store during rollback to stable
+ * and when we are told to ignore non-globally visible tombstones.
+ */
+ if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
+ !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
cbt->upd_value->buf.data = NULL;
cbt->upd_value->buf.size = 0;
+ cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
+ cbt->upd_value->tw.stop_ts = tw.stop_ts;
+ cbt->upd_value->tw.stop_txn = tw.stop_txn;
+ cbt->upd_value->tw.prepare = tw.prepare;
+ cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
+ return (0);
+ }
+
+ /* Store the stop time pair of the history store record that is returning. */
+ if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) {
+ cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts;
+ cbt->upd_value->tw.stop_ts = tw.stop_ts;
+ cbt->upd_value->tw.stop_txn = tw.stop_txn;
+ cbt->upd_value->tw.prepare = tw.prepare;
+ }
+
+ /* If the start time point is visible then we need to return the ondisk value. */
+ if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
+ if (cbt->upd_value->skip_buf) {
+ cbt->upd_value->buf.data = NULL;
+ cbt->upd_value->buf.size = 0;
+ }
+ cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts;
+ cbt->upd_value->tw.start_ts = tw.start_ts;
+ cbt->upd_value->tw.start_txn = tw.start_txn;
+ cbt->upd_value->tw.prepare = tw.prepare;
+ cbt->upd_value->type = WT_UPDATE_STANDARD;
+ return (0);
}
- cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts;
- cbt->upd_value->tw.start_ts = tw.start_ts;
- cbt->upd_value->tw.start_txn = tw.start_txn;
- cbt->upd_value->tw.prepare = tw.prepare;
- cbt->upd_value->type = WT_UPDATE_STANDARD;
- return (0);
}
/* If there's no visible update in the update chain or ondisk, check the history store file. */
if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) {
__wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH);
WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false,
- &cbt->upd_value->buf, &tw));
+ &cbt->upd_value->buf));
}
/*
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index d59138e1bb0..8b760c6df42 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -5875,12 +5875,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1468
/*! transaction: rollback to stable restored tombstones from history store */
#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1469
+/*! transaction: rollback to stable restored updates from history store */
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1470
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1471
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1472
+/*! transaction: transaction checkpoints due to obsolete pages */
+#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1473
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1472
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1474
/*!
* @}
@@ -6484,12 +6488,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2201
/*! transaction: rollback to stable restored tombstones from history store */
#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2202
+/*! transaction: rollback to stable restored updates from history store */
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2203
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2203
+#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2204
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2204
+#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2205
+/*! transaction: transaction checkpoints due to obsolete pages */
+#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2206
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2205
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2207
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index 707ef28c086..1ca3f32f262 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -213,7 +213,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session)
manager->lsm_worker_cookies[i].session = worker_session;
}
- F_SET(conn, WT_CONN_SERVER_LSM);
+ FLD_SET(conn->server_flags, WT_CONN_SERVER_LSM);
/* Start the LSM manager thread. */
WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager,
@@ -269,7 +269,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
removed = 0;
/* Clear the LSM server flag. */
- F_CLR(conn, WT_CONN_SERVER_LSM);
+ FLD_CLR(conn->server_flags, WT_CONN_SERVER_LSM);
WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || manager->lsm_workers == 0);
if (manager->lsm_workers > 0) {
@@ -351,7 +351,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
conn = S2C(session);
dhandle_locked = false;
- while (F_ISSET(conn, WT_CONN_SERVER_LSM)) {
+ while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM)) {
__wt_sleep(0, 10000);
if (TAILQ_EMPTY(&conn->lsmqh))
continue;
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 7ebdf736c2f..9581144489a 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1760,7 +1760,7 @@ __transaction_sync_run_chk(WT_SESSION_IMPL *session)
conn = S2C(session);
- return (FLD_ISSET(conn->flags, WT_CONN_SERVER_LOG));
+ return (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG));
}
/*
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 1ee255be706..100fa7b3ed8 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -210,8 +210,10 @@ static const char *const __stats_dsrc_desc[] = {
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
+ "transaction: rollback to stable restored updates from history store",
"transaction: rollback to stable sweeping history store keys",
"transaction: rollback to stable updates removed from history store",
+ "transaction: transaction checkpoints due to obsolete pages",
"transaction: update conflicts",
};
@@ -456,8 +458,10 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
+ stats->txn_rts_hs_restore_updates = 0;
stats->txn_rts_sweep_hs_keys = 0;
stats->txn_rts_hs_removed = 0;
+ stats->txn_checkpoint_obsolete_applied = 0;
stats->txn_update_conflict = 0;
}
@@ -689,8 +693,10 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to)
to->txn_rts_keys_removed += from->txn_rts_keys_removed;
to->txn_rts_keys_restored += from->txn_rts_keys_restored;
to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones;
+ to->txn_rts_hs_restore_updates += from->txn_rts_hs_restore_updates;
to->txn_rts_sweep_hs_keys += from->txn_rts_sweep_hs_keys;
to->txn_rts_hs_removed += from->txn_rts_hs_removed;
+ to->txn_checkpoint_obsolete_applied += from->txn_checkpoint_obsolete_applied;
to->txn_update_conflict += from->txn_update_conflict;
}
@@ -928,8 +934,10 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to)
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
+ to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates);
to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys);
to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed);
+ to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied);
to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
}
@@ -1416,8 +1424,10 @@ static const char *const __stats_connection_desc[] = {
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
+ "transaction: rollback to stable restored updates from history store",
"transaction: rollback to stable sweeping history store keys",
"transaction: rollback to stable updates removed from history store",
+ "transaction: transaction checkpoints due to obsolete pages",
"transaction: update conflicts",
};
@@ -1929,8 +1939,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
+ stats->txn_rts_hs_restore_updates = 0;
stats->txn_rts_sweep_hs_keys = 0;
stats->txn_rts_hs_removed = 0;
+ stats->txn_checkpoint_obsolete_applied = 0;
stats->txn_update_conflict = 0;
}
@@ -2453,8 +2465,10 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
+ to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates);
to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys);
to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed);
+ to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied);
to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
}
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
index 26c750fb496..1694e57dbc3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
@@ -34,8 +34,7 @@ __curtiered_open_cursors(WT_CURSOR_TIERED *curtiered)
dhandle = NULL;
tiered = curtiered->tiered;
- if (tiered->ntiers == 0)
- return (0);
+ WT_ASSERT(session, tiered->ntiers > 0);
/*
* If the key is pointing to memory that is pinned by a chunk cursor, take a copy before closing
@@ -1017,21 +1016,14 @@ err:
* documents avoids biasing towards small chunks. Then return the cursor on the chunk we have
* picked.
*/
-static int
+static void
__curtiered_random_chunk(WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered, WT_CURSOR **cursor)
{
- u_int i, ntiers;
-
- /*
- * If the tree is empty we cannot do a random lookup, so return a WT_NOTFOUND.
- */
- if ((ntiers = curtiered->tiered->ntiers) == 0)
- return (WT_NOTFOUND);
+ u_int i;
/* TODO: make randomness respect tree size. */
- i = __wt_random(&session->rnd) % ntiers;
+ i = __wt_random(&session->rnd) % curtiered->tiered->ntiers;
*cursor = curtiered->cursors[i];
- return (0);
}
/*
@@ -1055,7 +1047,7 @@ __curtiered_next_random(WT_CURSOR *cursor)
WT_ERR(__curtiered_enter(curtiered, false));
for (;;) {
- WT_ERR(__curtiered_random_chunk(session, curtiered, &c));
+ __curtiered_random_chunk(session, curtiered, &c);
/*
* This call to next_random on the chunk can potentially end in WT_NOTFOUND if the chunk we
* picked is empty. We want to retry in that case.
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_schema.c b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
index dc153b31e43..6e7dd84c0e3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_schema.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
@@ -15,12 +15,16 @@
int
__wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config)
{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval, tierconf;
WT_DECL_RET;
+ int ntiers;
char *meta_value;
const char *cfg[] = {WT_CONFIG_BASE(session, tiered_meta), config, NULL};
const char *metadata;
metadata = NULL;
+ ntiers = 0;
/* If it can be opened, it already exists. */
if ((ret = __wt_metadata_search(session, uri, &meta_value)) != WT_NOTFOUND) {
@@ -30,12 +34,24 @@ __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, co
}
WT_RET_NOTFOUND_OK(ret);
+ /* A tiered cursor must specify at least one underlying table */
+ WT_RET(__wt_config_gets(session, cfg, "tiered.tiers", &tierconf));
+ __wt_config_subinit(session, &cparser, &tierconf);
+
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ ++ntiers;
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (ntiers == 0)
+ WT_RET_MSG(session, EINVAL, "tiered table must specify at least one tier");
+
if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
WT_ERR(__wt_metadata_insert(session, uri, metadata));
}
err:
+ __wt_free(session, meta_value);
__wt_free(session, metadata);
return (ret);
}
@@ -188,14 +204,14 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
/* Point to some items in the copy to save re-parsing. */
WT_RET(__wt_config_gets(session, tiered_cfg, "tiered.tiers", &tierconf));
- /*
- * Count the number of tiers.
- */
+ /* Count the number of tiers. */
__wt_config_subinit(session, &cparser, &tierconf);
while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
++tiered->ntiers;
WT_RET_NOTFOUND_OK(ret);
+ WT_ASSERT(session, tiered->ntiers > 0);
+
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_calloc_def(session, tiered->ntiers, &tiered->tiers));
@@ -204,7 +220,7 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)ckey.len, ckey.str));
WT_ERR(__wt_session_get_dhandle(session, (const char *)buf->data, NULL, cfg, 0));
- __wt_atomic_addi32(&session->dhandle->session_inuse, 1);
+ (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
/* Load in reverse order (based on LSM logic). */
tiered->tiers[(tiered->ntiers - 1) - i] = session->dhandle;
WT_ERR(__wt_session_release_dhandle(session));
@@ -247,7 +263,7 @@ __wt_tiered_close(WT_SESSION_IMPL *session, WT_TIERED *tiered)
__wt_free(session, tiered->value_format);
if (tiered->tiers != NULL) {
for (i = 0; i < tiered->ntiers; i++)
- __wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
+ (void)__wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
__wt_free(session, tiered->tiers);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index d9a15ed067c..887abaa503d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -1415,14 +1415,17 @@ __checkpoint_lock_dirty_tree(
if (now > btree->clean_ckpt_timer)
skip_ckpt = false;
}
- if (skip_ckpt) {
+
+ /* Skip the clean btree until the btree has obsolete pages. */
+ if (skip_ckpt && !F_ISSET(btree, WT_BTREE_OBSOLETE_PAGES)) {
F_SET(btree, WT_BTREE_SKIP_CKPT);
goto skip;
}
}
- /* If we have to process this btree for any reason, reset the timer. */
+ /* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */
WT_BTREE_CLEAN_CKPT(session, btree, 0);
+ F_CLR(btree, WT_BTREE_OBSOLETE_PAGES);
/* Get the list of checkpoints for this file. */
WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase));
@@ -1486,6 +1489,35 @@ skip:
}
/*
+ * __checkpoint_apply_obsolete --
+ * Returns true if the checkpoint is obsolete.
+ */
+static bool
+__checkpoint_apply_obsolete(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CKPT *ckpt)
+{
+ wt_timestamp_t stop_ts;
+
+ stop_ts = WT_TS_MAX;
+ if (ckpt->size != 0) {
+ /*
+ * If the checkpoint has a valid stop timestamp, mark the btree as having obsolete pages.
+ * This flag is used to avoid skipping the btree until the obsolete check is performed on
+ * the checkpoints.
+ */
+ if (ckpt->ta.newest_stop_ts != WT_TS_MAX) {
+ F_SET(btree, WT_BTREE_OBSOLETE_PAGES);
+ stop_ts = ckpt->ta.newest_stop_durable_ts;
+ }
+ if (__wt_txn_visible_all(session, ckpt->ta.newest_stop_txn, stop_ts)) {
+ WT_STAT_CONN_DATA_INCR(session, txn_checkpoint_obsolete_applied);
+ return (true);
+ }
+ }
+
+ return (false);
+}
+
+/*
* __checkpoint_mark_skip --
* Figure out whether the checkpoint can be skipped for a tree.
*/
@@ -1523,9 +1555,17 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
F_CLR(btree, WT_BTREE_SKIP_CKPT);
if (!btree->modified && !force) {
deleted = 0;
- WT_CKPT_FOREACH (ckptbase, ckpt)
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ /*
+ * Don't skip the objects that have obsolete pages to let them to be removed as part of
+ * checkpoint cleanup.
+ */
+ if (__checkpoint_apply_obsolete(session, btree, ckpt))
+ return (0);
+
if (F_ISSET(ckpt, WT_CKPT_DELETE))
++deleted;
+ }
/*
* Complicated test: if the tree is clean and last two checkpoints have the same name
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index b30cf03be69..9ad6b7abd6d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -251,12 +251,22 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
type = (uint8_t)type_full;
- if (type == WT_UPDATE_MODIFY)
- WT_ERR(__wt_modify_apply_item(
- session, S2BT(session)->value_format, &full_value, hs_value->data));
- else {
- WT_ASSERT(session, type == WT_UPDATE_STANDARD);
- WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size));
+
+ /*
+ * Do not include history store updates greater than on-disk data store version to construct
+ * a full update to restore. Comparing with timestamps here has no problem unlike in search
+ * flow where the timestamps may be reset during reconciliation. RTS detects an on-disk
+ * update is unstable based on the written proper timestamp, so comparing against it with
+ * history store shouldn't have any problem.
+ */
+ if (hs_start_ts <= unpack->tw.start_ts) {
+ if (type == WT_UPDATE_MODIFY)
+ WT_ERR(__wt_modify_apply_item(
+ session, S2BT(session)->value_format, &full_value, hs_value->data));
+ else {
+ WT_ASSERT(session, type == WT_UPDATE_STANDARD);
+ WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size));
+ }
}
/*
@@ -280,9 +290,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
*/
if (!replace && hs_stop_durable_ts <= rollback_timestamp) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update valid with stop timestamp: %s and stable timestamp: %s",
+ "history store update valid with stop timestamp: %s, stable timestamp: %s and type: "
+ "%" PRIu8,
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), type);
break;
}
@@ -290,22 +301,23 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (hs_durable_ts <= rollback_timestamp) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s and stable timestamp: %s",
+ "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
+ WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
valid_update_found = true;
break;
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update aborted with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s and stable timestamp: %s",
+ "timestamp: %s, stable timestamp: %s and type: %" PRIu8,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type);
/*
* Start time point of the current record may be used as stop time point of the previous
@@ -329,6 +341,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* list. Otherwise remove the key by adding a tombstone.
*/
if (valid_update_found) {
+ WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts);
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
upd->txnid = cbt->upd_value->tw.start_txn;
@@ -336,7 +349,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
upd->start_ts = cbt->upd_value->tw.start_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"update restored from history store (txnid: %" PRIu64
- ", start_ts: %s, durable_ts: %s",
+ ", start_ts: %s and durable_ts: %s",
upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
__wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
@@ -345,6 +358,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* the rollback to stable operation.
*/
F_SET(upd, WT_UPDATE_RESTORED_FROM_HS);
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates);
/*
* We have a tombstone on the original update chain and it is behind the stable
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
index 13a3577745f..1c9cae21bbf 100644
--- a/src/third_party/wiredtiger/src/utilities/util_list.c
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -88,6 +88,12 @@ list_init_block(WT_SESSION *session, const char *key, WT_BLOCK *block)
wt_api = session->connection->get_extension_api(session->connection);
if ((ret = wt_api->metadata_search(wt_api, session, key, &config)) != 0)
WT_ERR(util_err(session, ret, "%s: WT_EXTENSION_API.metadata_search", key));
+ /*
+ * The config variable should be set and not NULL, but Coverity is convinced otherwise. This is
+ * an infrequent code path. Just add this extra conditional to make it happy.
+ */
+ if (config == NULL)
+ goto err;
if ((ret = wt_api->config_parser_open(wt_api, session, config, strlen(config), &parser)) != 0)
WT_ERR(util_err(session, ret, "WT_EXTENSION_API.config_parser_open"));
if ((ret = parser->get(parser, "allocation_size", &cval)) == 0)
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 13d91b793e3..f244df90452 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -31,6 +31,7 @@
static void config_backup_incr(void);
static void config_backup_incr_granularity(void);
+static void config_backup_incr_log_compatibility_check(void);
static void config_backward_compatible(void);
static void config_cache(void);
static void config_checkpoint(void);
@@ -267,12 +268,8 @@ config_backup_incr(void)
* archival doesn't seem as useful as testing backup, let the backup configuration override.
*/
if (config_is_perm("backup.incremental")) {
- if (g.c_backup_incr_flag == INCREMENTAL_LOG) {
- if (g.c_logging_archive && config_is_perm("logging.archive"))
- testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive");
- if (g.c_logging_archive)
- config_single("logging.archive=0", false);
- }
+ if (g.c_backup_incr_flag == INCREMENTAL_LOG)
+ config_backup_incr_log_compatibility_check();
if (g.c_backup_incr_flag == INCREMENTAL_BLOCK)
config_backup_incr_granularity();
return;
@@ -761,6 +758,23 @@ config_in_memory_reset(void)
}
/*
+ * config_backup_incr_compatibility_check --
+ * Backup incremental log compatibility check.
+ */
+static void
+config_backup_incr_log_compatibility_check(void)
+{
+ /*
+ * Incremental backup using log files is incompatible with logging archival. Disable logging
+ * archival if log incremental backup is set.
+ */
+ if (g.c_logging_archive && config_is_perm("logging.archive"))
+ testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive");
+ if (g.c_logging_archive)
+ config_single("logging.archive=0", false);
+}
+
+/*
* config_lsm_reset --
* LSM configuration review.
*/
@@ -801,6 +815,7 @@ config_lsm_reset(void)
case 2:
/* 50% */
config_single("backup.incremental=log", false);
+ config_backup_incr_log_compatibility_check();
break;
}
}
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint08.py b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py
new file mode 100755
index 00000000000..047b36dcd50
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_checkpoint08.py
+# Test that the btree checkpoint is not skipped if there are obsolete pages.
+
+import wiredtiger, wttest
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_checkpoint08(wttest.WiredTigerTestCase):
+ conn_config = 'cache_size=50MB,log=(enabled),statistics=(all)'
+ session_config = 'isolation=snapshot'
+
+ def get_stat(self, uri):
+ stat_uri = 'statistics:' + uri
+ stat_cursor = self.session.open_cursor(stat_uri)
+ val = stat_cursor[stat.dsrc.btree_clean_checkpoint_timer][2]
+ stat_cursor.close()
+ return val
+
+ def test_checkpoint08(self):
+ self.uri1 = 'table:ckpt08.1'
+ self.file1 = 'file:ckpt08.1.wt'
+ self.uri2 = 'table:ckpt08.2'
+ self.file2 = 'file:ckpt08.2.wt'
+ self.hsfile = 'file:WiredTigerHS.wt'
+ self.session.create(self.uri1, 'key_format=i,value_format=i')
+ self.session.create(self.uri2, 'key_format=i,value_format=i')
+
+ # Pin oldest and stable to timestamp 1.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1) +
+ ',stable_timestamp=' + timestamp_str(1))
+
+ # Setup: Insert some data and checkpoint it. Then modify only
+ # the data in the first table and checkpoint. Verify the clean skip
+ # timer is not set for the modified table and is set for the clean one.
+ c1 = self.session.open_cursor(self.uri1, None)
+ c2 = self.session.open_cursor(self.uri2, None)
+
+ self.session.begin_transaction()
+ c1[1] = 1
+ c2[1] = 1
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+
+ self.session.begin_transaction()
+ c1[1] = 10
+ c2[1] = 10
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(3))
+ self.session.checkpoint(None)
+
+ # Modify the both tables and reverify.
+ self.session.begin_transaction()
+ c1[3] = 3
+ c2[3] = 3
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(4))
+ self.session.checkpoint(None)
+
+ val = self.get_stat(self.uri1)
+ self.assertEqual(val, 0)
+ val = self.get_stat(self.uri2)
+ self.assertEqual(val, 0)
+ hsval = self.get_stat(self.hsfile)
+ self.assertNotEqual(hsval, 0)
+
+ # Modify the both tables and reverify when oldest timestamp moved.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(4))
+ self.session.begin_transaction()
+ c1[4] = 4
+ c2[4] = 4
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(5))
+ self.session.checkpoint(None)
+
+ val = self.get_stat(self.uri1)
+ self.assertEqual(val, 0)
+ val = self.get_stat(self.uri2)
+ self.assertEqual(val, 0)
+ hsval = self.get_stat(self.hsfile)
+ self.assertEqual(hsval, 0)
+
+ stat_cursor = self.session.open_cursor('statistics:file:WiredTigerHS.wt', None, None)
+ obsolete_applied = stat_cursor[stat.dsrc.txn_checkpoint_obsolete_applied][2]
+ self.assertEqual(obsolete_applied, 1)
+ stat_cursor.close()
+
+ c1.close()
+ c2.close()
+ self.session.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs05.py b/src/third_party/wiredtiger/test/suite/test_hs05.py
index f2d93a40547..745d22d5480 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs05.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs05.py
@@ -30,6 +30,7 @@ from helper import copy_wiredtiger_home
import wiredtiger, wttest
from wiredtiger import stat
from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
@@ -44,6 +45,12 @@ class test_hs05(wttest.WiredTigerTestCase):
conn_config += 'eviction_updates_target=100,eviction_updates_trigger=100'
session_config = 'isolation=snapshot'
stable = 1
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer', dict(key_format='i')),
+ ('string', dict(key_format='S'))
+ ]
+ scenarios = make_scenarios(key_format_values)
def get_stat(self, stat):
stat_cursor = self.session.open_cursor('statistics:')
@@ -71,7 +78,7 @@ class test_hs05(wttest.WiredTigerTestCase):
# Create a small table.
uri = "table:test_hs05"
nrows = 100
- ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u')
+ ds = SimpleDataSet(self, uri, nrows, key_format=self.key_format, value_format='u')
ds.populate()
bigvalue = b"aaaaa" * 100
diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py
index 967f1519807..cc38097da7f 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs06.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs06.py
@@ -47,8 +47,7 @@ class test_hs06(wttest.WiredTigerTestCase):
conn_config = 'cache_size=50MB,statistics=(fast)'
session_config = 'isolation=snapshot'
key_format_values = [
- # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061).
- # ('column', dict(key_format='r')),
+ ('column', dict(key_format='r')),
('integer', dict(key_format='i')),
('string', dict(key_format='S'))
]
@@ -210,6 +209,11 @@ class test_hs06(wttest.WiredTigerTestCase):
self.session.rollback_transaction()
def test_hs_prepare_reads(self):
+ # Prepare reads currently not supported with columnar store.
+ # Remove this once prepare reads is supported in WT-6061.
+ if self.key_format == 'r':
+ return
+
# Create a small table.
uri = "table:test_hs06"
create_params = 'key_format={},value_format=S'.format(self.key_format)
diff --git a/src/third_party/wiredtiger/test/suite/test_hs07.py b/src/third_party/wiredtiger/test/suite/test_hs07.py
index 14ad15c3281..37b451f3b79 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs07.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs07.py
@@ -30,6 +30,7 @@ import time
from helper import copy_wiredtiger_home
import wiredtiger, wttest
from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
@@ -42,6 +43,12 @@ class test_hs07(wttest.WiredTigerTestCase):
'eviction_updates_target=80,log=(enabled)')
session_config = 'isolation=snapshot'
+ key_format_values = (
+ ('column', dict(key_format='r')),
+ ('int', dict(key_format='i'))
+ )
+ scenarios = make_scenarios(key_format_values)
+
def large_updates(self, uri, value, ds, nrows, commit_ts):
# Update a large number of records, we'll hang if the history store table isn't working.
session = self.session
@@ -70,11 +77,11 @@ class test_hs07(wttest.WiredTigerTestCase):
# behavior.
uri = "table:las07_main"
ds = SimpleDataSet(
- self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+ self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)')
ds.populate()
uri2 = "table:las07_extra"
- ds2 = SimpleDataSet(self, uri2, 0, key_format="i", value_format="S")
+ ds2 = SimpleDataSet(self, uri2, 0, key_format=self.key_format, value_format="S")
ds2.populate()
# Pin oldest and stable to timestamp 1.
diff --git a/src/third_party/wiredtiger/test/suite/test_hs08.py b/src/third_party/wiredtiger/test/suite/test_hs08.py
index 9899842d529..9388121cdad 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs08.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs08.py
@@ -38,6 +38,11 @@ def timestamp_str(t):
class test_hs08(wttest.WiredTigerTestCase):
conn_config = 'cache_size=100MB,statistics=(all)'
session_config = 'isolation=snapshot'
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer', dict(key_format='i')),
+ ]
+ scenarios = make_scenarios(key_format_values)
def get_stat(self, stat):
stat_cursor = self.session.open_cursor('statistics:')
@@ -47,7 +52,7 @@ class test_hs08(wttest.WiredTigerTestCase):
def test_modify_insert_to_hs(self):
uri = "table:test_hs08"
- create_params = 'value_format=S,key_format=i'
+ create_params = 'value_format=S,key_format={}'.format(self.key_format)
value1 = 'a' * 1000
self.session.create(uri, create_params)
diff --git a/src/third_party/wiredtiger/test/suite/test_hs09.py b/src/third_party/wiredtiger/test/suite/test_hs09.py
index ac34e3f7b17..4bed2791808 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs09.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs09.py
@@ -38,12 +38,11 @@ def timestamp_str(t):
# second newest committed version to history store.
class test_hs09(wttest.WiredTigerTestCase):
# Force a small cache.
- conn_config = 'cache_size=50MB,statistics=(fast)'
+ conn_config = 'cache_size=20MB,statistics=(fast)'
session_config = 'isolation=snapshot'
uri = "table:test_hs09"
key_format_values = [
- # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061).
- #('column', dict(key_format='r')),
+ ('column', dict(key_format='r')),
('integer', dict(key_format='i')),
('string', dict(key_format='S')),
]
@@ -54,20 +53,31 @@ class test_hs09(wttest.WiredTigerTestCase):
return str(i)
return i
- def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts, expected_hs_stop_ts):
+ def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts,
+ expected_hs_stop_ts, expect_prepared_in_datastore = False):
session = self.conn.open_session(self.session_config)
session.checkpoint()
- # Check the data file value
+ # Check the data file value.
cursor = session.open_cursor(self.uri, None, 'checkpoint=WiredTigerCheckpoint')
+
+ # If we are expecting prepapred updates in the datastore, start an explicit transaction with
+ # ignore prepare flag to avoid getting a WT_PREPARE_CONFLICT error.
+ if expect_prepared_in_datastore:
+ session.begin_transaction("ignore_prepare=true")
+
for _, value in cursor:
self.assertEqual(value, expected_data_value)
+
+ if expect_prepared_in_datastore:
+ session.rollback_transaction()
+
cursor.close()
- # Check the history store file value
+ # Check the history store file value.
cursor = session.open_cursor("file:WiredTigerHS.wt", None, 'checkpoint=WiredTigerCheckpoint')
for _, _, hs_start_ts, _, hs_stop_ts, _, type, value in cursor:
- # No WT_UPDATE_TOMBSTONE in the history store
+ # No WT_UPDATE_TOMBSTONE in the history store.
self.assertNotEqual(type, 5)
- # No WT_UPDATE_BIRTHMARK in the history store
+ # No WT_UPDATE_BIRTHMARK in the history store.
self.assertNotEqual(type, 1)
# WT_UPDATE_STANDARD
if (type == 4):
@@ -100,7 +110,7 @@ class test_hs09(wttest.WiredTigerTestCase):
cursor[self.create_key(i)] = value2
self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
- # Uncommitted changes
+ # Uncommitted changes.
self.session.begin_transaction()
for i in range(1, 11):
cursor[self.create_key(i)] = value3
@@ -108,6 +118,11 @@ class test_hs09(wttest.WiredTigerTestCase):
self.check_ckpt_hs(value2, value1, 2, 3)
def test_prepared_updates_not_written_to_hs(self):
+ # Prepare reads currently not supported with columnar store.
+ # Remove this once prepare reads is supported in WT-6061.
+ if self.key_format == 'r':
+ return
+
# Create a small table.
create_params = 'key_format={},value_format=S'.format(self.key_format)
self.session.create(self.uri, create_params)
@@ -130,13 +145,15 @@ class test_hs09(wttest.WiredTigerTestCase):
cursor[self.create_key(i)] = value2
self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
- # Prepare some updates
+ # Prepare some updates.
self.session.begin_transaction()
for i in range(1, 11):
cursor[self.create_key(i)] = value3
self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(4))
- self.check_ckpt_hs(value2, value1, 2, 3)
+ # We can expect prepared values to show up in data store if the eviction runs between now
+ # and the time when we open a cursor on the user table.
+ self.check_ckpt_hs(value2, value1, 2, 3, True)
self.session.commit_transaction('commit_timestamp=' + timestamp_str(5) +
',durable_timestamp=' + timestamp_str(5))
diff --git a/src/third_party/wiredtiger/test/suite/test_hs10.py b/src/third_party/wiredtiger/test/suite/test_hs10.py
index f41f18bb999..ef92195daf9 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs10.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs10.py
@@ -38,6 +38,11 @@ def timestamp_str(t):
class test_hs10(wttest.WiredTigerTestCase):
conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
session_config = 'isolation=snapshot'
+ key_format_values = (
+ ('column', dict(key_format='r')),
+ ('int', dict(key_format='i'))
+ )
+ scenarios = make_scenarios(key_format_values)
def get_stat(self, stat):
stat_cursor = self.session.open_cursor('statistics:')
@@ -48,7 +53,7 @@ class test_hs10(wttest.WiredTigerTestCase):
def test_modify_insert_to_hs(self):
uri = "table:test_hs10"
uri2 = "table:test_hs10_otherdata"
- create_params = 'value_format=S,key_format=i'
+ create_params = 'value_format=S,key_format={}'.format(self.key_format)
value1 = 'a' * 1000
value2 = 'b' * 1000
self.session.create(uri, create_params)
diff --git a/src/third_party/wiredtiger/test/suite/test_hs12.py b/src/third_party/wiredtiger/test/suite/test_hs12.py
index ed332dc3349..a4e9199323c 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs12.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs12.py
@@ -27,6 +27,7 @@
# OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest, time
+from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
@@ -36,10 +37,17 @@ def timestamp_str(t):
class test_hs12(wttest.WiredTigerTestCase):
conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
session_config = 'isolation=snapshot'
+ key_format_values = [
+ # The commented columnar tests needs to be enabled once columnar
+ # Modify type update is fixed in (WT-5550).
+ # ('column', dict(key_format='r')),
+ ('integer', dict(key_format='i')),
+ ]
+ scenarios = make_scenarios(key_format_values)
def test_modify_append_to_string(self):
uri = "table:test_reverse_modify01_notimestamp"
- create_params = 'value_format=S,key_format=i'
+ create_params = 'value_format=S,key_format={}'.format(self.key_format)
value1 = 'abcedfghijklmnopqrstuvwxyz' * 5
value2 = 'b' * 100
valuebig = 'e' * 1000
diff --git a/src/third_party/wiredtiger/test/suite/test_hs13.py b/src/third_party/wiredtiger/test/suite/test_hs13.py
index 7ada5d7a0b6..27c6c6d4ade 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs13.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs13.py
@@ -27,6 +27,8 @@
# OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest
+from wtscenario import make_scenarios
+
def timestamp_str(t):
return '%x' % t
@@ -35,10 +37,16 @@ def timestamp_str(t):
class test_hs13(wttest.WiredTigerTestCase):
conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
session_config = 'isolation=snapshot'
+ key_format_values = [
+ # The commented columnar tests needs to be enabled once columnar Modify type update is fixed in (WT-5550).
+ # ('column', dict(key_format='r')),
+ ('integer', dict(key_format='i'))
+ ]
+ scenarios = make_scenarios(key_format_values)
def test_reverse_modifies_constructed_after_eviction(self):
uri = "table:test_hs13"
- create_params = 'value_format=S,key_format=i'
+ create_params = 'value_format=S,key_format={}'.format(self.key_format)
value1 = 'a' * 10000
value2 = 'b' * 10000
value3 = 'e' * 10000
diff --git a/src/third_party/wiredtiger/test/suite/test_hs14.py b/src/third_party/wiredtiger/test/suite/test_hs14.py
index ebd5f471f2b..4e8f5148da1 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs14.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs14.py
@@ -27,6 +27,7 @@
# OTHER DEALINGS IN THE SOFTWARE.
import time, wiredtiger, wttest
+from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
@@ -37,10 +38,20 @@ def timestamp_str(t):
class test_hs14(wttest.WiredTigerTestCase):
conn_config = 'cache_size=50MB'
session_config = 'isolation=snapshot'
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('string', dict(key_format='S'))
+ ]
+ scenarios = make_scenarios(key_format_values)
+
+ def create_key(self, i):
+ if self.key_format == 'S':
+ return str(i)
+ return i
def test_hs14(self):
uri = 'table:test_hs14'
- self.session.create(uri, 'key_format=S,value_format=S')
+ self.session.create(uri, 'key_format={},value_format=S'.format(self.key_format))
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
cursor = self.session.open_cursor(uri)
@@ -52,22 +63,22 @@ class test_hs14(wttest.WiredTigerTestCase):
for i in range(1, 10000):
self.session.begin_transaction()
- cursor[str(i)] = value1
+ cursor[self.create_key(i)] = value1
self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
self.session.begin_transaction()
- cursor[str(i)] = value2
+ cursor[self.create_key(i)] = value2
self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
self.session.begin_transaction()
- cursor[str(i)] = value3
+ cursor[self.create_key(i)] = value3
self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
self.session.begin_transaction()
- cursor[str(i)] = value4
+ cursor[self.create_key(i)] = value4
self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
start = time.time()
self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
for i in range(1, 10000):
- self.assertEqual(cursor[str(i)], value3)
+ self.assertEqual(cursor[self.create_key(i)], value3)
self.session.rollback_transaction()
end = time.time()
@@ -76,17 +87,17 @@ class test_hs14(wttest.WiredTigerTestCase):
for i in range(1, 10000):
self.session.begin_transaction()
- cursor.set_key(str(i))
+ cursor.set_key(self.create_key(i))
cursor.remove()
self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
self.session.begin_transaction()
- cursor[str(i)] = value5
+ cursor[self.create_key(i)] = value5
self.session.commit_transaction('commit_timestamp=' + timestamp_str(10))
start = time.time()
self.session.begin_transaction('read_timestamp=' + timestamp_str(9))
for i in range(1, 10000):
- cursor.set_key(str(i))
+ cursor.set_key(self.create_key(i))
self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND)
self.session.rollback_transaction()
end = time.time()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs20.py b/src/third_party/wiredtiger/test/suite/test_hs20.py
new file mode 100644
index 00000000000..2a9f7d02c93
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_hs20.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import time, wiredtiger, wttest
+
+# test_hs20.py
+# Ensure we never reconstruct a reverse modify update in the history store based on the onpage overflow value
+def timestamp_str(t):
+ return '%x' % t
+
+class test_hs20(wttest.WiredTigerTestCase):
+ conn_config = 'cache_size=50MB,eviction=(threads_max=1)'
+ session_config = 'isolation=snapshot'
+
+ def test_hs20(self):
+ uri = 'table:test_hs20'
+ # Set a very small maximum leaf value to trigger writing overflow values
+ self.session.create(uri, 'key_format=S,value_format=S,leaf_value_max=10B')
+ cursor = self.session.open_cursor(uri)
+ self.conn.set_timestamp(
+ 'oldest_timestamp=' + timestamp_str(1) + ',stable_timestamp=' + timestamp_str(1))
+
+ value1 = 'a' * 500
+ value2 = 'b' * 50
+
+ # Insert a value that is larger than the maximum leaf value.
+ for i in range(0, 10):
+ self.session.begin_transaction()
+ cursor[str(i)] = value1
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(2))
+
+ # Do 2 modifies.
+ for i in range(0, 10):
+ self.session.begin_transaction()
+ cursor.set_key(str(i))
+ mods = [wiredtiger.Modify('B', 500, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(3))
+
+ for i in range(0, 10):
+ self.session.begin_transaction()
+ cursor.set_key(str(i))
+ mods = [wiredtiger.Modify('C', 501, 1)]
+ self.assertEqual(cursor.modify(mods), 0)
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(4))
+
+ # Insert more data to trigger eviction.
+ for i in range(10, 100000):
+ self.session.begin_transaction()
+ cursor[str(i)] = value2
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+ # Update the overflow values.
+ for i in range(0, 10):
+ self.session.begin_transaction()
+ cursor[str(i)] = value2
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(5))
+
+ # Do a checkpoint to move the overflow values to the history store but keep the current in memory disk image.
+ self.session.checkpoint()
+
+ # Search the first modifies.
+ for i in range(0, 10):
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(3))
+ self.assertEqual(cursor[str(i)], value1 + "B")
+ self.session.rollback_transaction()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py
new file mode 100755
index 00000000000..ea88a33a066
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, threading, time
+from helper import copy_wiredtiger_home
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat, wiredtiger_strerror, WiredTigerError, WT_ROLLBACK
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from wtthread import checkpoint_thread, op_thread
+from time import sleep
+
+def timestamp_str(t):
+ return '%x' % t
+
+def mod_val(value, char, location, nbytes=1):
+ return value[0:location] + char + value[location+nbytes:]
+
+def retry_rollback(self, name, txn_session, code):
+ retry_limit = 100
+ retries = 0
+ completed = False
+ saved_exception = None
+ while not completed and retries < retry_limit:
+ if retries != 0:
+ self.pr("Retrying operation for " + name)
+ if txn_session:
+ txn_session.rollback_transaction()
+ sleep(0.1)
+ if txn_session:
+ txn_session.begin_transaction('isolation=snapshot')
+ self.pr("Began new transaction for " + name)
+ try:
+ code()
+ completed = True
+ except WiredTigerError as e:
+ rollback_str = wiredtiger_strerror(WT_ROLLBACK)
+ if rollback_str not in str(e):
+ raise(e)
+ retries += 1
+ saved_exception = e
+ if not completed and saved_exception:
+ raise(saved_exception)
+
+# test_rollback_to_stable14.py
+# Test the rollback to stable operation uses proper base update while restoring modifies from history store.
+class test_rollback_to_stable14(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ prepare_values = [
+ ('no_prepare', dict(prepare=False)),
+ ('prepare', dict(prepare=True))
+ ]
+
+ scenarios = make_scenarios(prepare_values)
+
+ def conn_config(self):
+ config = 'cache_size=8MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[history_store_checkpoint_delay]'
+ return config
+
+ def simulate_crash_restart(self, olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+ #
+ # close the original connection and open to new directory
+ # NOTE: This really cannot test the difference between the
+ # write-no-sync (off) version of log_flush and the sync
+ # version since we're not crashing the system itself.
+ #
+ self.close_conn()
+ self.conn = self.setUpConnectionOpen(newdir)
+ self.session = self.setUpSessionOpen(self.conn)
+
+ def test_rollback_to_stable(self):
+ nrows = 1500
+
+ # Create a table without logging.
+ self.pr("create/populate table")
+ uri = "table:rollback_to_stable14"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+
+ value_modQ = mod_val(value_a, 'Q', 0)
+ value_modR = mod_val(value_modQ, 'R', 1)
+ value_modS = mod_val(value_modR, 'S', 2)
+ value_modT = mod_val(value_modS, 'T', 3)
+
+ # Perform a combination of modifies and updates.
+ self.pr("large updates and modifies")
+ self.large_updates(uri, value_a, ds, nrows, 20)
+ self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30)
+ self.large_modifies(uri, 'R', ds, 1, 1, nrows, 40)
+ self.large_modifies(uri, 'S', ds, 2, 1, nrows, 50)
+ self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+
+ # Verify data is visible and correct.
+ self.check(value_a, uri, nrows, 20)
+ self.check(value_modQ, uri, nrows, 30)
+ self.check(value_modR, uri, nrows, 40)
+ self.check(value_modS, uri, nrows, 50)
+ self.check(value_modT, uri, nrows, 60)
+
+ # Pin stable to timestamp 60 if prepare otherwise 50.
+ if self.prepare:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(60))
+ else:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50))
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ self.pr("start checkpoint")
+ ckpt.start()
+
+ # Perform several modifies in parallel with checkpoint.
+ # Rollbacks may occur when checkpoint is running, so retry as needed.
+ self.pr("modifies")
+ retry_rollback(self, 'modify ds1, W', None,
+ lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70))
+ retry_rollback(self, 'modify ds1, X', None,
+ lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80))
+ retry_rollback(self, 'modify ds1, Y', None,
+ lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90))
+ retry_rollback(self, 'modify ds1, Z', None,
+ lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100))
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Simulate a server crash and restart.
+ self.pr("restart")
+ self.simulate_crash_restart(".", "RESTART")
+ self.pr("restart complete")
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+ hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(hs_restore_updates, nrows)
+ self.assertEqual(keys_restored, 0)
+ self.assertEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(hs_removed, nrows)
+ self.assertGreaterEqual(hs_sweep, 0)
+
+ # Check that the correct data is seen at and after the stable timestamp.
+ self.check(value_a, uri, nrows, 20)
+ self.check(value_modQ, uri, nrows, 30)
+ self.check(value_modR, uri, nrows, 40)
+ self.check(value_modS, uri, nrows, 50)
+
+ # The test may output the following message in eviction under cache pressure. Ignore that.
+ self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction")
+
+ def test_rollback_to_stable_same_ts(self):
+ nrows = 1500
+
+ # Create a table without logging.
+ self.pr("create/populate table")
+ uri = "table:rollback_to_stable14"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+
+ value_modQ = mod_val(value_a, 'Q', 0)
+ value_modR = mod_val(value_modQ, 'R', 1)
+ value_modS = mod_val(value_modR, 'S', 2)
+ value_modT = mod_val(value_modS, 'T', 3)
+
+ # Perform a combination of modifies and updates.
+ self.pr("large updates and modifies")
+ self.large_updates(uri, value_a, ds, nrows, 20)
+ self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30)
+ # prepare cannot use same timestamp always, so use a different timestamps that are aborted.
+ if self.prepare:
+ self.large_modifies(uri, 'R', ds, 1, 1, nrows, 51)
+ self.large_modifies(uri, 'S', ds, 2, 1, nrows, 55)
+ self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+ else:
+ self.large_modifies(uri, 'R', ds, 1, 1, nrows, 60)
+ self.large_modifies(uri, 'S', ds, 2, 1, nrows, 60)
+ self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60)
+
+ # Verify data is visible and correct.
+ self.check(value_a, uri, nrows, 20)
+ self.check(value_modQ, uri, nrows, 30)
+ self.check(value_modT, uri, nrows, 60)
+
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50))
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ self.pr("start checkpoint")
+ ckpt.start()
+
+ # Perform several modifies in parallel with checkpoint.
+ # Rollbacks may occur when checkpoint is running, so retry as needed.
+ self.pr("modifies")
+ retry_rollback(self, 'modify ds1, W', None,
+ lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70))
+ retry_rollback(self, 'modify ds1, X', None,
+ lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80))
+ retry_rollback(self, 'modify ds1, Y', None,
+ lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90))
+ retry_rollback(self, 'modify ds1, Z', None,
+ lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100))
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Simulate a server crash and restart.
+ self.pr("restart")
+ self.simulate_crash_restart(".", "RESTART")
+ self.pr("restart complete")
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2]
+ hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(hs_restore_updates, nrows)
+ self.assertEqual(keys_restored, 0)
+ self.assertEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(hs_removed, nrows * 3)
+ self.assertGreaterEqual(hs_sweep, 0)
+
+ # Check that the correct data is seen at and after the stable timestamp.
+ self.check(value_a, uri, nrows, 20)
+ self.check(value_modQ, uri, nrows, 30)
+
+ # The test may output the following message in eviction under cache pressure. Ignore that.
+ self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction")
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered01.py b/src/third_party/wiredtiger/test/suite/test_tiered01.py
index 2a41c3ff7ef..9a7066fd708 100644
--- a/src/third_party/wiredtiger/test/suite/test_tiered01.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered01.py
@@ -71,5 +71,12 @@ class test_tiered01(wttest.WiredTigerTestCase):
# self.session.drop(self.uri)
+ # It is an error to configure a tiered table with no tiers
+ def test_no_tiers(self):
+ msg = '/tiered table must specify at least one tier/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.create(self.uri, 'type=tiered,key_format=S,tiered=(tiers=())'),
+ msg)
+
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered02.py b/src/third_party/wiredtiger/test/suite/test_tiered02.py
new file mode 100644
index 00000000000..17eb3073c39
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered02.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered02.py
+# Test block-log-structured tree configuration options.
+class test_tiered02(wttest.WiredTigerTestCase):
+ K = 1024
+ M = 1024 * K
+ G = 1024 * M
+ uri = "file:test_tiered02"
+
+ # Occasionally add a lot of records, so that merges (and bloom) happen.
+ record_count_scenarios = wtscenario.quick_scenarios(
+ 'nrecs', [10, 10000], [0.9, 0.1])
+
+ scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+ # Test drop of an object.
+ def test_tiered(self):
+ args = 'key_format=S,block_allocation=log-structured'
+ self.verbose(3,
+ 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+ #ds = SimpleDataSet(self, self.uri, self.nrecs, config=args)
+ ds = SimpleDataSet(self, self.uri, 10, config=args)
+ ds.populate()
+ self.session.checkpoint()
+ ds = SimpleDataSet(self, self.uri, 10000, config=args)
+ ds.populate()
+
+ self.reopen_conn()
+ ds = SimpleDataSet(self, self.uri, 1000, config=args)
+ ds.populate()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered03.py b/src/third_party/wiredtiger/test/suite/test_tiered03.py
new file mode 100644
index 00000000000..624387c21a3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered03.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, re
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered03.py
+# Test block-log-structured tree configuration options.
+class test_tiered03(wttest.WiredTigerTestCase):
+ K = 1024
+ M = 1024 * K
+ G = 1024 * M
+ uri = 'file:test_tiered03'
+
+ # Occasionally add a lot of records, so that merges (and bloom) happen.
+ record_count_scenarios = wtscenario.quick_scenarios(
+ 'nrecs', [10, 10000], [0.9, 0.1])
+
+ scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+ # Test sharing data between a primary and a secondary
+ def test_sharing(self):
+ args = 'block_allocation=log-structured'
+ self.verbose(3,
+ 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+ ds = SimpleDataSet(self, self.uri, 10, config=args)
+ ds.populate()
+ ds.check()
+ self.session.checkpoint()
+ ds.check()
+
+ # Create a secondary database
+ dir2 = os.path.join(self.home, 'SECONDARY')
+ os.mkdir(dir2)
+ conn2 = self.setUpConnectionOpen(dir2)
+ session2 = conn2.open_session()
+
+ # Reference the tree from the secondary:
+ metac = self.session.open_cursor('metadata:')
+ metac2 = session2.open_cursor('metadata:', None, 'readonly=0')
+ uri2 = self.uri[:5] + '../' + self.uri[5:]
+ metac2[uri2] = metac[self.uri] + ",readonly=1"
+
+ cursor2 = session2.open_cursor(uri2)
+ ds.check_cursor(cursor2)
+ cursor2.close()
+
+ newds = SimpleDataSet(self, self.uri, 10000, config=args)
+ newds.populate()
+ newds.check()
+ self.session.checkpoint()
+ newds.check()
+
+ # Check we can still read from the last checkpoint
+ cursor2 = session2.open_cursor(uri2)
+ ds.check_cursor(cursor2)
+ cursor2.close()
+
+ # Bump to new checkpoint
+ origmeta = metac[self.uri]
+ checkpoint = re.search(r',checkpoint=\(.+?\)\)', origmeta).group(0)[1:]
+ self.pr('Orig checkpoint: ' + checkpoint)
+ session2.alter(uri2, checkpoint)
+ self.pr('New metadata on secondaery: ' + metac2[uri2])
+
+ # Check that we can see the new data
+ cursor2 = session2.open_cursor(uri2)
+ newds.check_cursor(cursor2)
+
+if __name__ == '__main__':
+ wttest.run()