summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-01-27 16:56:06 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-01-27 06:35:56 +0000
commite00fae3a427a480ea7090385392653a9fa6cd1d7 (patch)
tree22d2dcb6d50a45a24f4db747d3fc4ddfa3106551
parent657fd55617da405757b94bc9973df40394a18e5b (diff)
downloadmongo-e00fae3a427a480ea7090385392653a9fa6cd1d7.tar.gz
Import wiredtiger: 462a8434b56a1274b2d8cf8dd91240021df294c7 from branch mongodb-5.0
ref: e39ffb5541..462a8434b5 for: 4.9.0 WT-6673 Rollback to stable to fix the inconsistent checkpoint by removing updates outside of the checkpoint snapshot WT-7121 Include log-structured allocation python tests in WT WT-7126 Coverity analysis defect 116991: Explicit null dereferenced WT-7127 Coverity analysis defect 116992: Unchecked return value WT-7128 Coverity analysis defect 116993: Resource leak WT-7131 Tiered cursors should return error if configured with zero tiers Reverted ticket(s): WT-7091 Restrict usage of LSM to only operate in conjunction with compatible incremental backup mechanism
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c31
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c5
-rw-r--r--src/third_party/wiredtiger/src/docs/backup.dox3
-rw-r--r--src/third_party/wiredtiger/src/include/cell_inline.h10
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h1
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h2
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in28
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c14
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c10
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c7
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_cursor.c18
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_schema.c26
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c125
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c206
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c6
-rw-r--r--src/third_party/wiredtiger/test/format/config.c22
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py147
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py3
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered01.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered02.py63
-rw-r--r--src/third_party/wiredtiger/test/suite/test_tiered03.py97
24 files changed, 678 insertions, 158 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 997337c7896..9e745e53163 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -860,6 +860,7 @@ conn_dsrc_stats = [
TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
+ TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'),
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'),
TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index a0113950ef4..9a018da1f30 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "e39ffb554160de902060cd063c4b1547ff6d5e1e"
+ "commit": "462a8434b56a1274b2d8cf8dd91240021df294c7"
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index b9302ccb602..7d8e4bdb8db 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -917,6 +917,37 @@ restart:
}
/*
+ * __wt_dhandle_update_write_gens --
+ * Update the open dhandles write generation, run write generation and base write generation
+ * number.
+ */
+void
+__wt_dhandle_update_write_gens(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+
+ conn = S2C(session);
+
+ for (dhandle = NULL;;) {
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q));
+ if (dhandle == NULL)
+ break;
+ btree = (WT_BTREE *)dhandle->handle;
+
+ WT_ASSERT(session, btree != NULL);
+
+ /*
+ * Initialize the btree write generation numbers after rollback to stable so that the
+ * transaction ids of the pages will be reset when loaded from disk to memory.
+ */
+ btree->write_gen = btree->base_write_gen = btree->run_write_gen =
+ WT_MAX(btree->write_gen, conn->base_write_gen);
+ }
+}
+
+/*
* __wt_verbose_dump_handles --
* Dump information about all data handles.
*/
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index f576a3ffa12..4cb1ae86f0f 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -611,11 +611,6 @@ __backup_config(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[
session, EINVAL, "Incremental primary cursor must have a known source identifier");
F_SET(cb, WT_CURBACKUP_INCR);
}
-
- /* Return an error if block-based incremental backup is performed with open LSM trees. */
- if (incremental_config && !TAILQ_EMPTY(&conn->lsmqh))
- WT_ERR_MSG(session, ENOTSUP, "LSM does not work with block-based incremental backup");
-
err:
if (ret != 0 && cb->incr_src != NULL) {
F_CLR(cb->incr_src, WT_BLKINCR_INUSE);
diff --git a/src/third_party/wiredtiger/src/docs/backup.dox b/src/third_party/wiredtiger/src/docs/backup.dox
index 265253d8959..82979f16423 100644
--- a/src/third_party/wiredtiger/src/docs/backup.dox
+++ b/src/third_party/wiredtiger/src/docs/backup.dox
@@ -171,9 +171,6 @@ database directory has not been opened and recovery run. Once recovery
has run in a backup directory, you can no longer back up to that
database directory.
-Block-based incremental backup does not work with LSM trees. An error
-will be returned in that case.
-
An example of opening the backup data source for block-based incremental backup:
@snippet ex_all.c incremental block backup
diff --git a/src/third_party/wiredtiger/src/include/cell_inline.h b/src/third_party/wiredtiger/src/include/cell_inline.h
index 8a81adcd896..54a8790ac58 100644
--- a/src/third_party/wiredtiger/src/include/cell_inline.h
+++ b/src/third_party/wiredtiger/src/include/cell_inline.h
@@ -970,6 +970,16 @@ __cell_unpack_window_cleanup(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk
* No delete txnid=MAX, ts=MAX, txnid=MAX, ts=MAX,
* durable_ts=NONE durable_ts=NONE
*/
+
+ /*
+ * Don't reset the transaction ids in rollback to stable when called from recovery because
+ * rollback to stable in addition to stable timestamp also depends on transaction ids from the
+ * page that are read into cache to decide if an update needs to be rolled back.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
+ F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ return;
+
if (dsk->write_gen == 0 || dsk->write_gen > S2BT(session)->base_write_gen)
return;
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 68e636cdebf..6f2ced383f4 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1675,6 +1675,7 @@ extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_dhandle_update_write_gens(WT_SESSION_IMPL *session);
extern void __wt_encrypt_size(
WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep);
extern void __wt_err_func(
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index e1c6cea488a..3562800fcd0 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -780,6 +780,7 @@ struct __wt_connection_stats {
int64_t rec_time_window_stop_txn;
int64_t txn_read_race_prepare_update;
int64_t txn_rts_hs_stop_older_than_newer_start;
+ int64_t txn_rts_inconsistent_ckpt;
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
@@ -993,6 +994,7 @@ struct __wt_dsrc_stats {
int64_t rec_time_window_stop_txn;
int64_t txn_read_race_prepare_update;
int64_t txn_rts_hs_stop_older_than_newer_start;
+ int64_t txn_rts_inconsistent_ckpt;
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_hs_restore_tombstones;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index d59138e1bb0..69559d2c3ac 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -5869,18 +5869,20 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
* than newer records
*/
#define WT_STAT_CONN_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 1466
+/*! transaction: rollback to stable inconsistent checkpoint */
+#define WT_STAT_CONN_TXN_RTS_INCONSISTENT_CKPT 1467
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1467
+#define WT_STAT_CONN_TXN_RTS_KEYS_REMOVED 1468
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1468
+#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1469
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1469
+#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1470
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1471
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1472
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1472
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1473
/*!
* @}
@@ -6478,18 +6480,20 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
* than newer records
*/
#define WT_STAT_DSRC_TXN_RTS_HS_STOP_OLDER_THAN_NEWER_START 2199
+/*! transaction: rollback to stable inconsistent checkpoint */
+#define WT_STAT_DSRC_TXN_RTS_INCONSISTENT_CKPT 2200
/*! transaction: rollback to stable keys removed */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2200
+#define WT_STAT_DSRC_TXN_RTS_KEYS_REMOVED 2201
/*! transaction: rollback to stable keys restored */
-#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2201
+#define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2202
/*! transaction: rollback to stable restored tombstones from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2202
+#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2203
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2203
+#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2204
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2204
+#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2205
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2205
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2206
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index 150880625a6..424f0dde8b9 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -1044,19 +1044,19 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
}
/* Record snapshot information in metadata for checkpoint. */
- if (txn->snapshot_count > 0) {
- WT_ERR(__wt_buf_fmt(session, buf,
- WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
- "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32
- "," WT_SYSTEM_CKPT_SNAPSHOT "=[",
- txn->snap_min, txn->snap_max, txn->snapshot_count));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
+ "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32,
+ txn->snap_min, txn->snap_max, txn->snapshot_count));
+ if (txn->snapshot_count > 0) {
+ WT_ERR(__wt_buf_catfmt(session, buf, "," WT_SYSTEM_CKPT_SNAPSHOT "=["));
for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count)
WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], ","));
WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], "]"));
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data));
}
+ WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data));
/* Record the base write gen in metadata as part of checkpoint */
WT_ERR(__wt_buf_fmt(
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index ede22518c26..a472273de48 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -748,7 +748,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha
f |= O_CLOEXEC;
#endif
WT_SYSCALL_RETRY(((pfh->fd = open(name, f, 0444)) == -1 ? -1 : 0), ret);
- if (ret != 0)
+ /* Return error if the file not found during rollback to stable. */
+ if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ WT_ERR(__wt_errno());
+ else if (ret != 0)
WT_ERR_MSG(session, ret, "%s: handle-open: open-directory", name);
WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
goto directory_open;
@@ -800,7 +803,10 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const cha
/* Create/Open the file. */
WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret);
- if (ret != 0)
+ /* Return error if the file not found during rollback to stable. */
+ if (ret != 0 && F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
+ WT_ERR(ENOENT);
+ else if (ret != 0)
WT_ERR_MSG(session, ret,
pfh->direct_io ? "%s: handle-open: open: failed with direct I/O configured, some "
"filesystem types do not support direct I/O" :
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 1ee255be706..90de7cc2e94 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -207,6 +207,7 @@ static const char *const __stats_dsrc_desc[] = {
"reconciliation: records written including a stop transaction ID",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
+ "transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
@@ -453,6 +454,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->rec_time_window_stop_txn = 0;
stats->txn_read_race_prepare_update = 0;
stats->txn_rts_hs_stop_older_than_newer_start = 0;
+ stats->txn_rts_inconsistent_ckpt = 0;
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
@@ -686,6 +688,7 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to)
to->rec_time_window_stop_txn += from->rec_time_window_stop_txn;
to->txn_read_race_prepare_update += from->txn_read_race_prepare_update;
to->txn_rts_hs_stop_older_than_newer_start += from->txn_rts_hs_stop_older_than_newer_start;
+ to->txn_rts_inconsistent_ckpt += from->txn_rts_inconsistent_ckpt;
to->txn_rts_keys_removed += from->txn_rts_keys_removed;
to->txn_rts_keys_restored += from->txn_rts_keys_restored;
to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones;
@@ -925,6 +928,7 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to)
to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update);
to->txn_rts_hs_stop_older_than_newer_start +=
WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start);
+ to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt);
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
@@ -1413,6 +1417,7 @@ static const char *const __stats_connection_desc[] = {
"reconciliation: records written including a stop transaction ID",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
+ "transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
"transaction: rollback to stable restored tombstones from history store",
@@ -1926,6 +1931,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->rec_time_window_stop_txn = 0;
stats->txn_read_race_prepare_update = 0;
stats->txn_rts_hs_stop_older_than_newer_start = 0;
+ stats->txn_rts_inconsistent_ckpt = 0;
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_hs_restore_tombstones = 0;
@@ -2450,6 +2456,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->txn_read_race_prepare_update += WT_STAT_READ(from, txn_read_race_prepare_update);
to->txn_rts_hs_stop_older_than_newer_start +=
WT_STAT_READ(from, txn_rts_hs_stop_older_than_newer_start);
+ to->txn_rts_inconsistent_ckpt += WT_STAT_READ(from, txn_rts_inconsistent_ckpt);
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones);
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
index 26c750fb496..1694e57dbc3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
@@ -34,8 +34,7 @@ __curtiered_open_cursors(WT_CURSOR_TIERED *curtiered)
dhandle = NULL;
tiered = curtiered->tiered;
- if (tiered->ntiers == 0)
- return (0);
+ WT_ASSERT(session, tiered->ntiers > 0);
/*
* If the key is pointing to memory that is pinned by a chunk cursor, take a copy before closing
@@ -1017,21 +1016,14 @@ err:
* documents avoids biasing towards small chunks. Then return the cursor on the chunk we have
* picked.
*/
-static int
+static void
__curtiered_random_chunk(WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered, WT_CURSOR **cursor)
{
- u_int i, ntiers;
-
- /*
- * If the tree is empty we cannot do a random lookup, so return a WT_NOTFOUND.
- */
- if ((ntiers = curtiered->tiered->ntiers) == 0)
- return (WT_NOTFOUND);
+ u_int i;
/* TODO: make randomness respect tree size. */
- i = __wt_random(&session->rnd) % ntiers;
+ i = __wt_random(&session->rnd) % curtiered->tiered->ntiers;
*cursor = curtiered->cursors[i];
- return (0);
}
/*
@@ -1055,7 +1047,7 @@ __curtiered_next_random(WT_CURSOR *cursor)
WT_ERR(__curtiered_enter(curtiered, false));
for (;;) {
- WT_ERR(__curtiered_random_chunk(session, curtiered, &c));
+ __curtiered_random_chunk(session, curtiered, &c);
/*
* This call to next_random on the chunk can potentially end in WT_NOTFOUND if the chunk we
* picked is empty. We want to retry in that case.
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_schema.c b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
index dc153b31e43..6e7dd84c0e3 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_schema.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_schema.c
@@ -15,12 +15,16 @@
int
__wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config)
{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval, tierconf;
WT_DECL_RET;
+ int ntiers;
char *meta_value;
const char *cfg[] = {WT_CONFIG_BASE(session, tiered_meta), config, NULL};
const char *metadata;
metadata = NULL;
+ ntiers = 0;
/* If it can be opened, it already exists. */
if ((ret = __wt_metadata_search(session, uri, &meta_value)) != WT_NOTFOUND) {
@@ -30,12 +34,24 @@ __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, co
}
WT_RET_NOTFOUND_OK(ret);
+ /* A tiered cursor must specify at least one underlying table */
+ WT_RET(__wt_config_gets(session, cfg, "tiered.tiers", &tierconf));
+ __wt_config_subinit(session, &cparser, &tierconf);
+
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ ++ntiers;
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (ntiers == 0)
+ WT_RET_MSG(session, EINVAL, "tiered table must specify at least one tier");
+
if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
WT_ERR(__wt_metadata_insert(session, uri, metadata));
}
err:
+ __wt_free(session, meta_value);
__wt_free(session, metadata);
return (ret);
}
@@ -188,14 +204,14 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
/* Point to some items in the copy to save re-parsing. */
WT_RET(__wt_config_gets(session, tiered_cfg, "tiered.tiers", &tierconf));
- /*
- * Count the number of tiers.
- */
+ /* Count the number of tiers. */
__wt_config_subinit(session, &cparser, &tierconf);
while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
++tiered->ntiers;
WT_RET_NOTFOUND_OK(ret);
+ WT_ASSERT(session, tiered->ntiers > 0);
+
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_calloc_def(session, tiered->ntiers, &tiered->tiers));
@@ -204,7 +220,7 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)ckey.len, ckey.str));
WT_ERR(__wt_session_get_dhandle(session, (const char *)buf->data, NULL, cfg, 0));
- __wt_atomic_addi32(&session->dhandle->session_inuse, 1);
+ (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
/* Load in reverse order (based on LSM logic). */
tiered->tiers[(tiered->ntiers - 1) - i] = session->dhandle;
WT_ERR(__wt_session_release_dhandle(session));
@@ -247,7 +263,7 @@ __wt_tiered_close(WT_SESSION_IMPL *session, WT_TIERED *tiered)
__wt_free(session, tiered->value_format);
if (tiered->tiers != NULL) {
for (i = 0; i < tiered->ntiers; i++)
- __wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
+ (void)__wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1);
__wt_free(session, tiered->tiers);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index e81451a96b3..89d8e7528d9 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -506,9 +506,10 @@ __recovery_set_checkpoint_snapshot(WT_SESSION_IMPL *session)
* snapshot max.
*/
WT_ASSERT(session,
- conn->recovery_ckpt_snapshot_count == counter &&
- conn->recovery_ckpt_snapshot[0] == conn->recovery_ckpt_snap_min &&
- conn->recovery_ckpt_snapshot[counter - 1] < conn->recovery_ckpt_snap_max);
+ ((conn->recovery_ckpt_snapshot_count == 0) ||
+ (conn->recovery_ckpt_snapshot_count == counter &&
+ conn->recovery_ckpt_snapshot[0] == conn->recovery_ckpt_snap_min &&
+ conn->recovery_ckpt_snapshot[counter - 1] < conn->recovery_ckpt_snap_max)));
}
err:
@@ -742,12 +743,14 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
char *config;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool do_checkpoint, eviction_started, hs_exists, needs_rec, was_backup;
+ bool rts_executed, no_log_recovery;
conn = S2C(session);
WT_CLEAR(r);
WT_INIT_LSN(&r.ckpt_lsn);
config = NULL;
do_checkpoint = hs_exists = true;
+ rts_executed = no_log_recovery = false;
eviction_started = false;
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
@@ -760,13 +763,14 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
F_SET(conn, WT_CONN_RECOVERING);
WT_ERR(__recovery_set_ckpt_base_write_gen(&r));
- WT_ERR(__recovery_set_checkpoint_snapshot(session));
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
metafile = &r.files[WT_METAFILE_ID];
metafile->c = metac;
+ WT_ERR(__recovery_set_checkpoint_timestamp(&r));
+ WT_ERR(__recovery_set_oldest_timestamp(&r));
/*
* If no log was found (including if logging is disabled), or if the last checkpoint was done
* with logging disabled, recovery should not run. Scan the metadata to figure out the largest
@@ -781,6 +785,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
* earlier time.
*/
WT_ERR(__recovery_file_scan(&r));
+ no_log_recovery = true;
+
/*
* The array can be re-allocated in recovery_file_scan. Reset our pointer after scanning all
* the files.
@@ -793,7 +799,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
else
do_checkpoint = false;
WT_ERR(__hs_exists(session, metac, cfg, &hs_exists));
- goto done;
+ goto rollback_to_stable;
}
/*
@@ -867,6 +873,46 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
r.files[0].c = NULL;
WT_ERR(metac->close(metac));
+rollback_to_stable:
+ /*
+ * Perform rollback to stable only when the following conditions met.
+ * 1. The connection is not read-only. A read-only connection expects that there shouldn't be
+ * any changes that need to be done on the database other than reading.
+ * 2. The history store file was found in the metadata.
+ */
+ if (hs_exists && !F_ISSET(conn, WT_CONN_READONLY)) {
+ /* Start the eviction threads for rollback to stable if not already started. */
+ WT_ERR(__wt_evict_create(session));
+ eviction_started = true;
+
+ WT_ERR(__recovery_set_checkpoint_snapshot(session));
+ WT_ASSERT(session,
+ conn->txn_global.has_stable_timestamp == false &&
+ conn->txn_global.stable_timestamp == WT_TS_NONE);
+
+ /*
+ * Set the stable timestamp from recovery timestamp and process the trees for rollback to
+ * stable.
+ */
+ conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
+ conn->txn_global.has_stable_timestamp = false;
+
+ if (conn->txn_global.recovery_timestamp != WT_TS_NONE)
+ conn->txn_global.has_stable_timestamp = true;
+
+ __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS,
+ "performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
+ "%s",
+ __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
+ rts_executed = true;
+ WT_ERR(__wt_rollback_to_stable(session, NULL, true));
+ }
+
+ /* Don't run recovery if no log was found. */
+ if (no_log_recovery)
+ goto done;
+
/*
* Now, recover all the files apart from the metadata. Pass WT_LOGSCAN_RECOVER so that old logs
* get truncated.
@@ -904,10 +950,13 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Recovery can touch more data than fits in cache, so it relies on regular eviction to manage
- * paging. Start eviction threads for recovery without history store cursors.
+ * paging. Start eviction threads if not already started for recovery without history store
+ * cursors.
*/
- WT_ERR(__wt_evict_create(session));
- eviction_started = true;
+ if (!eviction_started) {
+ WT_ERR(__wt_evict_create(session));
+ eviction_started = true;
+ }
/*
* Always run recovery even if it was a clean shutdown only if this is not a read-only
@@ -925,60 +974,26 @@ __wt_txn_recover(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(ret);
done:
- WT_ERR(__recovery_set_checkpoint_timestamp(&r));
- WT_ERR(__recovery_set_oldest_timestamp(&r));
- /*
- * Perform rollback to stable only when the following conditions met.
- * 1. The connection is not read-only. A read-only connection expects that there shouldn't be
- * any changes that need to be done on the database other than reading.
- * 2. The history store file was found in the metadata.
- */
- if (hs_exists && !F_ISSET(conn, WT_CONN_READONLY)) {
- /* Start the eviction threads for rollback to stable if not already started. */
- if (!eviction_started) {
- WT_ERR(__wt_evict_create(session));
- eviction_started = true;
- }
-
- /*
- * Currently, rollback to stable only needs to make changes to tables that use timestamps.
- * That is because eviction does not run in parallel with a checkpoint, so content that is
- * written never uses transaction IDs newer than the checkpoint's transaction ID and thus
- * never needs to be rolled back. Once eviction is allowed while a checkpoint is active, it
- * will be necessary to take the page write generation number into account during rollback
- * to stable. For example, a page with write generation 10 and txnid 20 is written in one
- * checkpoint, and in the next restart a new page with write generation 30 and txnid 20 is
- * written. The rollback to stable operation should only rollback the latest page changes
- * solely based on the write generation numbers.
- */
- WT_ASSERT(session,
- conn->txn_global.has_stable_timestamp == false &&
- conn->txn_global.stable_timestamp == WT_TS_NONE);
-
- /*
- * Set the stable timestamp from recovery timestamp and process the trees for rollback to
- * stable.
- */
- conn->txn_global.stable_timestamp = conn->txn_global.recovery_timestamp;
- conn->txn_global.has_stable_timestamp = false;
-
- if (conn->txn_global.recovery_timestamp != WT_TS_NONE)
- conn->txn_global.has_stable_timestamp = true;
- __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RTS,
- "Performing recovery rollback_to_stable with stable timestamp: %s and oldest timestamp: "
- "%s",
- __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]),
- __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1]));
-
- WT_ERR(__wt_rollback_to_stable(session, NULL, false));
- } else if (do_checkpoint)
+ if (do_checkpoint || rts_executed)
/*
* Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with
* the checkpoint LSN and archiving.
*/
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+ if (rts_executed) {
+ /* Initialize the connection's base write generation after rollback to stable. */
+ WT_ERR(__wt_metadata_init_base_write_gen(session));
+
+ /*
+ * Update the open dhandles write generations and base write generation with the
+ * connection's base write generation because the recovery checkpoint writes the pages to
+ * disk with new write generation number which contains transaction ids that are needed to
+ * reset later.
+ */
+ __wt_dhandle_update_write_gens(session);
+ }
/*
* If we're downgrading and have newer log files, force an archive, no matter what the archive
* setting is.
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index b30cf03be69..23a01efbd2e 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -8,9 +8,15 @@
#include "wt_internal.h"
+#define WT_CHECK_RECOVERY_FLAG_TXNID_CKPT_SNAPMIN(session, txnid) \
+ (F_ISSET(S2C(session), WT_CONN_RECOVERING) && (txnid) >= S2C(session)->recovery_ckpt_snap_min)
+
/* Enable rollback to stable verbose messaging during recovery. */
-#define WT_VERB_RECOVERY_RTS(session) \
- (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? WT_VERB_RECOVERY | WT_VERB_RTS : WT_VERB_RTS)
+#define WT_VERB_RECOVERY_RTS(session) \
+ (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? \
+ WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS | WT_VERB_RTS : \
+ WT_VERB_RTS)
+
/*
* __rollback_abort_newer_update --
* Abort updates in an update change with timestamps newer than the rollback timestamp. Also,
@@ -149,6 +155,53 @@ err:
}
/*
+ * __rollback_check_if_txnid_non_committed --
+ * Check if the transaction id is non committed.
+ */
+static bool
+__rollback_check_if_txnid_non_committed(WT_SESSION_IMPL *session, uint64_t txnid)
+{
+ WT_CONNECTION_IMPL *conn;
+ bool found;
+
+ conn = S2C(session);
+
+ /* If not recovery then assume all the data as committed. */
+ if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ return (false);
+
+ /*
+ * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot
+ * details are zero then return false i.e, updates are committed.
+ */
+ if (conn->recovery_ckpt_snap_min == 0 && conn->recovery_ckpt_snap_max == 0)
+ return (false);
+
+ /*
+ * Snapshot data:
+ * ids < recovery_ckpt_snap_min are committed,
+ * ids > recovery_ckpt_snap_max are non committed,
+ * everything else is committed unless it is found in the recovery_ckpt_snapshot array.
+ */
+ if (txnid < conn->recovery_ckpt_snap_min)
+ return (false);
+ else if (txnid > conn->recovery_ckpt_snap_max)
+ return (true);
+
+ /*
+ * Return false when the recovery snapshot count is 0, which means there is no uncommitted
+ * transaction ids.
+ */
+ if (conn->recovery_ckpt_snapshot_count == 0)
+ return (false);
+
+ WT_BINARY_SEARCH(
+ txnid, conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count, found);
+
+ return (found);
+}
+
+/*
* __rollback_row_ondisk_fixup_key --
* Abort updates in the history store and replace the on-disk value with an update that
* satisfies the given timestamp.
@@ -274,38 +327,48 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
/*
* Stop processing when we find the newer version value of this key is stable according to
- * the current version stop timestamp when it is not appending the selected update to the
- * update chain. Also it confirms that history store doesn't contains any newer version than
- * the current version for the key.
+ * the current version stop timestamp and transaction id when it is not appending the
+ * selected update to the update chain. Also it confirms that history store doesn't contains
+ * any newer version than the current version for the key.
*/
- if (!replace && hs_stop_durable_ts <= rollback_timestamp) {
+ if (!replace &&
+ (!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) &&
+ (hs_stop_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "history store update valid with stop timestamp: %s and stable timestamp: %s",
+ "history store update valid with stop timestamp: %s, stable timestamp: %s and txnid: "
+ "%" PRIu64,
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[1]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
+ cbt->upd_value->tw.stop_txn);
break;
}
- /* Stop processing when we find a stable update according to the given timestamp. */
- if (hs_durable_ts <= rollback_timestamp) {
+ /*
+ * Stop processing when we find a stable update according to the given timestamp and
+ * transaction id.
+ */
+ if (!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn) &&
+ (hs_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s and stable timestamp: %s",
+ "timestamp: %s, stable timestamp: %s and txnid: %" PRIu64,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]),
+ cbt->upd_value->tw.start_txn);
valid_update_found = true;
break;
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update aborted with start timestamp: %s, durable timestamp: %s, stop "
- "timestamp: %s and stable timestamp: %s",
+ "timestamp: %s, stable timestamp: %s, start txnid: %" PRIu64 " and stop txnid: %" PRIu64,
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
- __wt_timestamp_to_string(rollback_timestamp, ts_string[3]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn,
+ cbt->upd_value->tw.stop_txn);
/*
* Start time point of the current record may be used as stop time point of the previous
@@ -331,7 +394,16 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (valid_update_found) {
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
- upd->txnid = cbt->upd_value->tw.start_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because
+ * the connections write generation will be initialized after rollback to stable and the
+ * updates in the cache will be problematic. The transaction id of pages which are in
+ * disk will be automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = cbt->upd_value->tw.start_txn;
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
upd->start_ts = cbt->upd_value->tw.start_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
@@ -353,7 +425,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (hs_stop_durable_ts <= rollback_timestamp &&
hs_stop_durable_ts < newer_hs_durable_ts) {
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
- tombstone->txnid = cbt->upd_value->tw.stop_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery,
+ * because the connections write generation will be initialized after rollback to
+ * stable and the updates in the cache will be problematic. The transaction id of
+ * pages which are in disk will be automatically reset as part of unpacking cell
+ * when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ tombstone->txnid = WT_TXN_NONE;
+ else
+ tombstone->txnid = cbt->upd_value->tw.stop_txn;
tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts;
tombstone->start_ts = cbt->upd_value->tw.stop_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
@@ -443,14 +525,15 @@ __rollback_abort_row_ondisk_kv(
WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys);
} else
return (0);
- } else if (vpack->tw.durable_start_ts > rollback_timestamp ||
+ } else if (((vpack->tw.durable_start_ts > rollback_timestamp) ||
+ __rollback_check_if_txnid_non_committed(session, vpack->tw.start_txn)) ||
(!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"on-disk update aborted with start durable timestamp: %s, commit timestamp: %s, "
- "prepared: %s and stable timestamp: %s",
+ "prepared: %s, stable timestamp: %s and txnid : %" PRIu64,
__wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]),
__wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), prepared ? "true" : "false",
- __wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), vpack->tw.start_txn);
if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true));
else {
@@ -462,7 +545,9 @@ __rollback_abort_row_ondisk_kv(
WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed);
}
} else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) &&
- (vpack->tw.durable_stop_ts > rollback_timestamp || prepared)) {
+ (((vpack->tw.durable_stop_ts > rollback_timestamp) ||
+ __rollback_check_if_txnid_non_committed(session, vpack->tw.stop_txn)) ||
+ prepared)) {
/*
* Clear the remove operation from the key by inserting the original on-disk value as a
* standard update.
@@ -470,17 +555,28 @@ __rollback_abort_row_ondisk_kv(
WT_RET(__wt_page_cell_data_ref(session, page, vpack, &buf));
WT_ERR(__wt_upd_alloc(session, &buf, WT_UPDATE_STANDARD, &upd, NULL));
- upd->txnid = vpack->tw.start_txn;
+ /*
+ * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the
+ * connections write generation will be initialized after rollback to stable and the updates
+ * in the cache will be problematic. The transaction id of pages which are in disk will be
+ * automatically reset as part of unpacking cell when loaded to cache.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ upd->txnid = WT_TXN_NONE;
+ else
+ upd->txnid = vpack->tw.start_txn;
upd->durable_ts = vpack->tw.durable_start_ts;
upd->start_ts = vpack->tw.start_ts;
F_SET(upd, WT_UPDATE_RESTORED_FROM_DS);
WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored);
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "key restored with commit timestamp: %s, durable timestamp: %s txnid: %" PRIu64
- "and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
+ "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: %s, "
+ "txnid: %" PRIu64
+ " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64
", prepared: %s",
__wt_timestamp_to_string(upd->start_ts, ts_string[0]),
- __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), upd->txnid,
+ __wt_timestamp_to_string(upd->durable_ts, ts_string[1]),
+ __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid,
__wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[2]),
__wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[3]), vpack->tw.stop_txn,
prepared ? "true" : "false");
@@ -735,6 +831,7 @@ __rollback_page_needs_abort(
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
wt_timestamp_t durable_ts;
+ uint64_t newest_txn;
uint32_t i;
char ts_string[WT_TS_INT_STRING_SIZE];
const char *tag;
@@ -743,12 +840,14 @@ __rollback_page_needs_abort(
addr = ref->addr;
mod = ref->page == NULL ? NULL : ref->page->modify;
durable_ts = WT_TS_NONE;
+ newest_txn = WT_TXN_NONE;
tag = "undefined state";
prepared = result = false;
/*
* The rollback operation should be performed on this page when any one of the following is
- * greater than the given timestamp:
+ * greater than the given timestamp or during recovery if the newest transaction id on the page
+ * is greater than or equal to recovered checkpoint snapshot min:
* 1. The reconciled replace page max durable timestamp.
* 2. The reconciled multi page max durable timestamp.
* 3. The on page address max durable timestamp.
@@ -775,17 +874,22 @@ __rollback_page_needs_abort(
__wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack);
durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta);
prepared = vpack.ta.prepare;
- result = (durable_ts > rollback_timestamp) || prepared;
+ newest_txn = vpack.ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID_CKPT_SNAPMIN(session, newest_txn);
} else if (addr != NULL) {
tag = "address";
durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta);
prepared = addr->ta.prepare;
- result = (durable_ts > rollback_timestamp) || prepared;
+ newest_txn = addr->ta.newest_txn;
+ result = (durable_ts > rollback_timestamp) || prepared ||
+ WT_CHECK_RECOVERY_FLAG_TXNID_CKPT_SNAPMIN(session, newest_txn);
}
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
- "%p: page with %s durable timestamp: %s and prepared updates: %s", (void *)ref, tag,
- __wt_timestamp_to_string(durable_ts, ts_string), prepared ? "true" : "false");
+ "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64 " and prepared updates: %s",
+ (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn,
+ prepared ? "true" : "false");
return (result);
}
@@ -904,6 +1008,9 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
WT_DECL_RET;
WT_REF *child_ref, *ref;
+ /* Set this flag to return error instead of panic if file is corrupted. */
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+
/* Walk the tree, marking commits aborted where appropriate. */
ref = NULL;
while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp,
@@ -917,6 +1024,7 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
} else
WT_RET(__rollback_abort_newer_updates(session, ref, rollback_timestamp));
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
return (ret);
}
@@ -1151,12 +1259,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts,
rollback_timestamp;
+ uint64_t rollback_txnid;
size_t addr_size;
char ts_string[2][WT_TS_INT_STRING_SIZE];
const char *config, *uri;
- bool durable_ts_found, prepared_updates;
+ bool durable_ts_found, prepared_updates, has_txn_updates_gt_than_ckpt_snap;
txn_global = &S2C(session)->txn_global;
+ rollback_txnid = 0;
addr_size = 0;
/*
@@ -1173,6 +1283,13 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
WT_RET(__wt_metadata_cursor(session, &cursor));
+ if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
+ __wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
+ "Recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
+ ", snapshot count: %" PRIu32,
+ S2C(session)->recovery_ckpt_snap_min, S2C(session)->recovery_ckpt_snap_max,
+ S2C(session)->recovery_ckpt_snapshot_count);
+
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_key(cursor, &uri));
@@ -1187,7 +1304,7 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
/* Find out the max durable timestamp of the object from checkpoint. */
newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE;
- durable_ts_found = prepared_updates = false;
+ durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false;
WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
__wt_config_subinit(session, &ckptconf, &cval);
for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
@@ -1210,12 +1327,22 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
prepared_updates = true;
}
WT_ERR_NOTFOUND_OK(ret, false);
+ ret = __wt_config_subgets(session, &cval, "newest_txn", &value);
+ if (value.len != 0)
+ rollback_txnid = (uint64_t)value.val;
+ WT_ERR_NOTFOUND_OK(ret, false);
ret = __wt_config_subgets(session, &cval, "addr", &value);
if (ret == 0)
addr_size = value.len;
WT_ERR_NOTFOUND_OK(ret, false);
}
max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts);
+ has_txn_updates_gt_than_ckpt_snap =
+ WT_CHECK_RECOVERY_FLAG_TXNID_CKPT_SNAPMIN(session, rollback_txnid);
+
+ /* Increment the inconsistent checkpoint stats counter. */
+ if (has_txn_updates_gt_than_ckpt_snap)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt);
/*
* The rollback to stable will skip the tables during recovery and shutdown in the following
@@ -1257,15 +1384,18 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
* 1. The tree is modified.
* 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp.
* 3. There is no durable timestamp in any checkpoint.
+ * 4. The checkpoint newest txn is greater than snapshot min txn id
*/
if (S2BT(session)->modified || max_durable_ts > rollback_timestamp || prepared_updates ||
- !durable_ts_found) {
+ !durable_ts_found || has_txn_updates_gt_than_ckpt_snap) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"tree rolled back with durable timestamp: %s, or when tree is modified: %s or "
- "prepared updates: %s or when durable time is not found: %s",
+ "prepared updates: %s or when durable time is not found: %s or txnid is greater than "
+ "recovery checkpoint snap min: %s",
__wt_timestamp_to_string(max_durable_ts, ts_string[0]),
S2BT(session)->modified ? "true" : "false", prepared_updates ? "true" : "false",
- !durable_ts_found ? "true" : "false");
+ !durable_ts_found ? "true" : "false",
+ has_txn_updates_gt_than_ckpt_snap ? "true" : "false");
WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp));
} else
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
@@ -1287,6 +1417,14 @@ __rollback_to_stable_btree_apply(WT_SESSION_IMPL *session)
WT_TRET(__rollback_to_stable_btree_hs_truncate(session, S2BT(session)->id));
WT_TRET(__wt_session_release_dhandle(session));
+
+ /*
+ * Continue when the table is corrupted and proceed to perform rollback to stable on other
+ * tables.
+ */
+ if (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))
+ continue;
+
WT_ERR(ret);
}
WT_ERR_NOTFOUND_OK(ret, false);
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
index 13a3577745f..1c9cae21bbf 100644
--- a/src/third_party/wiredtiger/src/utilities/util_list.c
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -88,6 +88,12 @@ list_init_block(WT_SESSION *session, const char *key, WT_BLOCK *block)
wt_api = session->connection->get_extension_api(session->connection);
if ((ret = wt_api->metadata_search(wt_api, session, key, &config)) != 0)
WT_ERR(util_err(session, ret, "%s: WT_EXTENSION_API.metadata_search", key));
+ /*
+ * The config variable should be set and not NULL, but Coverity is convinced otherwise. This is
+ * an infrequent code path. Just add this extra conditional to make it happy.
+ */
+ if (config == NULL)
+ goto err;
if ((ret = wt_api->config_parser_open(wt_api, session, config, strlen(config), &parser)) != 0)
WT_ERR(util_err(session, ret, "WT_EXTENSION_API.config_parser_open"));
if ((ret = parser->get(parser, "allocation_size", &cval)) == 0)
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 13d91b793e3..46b475b219d 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -784,25 +784,11 @@ config_lsm_reset(void)
config_single("transaction.timestamps=off", false);
}
- /*
- * LSM does not work with block-based incremental backup, change the incremental backup
- * mechanism if block based in configured.
- */
+ /* LSM may not work with backups, turn off backups if lsm is configured. */
if (g.c_backups) {
- if (config_is_perm("backup.incremental") && g.c_backup_incr_flag == INCREMENTAL_BLOCK)
- testutil_die(EINVAL, "LSM does not work with backup.incremental=block configuration.");
-
- if (g.c_backup_incr_flag == INCREMENTAL_BLOCK)
- switch (mmrand(NULL, 1, 2)) {
- case 1:
- /* 50% */
- config_single("backup.incremental=off", false);
- break;
- case 2:
- /* 50% */
- config_single("backup.incremental=log", false);
- break;
- }
+ if (config_is_perm("backup"))
+ testutil_die(EINVAL, "LSM is incompatible with backup configurations");
+ config_single("backup=off", false);
}
}
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
new file mode 100644
index 00000000000..0f705301bcd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, threading, time
+from wtthread import checkpoint_thread, op_thread
+from helper import copy_wiredtiger_home
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from wiredtiger import stat
+
+# test_checkpoint_snapshot02.py
+# This test is to run checkpoint and eviction in parallel with timing
+# stress for checkpoint and let eviction write more data than checkpoint.
+#
+
+def timestamp_str(t):
+ return '%x' % t
+class test_checkpoint_snapshot02(wttest.WiredTigerTestCase):
+
+ # Create a table.
+ uri = "table:test_checkpoint_snapshot02"
+ nrows = 1000
+
+ def conn_config(self):
+ config = 'cache_size=5MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[checkpoint_slow]'
+ return config
+
+ def large_updates(self, uri, value, ds, nrows):
+ # Update a large number of records.
+ session = self.session
+ cursor = session.open_cursor(uri)
+ for i in range(0, nrows):
+ session.begin_transaction()
+ cursor[ds.key(i)] = value
+ session.commit_transaction()
+ cursor.close()
+
+ def check(self, check_value, uri, nrows):
+ session = self.session
+ session.begin_transaction()
+ cursor = session.open_cursor(uri)
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, check_value)
+ count += 1
+ session.commit_transaction()
+ self.assertEqual(count, nrows)
+
+ def test_checkpoint_snapshot(self):
+
+ ds = SimpleDataSet(self, self.uri, 0, key_format="S", value_format="S",config='log=(enabled=false)')
+ ds.populate()
+ valuea = "aaaaa" * 100
+ valueb = "bbbbb" * 100
+ valuec = "ccccc" * 100
+ valued = "ddddd" * 100
+
+ cursor = self.session.open_cursor(self.uri)
+ self.large_updates(self.uri, valuea, ds, self.nrows)
+
+ self.check(valuea, self.uri, self.nrows)
+
+ self.session.begin_transaction()
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ ckpt.start()
+
+ # Check for the value to wait for checkpoint to start.
+ cursor = self.session.open_cursor(self.uri)
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, valuea)
+ count += 1
+ self.assertEqual(count, self.nrows)
+
+ # Insert some data from the transaction which is running before
+ # checkpoint started
+ for i in range(0, self.nrows):
+ cursor.set_key(ds.key(i))
+ cursor.set_value(valueb)
+ self.assertEqual(cursor.insert(), 0)
+ self.session.commit_transaction()
+
+ self.large_updates(self.uri, valuec, ds, self.nrows)
+ self.large_updates(self.uri, valued, ds, self.nrows)
+
+ finally:
+ done.set()
+ ckpt.join()
+
+ #Simulate a crash by copying to a new directory(RESTART).
+ copy_wiredtiger_home(".", "RESTART")
+
+ # Open the new directory.
+ self.conn = self.setUpConnectionOpen("RESTART")
+ self.session = self.setUpSessionOpen(self.conn)
+
+ # Check the table contains the last checkpointed value.
+ self.check(valuea, self.uri, self.nrows)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertGreater(inconsistent_ckpt, 0)
+ self.assertGreater(hs_removed, 0)
+ self.assertEqual(upd_aborted, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(pages_visited, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
index f3c93509d63..7cfc3ba2fe7 100755
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable05.py
@@ -133,12 +133,11 @@ class test_rollback_to_stable05(test_rollback_to_stable_base):
self.assertEqual(calls, 1)
self.assertEqual(keys_removed, 0)
self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(pages_visited, 0)
if self.in_memory:
- self.assertGreaterEqual(pages_visited, 0)
self.assertEqual(upd_aborted, 0)
self.assertEqual(hs_removed, 0)
else:
- self.assertEqual(pages_visited, 0)
self.assertEqual(upd_aborted, 0)
self.assertEqual(hs_removed, 0)
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
index 4ac28066596..631213e665b 100755
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
@@ -143,7 +143,7 @@ class test_rollback_to_stable12(test_rollback_to_stable_base):
self.assertGreater(pages_visited, 0)
self.assertGreaterEqual(hs_removed, 0)
self.assertEqual(hs_sweep, 0)
- self.assertGreater(pages_walk_skipped, 0)
+ self.assertGreaterEqual(pages_walk_skipped, 0)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered01.py b/src/third_party/wiredtiger/test/suite/test_tiered01.py
index 2a41c3ff7ef..9a7066fd708 100644
--- a/src/third_party/wiredtiger/test/suite/test_tiered01.py
+++ b/src/third_party/wiredtiger/test/suite/test_tiered01.py
@@ -71,5 +71,12 @@ class test_tiered01(wttest.WiredTigerTestCase):
# self.session.drop(self.uri)
+ # It is an error to configure a tiered table with no tiers
+ def test_no_tiers(self):
+ msg = '/tiered table must specify at least one tier/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.create(self.uri, 'type=tiered,key_format=S,tiered=(tiers=())'),
+ msg)
+
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered02.py b/src/third_party/wiredtiger/test/suite/test_tiered02.py
new file mode 100644
index 00000000000..17eb3073c39
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered02.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered02.py
+# Test block-log-structured tree configuration options.
+class test_tiered02(wttest.WiredTigerTestCase):
+ K = 1024
+ M = 1024 * K
+ G = 1024 * M
+ uri = "file:test_tiered02"
+
+ # Occasionally add a lot of records, so that merges (and bloom) happen.
+ record_count_scenarios = wtscenario.quick_scenarios(
+ 'nrecs', [10, 10000], [0.9, 0.1])
+
+ scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+ # Test drop of an object.
+ def test_tiered(self):
+ args = 'key_format=S,block_allocation=log-structured'
+ self.verbose(3,
+ 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+ #ds = SimpleDataSet(self, self.uri, self.nrecs, config=args)
+ ds = SimpleDataSet(self, self.uri, 10, config=args)
+ ds.populate()
+ self.session.checkpoint()
+ ds = SimpleDataSet(self, self.uri, 10000, config=args)
+ ds.populate()
+
+ self.reopen_conn()
+ ds = SimpleDataSet(self, self.uri, 1000, config=args)
+ ds.populate()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_tiered03.py b/src/third_party/wiredtiger/test/suite/test_tiered03.py
new file mode 100644
index 00000000000..624387c21a3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_tiered03.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2021 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, re
+import wiredtiger, wtscenario, wttest
+from wtdataset import SimpleDataSet
+
+# test_tiered03.py
+# Test block-log-structured tree configuration options.
+class test_tiered03(wttest.WiredTigerTestCase):
+ K = 1024
+ M = 1024 * K
+ G = 1024 * M
+ uri = 'file:test_tiered03'
+
+ # Occasionally add a lot of records, so that merges (and bloom) happen.
+ record_count_scenarios = wtscenario.quick_scenarios(
+ 'nrecs', [10, 10000], [0.9, 0.1])
+
+ scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500)
+
+ # Test sharing data between a primary and a secondary
+ def test_sharing(self):
+ args = 'block_allocation=log-structured'
+ self.verbose(3,
+ 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs))
+ ds = SimpleDataSet(self, self.uri, 10, config=args)
+ ds.populate()
+ ds.check()
+ self.session.checkpoint()
+ ds.check()
+
+ # Create a secondary database
+ dir2 = os.path.join(self.home, 'SECONDARY')
+ os.mkdir(dir2)
+ conn2 = self.setUpConnectionOpen(dir2)
+ session2 = conn2.open_session()
+
+ # Reference the tree from the secondary:
+ metac = self.session.open_cursor('metadata:')
+ metac2 = session2.open_cursor('metadata:', None, 'readonly=0')
+ uri2 = self.uri[:5] + '../' + self.uri[5:]
+ metac2[uri2] = metac[self.uri] + ",readonly=1"
+
+ cursor2 = session2.open_cursor(uri2)
+ ds.check_cursor(cursor2)
+ cursor2.close()
+
+ newds = SimpleDataSet(self, self.uri, 10000, config=args)
+ newds.populate()
+ newds.check()
+ self.session.checkpoint()
+ newds.check()
+
+ # Check we can still read from the last checkpoint
+ cursor2 = session2.open_cursor(uri2)
+ ds.check_cursor(cursor2)
+ cursor2.close()
+
+ # Bump to new checkpoint
+ origmeta = metac[self.uri]
+ checkpoint = re.search(r',checkpoint=\(.+?\)\)', origmeta).group(0)[1:]
+ self.pr('Orig checkpoint: ' + checkpoint)
+ session2.alter(uri2, checkpoint)
+ self.pr('New metadata on secondaery: ' + metac2[uri2])
+
+ # Check that we can see the new data
+ cursor2 = session2.open_cursor(uri2)
+ newds.check_cursor(cursor2)
+
+if __name__ == '__main__':
+ wttest.run()