summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorEtienne Petrel <etienne.petrel@mongodb.com>2022-04-11 01:09:51 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-11 01:40:12 +0000
commit3c273f809e6ddba55dba616d79bd07df398cd37f (patch)
tree96f4c7f1a8a278f2e7212d2f56d896434d57f709 /src
parent10e1b9f8e33cf369031229706d76a24d7deace5c (diff)
downloadmongo-3c273f809e6ddba55dba616d79bd07df398cd37f.tar.gz
Import wiredtiger: ef59b9e2c174ec2e89a16cf417426f2d5805aff7 from branch mongodb-master
ref: 55d6761f7e..ef59b9e2c1 for: 6.0.0-rc0 WT-5927 Checkpoint cursors
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py14
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c23
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c6
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c11
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c328
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_hs.c39
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/docs/durability-checkpoint.dox23
-rw-r--r--src/third_party/wiredtiger/src/docs/timestamp-prepare.dox21
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c4
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c11
-rw-r--r--src/third_party/wiredtiger/src/history/hs_cursor.c25
-rw-r--r--src/third_party/wiredtiger/src/include/btree_inline.h2
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h17
-rw-r--r--src/third_party/wiredtiger/src/include/cursor_inline.h8
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h15
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h27
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h17
-rw-r--r--src/third_party/wiredtiger/src/include/session.h11
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h26
-rw-r--r--src/third_party/wiredtiger/src/include/txn_inline.h7
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in4
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c567
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_worker.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c309
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c91
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c242
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c111
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c2
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/checkpointer.c57
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c4
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h2
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c11
-rw-r--r--src/third_party/wiredtiger/test/suite/test_bulk02.py4
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_checkpoint01.py20
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint10.py193
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint11.py252
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint12.py126
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint13.py158
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint14.py191
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint15.py178
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint16.py126
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint17.py140
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint18.py163
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint19.py169
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint20.py175
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint21.py185
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_gc05.py62
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs06.py38
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs09.py9
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/wtthread.py15
61 files changed, 3887 insertions, 393 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 416c18669a9..b972ca6fcef 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -1518,9 +1518,16 @@ methods = {
to the bit count (except for the last set of values loaded)'''),
Config('checkpoint', '', r'''
the name of a checkpoint to open (the reserved name
- "WiredTigerCheckpoint" opens the most recent internal
+ "WiredTigerCheckpoint" opens the most recent
checkpoint taken for the object). The cursor does not
support data modification'''),
+ Config('checkpoint_use_history', 'true', r'''
+ when opening a checkpoint cursor, open history store cursors and retrieve
+ snapshot and timestamp information from the checkpoint. This is in general
+ required for correct reads; if setting it to false the caller must ensure
+ that the checkpoint is self-contained in the data store: timestamps are not
+ in use and the object was quiescent when the checkpoint was taken''',
+ type='boolean', undoc=True),
Config('checkpoint_wait', 'true', r'''
wait for the checkpoint lock, if \c checkpoint_wait=false, open the
cursor without taking a lock, returning EBUSY if the operation
@@ -1530,6 +1537,11 @@ methods = {
configure debug specific behavior on a cursor. Generally only
used for internal testing purposes''',
type='category', subconfig=[
+ Config('checkpoint_read_timestamp', '', r'''
+ read the checkpoint using the specified timestamp. The supplied value
+ must not be older than the checkpoint's oldest timestamp. Ignored if
+ not reading from a checkpoint''',
+ undoc=True),
Config('dump_version', 'false', r'''
open a version cursor, which is a debug cursor on a table that
enables iteration through the history of values for a given key.''',
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index d741e8c8f97..0c228ecb5f2 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -686,6 +686,7 @@ compressStream
compressibility
concat
cond
+conditionalize
conf
confchk
config
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 5fac5718f5f..664bcb61e0d 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "55d6761f7e8673c1dbb9f382c155d721dc1ce722"
+ "commit": "ef59b9e2c174ec2e89a16cf417426f2d5805aff7"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 6b49248bf68..ac62b46560f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -325,10 +325,16 @@ restart_read:
* Note: it's important that we're checking the on-disk value for global visibility, and not
* whatever __wt_txn_read returned, which might be something else. (If it's something else,
* we can't cache it; but in that case the on-disk value cannot be globally visible.)
+ *
+ * If we're reading from a checkpoint, it's sufficient to check visibility against the
+ * checkpoint's snapshot. Don't check global visibility, because that checks the current
+ * state of the world rather than the checkpoint state.
*/
cbt->cip_saved = cip;
if (rle > 1 &&
- __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts)) {
+ (WT_READING_CHECKPOINT(session) ?
+ __wt_txn_visible(session, unpack.tw.start_txn, unpack.tw.durable_start_ts) :
+ __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts))) {
/*
* Copy the value into cbt->tmp to cache it. This is perhaps unfortunate, because
* copying isn't free, but it's currently necessary. The memory we're copying might be
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 6373874fc65..c49dab8a90f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -483,10 +483,16 @@ restart_read:
* Note: it's important that we're checking the on-disk value for global visibility, and not
* whatever __wt_txn_read returned, which might be something else. (If it's something else,
* we can't cache it; but in that case the on-disk value cannot be globally visible.)
+ *
+ * If we're reading from a checkpoint, it's sufficient to check visibility against the
+ * checkpoint's snapshot. Don't check global visibility, because that checks the current
+ * state of the world rather than the checkpoint state.
*/
cbt->cip_saved = cip;
if (rle > 1 &&
- __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts)) {
+ (WT_READING_CHECKPOINT(session) ?
+ __wt_txn_visible(session, unpack.tw.start_txn, unpack.tw.durable_start_ts) :
+ __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts))) {
/*
* Copy the value into cbt->tmp to cache it. This is perhaps unfortunate, because
* copying isn't free, but it's currently necessary. The memory we're copying might be
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 5accecd938e..1fd622f29ad 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -252,10 +252,12 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
WT_ERR(__wt_scr_alloc(session, 512, &ds->t2));
/*
- * Set up history store support, opening a history store cursor on demand. Return error if that
- * doesn't work, except while running in-memory configuration.
+ * Set up history store support, opening a history store cursor on demand, except while running
+ * in-memory configuration, or when reading a checkpoint that has no corresponding history store
+ * checkpoint.
*/
- if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !WT_IS_HS(session->dhandle))
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !WT_IS_HS(session->dhandle) &&
+ !(WT_READING_CHECKPOINT(session) && session->hs_checkpoint == NULL))
WT_ERR(__wt_curhs_open(session, NULL, &ds->hs_cursor));
if (ds->hs_cursor != NULL) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 08ef39e9b82..4aa541b28ab 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
+static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt, bool);
static int __btree_get_last_recno(WT_SESSION_IMPL *);
static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_preload(WT_SESSION_IMPL *);
@@ -84,13 +84,16 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
btree->dhandle = dhandle;
/* Checkpoint and verify files are readonly. */
- if (dhandle->checkpoint != NULL || F_ISSET(btree, WT_BTREE_VERIFY) ||
+ if (WT_DHANDLE_IS_CHECKPOINT(dhandle) || F_ISSET(btree, WT_BTREE_VERIFY) ||
F_ISSET(S2C(session), WT_CONN_READONLY))
F_SET(btree, WT_BTREE_READONLY);
/* Get the checkpoint information for this name/checkpoint pair. */
WT_RET(__wt_meta_checkpoint(session, dhandle->name, dhandle->checkpoint, &ckpt));
+ /* Set the order number. */
+ dhandle->checkpoint_order = ckpt.order;
+
/*
* Bulk-load is only permitted on newly created files, not any empty file -- see the checkpoint
* code for a discussion.
@@ -107,7 +110,7 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
}
/* Initialize and configure the WT_BTREE structure. */
- WT_ERR(__btree_conf(session, &ckpt));
+ WT_ERR(__btree_conf(session, &ckpt, WT_DHANDLE_IS_CHECKPOINT(dhandle)));
/* Connect to the underlying block manager. */
WT_ERR(__wt_blkcache_open(
@@ -300,7 +303,7 @@ __wt_btree_config_encryptor(
* Configure a WT_BTREE structure.
*/
static int
-__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt, bool is_ckpt)
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval, metadata;
@@ -551,12 +554,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
*
* Rollback to stable does not operate on logged tables and metadata, so it is skipped.
*
- * The only scenario where the checkpoint run write generation number is less than the
- * connection last checkpoint base write generation number is when rollback to stable doesn't
- * happen during the recovery due to the unavailability of history store file.
+ * The only scenarios where the checkpoint run write generation number is less than the
+ * connection last checkpoint base write generation number are when rollback to stable doesn't
+ * happen during the recovery due to the unavailability of history store file, or when reading a
+ * checkpoint.
*/
- if (!F_ISSET(conn, WT_CONN_RECOVERING) || F_ISSET(btree, WT_BTREE_LOGGED) ||
- ckpt->run_write_gen < conn->last_ckpt_base_write_gen)
+ if ((!F_ISSET(conn, WT_CONN_RECOVERING) || F_ISSET(btree, WT_BTREE_LOGGED) ||
+ ckpt->run_write_gen < conn->last_ckpt_base_write_gen) &&
+ !is_ckpt)
btree->base_write_gen = btree->run_write_gen;
else
btree->base_write_gen = ckpt->run_write_gen;
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index bdc946a18a5..4b5a5c23109 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -996,8 +996,12 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, bool *preparedp)
*
* The visibility information is not referenced on the page so we need to ensure that
* the value is globally visible at the point in time where we read the page into cache.
+ *
+ * Skip if reading from a checkpoint because visible_all uses the current oldest txnid,
+ * which is not in general the checkpoint's oldest txnid, and may make things visible
+ * that shouldn't be.
*/
- if (!btree->huffman_value &&
+ if (!btree->huffman_value && !WT_READING_CHECKPOINT(session) &&
(WT_TIME_WINDOW_IS_EMPTY(&unpack.tw) ||
(!WT_TIME_WINDOW_HAS_STOP(&unpack.tw) &&
__wt_txn_tw_start_visible_all(session, &unpack.tw))))
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index ab219c960e4..83439a4e232 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -373,6 +373,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_log_flush[] = {
{"sync", "string", NULL, "choices=[\"off\",\"on\"]", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_debug_subconfigs[] = {
+ {"checkpoint_read_timestamp", "string", NULL, NULL, NULL, 0},
{"dump_version", "boolean", NULL, NULL, NULL, 0},
{"release_evict", "boolean", NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}};
@@ -386,8 +387,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor_incremental_subconfi
static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = {
{"append", "boolean", NULL, NULL, NULL, 0}, {"bulk", "string", NULL, NULL, NULL, 0},
{"checkpoint", "string", NULL, NULL, NULL, 0},
+ {"checkpoint_use_history", "boolean", NULL, NULL, NULL, 0},
{"checkpoint_wait", "boolean", NULL, NULL, NULL, 0},
- {"debug", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_debug_subconfigs, 2},
+ {"debug", "category", NULL, NULL, confchk_WT_SESSION_open_cursor_debug_subconfigs, 3},
{"dump", "string", NULL,
"choices=[\"hex\",\"json\",\"pretty\",\"pretty_hex\","
"\"print\"]",
@@ -1287,14 +1289,15 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator",
{"WT_SESSION.log_flush", "sync=on", confchk_WT_SESSION_log_flush, 1},
{"WT_SESSION.log_printf", "", NULL, 0},
{"WT_SESSION.open_cursor",
- "append=false,bulk=false,checkpoint=,checkpoint_wait=true,"
- "debug=(dump_version=false,release_evict=false),dump=,"
+ "append=false,bulk=false,checkpoint=,checkpoint_use_history=true,"
+ "checkpoint_wait=true,debug=(checkpoint_read_timestamp=,"
+ "dump_version=false,release_evict=false),dump=,"
"incremental=(consolidate=false,enabled=false,file=,"
"force_stop=false,granularity=16MB,src_id=,this_id=),"
"next_random=false,next_random_sample_size=0,overwrite=true,"
"prefix_search=false,raw=false,read_once=false,readonly=false,"
"skip_sort_check=false,statistics=,target=",
- confchk_WT_SESSION_open_cursor, 17},
+ confchk_WT_SESSION_open_cursor, 18},
{"WT_SESSION.prepare_transaction", "prepare_timestamp=", confchk_WT_SESSION_prepare_transaction,
1},
{"WT_SESSION.query_timestamp", "get=read", confchk_WT_SESSION_query_timestamp, 1},
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index 4b90b0c0712..85b2dcf27ed 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -360,7 +360,7 @@ __backup_add_id(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval)
* Get the most recent checkpoint name. For now just use the one that is part of the metadata.
* We only care whether or not a checkpoint exists, so immediately free it.
*/
- ret = __wt_meta_checkpoint_last_name(session, WT_METAFILE_URI, &ckpt);
+ ret = __wt_meta_checkpoint_last_name(session, WT_METAFILE_URI, &ckpt, NULL, NULL);
__wt_free(session, ckpt);
WT_ERR_NOTFOUND_OK(ret, true);
if (ret == WT_NOTFOUND) {
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index d373c089c06..a7ecac9af15 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -16,6 +16,73 @@ WT_STAT_USECS_HIST_INCR_FUNC(opread, perf_hist_opread_latency, 100)
WT_STAT_USECS_HIST_INCR_FUNC(opwrite, perf_hist_opwrite_latency, 100)
/*
+ * Wrapper for substituting checkpoint state when doing checkpoint cursor operations.
+ *
+ * A checkpoint cursor has two extra things in it: a dummy transaction (always), and a dhandle for
+ * the corresponding history store checkpoint (mostly but not always).
+ *
+ * If there's a checkpoint transaction, it means we're a checkpoint cursor. In that case we
+ * substitute the transaction into the session, and also stick the checkpoint name of the history
+ * store dhandle in the session for when the history store is opened. After the operation completes
+ * we then undo it all.
+ *
+ * If the current transaction is _already_ a checkpoint cursor dummy transaction, however, do
+ * nothing. This happens when the history store logic opens history store cursors inside checkpoint
+ * cursor operations on the data store. In that case we want to keep the existing state.
+ */
+#define WT_WITH_CHECKPOINT(session, cbt, op) \
+ do { \
+ WT_TXN *__saved_txn; \
+ \
+ if ((cbt)->checkpoint_txn != NULL) { \
+ __saved_txn = (session)->txn; \
+ if (F_ISSET(__saved_txn, WT_TXN_IS_CHECKPOINT)) \
+ __saved_txn = NULL; \
+ else { \
+ (session)->txn = (cbt)->checkpoint_txn; \
+ if ((cbt)->checkpoint_hs_dhandle != NULL) { \
+ WT_ASSERT(session, session->hs_checkpoint == NULL); \
+ session->hs_checkpoint = (cbt)->checkpoint_hs_dhandle->checkpoint; \
+ } \
+ } \
+ } else \
+ __saved_txn = NULL; \
+ op; \
+ if (__saved_txn != NULL) { \
+ (session)->txn = __saved_txn; \
+ session->hs_checkpoint = NULL; \
+ } \
+ } while (0)
+
+/*
+ * __curfile_check_cbt_txn --
+ * Enforce restrictions on nesting checkpoint cursors. The only nested cursors we should get to
+ * from a checkpoint cursor are cursors for the corresponding history store checkpoint.
+ */
+static inline int
+__curfile_check_cbt_txn(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_TXN *txn;
+
+ txn = session->txn;
+
+ /* If not reading a checkpoint, everything's fine. */
+ if (cbt->checkpoint_txn == NULL)
+ return (0);
+
+ /*
+ * It is ok if the current transaction is already a checkpoint transaction. Assert that we are
+ * the history store.
+ */
+ if (F_ISSET(txn, WT_TXN_IS_CHECKPOINT)) {
+ WT_ASSERT(session, WT_IS_HS(cbt->dhandle));
+ WT_ASSERT(session, WT_DHANDLE_IS_CHECKPOINT(cbt->dhandle));
+ }
+
+ return (0);
+}
+
+/*
* __curfile_compare --
* WT_CURSOR->compare method for the btree cursor type.
*/
@@ -90,7 +157,10 @@ __curfile_next(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, next, CUR2BT(cbt));
WT_ERR(__cursor_copy_release(cursor));
- WT_ERR(__wt_btcur_next(cbt, false));
+ WT_ERR(__curfile_check_cbt_txn(session, cbt));
+
+ WT_WITH_CHECKPOINT(session, cbt, ret = __wt_btcur_next(cbt, false));
+ WT_ERR(ret);
/* Next maintains a position, key and value. */
WT_ASSERT(session,
@@ -117,7 +187,10 @@ __wt_curfile_next_random(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, next, CUR2BT(cbt));
WT_ERR(__cursor_copy_release(cursor));
- WT_ERR(__wt_btcur_next_random(cbt));
+ WT_ERR(__curfile_check_cbt_txn(session, cbt));
+
+ WT_WITH_CHECKPOINT(session, cbt, ret = __wt_btcur_next_random(cbt));
+ WT_ERR(ret);
/* Next-random maintains a position, key and value. */
WT_ASSERT(session,
@@ -143,7 +216,10 @@ __curfile_prev(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, prev, CUR2BT(cbt));
WT_ERR(__cursor_copy_release(cursor));
- WT_ERR(__wt_btcur_prev(cbt, false));
+ WT_ERR(__curfile_check_cbt_txn(session, cbt));
+
+ WT_WITH_CHECKPOINT(session, cbt, ret = __wt_btcur_prev(cbt, false));
+ WT_ERR(ret);
/* Prev maintains a position, key and value. */
WT_ASSERT(session,
@@ -197,8 +273,11 @@ __curfile_search(WT_CURSOR *cursor)
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
+ WT_ERR(__curfile_check_cbt_txn(session, cbt));
+
time_start = __wt_clock(session);
- WT_ERR(__wt_btcur_search(cbt));
+ WT_WITH_CHECKPOINT(session, cbt, ret = __wt_btcur_search(cbt));
+ WT_ERR(ret);
time_stop = __wt_clock(session);
__wt_stat_usecs_hist_incr_opread(session, WT_CLOCKDIFF_US(time_stop, time_start));
@@ -228,8 +307,11 @@ __curfile_search_near(WT_CURSOR *cursor, int *exact)
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
+ WT_ERR(__curfile_check_cbt_txn(session, cbt));
+
time_start = __wt_clock(session);
- WT_ERR(__wt_btcur_search_near(cbt, exact));
+ WT_WITH_CHECKPOINT(session, cbt, ret = __wt_btcur_search_near(cbt, exact));
+ WT_ERR(ret);
time_stop = __wt_clock(session);
__wt_stat_usecs_hist_incr_opread(session, WT_CLOCKDIFF_US(time_stop, time_start));
@@ -512,6 +594,17 @@ err:
WT_ASSERT(session, session->dhandle == NULL || session->dhandle->session_inuse > 0);
+ /* Free any private transaction set up for a checkpoint cursor. */
+ if (cbt->checkpoint_txn != NULL)
+ __wt_txn_close_checkpoint_cursor(session, &cbt->checkpoint_txn);
+
+ /* Close any history store handle set up for a checkpoint cursor. */
+ if (cbt->checkpoint_hs_dhandle != NULL) {
+ WT_WITH_DHANDLE(
+ session, cbt->checkpoint_hs_dhandle, WT_TRET(__wt_session_release_dhandle(session)));
+ cbt->checkpoint_hs_dhandle = NULL;
+ }
+
__wt_cursor_close(cursor);
/*
@@ -647,12 +740,145 @@ __curfile_reopen(WT_CURSOR *cursor, bool sweep_check_only)
}
/*
+ * __curfile_setup_checkpoint --
+ * Open helper code for checkpoint cursors.
+ */
+static int
+__curfile_setup_checkpoint(WT_CURSOR_BTREE *cbt, const char *cfg[], WT_DATA_HANDLE *hs_dhandle,
+ WT_CKPT_SNAPSHOT *ckpt_snapshot)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = CUR2S(cbt);
+
+ /*
+ * It is important that reading from a checkpoint also reads from the corresponding
+ * checkpoint of the history store and also uses matching snapshot and timestamp data;
+ * otherwise all kinds of things go wrong. The logic for getting a matching set is complex (what
+ * it means to be "matching" is also complex) and is explained in session_dhandle.c. This
+ * comment explains what happens once we get a matching set so that subsequent reads work
+ * correctly.
+ *
+ * 1. When we get here, if we are opening a data store checkpoint, our "current" dhandle in the
+ * session is the data store checkpoint, hs_dhandle is the matching history store checkpoint,
+ * and ckpt_snapshot contains the snapshot and timestamp data. It is at least theoretically
+ * possible for hs_dhandle to be null; this means there is no corresponding history store
+ * checkpoint. In this case we will avoid trying to open it later.
+ *
+ * We keep the history store checkpoint dhandle in the checkpoint cursor, and hold it open as
+ * long as the checkpoint cursor remains open. It is never directly used, but it ensures that
+ * the history store checkpoint will not be removed under us and any history store lookups done
+ * via the checkpoint cursor (which open the history store separately themselves) will be able
+ * to open the right version of the history store. This is essential for unnamed checkpoints as
+ * they turn over frequently and asynchronously. It is, strictly speaking, not necessary for
+ * named checkpoints, because as long as a named checkpoint data store cursor is open that named
+ * checkpoint cannot be changed. However, making the behavior conditional would introduce
+ * substantial interface complexity to little benefit.
+ *
+ * 2. When we get here, if we are opening a history store checkpoint, our "current" dhandle in
+ * the session is the history store checkpoint, hs_dhandle is null, and ckpt_snapshot contains
+ * the checkpoint's snapshot and timestamp information.
+ *
+ * If we are opening a history store checkpoint directly from the application (normally the
+ * application should never do this, but one or two tests do) we will get snapshot information
+ * matching the checkpoint. If we are opening a history store checkpoint internally, as part of
+ * an operation on a data store checkpoint cursor, we will have explicitly opened the right
+ * history store checkpoint. The snapshot information may be from a newer checkpoint, but will
+ * not be used.
+ *
+ * 3. To make visibility checks work correctly relative to the checkpoint snapshot, we concoct a
+ * dummy transaction and load the snapshot data into it. This transaction lives in the
+ * checkpoint cursor. It is substituted into session->txn during checkpoint cursor operations.
+ * Note that we do _not_ substitute into txn_shared, so using a checkpoint cursor does not cause
+ * interactions with other threads and in particular does not affect the pinned timestamp
+ * computation. The read timestamp associated with the checkpoint is kept in the dummy
+ * transaction, and there's a (single) special case in the visibility code to check it instead
+ * of the normal read timestamp in txn_shared.
+ *
+ * Global visibility checks that can occur during checkpoint cursor operations need to be
+ * special-cased, because global visibility checks against the current world and not the
+ * checkpoint. There are only a few of these and it seemed more effective to conditionalize them
+ * directly rather than tinkering with the visibility code itself.
+ *
+ * 4. We do not substitute into session->txn if we are already in a checkpoint cursor (that is,
+ * if session->txn is a checkpoint cursor dummy transaction) -- this happens when doing history
+ * store accesses within a data store operation, and means that the history store accesses use
+ * the same snapshot and timestamp information as the data store accesses, which is important
+ * for consistency.
+ *
+ * 5. Because the checkpoint cursor in use is not itself visible in various parts of the
+ * system (most notably the history store code) anything else we need to know about
+ * elsewhere also gets substituted into the session at this point. Currently the only such item
+ * is the name for the matching history store checkpoint.
+ *
+ * 6. When accessing the history store, we will use the history store checkpoint name stashed in
+ * the session if there is one.
+ */
+
+ /* We may have gotten a history store handle, but not if we're the history store. */
+ WT_ASSERT(session, !WT_IS_HS(session->dhandle) || hs_dhandle == NULL);
+
+ /* We should always have snapshot data, though it might be degenerate. */
+ WT_ASSERT(session, ckpt_snapshot != NULL);
+
+ /*
+ * Override the read timestamp if explicitly provided. Otherwise it's the stable timestamp from
+ * the checkpoint. Replace it in the snapshot info if necessary.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_gets_def(session, cfg, "debug.checkpoint_read_timestamp", 0, &cval), true);
+ if (ret == 0) {
+ if (cval.len > 0 && cval.val == 0)
+ /*
+ * Allow setting "0" explicitly to mean "none". Otherwise 0 is rejected by the timestamp
+ * parser. Note that the default is not "none", it is the checkpoint's stable timestamp.
+ */
+ ckpt_snapshot->stable_ts = WT_TXN_NONE;
+ else if (cval.val != 0) {
+ WT_ERR(__wt_txn_parse_timestamp(
+ session, "checkpoint_read", &ckpt_snapshot->stable_ts, &cval));
+ /*
+ * Fail if the read timestamp is less than checkpoint's oldest timestamp. Since this is
+ * a debug setting it's not super critical to make it a usable interface, and for
+ * testing it's usually more illuminating to fail if something unexpected happens. If we
+ * end up exposing the checkpoint read timestamp, it might be better to have this always
+ * round up instead, since there's no useful way for the application to get the
+ * checkpoint's oldest timestamp itself.
+ */
+ if (ckpt_snapshot->stable_ts < ckpt_snapshot->oldest_ts)
+ WT_ERR_MSG(session, EINVAL,
+ "checkpoint_read_timestamp must not be before the checkpoint oldest "
+ "timestamp");
+ }
+ }
+
+ /*
+ * Always create the dummy transaction. If we're opening the history store from inside a data
+ * store checkpoint cursor, we'll end up not using it, but we can't easily tell from here
+ * whether that's the case. Pass in the snapshot info.
+ */
+ WT_ERR(__wt_txn_init_checkpoint_cursor(session, ckpt_snapshot, &cbt->checkpoint_txn));
+
+ /*
+ * Stow the history store handle on success. (It will be released further up the call chain if
+ * we fail.)
+ */
+ WT_ASSERT(session, ret == 0);
+ cbt->checkpoint_hs_dhandle = hs_dhandle;
+
+err:
+ return (ret);
+}
+
+/*
* __curfile_create --
* Open a cursor for a given btree handle.
*/
static int
__curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk,
- bool bitmap, WT_CURSOR **cursorp)
+ bool bitmap, WT_DATA_HANDLE *hs_dhandle, WT_CKPT_SNAPSHOT *ckpt_snapshot, WT_CURSOR **cursorp)
{
WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */
__wt_cursor_get_value, /* get-value */
@@ -707,8 +933,17 @@ __curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[],
*/
__wt_cursor_dhandle_incr_use(session);
- if (session->dhandle->checkpoint != NULL)
- F_SET(cbt, WT_CBT_NO_TXN | WT_CBT_NO_TRACKING);
+ if (WT_READING_CHECKPOINT(session)) {
+ /* Checkpoint cursor. */
+ if (bulk)
+ /* Fail now; otherwise we fail further down and then segfault trying to recover. */
+ WT_RET_MSG(session, EINVAL, "checkpoints are read-only and cannot be bulk-loaded");
+ WT_RET(__curfile_setup_checkpoint(cbt, cfg, hs_dhandle, ckpt_snapshot));
+ } else {
+ /* We should not have been given the bits used by checkpoint cursors. */
+ WT_ASSERT(session, hs_dhandle == NULL);
+ WT_ASSERT(session, ckpt_snapshot->snapshot_txns == NULL);
+ }
if (bulk) {
F_SET(cursor, WT_CURSTD_BULK);
@@ -767,10 +1002,11 @@ err:
__wt_cursor_dhandle_decr_use(session);
/*
- * Our caller expects to release the data handle if we fail. Disconnect it from the cursor
- * before closing.
+ * Our caller expects to release the data handles if we fail. Disconnect both the main and
+ * any history store handle from the cursor before closing.
*/
cbt->dhandle = NULL;
+ cbt->checkpoint_hs_dhandle = NULL;
WT_TRET(__curfile_close(cursor));
*cursorp = NULL;
@@ -787,15 +1023,25 @@ int
__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[],
WT_CURSOR **cursorp)
{
+ WT_CKPT_SNAPSHOT ckpt_snapshot;
WT_CONFIG_ITEM cval;
+ WT_DATA_HANDLE *hs_dhandle;
WT_DECL_RET;
uint32_t flags;
- bool bitmap, bulk, checkpoint_wait;
+ bool bitmap, bulk, checkpoint_use_history, checkpoint_wait;
+ ckpt_snapshot.snapshot_max = WT_TXN_MAX;
+ ckpt_snapshot.snapshot_min = WT_TXN_MAX;
+ ckpt_snapshot.snapshot_txns = NULL;
+ ckpt_snapshot.snapshot_count = 0;
+ hs_dhandle = NULL;
bitmap = bulk = false;
+ checkpoint_use_history = true;
checkpoint_wait = true;
flags = 0;
+ WT_ASSERT(session, WT_BTREE_PREFIX(uri));
+
/*
* Decode the bulk configuration settings. In memory databases ignore bulk load.
*/
@@ -825,25 +1071,71 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, c
if (bulk)
LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);
- WT_ASSERT(session, WT_BTREE_PREFIX(uri));
+ /* Find out if we're supposed to avoid opening the history store. */
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint_use_history", 0, &cval));
+ if (cval.len > 0)
+ checkpoint_use_history = (cval.val != 0);
- /* Get the handle and lock it while the cursor is using it. */
/*
- * If we are opening exclusive and don't want a bulk cursor open to fail with EBUSY due to a
- * database-wide checkpoint, get the handle while holding the checkpoint lock.
+ * This open path is used for checkpoint cursors and bulk cursors as well as ordinary cursors.
+ * Several considerations apply as a result.
+ *
+ * 1. For bulk cursors we need to do an exclusive open. In this case, a running database-wide
+ * checkpoint can result in EBUSY. To avoid this, we can take the checkpoint lock while opening
+ * the dhandle, which causes us to block until any running checkpoint finishes. This is
+ * controlled by the "checkpoint_wait" config. Nothing else does an exclusive open, so the path
+ * with the checkpoint lock is not otherwise reachable.
+ *
+ * 2. For checkpoint cursors it is not safe to take the checkpoint lock here, because the LSM
+ * code opens checkpoint cursors while holding the schema lock and the checkpoint lock is
+ * supposed to come before the schema lock. If there should ever be some reason to do an
+ * exclusive open of a checkpoint cursor, something will have to give.
+ *
+ * 3. If we are opening a checkpoint cursor, we need two dhandles, one for the tree we're
+ * actually trying to open and (unless that's itself the history store) one for the history
+ * store, and also a copy of the snapshot and timestamp metadata for the checkpoint. It's
+ * necessary for data correctness for all three of these to match. There's a complicated scheme
+ * for getting a matching set while avoiding races with a running checkpoint inside the open
+ * logic (see session_dhandle.c) that we fortunately don't need to think about here.
+ *
+ * 4. The LSM code also opens cursors on single-file checkpoints with no corresponding history
+ * store or snapshot information. It takes steps to make sure everything in the checkpoint is
+ * globally visible and sets checkpoint_use_history=false to indicate we shouldn't try to open
+ * the history store or retrieve the snapshot. If we were to try, we'd fail and the LSM code
+ * would get upset.
+ *
+ * 5. To avoid a proliferation of cases, and to avoid repeatedly parsing config strings, we
+ * always pass down the return arguments for the history store dhandle and checkpoint snapshot
+ * information (except for the bulk-only case and the LSM case) and pass the results on to
+ * __curfile_create. We will not get anything back unless we are actually opening a checkpoint
+ * cursor. The open code takes care of the special case of opening a checkpoint cursor on the
+ * history store. (This is not normally done by applications; but it is done by a couple tests,
+ * and furthermore any internally opened history store cursors come through here, so this case
+ * does matter.)
*/
+
+ /* Get the handle and lock it while the cursor is using it. */
if (LF_ISSET(WT_DHANDLE_EXCLUSIVE) && checkpoint_wait)
WT_WITH_CHECKPOINT_LOCK(
- session, ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags));
+ session, ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags, NULL, NULL));
+ else if (checkpoint_use_history)
+ ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags, &hs_dhandle, &ckpt_snapshot);
else
- ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags);
+ ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags, NULL, NULL);
WT_RET(ret);
- WT_ERR(__curfile_create(session, owner, cfg, bulk, bitmap, cursorp));
+ WT_ERR(
+ __curfile_create(session, owner, cfg, bulk, bitmap, hs_dhandle, &ckpt_snapshot, cursorp));
return (0);
err:
+ if (hs_dhandle != NULL)
+ WT_WITH_DHANDLE(session, hs_dhandle, WT_TRET(__wt_session_release_dhandle(session)));
+
+ /* If a snapshot array was returned and hasn't been moved elsewhere, discard it now. */
+ __wt_free(session, ckpt_snapshot.snapshot_txns);
+
/* If the cursor could not be opened, release the handle. */
WT_TRET(__wt_session_release_dhandle(session));
return (ret);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c
index 191daf61e29..6b7114bc27f 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_hs.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c
@@ -24,17 +24,36 @@ __curhs_file_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR *
{
WT_CURSOR *cursor;
WT_DECL_RET;
- const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
+ size_t len;
+ char *tmp;
+ const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL, NULL};
+
+ if (WT_READING_CHECKPOINT(session)) {
+ /*
+ * Propagate the checkpoint setting to the history cursor. Use the indicated history store
+ * checkpoint. If that's null, it means there is no history store checkpoint to read and we
+ * aren't supposed to come here.
+ */
+ WT_ASSERT(session, session->hs_checkpoint != NULL);
+ len = strlen("checkpoint=") + strlen(session->hs_checkpoint) + 1;
+ WT_RET(__wt_malloc(session, len, &tmp));
+ WT_ERR(__wt_snprintf(tmp, len, "checkpoint=%s", session->hs_checkpoint));
+ open_cursor_cfg[1] = tmp;
+ } else
+ tmp = NULL;
WT_WITHOUT_DHANDLE(
session, ret = __wt_open_cursor(session, WT_HS_URI, owner, open_cursor_cfg, &cursor));
- WT_RET(ret);
+ WT_ERR(ret);
/* History store cursors should always ignore tombstones. */
F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE);
*cursorp = cursor;
- return (0);
+
+err:
+ __wt_free(session, tmp);
+ return (ret);
}
/*
@@ -430,9 +449,12 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
/*
* If the stop time pair on the tombstone in the history store is already globally visible
- * we can skip it.
+ * we can skip it. But only if we aren't reading from a checkpoint. If we're reading from a
+ * checkpoint, we need to see the world as of the checkpoint, and visible-all checks refer
+ * to the current world.
*/
- if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ if (!WT_READING_CHECKPOINT(session) &&
+ __wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
WT_STAT_CONN_DATA_INCR(session, cursor_prev_hs_tombstone);
continue;
}
@@ -526,9 +548,12 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
/*
* If the stop time pair on the tombstone in the history store is already globally visible
- * we can skip it.
+ * we can skip it. But only if we aren't reading from a checkpoint. If we're reading from a
+ * checkpoint, we need to see the world as of the checkpoint, and visible-all checks refer
+ * to the current world.
*/
- if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ if (!WT_READING_CHECKPOINT(session) &&
+ __wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone);
continue;
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 2df2cd7410b..ef8be97bc40 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -408,7 +408,7 @@ __curstat_file_init(
return (0);
}
- WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0));
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0, NULL, NULL));
dhandle = session->dhandle;
/*
diff --git a/src/third_party/wiredtiger/src/docs/durability-checkpoint.dox b/src/third_party/wiredtiger/src/docs/durability-checkpoint.dox
index 3b9425833d5..bda7c1ca1c9 100644
--- a/src/third_party/wiredtiger/src/docs/durability-checkpoint.dox
+++ b/src/third_party/wiredtiger/src/docs/durability-checkpoint.dox
@@ -68,14 +68,19 @@ until those changes are made.
@section checkpoint_cursors Checkpoint cursors
-Cursors are normally opened in the most recent version of a data source.
-However, the \c checkpoint configuration string may be provided
-to WT_SESSION::open_cursor, opening a read-only, static view of the
-data source. This provides a limited form of time-travel, as the static
+Cursors are normally opened in the live version of a data source.
+However, it is also possible to open a read-only, static view of the
+data source as of a checkpoint.
+This is done by passing the \c checkpoint configuration string to
+WT_SESSION::open_cursor.
+This provides a limited form of time-travel, as the static
view is not changed by subsequent checkpoints and will persist until
-the checkpoint cursor is closed. While it is not an error to set a read
-timestamp in a transaction including a checkpoint cursor, it also has no
-effect on the data returned by the checkpoint cursor.
+the checkpoint cursor is closed.
+Checkpoint cursors ignore the currently running transaction; they are (in a
+sense) their own transactions.
+When timestamps are in use, a checkpoint cursor reads at the time
+associated with the checkpoint, which is normally the stable timestamp
+as of the time the checkpoint was taken.
@section checkpoint_naming Checkpoint naming
@@ -90,8 +95,8 @@ replaced, they can be used to save the state of the data for later use.
Internal checkpoints, that is, checkpoints not named by the application, use the
reserved name \c WiredTigerCheckpoint. (All checkpoint names beginning with this
-string are reserved.) Applications can open the most recent of these checkpoints
-by specifying \c WiredTigerCheckpoint as the checkpoint name to
+string are reserved.) Applications can open the most recent checkpoint (whether
+internal or named) by specifying \c WiredTigerCheckpoint as the checkpoint name to
WT_SESSION::open_cursor.
The name "all" is also reserved as it is used when dropping checkpoints.
diff --git a/src/third_party/wiredtiger/src/docs/timestamp-prepare.dox b/src/third_party/wiredtiger/src/docs/timestamp-prepare.dox
index 55dac15dbf8..b629f30d032 100644
--- a/src/third_party/wiredtiger/src/docs/timestamp-prepare.dox
+++ b/src/third_party/wiredtiger/src/docs/timestamp-prepare.dox
@@ -69,6 +69,14 @@ This scenario is not currently detected by WiredTiger; applications are
responsible for avoiding it.
In future versions such transactions might fail.
+\warning
+Similarly, if a transaction has a durable timestamp later than its commit
+timestamp, and a checkpoint is taken while the global stable timestamp
+is between these points, the transaction may or may not be visible
+when the checkpoint is opened with a checkpoint cursor; the behavior
+is unspecified.
+Applications should avoid this situation.
+
@section timestamp_prepare_ignore_prepare Configuring ignore_prepare
The WT_SESSION::begin_transaction method includes the \c ignore_prepare
@@ -89,4 +97,17 @@ transaction to make updates. This can cause data inconsistency problems with
the commit or rollback of the prepared transaction, or the disappearance of
a prepared update by overwriting it.
+Checkpoints taken while a transaction is prepared but not committed
+will not include the prepared transaction; reading from the checkpoint
+with a checkpoint cursor will behave as if the prepared transaction
+did not exist.
+This is comparable to the \c ignore_prepare behavior and carries the
+same consequences: reading the checkpoint and reading the live
+database at the checkpoint's time after the prepared transaction is
+resolved may produce different values.
+This situation can only arise if the stable timestamp is advanced
+(and a checkpoint then taken) while a transaction is prepared and
+still unresolved.
+Applications wishing to rule out this situation can avoid doing that.
+
*/
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 462c387871e..90ed9b40b4b 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -2588,8 +2588,8 @@ __verbose_dump_cache_single(WT_SESSION_IMPL *session, uint64_t *total_bytesp,
dhandle = session->dhandle;
btree = dhandle->handle;
WT_RET(__wt_msg(session, "%s(%s%s)%s%s:", dhandle->name,
- dhandle->checkpoint != NULL ? "checkpoint=" : "",
- dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>",
+ WT_DHANDLE_IS_CHECKPOINT(dhandle) ? "checkpoint=" : "",
+ WT_DHANDLE_IS_CHECKPOINT(dhandle) ? dhandle->checkpoint : "<live>",
btree->evict_disabled != 0 ? " eviction disabled" : "",
btree->evict_disabled_open ? " at open" : ""));
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 15e2286a429..2bc165326df 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -76,7 +76,16 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0;
FLD_SET(evict_flags, WT_EVICT_CALL_URGENT);
- WT_RET(__wt_curhs_cache(session));
+ /*
+ * There is no need to cache a history store cursor if evicting a readonly page. That includes
+ * pages from a checkpoint. Note that opening a history store cursor on a checkpoint page from
+ * here will explode because the identity of the matching history store checkpoint isn't
+ * available.
+ */
+ if (ref->page != NULL && !__wt_page_evict_clean(ref->page)) {
+ WT_ASSERT(session, !WT_READING_CHECKPOINT(session));
+ WT_RET(__wt_curhs_cache(session));
+ }
(void)__wt_atomic_addv32(&btree->evict_busy, 1);
ret = __wt_evict(session, ref, previous_state, evict_flags);
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c
index f9155063fe1..18bf2830c7d 100644
--- a/src/third_party/wiredtiger/src/history/hs_cursor.c
+++ b/src/third_party/wiredtiger/src/history/hs_cursor.c
@@ -94,7 +94,23 @@ __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
key->size = WT_PTRDIFF(p, recno_key_buf);
}
- WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
+ /*
+ * If reading from a checkpoint, it is possible to get here because the history store is
+ * currently open, but not be able to get a cursor because there was no history store in the
+ * checkpoint. We know this is the case if there's no history store checkpoint name stashed in
+ * the session. In this case, behave the same as if we searched and found nothing. Otherwise, we
+ * should be able to open a cursor on the selected checkpoint; if we fail because it's somehow
+ * disappeared, that's a problem and we shouldn't just silently return no data.
+ */
+ if (WT_READING_CHECKPOINT(session) && session->hs_checkpoint == NULL) {
+ ret = 0;
+ goto done;
+ }
+
+ WT_ERR_NOTFOUND_OK(__wt_curhs_open(session, NULL, &hs_cursor), true);
+ /* Do this separately for now because the behavior below is confusing if it triggers. */
+ WT_ASSERT(session, ret != WT_NOTFOUND);
+ WT_ERR(ret);
/*
* After positioning our cursor, we're stepping backwards to find the correct update. Since the
@@ -104,9 +120,12 @@ __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
* A reader without a timestamp should read the largest timestamp in the range, however cursor
* search near if given a 0 timestamp will place at the top of the range and hide the records
* below it. As such we need to adjust a 0 timestamp to the timestamp max value.
+ *
+ * If reading a checkpoint, use the checkpoint read timestamp instead.
*/
- read_timestamp =
- txn_shared->read_timestamp == WT_TS_NONE ? WT_TS_MAX : txn_shared->read_timestamp;
+ read_timestamp = WT_READING_CHECKPOINT(session) ? session->txn->checkpoint_read_timestamp :
+ txn_shared->read_timestamp;
+ read_timestamp = read_timestamp == WT_TS_NONE ? WT_TS_MAX : read_timestamp;
hs_cursor->set_key(hs_cursor, 4, btree_id, key, read_timestamp, UINT64_MAX);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
diff --git a/src/third_party/wiredtiger/src/include/btree_inline.h b/src/third_party/wiredtiger/src/include/btree_inline.h
index a3f38030c50..7857277c948 100644
--- a/src/third_party/wiredtiger/src/include/btree_inline.h
+++ b/src/third_party/wiredtiger/src/include/btree_inline.h
@@ -707,7 +707,7 @@ __wt_tree_modify_set(WT_SESSION_IMPL *session)
*/
if (!S2BT(session)->modified) {
/* Assert we never dirty a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session, !WT_READING_CHECKPOINT(session));
S2BT(session)->modified = true;
WT_FULL_BARRIER();
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 2a2638660d6..ce0f223a5d8 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -196,6 +196,15 @@ struct __wt_cursor_btree {
WT_UPDATE_VALUE *upd_value, _upd_value;
/*
+ * Bits used by checkpoint cursor: a private transaction, used to provide the proper read
+ * snapshot, and a reference to the corresponding history store checkpoint, which keeps it from
+ * disappearing under us if it's unnamed and also tracks its identity for use in history store
+ * accesses.
+ */
+ WT_TXN *checkpoint_txn;
+ WT_DATA_HANDLE *checkpoint_hs_dhandle;
+
+ /*
* Fixed-length column-store items are a single byte, and it's simpler and cheaper to allocate
* the space for it now than keep checking to see if we need to grow the buffer.
*/
@@ -223,11 +232,9 @@ struct __wt_cursor_btree {
#define WT_CBT_ITERATE_PREV 0x010u /* Prev iteration configuration */
#define WT_CBT_ITERATE_RETRY_NEXT 0x020u /* Prepare conflict by next. */
#define WT_CBT_ITERATE_RETRY_PREV 0x040u /* Prepare conflict by prev. */
-#define WT_CBT_NO_TRACKING 0x080u /* Non tracking cursor. */
-#define WT_CBT_NO_TXN 0x100u /* Non-txn cursor (e.g. a checkpoint) */
-#define WT_CBT_READ_ONCE 0x200u /* Page in with WT_READ_WONT_NEED */
-#define WT_CBT_SEARCH_SMALLEST 0x400u /* Row-store: small-key insert list */
-#define WT_CBT_VAR_ONPAGE_MATCH 0x800u /* Var-store: on-page recno match */
+#define WT_CBT_READ_ONCE 0x080u /* Page in with WT_READ_WONT_NEED */
+#define WT_CBT_SEARCH_SMALLEST 0x100u /* Row-store: small-key insert list */
+#define WT_CBT_VAR_ONPAGE_MATCH 0x200u /* Var-store: on-page recno match */
/* AUTOMATIC FLAG VALUE GENERATION STOP 32 */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
diff --git a/src/third_party/wiredtiger/src/include/cursor_inline.h b/src/third_party/wiredtiger/src/include/cursor_inline.h
index 65262eeea08..b16b2086e53 100644
--- a/src/third_party/wiredtiger/src/include/cursor_inline.h
+++ b/src/third_party/wiredtiger/src/include/cursor_inline.h
@@ -225,7 +225,7 @@ __cursor_reset(WT_CURSOR_BTREE *cbt)
/* If the cursor was active, deactivate it. */
if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
- if (!F_ISSET(cbt, WT_CBT_NO_TRACKING))
+ if (!WT_READING_CHECKPOINT(session))
__cursor_leave(session);
F_CLR(cbt, WT_CBT_ACTIVE);
}
@@ -234,7 +234,7 @@ __cursor_reset(WT_CURSOR_BTREE *cbt)
* When the count of active cursors in the session goes to zero, there are no active cursors,
* and we can release any snapshot we're holding for read committed isolation.
*/
- if (session->ncursors == 0 && !F_ISSET(cbt, WT_CBT_NO_TXN))
+ if (session->ncursors == 0 && !WT_READING_CHECKPOINT(session))
__wt_txn_read_last(session);
/* If we're not holding a cursor reference, we're done. */
@@ -400,7 +400,7 @@ __wt_cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
/* Activate the file cursor. */
if (!F_ISSET(cbt, WT_CBT_ACTIVE)) {
- if (!F_ISSET(cbt, WT_CBT_NO_TRACKING))
+ if (!WT_READING_CHECKPOINT(session))
WT_RET(__cursor_enter(session));
F_SET(cbt, WT_CBT_ACTIVE);
}
@@ -408,7 +408,7 @@ __wt_cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
/*
* If this is an ordinary transactional cursor, make sure we are set up to read.
*/
- if (!F_ISSET(cbt, WT_CBT_NO_TXN))
+ if (!WT_READING_CHECKPOINT(session))
__wt_txn_cursor_op(session);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
index ad967876010..829d766f7c4 100644
--- a/src/third_party/wiredtiger/src/include/dhandle.h
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -55,6 +55,8 @@
WT_DHANDLE_ACQUIRE(dhandle); \
} while (0)
+#define WT_DHANDLE_IS_CHECKPOINT(dhandle) ((dhandle)->checkpoint != NULL)
+
/*
* WT_DATA_HANDLE --
* A handle for a generic named data source.
@@ -64,12 +66,13 @@ struct __wt_data_handle {
TAILQ_ENTRY(__wt_data_handle) q;
TAILQ_ENTRY(__wt_data_handle) hashq;
- const char *name; /* Object name as a URI */
- uint64_t name_hash; /* Hash of name */
- const char *checkpoint; /* Checkpoint name (or NULL) */
- const char **cfg; /* Configuration information */
- const char *meta_base; /* Base metadata configuration */
- size_t meta_base_length; /* Base metadata length */
+ const char *name; /* Object name as a URI */
+ uint64_t name_hash; /* Hash of name */
+ const char *checkpoint; /* Checkpoint name (or NULL) */
+ int64_t checkpoint_order; /* Checkpoint order number, when applicable */
+ const char **cfg; /* Configuration information */
+ const char *meta_base; /* Base metadata configuration */
+ size_t meta_base_length; /* Base metadata length */
#ifdef HAVE_DIAGNOSTIC
const char *orig_meta_base; /* Copy of the base metadata configuration */
#endif
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 7ebf7ba83e1..8c7eb120d56 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1049,10 +1049,14 @@ extern int __wt_meta_block_metadata(WT_SESSION_IMPL *session, const char *config
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint,
WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_checkpoint_by_name(WT_SESSION_IMPL *session, const char *uri,
+ const char *checkpoint, int64_t *orderp, uint64_t *timep)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_meta_checkpoint_last_name(WT_SESSION_IMPL *session, const char *fname,
- const char **namep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_checkpoint_last_name(
+ WT_SESSION_IMPL *session, const char *fname, const char **namep, int64_t *orderp, uint64_t *timep)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_get(WT_SESSION_IMPL *session, const char *fname, bool update,
WT_CKPT **ckptbasep, size_t *allocated) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_get_from_config(WT_SESSION_IMPL *session, bool update,
@@ -1064,8 +1068,17 @@ extern int __wt_meta_ckptlist_to_meta(WT_SESSION_IMPL *session, WT_CKPT *ckptbas
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_ckptlist_update_config(WT_SESSION_IMPL *session, WT_CKPT *ckptbase,
const char *oldcfg, char **newcfgp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_read_checkpoint_oldest(WT_SESSION_IMPL *session, const char *ckpt_name,
+ wt_timestamp_t *timestampp, uint64_t *ckpttime) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_name,
+ uint64_t *snap_min, uint64_t *snap_max, uint64_t **snapshot, uint32_t *snapshot_count,
+ uint64_t *ckpttime) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_read_checkpoint_timestamp(WT_SESSION_IMPL *session, const char *ckpt_name,
+ wt_timestamp_t *timestampp, uint64_t *ckpttime) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_sysinfo_clear(WT_SESSION_IMPL *session, const char *name, size_t namelen)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_meta_sysinfo_set(WT_SESSION_IMPL *session, bool full, const char *name,
+ size_t namelen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session)
@@ -1376,7 +1389,8 @@ extern int __wt_session_create(WT_SESSION_IMPL *session, const char *uri, const
extern int __wt_session_cursor_cache_sweep(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[],
- uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint32_t flags, WT_DATA_HANDLE **hs_dhandlep, WT_CKPT_SNAPSHOT *ckpt_snapshot)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_session_get_dhandle(WT_SESSION_IMPL *session, const char *uri,
const char *checkpoint, const char *cfg[], uint32_t flags)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -1553,6 +1567,8 @@ extern int __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char **cfg)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_init_checkpoint_cursor(WT_SESSION_IMPL *session, WT_CKPT_SNAPSHOT *snapinfo,
+ WT_TXN **txn_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_is_blocking(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
@@ -1862,6 +1878,7 @@ extern void __wt_timestamp_to_hex_string(wt_timestamp_t ts, char *hex_timestamp)
extern void __wt_txn_bump_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session);
+extern void __wt_txn_close_checkpoint_cursor(WT_SESSION_IMPL *session, WT_TXN **txn_arg);
extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index 7daa9e5e162..c94e62101f5 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -38,11 +38,15 @@
#define WT_SYSTEM_CKPT_URI "system:checkpoint" /* Checkpoint timestamp URI */
#define WT_SYSTEM_OLDEST_TS "oldest_timestamp" /* Oldest timestamp name */
#define WT_SYSTEM_OLDEST_URI "system:oldest" /* Oldest timestamp URI */
+#define WT_SYSTEM_TS_TIME "checkpoint_time" /* Checkpoint wall time */
+#define WT_SYSTEM_TS_WRITE_GEN "write_gen" /* Checkpoint write generation */
#define WT_SYSTEM_CKPT_SNAPSHOT "snapshots" /* List of snapshots */
#define WT_SYSTEM_CKPT_SNAPSHOT_MIN "snapshot_min" /* Snapshot minimum */
#define WT_SYSTEM_CKPT_SNAPSHOT_MAX "snapshot_max" /* Snapshot maximum */
#define WT_SYSTEM_CKPT_SNAPSHOT_COUNT "snapshot_count" /* Snapshot count */
#define WT_SYSTEM_CKPT_SNAPSHOT_URI "system:checkpoint_snapshot" /* Checkpoint snapshot URI */
+#define WT_SYSTEM_CKPT_SNAPSHOT_TIME "checkpoint_time" /* Checkpoint wall time */
+#define WT_SYSTEM_CKPT_SNAPSHOT_WRITE_GEN "write_gen" /* Checkpoint write generation */
#define WT_SYSTEM_BASE_WRITE_GEN_URI "system:checkpoint_base_write_gen" /* Base write gen URI */
#define WT_SYSTEM_BASE_WRITE_GEN "base_write_gen" /* Base write gen name */
@@ -165,3 +169,16 @@ struct __wt_ckpt {
/* AUTOMATIC FLAG VALUE GENERATION STOP 32 */
uint32_t flags;
};
+
+/*
+ * WT_CKPT_SNAPSHOT --
+ * Snapshot and timestamp information associated with a checkpoint.
+ */
+struct __wt_ckpt_snapshot {
+ uint64_t oldest_ts;
+ uint64_t stable_ts;
+ uint64_t snapshot_min;
+ uint64_t snapshot_max;
+ uint64_t *snapshot_txns;
+ uint32_t snapshot_count;
+};
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index 58551f5b702..2ef1f440fa5 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -141,11 +141,19 @@ struct __wt_session_impl {
void *block_manager; /* Block-manager support */
int (*block_manager_cleanup)(WT_SESSION_IMPL *);
+ const char *hs_checkpoint; /* History store checkpoint name, during checkpoint cursor ops */
+
/* Checkpoint handles */
WT_DATA_HANDLE **ckpt_handle; /* Handle list */
u_int ckpt_handle_next; /* Next empty slot */
size_t ckpt_handle_allocated; /* Bytes allocated */
+ /* Named checkpoint drop list, during a checkpoint */
+ WT_ITEM *ckpt_drop_list;
+
+ /* Checkpoint time of current checkpoint, during a checkpoint */
+ uint64_t current_ckpt_sec;
+
/*
* Operations acting on handles.
*
@@ -288,3 +296,6 @@ struct __wt_session_impl {
WT_SESSION_STATS stats;
};
+
+/* Consider moving this to session_inline.h if it ever appears. */
+#define WT_READING_CHECKPOINT(s) ((s)->dhandle != NULL && WT_DHANDLE_IS_CHECKPOINT((s)->dhandle))
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 7c60dfc23bb..c167adeeb15 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -290,6 +290,11 @@ struct __wt_txn {
*/
wt_timestamp_t prepare_timestamp;
+ /*
+ * Timestamp used for reading via a checkpoint cursor instead of txn_shared->read_timestamp.
+ */
+ wt_timestamp_t checkpoint_read_timestamp;
+
/* Array of modifications by this transaction. */
WT_TXN_OP *mod;
size_t mod_alloc;
@@ -333,16 +338,17 @@ struct __wt_txn {
#define WT_TXN_HAS_TS_DURABLE 0x00020u
#define WT_TXN_HAS_TS_PREPARE 0x00040u
#define WT_TXN_IGNORE_PREPARE 0x00080u
-#define WT_TXN_PREPARE 0x00100u
-#define WT_TXN_PREPARE_IGNORE_API_CHECK 0x00200u
-#define WT_TXN_READONLY 0x00400u
-#define WT_TXN_RUNNING 0x00800u
-#define WT_TXN_SHARED_TS_DURABLE 0x01000u
-#define WT_TXN_SHARED_TS_READ 0x02000u
-#define WT_TXN_SYNC_SET 0x04000u
-#define WT_TXN_TS_ROUND_PREPARED 0x08000u
-#define WT_TXN_TS_ROUND_READ 0x10000u
-#define WT_TXN_UPDATE 0x20000u
+#define WT_TXN_IS_CHECKPOINT 0x00100u
+#define WT_TXN_PREPARE 0x00200u
+#define WT_TXN_PREPARE_IGNORE_API_CHECK 0x00400u
+#define WT_TXN_READONLY 0x00800u
+#define WT_TXN_RUNNING 0x01000u
+#define WT_TXN_SHARED_TS_DURABLE 0x02000u
+#define WT_TXN_SHARED_TS_READ 0x04000u
+#define WT_TXN_SYNC_SET 0x08000u
+#define WT_TXN_TS_ROUND_PREPARED 0x10000u
+#define WT_TXN_TS_ROUND_READ 0x20000u
+#define WT_TXN_UPDATE 0x40000u
/* AUTOMATIC FLAG VALUE GENERATION STOP 32 */
uint32_t flags;
diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h
index 25e823b661c..e0ff2f35a26 100644
--- a/src/third_party/wiredtiger/src/include/txn_inline.h
+++ b/src/third_party/wiredtiger/src/include/txn_inline.h
@@ -697,8 +697,8 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
return (true);
- /* Otherwise, we should be called with a snapshot or we are in a checkpoint cursor. */
- WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || session->dhandle->checkpoint != NULL);
+ /* Otherwise, we should be called with a snapshot. */
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
return (__wt_txn_visible_id_snapshot(
id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
@@ -728,6 +728,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp
if (!F_ISSET(txn, WT_TXN_SHARED_TS_READ) || timestamp == WT_TS_NONE)
return (true);
+ if (WT_READING_CHECKPOINT(session))
+ return (timestamp <= txn->checkpoint_read_timestamp);
+
return (timestamp <= txn_shared->read_timestamp);
}
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 7c832ec6ff0..9dd3aae862f 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -911,8 +911,8 @@ struct __wt_session {
* Bulk-loaded bitmap values must end on a byte boundary relative to the bit count (except
* for the last set of values loaded)., a string; default \c false.}
* @config{checkpoint, the name of a checkpoint to open (the reserved name
- * "WiredTigerCheckpoint" opens the most recent internal checkpoint taken for the object).
- * The cursor does not support data modification., a string; default empty.}
+ * "WiredTigerCheckpoint" opens the most recent checkpoint taken for the object). The cursor
+ * does not support data modification., a string; default empty.}
* @config{debug = (, configure debug specific behavior on a cursor. Generally only used
* for internal testing purposes., a set of related configuration options defined below.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;dump_version, open a version cursor\, which is a debug
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 1fa544ab841..8c393ff2a7d 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -113,6 +113,8 @@ struct __wt_cell_unpack_kv;
typedef struct __wt_cell_unpack_kv WT_CELL_UNPACK_KV;
struct __wt_ckpt;
typedef struct __wt_ckpt WT_CKPT;
+struct __wt_ckpt_snapshot;
+typedef struct __wt_ckpt_snapshot WT_CKPT_SNAPSHOT;
struct __wt_col;
typedef struct __wt_col WT_COL;
struct __wt_col_fix_auxiliary_header;
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index f4055f7b706..3cb9a9bf46e 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -452,7 +452,7 @@ __clsm_open_cursors(WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_
return (0);
ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
- ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
+ ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw,checkpoint_use_history=false";
ckpt_cfg[2] = NULL;
/*
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
index 71e32ecf3d7..1d04124a4a3 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -502,7 +502,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
* Open a handle on the new chunk before application threads attempt to access it, opening it
* pre-loads internal pages into the file system cache.
*/
- cfg[1] = "checkpoint=" WT_CHECKPOINT;
+ cfg[1] = "checkpoint=" WT_CHECKPOINT ",checkpoint_use_history=false";
WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
WT_TRET(dest->close(dest));
dest = NULL;
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index f4c09631439..259ebac5aac 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -9,7 +9,7 @@
#include "wt_internal.h"
static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *);
-static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
+static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **, int64_t *, uint64_t *);
static int __ckpt_load(WT_SESSION_IMPL *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *);
static int __ckpt_named(WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *, bool);
@@ -146,10 +146,12 @@ err:
/*
* __wt_meta_checkpoint_last_name --
- * Return the last unnamed checkpoint's name.
+ * Return the last unnamed checkpoint's name. Return the order number and wall-clock time if
+ * requested so the caller can check for races with a currently running checkpoint.
*/
int
-__wt_meta_checkpoint_last_name(WT_SESSION_IMPL *session, const char *fname, const char **namep)
+__wt_meta_checkpoint_last_name(
+ WT_SESSION_IMPL *session, const char *fname, const char **namep, int64_t *orderp, uint64_t *timep)
{
WT_DECL_RET;
char *config;
@@ -157,13 +159,64 @@ __wt_meta_checkpoint_last_name(WT_SESSION_IMPL *session, const char *fname, cons
config = NULL;
/* Retrieve the metadata entry for the file. */
- WT_ERR(__wt_metadata_search(session, fname, &config));
+ WT_RET(__wt_metadata_search(session, fname, &config));
/* Check the major/minor version numbers. */
WT_ERR(__ckpt_version_chk(session, fname, config));
/* Retrieve the name of the last unnamed checkpoint. */
- WT_ERR(__ckpt_last_name(session, config, namep));
+ WT_ERR(__ckpt_last_name(session, config, namep, orderp, timep));
+
+err:
+ __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_by_name --
+ * Look up the requested named checkpoint in the metadata and return its order and time
+ * information.
+ */
+int
+__wt_meta_checkpoint_by_name(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint,
+ int64_t *orderp, uint64_t *timep)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM a, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ char *config;
+
+ conn = S2C(session);
+ config = NULL;
+ *orderp = 0;
+ *timep = 0;
+
+ /* Retrieve the metadata entry. */
+ WT_RET(__wt_metadata_search(session, uri, &config));
+
+ /* Check the major/minor version numbers. */
+ WT_ERR(__ckpt_version_chk(session, uri, config));
+
+ WT_ERR(__wt_config_getones(session, config, "checkpoint", &v));
+ __wt_config_subinit(session, &ckptconf, &v);
+
+ /*
+ * Take the first match: there should never be more than a single checkpoint of any name.
+ */
+ while (__wt_config_next(&ckptconf, &k, &v) == 0)
+ if (WT_STRING_MATCH(checkpoint, k.str, k.len)) {
+
+ WT_ERR(__wt_config_subgets(session, &v, "order", &a));
+ if (a.val > 0)
+ *orderp = a.val;
+ WT_ERR(__wt_config_subgets(session, &v, "write_gen", &a));
+ if ((uint64_t)a.val >= conn->base_write_gen) {
+ WT_ERR(__wt_config_subgets(session, &v, "time", &a));
+ *timep = (uint64_t)a.val;
+ }
+ break;
+ }
err:
__wt_free(session, config);
@@ -308,40 +361,55 @@ __ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
/*
* __ckpt_last_name --
- * Return the name associated with the file's last unnamed checkpoint.
+ * Return the name associated with the file's last unnamed checkpoint. Except: in keeping with
+ * global snapshot/timestamp metadata being about the most recent checkpoint (named or unnamed),
+ * we return the most recent checkpoint (named or unnamed), since all callers need a checkpoint
+ * that matches the snapshot info they're using.
*/
static int
-__ckpt_last_name(WT_SESSION_IMPL *session, const char *config, const char **namep)
+__ckpt_last_name(WT_SESSION_IMPL *session, const char *config, const char **namep, int64_t *orderp,
+ uint64_t *timep)
{
WT_CONFIG ckptconf;
WT_CONFIG_ITEM a, k, v;
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ uint64_t time;
int64_t found;
+ conn = S2C(session);
*namep = NULL;
+ time = 0;
WT_ERR(__wt_config_getones(session, config, "checkpoint", &v));
__wt_config_subinit(session, &ckptconf, &v);
for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
- /*
- * We only care about unnamed checkpoints; applications may not use any matching prefix as a
- * checkpoint name, the comparison is pretty simple.
- */
- if (k.len < strlen(WT_CHECKPOINT) ||
- strncmp(k.str, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0)
- continue;
- /* Ignore checkpoints before the ones we've already seen. */
+ /* Ignore checkpoints before (by the order numbering) the ones we've already seen. */
WT_ERR(__wt_config_subgets(session, &v, "order", &a));
if (found && a.val < found)
continue;
+ found = a.val;
+
+ /* If the write generation is current, extract the wall-clock time for matching purposes. */
+ WT_ERR(__wt_config_subgets(session, &v, "write_gen", &a));
+ if ((uint64_t)a.val >= conn->base_write_gen) {
+ WT_ERR(__wt_config_subgets(session, &v, "time", &a));
+ time = (uint64_t)a.val;
+ } else
+ time = 0;
__wt_free(session, *namep);
WT_ERR(__wt_strndup(session, k.str, k.len, namep));
- found = a.val;
}
if (!found)
ret = WT_NOTFOUND;
+ else {
+ if (orderp != NULL)
+ *orderp = found;
+ if (timep != NULL)
+ *timep = time;
+ }
if (0) {
err:
@@ -566,12 +634,9 @@ __meta_ckptlist_allocate_new_ckpt(
WT_SESSION_IMPL *session, WT_CKPT **ckptbasep, size_t *allocated, const char *config)
{
WT_CKPT *ckptbase, *ckpt;
- WT_CONNECTION_IMPL *conn;
size_t slot;
- uint64_t most_recent;
ckptbase = *ckptbasep;
- conn = S2C(session);
slot = 0;
if (ckptbase != NULL)
@@ -598,27 +663,29 @@ __meta_ckptlist_allocate_new_ckpt(
ckpt = &ckptbase[slot];
ckpt->order = (slot == 0) ? 1 : ckptbase[slot - 1].order + 1;
- __wt_seconds(session, &ckpt->sec);
+
+ ckpt->sec = session->current_ckpt_sec;
+ WT_ASSERT(session, ckpt->sec > 0);
+
/*
- * Update time value for most recent checkpoint, not letting it move backwards. It is possible
- * to race here, so use atomic CAS. This code relies on the fact that anyone we race with will
- * only increase (never decrease) the most recent checkpoint time value.
+ * If we're adding a checkpoint, in general it should be newer than the previous one according
+ * to the time field. However, we don't try to crosscheck that here because it's not quite
+ * always true, and ultimately it doesn't matter.
+ *
+ * First, if the previous checkpoint is from an earlier database run its time might be off,
+ * either because of issues with the system clock or because the checkpoint clock got run
+ * forward (see notes in txn_ckpt.c) and we crashed and restarted and are still behind it. This
+ * could be ruled out by checking the write generation.
*
- * Update the time for this checkpoint not letting it move backwards. If tiered storage is in
- * use move it at least up to the most recent flush. Then move it up to at least the most
- * checkpoint.
+ * Second, a single-tree checkpoint can occur while a global checkpoint is in progress. In that
+ * case the global checkpoint will have an earlier time, but might get to the tree in question
+ * later. With WT-8695 this should only be possible with the metadata, so we could rule it out
+ * by only checking non-metadata files.
*
- * NOTE: reading the most recent flush time is not an ordered read because currently checkpoint
- * and flush tier are mutually exclusive.
+ * Third, it appears to be possible for a close checkpoint to occur while a global checkpoint is
+ * in progress, with the same consequences. There doesn't seem to be any obvious way to detect
+ * and rule out this case.
*/
- for (;;) {
- WT_ORDERED_READ(most_recent, conn->ckpt_most_recent);
- ckpt->sec = WT_MAX(ckpt->sec, conn->flush_most_recent);
- ckpt->sec = WT_MAX(ckpt->sec, most_recent);
- if (ckpt->sec == most_recent ||
- __wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt->sec))
- break;
- }
/* Either load block mods from the config, or from the previous checkpoint. */
WT_RET(
@@ -984,7 +1051,7 @@ err:
/*
* __wt_metadata_correct_base_write_gen --
- * Update the connection's base write generation from all files in metadata at then end of the
+ * Update the connection's base write generation from all files in metadata at the end of the
* recovery checkpoint.
*/
int
@@ -1268,76 +1335,177 @@ __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/*
+ * __meta_print_snapshot --
+ * Generate the text form of the checkpoint's snapshot for recording in the metadata.
+ */
+static int
+__meta_print_snapshot(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+ WT_TXN *txn;
+ uint32_t snap_count;
+
+ txn = session->txn;
+
+ WT_RET(__wt_buf_fmt(session, buf,
+ WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
+ "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32,
+ txn->snap_min, txn->snap_max, txn->snapshot_count));
+
+ if (txn->snapshot_count > 0) {
+ WT_RET(__wt_buf_catfmt(session, buf, "," WT_SYSTEM_CKPT_SNAPSHOT "=["));
+ for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count)
+ WT_RET(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], ","));
+
+ WT_RET(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], "]"));
+ }
+
+ WT_RET(__wt_buf_catfmt(session, buf,
+ "," WT_SYSTEM_TS_TIME "=%" PRIu64 "," WT_SYSTEM_TS_WRITE_GEN "=%" PRIu64,
+ session->current_ckpt_sec, S2C(session)->base_write_gen));
+
+ return (0);
+}
+
+/*
+ * __meta_sysinfo_update --
+ * Helper to update the most recent and/or named checkpoint snapshot metadata entry.
+ */
+static int
+__meta_sysinfo_update(WT_SESSION_IMPL *session, bool full, const char *name, size_t namelen,
+ WT_ITEM *buf, const char *uri, const char *value)
+{
+ if (full)
+ WT_RET(__wt_metadata_update(session, uri, value));
+ if (name != NULL) {
+ WT_RET(__wt_buf_fmt(session, buf, "%s.%.*s", uri, (int)namelen, name));
+ WT_RET(__wt_metadata_update(session, buf->data, value));
+ }
+ return (0);
+}
+
+/*
+ * __meta_sysinfo_remove --
+ * Helper to remove the most recent and/or named checkpoint snapshot metadata entry.
+ */
+static int
+__meta_sysinfo_remove(WT_SESSION_IMPL *session, bool full, const char *name, size_t namelen,
+ WT_ITEM *buf, const char *uri)
+{
+ if (full)
+ WT_RET_NOTFOUND_OK(__wt_metadata_remove(session, uri));
+ if (name != NULL) {
+ WT_RET(__wt_buf_fmt(session, buf, "%s.%.*s", uri, (int)namelen, name));
+ WT_RET_NOTFOUND_OK(__wt_metadata_remove(session, buf->data));
+ }
+ return (0);
+}
+
+/*
* __wt_meta_sysinfo_set --
* Set the system information in the metadata.
*/
int
-__wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
+__wt_meta_sysinfo_set(WT_SESSION_IMPL *session, bool full, const char *name, size_t namelen)
{
- WT_DECL_ITEM(buf);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(uribuf);
+ WT_DECL_ITEM(valbuf);
WT_DECL_RET;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t oldest_timestamp;
- uint32_t snap_count;
char hex_timestamp[WT_TS_HEX_STRING_SIZE];
char ts_string[2][WT_TS_INT_STRING_SIZE];
- txn_global = &S2C(session)->txn_global;
-
- txn = session->txn;
- WT_ERR(__wt_scr_alloc(session, 1024, &buf));
/*
- * We need to record the timestamp of the checkpoint in the metadata. The timestamp value is set
- * at a higher level, either in checkpoint or in recovery.
+ * Write the checkpoint timestamp and snapshot information to the metadata. For any full
+ * checkpoint, including most named checkpoints, write the most recent checkpoint's entries. For
+ * all named checkpoints, whether or not full, write it to that checkpoint's entries by name.
+ * This writes out two copies for most named checkpoints, but that's ok.
+ *
+ * The most recent checkpoint's entries are
+ * system:checkpoint (contains checkpoint_timestamp=TS)
+ * system:oldest (contains oldest_timestamp=TS)
+ * system:checkpoint_snapshot (contains snapshot_{min,max}=TXN, snapshot_count=N,
+ * snapshots=[TXN,TXN,...])
+ * and a named checkpoint's entries are
+ * system:checkpoint.NAME
+ * system:oldest.NAME
+ * system:checkpoint_snapshot.NAME
+ * with the same contents.
+ *
+ * All three entries also include time=SEC and write_gen=WRITE-GEN, where the time is the wall
+ * clock time (not timestamp) from the checkpoint and the write generation is the base write
+ * generation as of when the checkpoint was taken. This information relates the metadata info to
+ * specific tree-level checkpoints.
+ *
+ * We also write the base write generation to system:system:checkpoint_base_write_gen for full
+ * checkpoints. This information doesn't appear needed for named checkpoints and isn't written.
+ *
+ * The checkpoint timestamp written is set by higher-level code, either in checkpoint or in
+ * recovery.
+ *
+ * We also need to record the oldest timestamp in the metadata so we can set it on startup. The
+ * checkpoint's oldest timestamp is the minimum of the current oldest timestamp and the
+ * checkpoint timestamp.
+ *
+ * For both timestamps, don't store zero entries in the metadata: remove the entry instead. This
+ * avoids downgrade issues if the metadata is opened with an older version of WiredTiger that
+ * doesn't understand the new entry.
*/
- __wt_timestamp_to_hex_string(txn_global->meta_ckpt_timestamp, hex_timestamp);
- /*
- * Don't leave a zero entry in the metadata: remove it. This avoids downgrade issues if the
- * metadata is opened with an older version of WiredTiger that does not understand the new
- * entry.
- */
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ txn = session->txn;
+ if (name != NULL)
+ WT_ERR(__wt_scr_alloc(session, namelen + 128, &uribuf));
+ WT_ERR(__wt_scr_alloc(session, 1024, &valbuf));
+
+ /* Handle the checkpoint timestamp. */
+
+ __wt_timestamp_to_hex_string(txn_global->meta_ckpt_timestamp, hex_timestamp);
if (strcmp(hex_timestamp, "0") == 0)
- WT_ERR_NOTFOUND_OK(__wt_metadata_remove(session, WT_SYSTEM_CKPT_URI), false);
+ WT_ERR(__meta_sysinfo_remove(session, full, name, namelen, uribuf, WT_SYSTEM_CKPT_URI));
else {
- WT_ERR(__wt_buf_fmt(session, buf, WT_SYSTEM_CKPT_TS "=\"%s\"", hex_timestamp));
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_URI, buf->data));
+ WT_ERR(__wt_buf_fmt(session, valbuf,
+ WT_SYSTEM_CKPT_TS "=\"%s\"," WT_SYSTEM_TS_TIME "=%" PRIu64 "," WT_SYSTEM_TS_WRITE_GEN
+ "=%" PRIu64,
+ hex_timestamp, session->current_ckpt_sec, conn->base_write_gen));
+ WT_ERR(__meta_sysinfo_update(
+ session, full, name, namelen, uribuf, WT_SYSTEM_CKPT_URI, valbuf->data));
}
/*
- * We also need to record the oldest timestamp in the metadata so we can set it on startup. We
- * should set the checkpoint's oldest timestamp as the minimum of the current oldest timestamp
- * and the checkpoint timestamp.
+ * Handle the oldest timestamp.
*
* Cache the oldest timestamp and use a read barrier to prevent us from reading two different
* values of the oldest timestamp.
*/
+
oldest_timestamp = txn_global->oldest_timestamp;
WT_READ_BARRIER();
__wt_timestamp_to_hex_string(
WT_MIN(oldest_timestamp, txn_global->meta_ckpt_timestamp), hex_timestamp);
+
if (strcmp(hex_timestamp, "0") == 0)
- WT_ERR_NOTFOUND_OK(__wt_metadata_remove(session, WT_SYSTEM_OLDEST_URI), false);
+ WT_ERR(__meta_sysinfo_remove(session, full, name, namelen, uribuf, WT_SYSTEM_OLDEST_URI));
else {
- WT_ERR(__wt_buf_fmt(session, buf, WT_SYSTEM_OLDEST_TS "=\"%s\"", hex_timestamp));
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_OLDEST_URI, buf->data));
+ WT_ERR(__wt_buf_fmt(session, valbuf,
+ WT_SYSTEM_OLDEST_TS "=\"%s\"," WT_SYSTEM_TS_TIME "=%" PRIu64 "," WT_SYSTEM_TS_WRITE_GEN
+ "=%" PRIu64,
+ hex_timestamp, session->current_ckpt_sec, conn->base_write_gen));
+ WT_ERR(__meta_sysinfo_update(
+ session, full, name, namelen, uribuf, WT_SYSTEM_OLDEST_URI, valbuf->data));
}
- /* Record snapshot information in metadata for checkpoint. */
- WT_ERR(__wt_buf_fmt(session, buf,
- WT_SYSTEM_CKPT_SNAPSHOT_MIN "=%" PRIu64 "," WT_SYSTEM_CKPT_SNAPSHOT_MAX "=%" PRIu64
- "," WT_SYSTEM_CKPT_SNAPSHOT_COUNT "=%" PRIu32,
- txn->snap_min, txn->snap_max, txn->snapshot_count));
+ /* Handle the snapshot information. */
- if (txn->snapshot_count > 0) {
- WT_ERR(__wt_buf_catfmt(session, buf, "," WT_SYSTEM_CKPT_SNAPSHOT "=["));
- for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count)
- WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], ","));
+ WT_ERR(__meta_print_snapshot(session, valbuf));
+ WT_ERR(__meta_sysinfo_update(
+ session, full, name, namelen, uribuf, WT_SYSTEM_CKPT_SNAPSHOT_URI, valbuf->data));
- WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "%s", txn->snapshot[snap_count], "]"));
- }
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, buf->data));
+ /* Print what we did. */
__wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
"saving checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64
@@ -1347,19 +1515,260 @@ __wt_meta_sysinfo_set(WT_SESSION_IMPL *session)
txn->snap_min, txn->snap_max, txn->snapshot_count,
__wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[0]),
__wt_timestamp_to_string(txn_global->meta_ckpt_timestamp, ts_string[1]),
- S2C(session)->base_write_gen);
+ conn->base_write_gen);
- /* Record the base write gen in metadata as part of checkpoint */
- WT_ERR(__wt_buf_fmt(
- session, buf, WT_SYSTEM_BASE_WRITE_GEN "=%" PRIu64, S2C(session)->base_write_gen));
- WT_ERR(__wt_metadata_update(session, WT_SYSTEM_BASE_WRITE_GEN_URI, buf->data));
+ /*
+ * Record the base write gen in metadata as part of full checkpoints.
+ *
+ * Note that "full" here means what it does in __txn_checkpoint: the user didn't give an
+ * explicit list of trees to checkpoint. It is allowed (though currently not sensible) for the
+ * user to do that with a named checkpoint, in which case we don't want to make this change.
+ */
+ if (full) {
+ WT_ERR(__wt_buf_fmt(
+ session, valbuf, WT_SYSTEM_BASE_WRITE_GEN "=%" PRIu64, conn->base_write_gen));
+ WT_ERR(__wt_metadata_update(session, WT_SYSTEM_BASE_WRITE_GEN_URI, valbuf->data));
+ }
err:
- __wt_scr_free(session, &buf);
+ __wt_scr_free(session, &valbuf);
+ if (name != NULL)
+ __wt_scr_free(session, &uribuf);
+ return (ret);
+}
+
+/*
+ * __wt_meta_sysinfo_clear --
+ * Clear the system information (for a named checkpoint) from the metadata.
+ */
+int
+__wt_meta_sysinfo_clear(WT_SESSION_IMPL *session, const char *name, size_t namelen)
+{
+ WT_DECL_ITEM(uribuf);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, namelen + 128, &uribuf));
+
+ WT_ERR(__meta_sysinfo_remove(session, false, name, namelen, uribuf, WT_SYSTEM_CKPT_URI));
+ WT_ERR(__meta_sysinfo_remove(session, false, name, namelen, uribuf, WT_SYSTEM_OLDEST_URI));
+ WT_ERR(
+ __meta_sysinfo_remove(session, false, name, namelen, uribuf, WT_SYSTEM_CKPT_SNAPSHOT_URI));
+
+err:
+ __wt_scr_free(session, &uribuf);
+ return (ret);
+}
+
+/*
+ * __wt_meta_read_checkpoint_snapshot --
+ * Fetch the snapshot data for a checkpoint from the metadata file. Reads the selected named
+ * checkpoint's snapshot, or if the checkpoint name passed is null, the most recent checkpoint's
+ * snapshot. The snapshot list returned is allocated and must be freed by the caller.
+ */
+int
+__wt_meta_read_checkpoint_snapshot(WT_SESSION_IMPL *session, const char *ckpt_name,
+ uint64_t *snap_min, uint64_t *snap_max, uint64_t **snapshot, uint32_t *snapshot_count,
+ uint64_t *ckpttime)
+{
+ WT_CONFIG list;
+ WT_CONFIG_ITEM cval;
+ WT_CONFIG_ITEM k;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint32_t counter;
+ char *sys_config;
+
+ conn = S2C(session);
+ counter = 0;
+ sys_config = NULL;
+
+ /*
+ * There's an issue with checkpoints produced by some old versions having bad snapshot data.
+ * (See WT-8395.) We should ignore those snapshots when we can identify them. This only applies
+ * to reading the last checkpoint during recovery, however, so it is done in our caller. (In
+ * other cases, for WiredTigerCheckpoint the checkpoint taken after recovery will have replaced
+ * any old and broken snapshot; and for named checkpoints, the broken versions didn't write out
+ * snapshot information at all anyway.)
+ */
+
+ /* Initialize to an empty snapshot. */
+ *snap_min = WT_TXN_NONE;
+ *snap_max = WT_TXN_NONE;
+ *snapshot = NULL;
+ *snapshot_count = 0;
+
+ /* Fetch the metadata string. */
+ if (ckpt_name == NULL)
+ WT_ERR_NOTFOUND_OK(
+ __wt_metadata_search(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, &sys_config), false);
+ else {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s.%s", WT_SYSTEM_CKPT_SNAPSHOT_URI, ckpt_name));
+ WT_ERR_NOTFOUND_OK(__wt_metadata_search(session, tmp->data, &sys_config), false);
+ }
+
+ /* Extract the components of the metadata string. */
+ if (sys_config != NULL) {
+ WT_CLEAR(cval);
+ if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MIN, &cval) == 0 &&
+ cval.len != 0)
+ *snap_min = (uint64_t)cval.val;
+
+ if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MAX, &cval) == 0 &&
+ cval.len != 0)
+ *snap_max = (uint64_t)cval.val;
+
+ if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_COUNT, &cval) == 0 &&
+ cval.len != 0)
+ *snapshot_count = (uint32_t)cval.val;
+
+ if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT, &cval) == 0 &&
+ cval.len != 0) {
+ __wt_config_subinit(session, &list, &cval);
+ WT_ERR(__wt_calloc_def(session, *snapshot_count, snapshot));
+ while (__wt_config_subget_next(&list, &k) == 0)
+ (*snapshot)[counter++] = (uint64_t)k.val;
+ }
+
+ if (ckpttime != NULL) {
+ /* If the write generation is current, extract the checkpoint time. Otherwise we use 0.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_WRITE_GEN, &cval),
+ false);
+ if (cval.val != 0 && (uint64_t)cval.val >= conn->base_write_gen) {
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_TIME, &cval),
+ false);
+ if (cval.val != 0)
+ *ckpttime = (uint64_t)cval.val;
+ }
+ }
+
+ /*
+ * Make sure that the snapshot is self-consistent. The snapshot array should contain only
+ * transaction IDs between min and max.
+ */
+ WT_ASSERT(session,
+ *snapshot == NULL ||
+ (*snapshot_count == counter && (*snapshot)[0] == *snap_min &&
+ (*snapshot)[counter - 1] < *snap_max));
+ }
+
+err:
+ __wt_free(session, sys_config);
+ if (ckpt_name != NULL)
+ __wt_scr_free(session, &tmp);
return (ret);
}
/*
+ * __meta_retrieve_timestamp --
+ * Retrieve a timestamp from the metadata. Not present explicitly means WT_TXN_NONE.
+ */
+static int
+__meta_retrieve_timestamp(WT_SESSION_IMPL *session, const char *system_uri,
+ const char *timestamp_name, wt_timestamp_t *timestampp, uint64_t *ckpttime)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ char *sys_config;
+
+ conn = S2C(session);
+ sys_config = NULL;
+ *timestampp = WT_TXN_NONE;
+ if (ckpttime != NULL)
+ *ckpttime = 0;
+
+ /* Search the metadata for the system information. */
+ WT_ERR_NOTFOUND_OK(__wt_metadata_search(session, system_uri, &sys_config), false);
+ if (sys_config != NULL) {
+ WT_CLEAR(cval);
+ WT_ERR_NOTFOUND_OK(__wt_config_getones(session, sys_config, timestamp_name, &cval), false);
+ if (cval.len != 0) {
+ __wt_verbose(session, WT_VERB_RECOVERY, "Recovery %s %.*s", timestamp_name,
+ (int)cval.len, cval.str);
+ WT_ERR(__wt_txn_parse_timestamp_raw(session, timestamp_name, timestampp, &cval));
+ }
+
+ if (ckpttime != NULL) {
+ /* If the write generation is current, extract the checkpoint time. Otherwise we use 0.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_getones(session, sys_config, WT_SYSTEM_TS_WRITE_GEN, &cval), false);
+ if (cval.val != 0 && (uint64_t)cval.val >= conn->base_write_gen) {
+ WT_ERR_NOTFOUND_OK(
+ __wt_config_getones(session, sys_config, WT_SYSTEM_TS_TIME, &cval), false);
+ if (cval.val != 0)
+ *ckpttime = (uint64_t)cval.val;
+ }
+ }
+ }
+
+err:
+ __wt_free(session, sys_config);
+ return (ret);
+}
+
+/*
+ * __meta_retrieve_a_checkpoint_timestamp --
+ * Fetch a timestamp associated with the checkpoint from the metadata. If the checkpoint name
+ * passed is null, returns the timestamp from the most recent checkpoint. Also returns the
+ * checkpoint wall-clock time the timestamp came from (which is a time, but not a timestamp...)
+ *
+ * Here "checkpoint timestamp" means "a timestamp in a checkpoint". This variance in terminology is
+ * confusing, but at this point not readily avoided.
+ */
+static int
+__meta_retrieve_a_checkpoint_timestamp(WT_SESSION_IMPL *session, const char *ckpt_name,
+ const char *uri, const char *key, wt_timestamp_t *timestampp, uint64_t *ckpttime)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ if (ckpt_name == NULL)
+ return (__meta_retrieve_timestamp(session, uri, key, timestampp, ckpttime));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s.%s", uri, ckpt_name));
+ WT_ERR(__meta_retrieve_timestamp(session, tmp->data, key, timestampp, ckpttime));
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
+ * __wt_meta_read_checkpoint_timestamp --
+ * Fetch a checkpoint's checkpoint timestamp, aka stable timestamp, from the metadata. If the
+ * checkpoint name passed is null, returns the timestamp from the most recent checkpoint.
+ *
+ * Here "checkpoint timestamp" means "the stable timestamp saved with a checkpoint". This variance
+ * in terminology is confusing, but at this point not readily avoided.
+ */
+int
+__wt_meta_read_checkpoint_timestamp(
+ WT_SESSION_IMPL *session, const char *ckpt_name, wt_timestamp_t *timestampp, uint64_t *ckpttime)
+{
+ return (__meta_retrieve_a_checkpoint_timestamp(
+ session, ckpt_name, WT_SYSTEM_CKPT_URI, WT_SYSTEM_CKPT_TS, timestampp, ckpttime));
+}
+
+/*
+ * __wt_meta_read_checkpoint_oldest --
+ * Fetch a checkpoint's oldest timestamp from the metadata. If the checkpoint name passed is
+ * null, returns the timestamp from the most recent checkpoint.
+ */
+int
+__wt_meta_read_checkpoint_oldest(
+ WT_SESSION_IMPL *session, const char *ckpt_name, wt_timestamp_t *timestampp, uint64_t *ckpttime)
+{
+ return (__meta_retrieve_a_checkpoint_timestamp(
+ session, ckpt_name, WT_SYSTEM_OLDEST_URI, WT_SYSTEM_OLDEST_TS, timestampp, ckpttime));
+}
+
+/*
* __ckpt_version_chk --
* Check the version major/minor numbers.
*/
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
index c8c8ecd7558..3ad0ef19a9b 100644
--- a/src/third_party/wiredtiger/src/schema/schema_worker.c
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -28,7 +28,7 @@ __wt_exclusive_handle_operation(WT_SESSION_IMPL *session, const char *uri,
WT_RET(ret);
}
- WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, open_flags));
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, open_flags, NULL, NULL));
WT_SAVE_DHANDLE(session, ret = file_func(session, cfg));
WT_TRET(__wt_session_release_dhandle(session));
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 8adaee16387..d98f236b851 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -143,7 +143,7 @@ __wt_session_copy_values(WT_SESSION_IMPL *session)
WT_ASSERT(session,
txn_shared->pinned_id != WT_TXN_NONE ||
(WT_BTREE_PREFIX(cursor->uri) &&
- F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN)));
+ WT_DHANDLE_IS_CHECKPOINT(((WT_CURSOR_BTREE *)cursor)->dhandle)));
#endif
WT_RET(__cursor_localvalue(cursor));
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index 685bc6e0c46..07e18bd1e43 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -283,55 +283,301 @@ __wt_session_release_dhandle(WT_SESSION_IMPL *session)
}
/*
+ * __session_fetch_checkpoint_meta --
+ * Retrieve information about the selected checkpoint. Notes on the returned values are found
+ * under __session_lookup_checkpoint.
+ */
+static int
+__session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name,
+ WT_CKPT_SNAPSHOT *info_ret, uint64_t *snapshot_time_ret, uint64_t *stable_time_ret,
+ uint64_t *oldest_time_ret)
+{
+ /* Get the timestamps. */
+ WT_RET(__wt_meta_read_checkpoint_timestamp(
+ session, ckpt_name, &info_ret->stable_ts, stable_time_ret));
+ WT_RET(
+ __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret));
+
+ /* Get the snapshot. */
+ WT_RET(__wt_meta_read_checkpoint_snapshot(session, ckpt_name, &info_ret->snapshot_min,
+ &info_ret->snapshot_max, &info_ret->snapshot_txns, &info_ret->snapshot_count,
+ snapshot_time_ret));
+
+ /*
+ * If we successfully read a null snapshot, set the min and max to WT_TXN_MAX so everything is
+ * visible. (Whether this is desirable isn't entirely clear, but if we leave them set to
+ * WT_TXN_NONE, then nothing is visible, and that's clearly not useful. The other choices are to
+ * fail, which doesn't help, or to signal somehow to the checkpoint cursor that it should run
+ * without a dummy transaction, which doesn't work.)
+ */
+ if (info_ret->snapshot_min == WT_TXN_NONE && info_ret->snapshot_max == WT_TXN_NONE) {
+ info_ret->snapshot_min = info_ret->snapshot_max = WT_TXN_MAX;
+ WT_ASSERT(session, info_ret->snapshot_txns == NULL && info_ret->snapshot_count == 0);
+ }
+
+ return (0);
+}
+
+/*
+ * __session_open_hs_ckpt --
+ * Get a btree handle for the requested checkpoint of the history store and return it.
+ */
+static int
+__session_open_hs_ckpt(WT_SESSION_IMPL *session, const char *checkpoint, const char *cfg[],
+ uint32_t flags, int64_t order_expected, WT_DATA_HANDLE **hs_dhandlep)
+{
+ WT_RET(__wt_session_get_dhandle(session, WT_HS_URI, checkpoint, cfg, flags));
+
+ if (session->dhandle->checkpoint_order != order_expected) {
+ /* Not what we were expecting; treat as EBUSY and let the caller retry. */
+ WT_RET(__wt_session_release_dhandle(session));
+ return (__wt_set_return(session, EBUSY));
+ }
+
+ /* The handle is left in the session; return it explicitly for caller's convenience. */
+ *hs_dhandlep = session->dhandle;
+ return (0);
+}
+
+/*
* __wt_session_get_btree_ckpt --
- * Check the configuration strings for a checkpoint name, get a btree handle for the given name,
- * set session->dhandle.
+ * Check the configuration strings for a checkpoint name. If opening a checkpoint, resolve the
+ * checkpoint name, get a btree handle for it, load that into the session, and if requested with
+ * non-null pointers, also resolve a matching history store checkpoint, open a handle for it,
+ * return that, and also find and return the corresponding snapshot/timestamp metadata. The
+ * transactions array in the snapshot info is allocated and must be freed by the caller on
+ * success. If not opening a checkpoint, the history store dhandle and snapshot info is
+ * immaterial; if the return pointers are not null, send back nulls and in particular never
+ * allocate or open anything.
*/
int
-__wt_session_get_btree_ckpt(
- WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags)
+__wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[],
+ uint32_t flags, WT_DATA_HANDLE **hs_dhandlep, WT_CKPT_SNAPSHOT *ckpt_snapshot)
{
WT_CONFIG_ITEM cval;
WT_DECL_RET;
- const char *checkpoint;
- bool last_ckpt;
+ uint64_t ds_time, hs_time, oldest_time, snapshot_time, stable_time;
+ int64_t ds_order, hs_order;
+ const char *checkpoint, *hs_checkpoint;
+ bool is_unnamed_ckpt, must_resolve;
- last_ckpt = false;
+ ds_time = hs_time = oldest_time = snapshot_time = stable_time = 0;
+ ds_order = hs_order = 0;
checkpoint = NULL;
+ hs_checkpoint = NULL;
+
+ /* These should only be set together. Asking for only one doesn't make sense. */
+ WT_ASSERT(session, (hs_dhandlep == NULL) == (ckpt_snapshot == NULL));
+
+ if (hs_dhandlep != NULL)
+ *hs_dhandlep = NULL;
+ if (ckpt_snapshot != NULL) {
+ ckpt_snapshot->oldest_ts = WT_TS_NONE;
+ ckpt_snapshot->stable_ts = WT_TS_NONE;
+ ckpt_snapshot->snapshot_min = WT_TXN_MAX;
+ ckpt_snapshot->snapshot_max = WT_TXN_MAX;
+ ckpt_snapshot->snapshot_txns = NULL;
+ ckpt_snapshot->snapshot_count = 0;
+ }
/*
* This function exists to handle checkpoint configuration. Callers that never open a checkpoint
* call the underlying function directly.
*/
WT_RET_NOTFOUND_OK(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
- if (cval.len != 0) {
- /*
- * The internal checkpoint name is special, find the last unnamed checkpoint of the object.
- */
- if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
- last_ckpt = true;
-retry:
- WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint));
- } else
- WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
+ if (cval.len == 0) {
+ /* We are not opening a checkpoint. This is the simple case; retire it immediately. */
+ return (__wt_session_get_dhandle(session, uri, NULL, cfg, flags));
}
- ret = __wt_session_get_dhandle(session, uri, checkpoint, cfg, flags);
- __wt_free(session, checkpoint);
-
/*
- * There's a potential race: we get the name of the most recent unnamed checkpoint, but if it's
- * discarded (or locked so it can be discarded) by the time we try to open it, we'll fail the
- * open. Retry in those cases, a new "last" checkpoint should surface, and we can't return an
- * error, the application will be justifiably upset if we can't open the last checkpoint
- * instance of an object.
+ * Here and below is only for checkpoints.
+ *
+ * Ultimately, unless we're being opened from a context where we won't ever need to access the
+ * history store, we need two dhandles and a set of snapshot/timestamp info that all match.
+ *
+ * "Match" here is a somewhat complex issue. In the simple case, it means trees and a snapshot
+ * that came from the same global checkpoint. But because checkpoints skip clean trees, either
+ * tree can potentially be from an earlier global checkpoint. This means we cannot readily
+ * identify matching trees by looking at them (or by looking at their metadata either) -- both
+ * the order numbers and the wall clock times can easily be different. Consequently we don't try
+ * to actively find or check matching trees; instead we rely on the system to not produce
+ * mutually inconsistent checkpoints, and read out whatever exists taking active steps to avoid
+ * racing with a currently running checkpoint.
+ *
+ * Note that this fundamentally relies on partial checkpoints being prohibited. In the presence
+ * of partial checkpoints we would have to actively find matching trees, and in many cases
+ * (because old unnamed checkpoints are garbage collected) the proper matching history store
+ * wouldn't exist any more and we'd be stuck.
+ *
+ * The scheme is as follows: 1. Read checkpoint info out of the metadata, and retry until we get
+ * a consistent set; then 2. Open both dhandles and retry the whole thing if we didn't get the
+ * trees we expected.
+ *
+ * For the first part, we look up the requested checkpoint in both the data store and history
+ * store's metadata (either by name or for WiredTigerCheckpoint by picking the most recent
+ * checkpoint), and look up the snapshot and timestamps in the global metadata. For all of these
+ * we retrieve the wall clock time of the checkpoint, which we'll use to check for consistency.
+ * For the trees we also retrieve the order numbers of the checkpoints, which we'll use to check
+ * that the dhandles we open are the ones we wanted. (For unnamed checkpoints, they must be,
+ * because unnamed checkpoints are never replaced, but for named checkpoints it's possible for
+ * the open to race with regeneration of the checkpoint.)
+ *
+ * Because the snapshot and timestamp information is always written by every checkpoint, and is
+ * written last, it always gives the wall clock time of the most recent completed global
+ * checkpoint. If either the data store or history store checkpoint has a newer wall clock time,
+ * it must be from a currently running checkpoint and does not match the snapshot; therefore we
+ * must retry or fail. If both have the same or an older wall clock time, they are from the same
+ * or an older checkpoint and can be presumed to match.
+ *
+ * A slight complication is that the snapshot and timestamp information is three separate pieces
+ * of metadata; we read the time from all three and if they don't agree, it must be because a
+ * checkpoint is finishing at this very moment, so we retry.
*
- * The check against WT_NOTFOUND is correct: if there was no checkpoint for the object (that is,
- * the object has never been in a checkpoint), we returned immediately after the call to search
- * for that name.
+ * (It is actually slightly more complicated: either timestamp might not be present, in which
+ * case the time will read back as zero. The snapshot is written last, and always written, so we
+ * accept the timestamp times if they less than or equal to the snapshot time. We are only
+ * racing if they are newer.)
+ *
+ * This scheme relies on the fact we take steps to make sure that the checkpoint wall clock time
+ * does not run backward, and that successive checkpoints are never given the same wall clock
+ * time. Note that we use the write generation to ignore wall clock times from previous database
+ * opens (all such are treated as 0) -- anything from a previous database open can't have been
+ * produced by a currently running checkpoint and can be presumed to match. This is done so we
+ * don't get in trouble if the system clock moves backwards between runs, and also to avoid
+ * possible issues if the checkpoint clock runs forward. (See notes about that in txn_ckpt.c.)
+ * Furthermore, this avoids any confusion potentially caused by older versions not including the
+ * checkpoint time in the snapshot and timestamp metadata.
+ *
+ * Also note that only the exact name "WiredTigerCheckpoint" needs to be resolved. Requests to
+ * open specific versions, such as "WiredTigerCheckpoint.6", must be looked up like named
+ * checkpoints but are otherwise still treated as unnamed. This is necessary so that the
+ * matching history store checkpoint we find can be itself opened later.
+ *
+ * It is also at least theoretically possible for there to be no matching history store
+ * checkpoint. If looking up the checkpoint names finds no history store checkpoint, its name
+ * will come back as null and we must avoid trying to open it, either here or later on in the
+ * life of the checkpoint cursor.
*/
- if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY))
- goto retry;
+
+ if (strcmp(uri, WT_HS_URI) == 0)
+ /* We're opening the history store directly, so don't open it twice. */
+ hs_dhandlep = NULL;
+
+ /* Test for the internal checkpoint name (WiredTigerCheckpoint). */
+ must_resolve = WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len);
+ is_unnamed_ckpt = cval.len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT);
+
+ /* This is the top of a retry loop. */
+ do {
+ ret = 0;
+
+ if (ckpt_snapshot != NULL)
+ /* We're about to re-fetch this; discard the prior version. No effect the first time. */
+ __wt_free(session, ckpt_snapshot->snapshot_txns);
+
+ /* Look up the data store checkpoint. */
+ if (must_resolve)
+ WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint, &ds_order, &ds_time));
+ else {
+ /* Copy the checkpoint name. */
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint));
+
+ /* Look up the checkpoint and get its time and order information. */
+ WT_RET(__wt_meta_checkpoint_by_name(session, uri, checkpoint, &ds_order, &ds_time));
+ }
+
+ /* Look up the history store checkpoint. */
+ if (hs_dhandlep != NULL) {
+ if (must_resolve)
+ WT_RET_NOTFOUND_OK(__wt_meta_checkpoint_last_name(
+ session, WT_HS_URI, &hs_checkpoint, &hs_order, &hs_time));
+ else {
+ ret =
+ __wt_meta_checkpoint_by_name(session, WT_HS_URI, checkpoint, &hs_order, &hs_time);
+ WT_RET_NOTFOUND_OK(ret);
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ else
+ WT_RET(__wt_strdup(session, checkpoint, &hs_checkpoint));
+ }
+ }
+
+ /*
+ * If we were asked for snapshot metadata, fetch it now, including the time (comparable to
+ * checkpoint times) for each element.
+ */
+ if (ckpt_snapshot != NULL) {
+ WT_RET(__session_fetch_checkpoint_meta(session, is_unnamed_ckpt ? NULL : checkpoint,
+ ckpt_snapshot, &snapshot_time, &stable_time, &oldest_time));
+
+ /*
+ * Check if we raced with a running checkpoint.
+ *
+ * If either timestamp metadata time is newer than the snapshot, we read in the middle
+ * of that material being updated and we need to retry. If that didn't happen, then
+ * check if either the data store or history store checkpoint time is newer than the
+ * metadata time. In either case we need to retry.
+ *
+ * Otherwise we have successfully gotten a matching set, as described above.
+ *
+ * If there is no history store checkpoint, its time will be zero, which will be
+ * accepted.
+ *
+ * We skip the test entirely if we aren't trying to return a snapshot (and therefore not
+ * history either) because there's nothing to check, and if we didn't retrieve the
+ * snapshot its time will be 0 and the check will fail gratuitously and lead to retrying
+ * forever.
+ */
+
+ if (ds_time > snapshot_time || hs_time > snapshot_time || stable_time > snapshot_time ||
+ oldest_time > snapshot_time)
+ ret = __wt_set_return(session, EBUSY);
+ }
+
+ if (ret == 0) {
+ /* Get a handle for the data store. */
+ ret = __wt_session_get_dhandle(session, uri, checkpoint, cfg, flags);
+ if (ret == 0 && session->dhandle->checkpoint_order != ds_order) {
+ /* The tree we opened is newer than the one we expected; need to retry. */
+ WT_TRET(__wt_session_release_dhandle(session));
+ WT_TRET(__wt_set_return(session, EBUSY));
+ }
+ }
+
+ if (ret == 0 && hs_checkpoint != NULL) {
+ /* Get a handle for the history store. */
+ WT_ASSERT(session, hs_dhandlep != NULL);
+ WT_WITHOUT_DHANDLE(session,
+ ret =
+ __session_open_hs_ckpt(session, hs_checkpoint, cfg, flags, hs_order, hs_dhandlep));
+ if (ret != 0)
+ WT_TRET(__wt_session_release_dhandle(session));
+ }
+
+ /* Drop the names; we don't need them any more. Nulls the pointers; retry relies on that. */
+ __wt_free(session, checkpoint);
+ __wt_free(session, hs_checkpoint);
+
+ /*
+ * There's a potential race: we get the name of the most recent unnamed checkpoint, but if
+ * it's discarded (or locked so it can be discarded) by the time we try to open it, we'll
+ * fail the open. Retry in those cases; a new version checkpoint should surface, and we
+ * can't return an error. The application will be justifiably upset if we can't open the
+ * last checkpoint instance of an object.
+ *
+ * The WT_NOTFOUND condition will eventually clear; some unnamed checkpoint existed when we
+ * looked up the name (otherwise we would have failed then) so a new one must be progress.
+ *
+ * At this point we should either have ret == 0 and the handles we were asked for, or ret !=
+ * 0 and no handles.
+ *
+ * For named checkpoints, we don't retry, I guess because the application ought not to try
+ * to open its checkpoints while regenerating them.
+ */
+
+ } while (is_unnamed_ckpt && (ret == WT_NOTFOUND || ret == EBUSY));
+
return (ret);
}
@@ -458,7 +704,8 @@ __session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *che
/*
* __wt_session_get_dhandle --
- * Get a data handle for the given name, set session->dhandle.
+ * Get a data handle for the given name, set session->dhandle. Optionally if we opened a
+ * checkpoint return its checkpoint order number.
*/
int
__wt_session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint,
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 641d3ed0e5b..9c5bf2d2022 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -2063,6 +2063,97 @@ __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
}
/*
+ * __wt_txn_init_checkpoint_cursor --
+ * Create a transaction object for a checkpoint cursor. On success, takes charge of the snapshot
+ * array passed down, which should have been allocated separately, and nulls the pointer. (On
+ * failure, the caller must destroy it.)
+ */
+int
+__wt_txn_init_checkpoint_cursor(
+ WT_SESSION_IMPL *session, WT_CKPT_SNAPSHOT *snapinfo, WT_TXN **txn_ret)
+{
+ WT_TXN *txn;
+
+ /*
+ * Allocate the WT_TXN structure. Don't use the variable-length array at the end, because the
+ * code for reading the snapshot allocates the snapshot list itself; copying it serves no
+ * purpose, and twisting up the read code to allow controlling the allocation from here is not
+ * worthwhile.
+ */
+ WT_RET(__wt_calloc_one(session, &txn));
+
+ /* We have no transaction ID and won't gain one, being read-only. */
+ txn->id = WT_TXN_NONE;
+
+ /* Use snapshot isolation. */
+ txn->isolation = WT_ISO_SNAPSHOT;
+
+ /* Save the snapshot data. */
+ txn->snap_min = snapinfo->snapshot_min;
+ txn->snap_max = snapinfo->snapshot_max;
+ txn->snapshot = snapinfo->snapshot_txns;
+ txn->snapshot_count = snapinfo->snapshot_count;
+
+ /*
+ * At this point we have taken charge of the snapshot's transaction list; it has been moved to
+ * the dummy transaction. Null the caller's copy so it doesn't get freed twice if something
+ * above us fails after we return.
+ */
+ snapinfo->snapshot_txns = NULL;
+
+ /* Set the read timestamp. */
+ txn->checkpoint_read_timestamp = snapinfo->stable_ts;
+
+ /* Set the flag that indicates if we have a timestamp. */
+ if (txn->checkpoint_read_timestamp != WT_TS_NONE)
+ F_SET(txn, WT_TXN_SHARED_TS_READ);
+
+ /*
+ * Set other relevant flags. Always ignore prepared values; they can get into checkpoints.
+ *
+ * Prepared values don't get written out by checkpoints by default, but can appear if pages get
+ * evicted. So whether any given prepared value from any given prepared but yet-uncommitted
+ * transaction shows up or not is arbitrary and unpredictable. Therefore, failing on it serves
+ * no data integrity purpose and will only make the system flaky.
+ *
+ * There is a problem, however. Prepared transactions are allowed to commit before stable if
+ * stable moves forward, as long as the durable timestamp is after stable. Such transactions can
+ * therefore be committed after (in execution time) the checkpoint is taken but with a commit
+ * timestamp less than the checkpoint's stable timestamp. They will then exist in the live
+ * database and be visible if read as of the checkpoint timestamp, but not exist in the
+ * checkpoint, which is inconsistent. There is probably nothing that can be done about this
+ * without making prepared transactions durable in prepared state, which is a Big Deal, so
+ * applications using prepared transactions and using this commit leeway need to be cognizant of
+ * the issue.
+ */
+ F_SET(txn,
+ WT_TXN_HAS_SNAPSHOT | WT_TXN_IS_CHECKPOINT | WT_TXN_READONLY | WT_TXN_RUNNING |
+ WT_TXN_IGNORE_PREPARE);
+
+ *txn_ret = txn;
+ return (0);
+}
+
+/*
+ * __wt_txn_close_checkpoint_cursor --
+ * Dispose of the private transaction object in a checkpoint cursor.
+ */
+void
+__wt_txn_close_checkpoint_cursor(WT_SESSION_IMPL *session, WT_TXN **txn_arg)
+{
+ WT_TXN *txn;
+
+ txn = *txn_arg;
+ *txn_arg = NULL;
+
+ /* The snapshot list isn't at the end of the transaction structure here; free it explicitly. */
+ WT_ASSERT(session, txn->snapshot != txn->__snapshot);
+ __wt_free(session, txn->snapshot);
+
+ __wt_free(session, txn);
+}
+
+/*
* __wt_txn_stats_update --
* Update the transaction statistics for return to the application.
*/
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 84140d18c18..eb40a4655ea 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -13,6 +13,7 @@ static int __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *, bool, bool, bool, con
static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool);
static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]);
static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
+static int __drop_list_execute(WT_SESSION_IMPL *session, WT_ITEM *drop_list);
/*
* __checkpoint_name_ok --
@@ -257,7 +258,7 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
}
/* Should not be called with anything other than a live btree handle. */
- WT_ASSERT(session, WT_DHANDLE_BTREE(session->dhandle) && session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session, WT_DHANDLE_BTREE(session->dhandle) && !WT_READING_CHECKPOINT(session));
btree = S2BT(session);
@@ -784,6 +785,81 @@ __txn_checkpoint_can_skip(
}
/*
+ * __txn_checkpoint_establish_time --
+ * Get a time (wall time, not a timestamp) for this checkpoint. The time is left in the session.
+ */
+static void
+__txn_checkpoint_establish_time(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ uint64_t ckpt_sec, most_recent;
+
+ conn = S2C(session);
+
+ /*
+ * If tiered storage is in use, move the time up to at least the most recent flush first. NOTE:
+ * reading the most recent flush time is not an ordered read (or repeated on retry) because
+ * currently checkpoint and flush tier are mutually exclusive.
+ *
+ * Update the global value that tracks the most recent checkpoint, and use it to make sure the
+ * most recent checkpoint time doesn't move backwards. Also make sure that this checkpoint time
+ * is not the same as the previous one, by running the clock forwards as needed.
+ *
+ * Note that while it's possible to run the clock a good long way forward if one tries (e.g. by
+ * doing a large number of schema operations that are fast and generate successive checkpoints
+ * of the metadata) and some tests (e.g. f_ops) do, this is not expected to happen in real use
+ * or lead to significant deviations from wall clock time. In a real database of any size full
+ * checkpoints take more than one second and schema operations are rare. Furthermore, though
+ * these times are saved on disk and displayed by 'wt list' they are not used operationally
+ * except in restricted ways:
+ * - to manage the interaction between hot backups and checkpointing, where the absolute time
+ * does not matter;
+ * - to track when tiered storage was last flushed in order to avoid redoing work, where the
+ * absolute time does not matter;
+ * - to detect and retry races between opening checkpoint cursors and checkpoints in progress
+ * (which only cares about ordering and only since the last database open).
+ *
+ * Currently the checkpoint time can move backwards if something has run it forward and a crash
+ * (or shutdown) and restart happens quickly enough that the wall clock hasn't caught up yet.
+ * This is a property of the way it gets initialized at startup, which is naive, and if issues
+ * arise where this matters it can get adjusted during startup in much the way the base write
+ * generation does. The checkpoint cursor opening code was set up specifically so that this does
+ * not matter.
+ *
+ * It is possible to race here, so use atomic CAS. This code relies on the fact that anyone we
+ * race with will only increase (never decrease) the most recent checkpoint time value.
+ *
+ * We store the time in the session rather than passing it around explicitly because passing it
+ * around explicitly runs afoul of the type signatures of the functions passed to schema_worker.
+ */
+
+ __wt_seconds(session, &ckpt_sec);
+ ckpt_sec = WT_MAX(ckpt_sec, conn->flush_most_recent);
+
+ for (;;) {
+ WT_ORDERED_READ(most_recent, conn->ckpt_most_recent);
+ if (ckpt_sec <= most_recent)
+ ckpt_sec = most_recent + 1;
+ if (__wt_atomic_cas64(&conn->ckpt_most_recent, most_recent, ckpt_sec))
+ break;
+ }
+
+ WT_ASSERT(session, session->current_ckpt_sec == 0);
+ session->current_ckpt_sec = ckpt_sec;
+}
+
+/*
+ * __txn_checkpoint_clear_time --
+ * Clear the current checkpoint time in the session.
+ */
+static void
+__txn_checkpoint_clear_time(WT_SESSION_IMPL *session)
+{
+ WT_ASSERT(session, session->current_ckpt_sec > 0);
+ session->current_ckpt_sec = 0;
+}
+
+/*
* __txn_checkpoint --
* Checkpoint a database or a list of objects in the database.
*/
@@ -792,6 +868,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
struct timespec tsp;
WT_CACHE *cache;
+ WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *hs_dhandle;
WT_DECL_RET;
@@ -800,9 +877,11 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN_ISOLATION saved_isolation;
wt_off_t hs_size;
wt_timestamp_t ckpt_tmp_ts;
+ size_t namelen;
uint64_t fsync_duration_usecs, generation, hs_ckpt_duration_usecs;
- uint64_t time_start_fsync, time_stop_fsync, time_start_hs, time_stop_hs;
+ uint64_t time_start_fsync, time_start_hs, time_stop_fsync, time_stop_hs;
u_int i;
+ const char *name;
bool can_skip, failed, full, idle, logging, tracking, use_timestamp;
void *saved_meta_next;
@@ -822,6 +901,16 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
return (0);
}
+ /* Check if this is a named checkpoint. */
+ WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+ if (cval.len != 0) {
+ name = cval.str;
+ namelen = cval.len;
+ } else {
+ name = NULL;
+ namelen = 0;
+ }
+
/*
* Do a pass over the configuration arguments and figure out what kind of checkpoint this is.
*/
@@ -842,6 +931,12 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
conn->ckpt_write_pages = 0;
/*
+ * Get a time (wall time, not a timestamp) for this checkpoint. This will be applied to all the
+ * trees so they match. The time is left in the session.
+ */
+ __txn_checkpoint_establish_time(session);
+
+ /*
* Update the global oldest ID so we do all possible cleanup.
*
* This is particularly important for compact, so that all dirty pages can be fully written.
@@ -976,9 +1071,17 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* checkpoint.
*/
session->dhandle = NULL;
- /* We have to set the system information before we release the snapshot. */
- if (full)
- WT_ERR(__wt_meta_sysinfo_set(session));
+
+ /*
+ * We have to update the system information before we release the snapshot. Drop the system
+ * information for checkpoints we're dropping first in case the names overlap.
+ */
+ if (session->ckpt_drop_list != NULL) {
+ __drop_list_execute(session, session->ckpt_drop_list);
+ __wt_scr_free(session, &session->ckpt_drop_list);
+ }
+ if (full || name != NULL)
+ WT_ERR(__wt_meta_sysinfo_set(session, full, name, namelen));
/* Release the snapshot so we aren't pinning updates in cache. */
__wt_txn_release_snapshot(session);
@@ -1140,6 +1243,11 @@ err:
session, session->ckpt_handle[i], WT_TRET(__wt_session_release_dhandle(session)));
}
+ if (session->ckpt_drop_list != NULL)
+ __wt_scr_free(session, &session->ckpt_drop_list);
+
+ __txn_checkpoint_clear_time(session);
+
__wt_free(session, session->ckpt_handle);
session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
@@ -1238,11 +1346,49 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting)
}
/*
+ * __drop_list_execute --
+ * Clear the system info (snapshot and timestamp info) for the named checkpoints on the drop
+ * list.
+ */
+static int
+__drop_list_execute(WT_SESSION_IMPL *session, WT_ITEM *drop_list)
+{
+ WT_CONFIG dropconf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+
+ /* The list has the form (name, name, ...,) so we can read it with the config parser. */
+ __wt_config_init(session, &dropconf, drop_list->data);
+ while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) {
+ WT_RET(__wt_meta_sysinfo_clear(session, k.str, k.len));
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ return (0);
+}
+
+/*
+ * __drop_list_add --
+ * Add a checkpoint name to the list of (named) checkpoints being dropped. The list is produced
+ * by the first tree in the checkpoint (it must be the same in every tree, so it only needs to
+ * be produced once) and used at the top level to drop the snapshot and timestamp metadata for
+ * those checkpoints. Note that while there are several places in this file where WT_CKPT_DELETE
+ * is cleared on the fly, meaning the checkpoint won't actually be dropped, none of these apply
+ * to named checkpoints.
+ */
+static int
+__drop_list_add(WT_SESSION_IMPL *session, WT_ITEM *drop_list, const char *name)
+{
+ return (__wt_buf_catfmt(session, drop_list, "%s,", name));
+}
+
+/*
* __drop --
* Drop all checkpoints with a specific name.
*/
-static void
-__drop(WT_CKPT *ckptbase, const char *name, size_t len)
+static int
+__drop(
+ WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
WT_CKPT *ckpt;
@@ -1258,16 +1404,23 @@ __drop(WT_CKPT *ckptbase, const char *name, size_t len)
F_SET(ckpt, WT_CKPT_DELETE);
} else
WT_CKPT_FOREACH (ckptbase, ckpt)
- if (WT_STRING_MATCH(ckpt->name, name, len))
+ if (WT_STRING_MATCH(ckpt->name, name, len)) {
+ /* Remember the names of named checkpoints we're dropping. */
+ if (drop_list != NULL)
+ WT_RET(__drop_list_add(session, drop_list, ckpt->name));
F_SET(ckpt, WT_CKPT_DELETE);
+ }
+
+ return (0);
}
/*
* __drop_from --
* Drop all checkpoints after, and including, the named checkpoint.
*/
-static void
-__drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
+static int
+__drop_from(
+ WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
WT_CKPT *ckpt;
bool matched;
@@ -1276,9 +1429,13 @@ __drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
* There's a special case -- if the name is "all", then we delete all of the checkpoints.
*/
if (WT_STRING_MATCH("all", name, len)) {
- WT_CKPT_FOREACH (ckptbase, ckpt)
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ /* Remember the names of named checkpoints we're dropping. */
+ if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ WT_RET(__drop_list_add(session, drop_list, ckpt->name));
F_SET(ckpt, WT_CKPT_DELETE);
- return;
+ }
+ return (0);
}
/*
@@ -1291,16 +1448,22 @@ __drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
continue;
matched = true;
+ /* Remember the names of named checkpoints we're dropping. */
+ if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ WT_RET(__drop_list_add(session, drop_list, ckpt->name));
F_SET(ckpt, WT_CKPT_DELETE);
}
+
+ return (0);
}
/*
* __drop_to --
* Drop all checkpoints before, and including, the named checkpoint.
*/
-static void
-__drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
+static int
+__drop_to(
+ WT_SESSION_IMPL *session, WT_ITEM *drop_list, WT_CKPT *ckptbase, const char *name, size_t len)
{
WT_CKPT *ckpt, *mark;
@@ -1314,14 +1477,19 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
mark = ckpt;
if (mark == NULL)
- return;
+ return (0);
WT_CKPT_FOREACH (ckptbase, ckpt) {
+ /* Remember the names of named checkpoints we're dropping. */
+ if (drop_list != NULL && !WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ WT_RET(__drop_list_add(session, drop_list, ckpt->name));
F_SET(ckpt, WT_CKPT_DELETE);
if (ckpt == mark)
break;
}
+
+ return (0);
}
/*
@@ -1444,6 +1612,7 @@ __checkpoint_lock_dirty_tree(
WT_CONFIG_ITEM cval, k, v;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ WT_ITEM *drop_list;
size_t ckpt_bytes_allocated;
uint64_t now;
char *name_alloc;
@@ -1454,6 +1623,7 @@ __checkpoint_lock_dirty_tree(
ckpt = ckptbase = NULL;
ckpt_bytes_allocated = 0;
dhandle = session->dhandle;
+ drop_list = NULL;
name_alloc = NULL;
seen_ckpt_add = false;
@@ -1539,6 +1709,13 @@ __checkpoint_lock_dirty_tree(
cval.len = 0;
WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
if (cval.len != 0) {
+ /* Gather the list of named checkpoints to drop (if any) from the first tree visited. */
+ if (session->ckpt_drop_list == NULL) {
+ WT_ERR(__wt_scr_alloc(session, cval.len + 10, &session->ckpt_drop_list));
+ WT_ERR(__wt_buf_set(session, session->ckpt_drop_list, "(", 1));
+ drop_list = session->ckpt_drop_list;
+ }
+
__wt_config_subinit(session, &dropconf, &cval);
while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) {
/* Disallow unsafe checkpoint names. */
@@ -1548,21 +1725,28 @@ __checkpoint_lock_dirty_tree(
WT_ERR(__checkpoint_name_ok(session, v.str, v.len, true));
if (v.len == 0)
- __drop(ckptbase, k.str, k.len);
+ WT_ERR(__drop(session, drop_list, ckptbase, k.str, k.len));
else if (WT_STRING_MATCH("from", k.str, k.len))
- __drop_from(ckptbase, v.str, v.len);
+ WT_ERR(__drop_from(session, drop_list, ckptbase, v.str, v.len));
else if (WT_STRING_MATCH("to", k.str, k.len))
- __drop_to(ckptbase, v.str, v.len);
+ WT_ERR(__drop_to(session, drop_list, ckptbase, v.str, v.len));
else
WT_ERR_MSG(session, EINVAL, "unexpected value for checkpoint key: %.*s",
(int)k.len, k.str);
}
WT_ERR_NOTFOUND_OK(ret, false);
+
+ if (drop_list != NULL)
+ WT_ERR(__wt_buf_catfmt(session, drop_list, ")"));
}
}
- /* Drop checkpoints with the same name as the one we're taking. */
- __drop(ckptbase, name, strlen(name));
+ /*
+ * Drop checkpoints with the same name as the one we're taking. We don't need to add this to the
+ * drop list for snapshot/timestamp metadata because the metadata will be replaced by the new
+ * checkpoint.
+ */
+ WT_ERR(__drop(session, NULL, ckptbase, name, strlen(name)));
/* Set the name of the new entry at the end of the list. */
WT_CKPT_FOREACH (ckptbase, ckpt)
@@ -2038,16 +2222,21 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_CONFIG_ITEM cval;
WT_DECL_RET;
- bool force;
+ bool force, standalone;
/* Should not be called with a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session, !WT_READING_CHECKPOINT(session));
/* We must hold the metadata lock if checkpointing the metadata. */
WT_ASSERT(session,
!WT_IS_METADATA(session->dhandle) ||
FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_METADATA));
+ /* If we're already in a global checkpoint, don't get a new time. Otherwise, we need one. */
+ standalone = session->current_ckpt_sec == 0;
+ if (standalone)
+ __txn_checkpoint_establish_time(session);
+
WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
force = cval.val != 0;
WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
@@ -2056,6 +2245,9 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
ret = __checkpoint_tree(session, true, cfg);
done:
+ if (standalone)
+ __txn_checkpoint_clear_time(session);
+
/* Do not store the cached checkpoint list when checkpointing a single file alone. */
__wt_meta_saved_ckptlist_free(session);
return (ret);
@@ -2075,7 +2267,7 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
bm = S2BT(session)->bm;
/* Should not be called with a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session, !WT_READING_CHECKPOINT(session));
/* Unnecessary if checkpoint_sync has been configured "off". */
if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
@@ -2138,12 +2330,16 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (need_tracking)
WT_RET(__wt_meta_track_on(session));
+ __txn_checkpoint_establish_time(session);
+
WT_SAVE_DHANDLE(
session, ret = __checkpoint_lock_dirty_tree(session, false, false, need_tracking, NULL));
WT_ASSERT(session, ret == 0);
if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
ret = __checkpoint_tree(session, false, NULL);
+ __txn_checkpoint_clear_time(session);
+
/* Do not store the cached checkpoint list when closing the handle. */
__wt_meta_saved_ckptlist_free(session);
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index f78ef3cbc5c..537e73d1e7c 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -374,40 +374,6 @@ __txn_log_recover(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, WT_LS
}
/*
- * __recovery_retrieve_timestamp --
- * Retrieve a timestamp from the metadata.
- */
-static int
-__recovery_retrieve_timestamp(
- WT_RECOVERY *r, const char *system_uri, const char *timestamp_name, wt_timestamp_t *timestampp)
-{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- char *sys_config;
-
- sys_config = NULL;
-
- session = r->session;
-
- /* Search the metadata for the system information. */
- WT_ERR_NOTFOUND_OK(__wt_metadata_search(session, system_uri, &sys_config), false);
- if (sys_config != NULL) {
- WT_CLEAR(cval);
- WT_ERR_NOTFOUND_OK(__wt_config_getones(session, sys_config, timestamp_name, &cval), false);
- if (cval.len != 0) {
- __wt_verbose(session, WT_VERB_RECOVERY, "Recovery %s %.*s", timestamp_name,
- (int)cval.len, cval.str);
- WT_ERR(__wt_txn_parse_timestamp_raw(session, timestamp_name, timestampp, &cval));
- }
- }
-
-err:
- __wt_free(session, sys_config);
- return (ret);
-}
-
-/*
* __recovery_set_checkpoint_timestamp --
* Set the checkpoint timestamp as retrieved from the metadata file.
*/
@@ -421,14 +387,12 @@ __recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
session = r->session;
conn = S2C(session);
+
/*
* Read the system checkpoint information from the metadata file and save the stable timestamp
* of the last checkpoint for later query. This gets saved in the connection.
*/
- ckpt_timestamp = 0;
-
- WT_RET(
- __recovery_retrieve_timestamp(r, WT_SYSTEM_CKPT_URI, WT_SYSTEM_CKPT_TS, &ckpt_timestamp));
+ WT_RET(__wt_meta_read_checkpoint_timestamp(r->session, NULL, &ckpt_timestamp, NULL));
/*
* Set the recovery checkpoint timestamp and the metadata checkpoint timestamp so that the
@@ -460,10 +424,7 @@ __recovery_set_oldest_timestamp(WT_RECOVERY *r)
* Read the system checkpoint information from the metadata file and save the oldest timestamp
* of the last checkpoint for later query. This gets saved in the connection.
*/
- oldest_timestamp = 0;
-
- WT_RET(__recovery_retrieve_timestamp(
- r, WT_SYSTEM_OLDEST_URI, WT_SYSTEM_OLDEST_TS, &oldest_timestamp));
+ WT_RET(__wt_meta_read_checkpoint_oldest(r->session, NULL, &oldest_timestamp, NULL));
conn->txn_global.oldest_timestamp = oldest_timestamp;
conn->txn_global.has_oldest_timestamp = oldest_timestamp != WT_TS_NONE;
@@ -480,22 +441,9 @@ __recovery_set_oldest_timestamp(WT_RECOVERY *r)
static int
__recovery_set_checkpoint_snapshot(WT_SESSION_IMPL *session)
{
- WT_CONFIG list;
- WT_CONFIG_ITEM cval;
- WT_CONFIG_ITEM k;
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- uint32_t counter;
- char *sys_config;
- sys_config = NULL;
conn = S2C(session);
- counter = 0;
-
- /* Initialize the recovery checkpoint snapshot variables to default values. */
- conn->recovery_ckpt_snap_min = WT_TXN_NONE;
- conn->recovery_ckpt_snap_max = WT_TXN_NONE;
- conn->recovery_ckpt_snapshot_count = 0;
/*
* WiredTiger versions 10.0.1 onward have a valid checkpoint snapshot on-disk. There was a bug
@@ -507,53 +455,22 @@ __recovery_set_checkpoint_snapshot(WT_SESSION_IMPL *session)
* available, assume that the snapshot is valid, otherwise restoring from a backup won't work.
*/
if (__wt_version_defined(conn->recovery_version) &&
- __wt_version_lte(conn->recovery_version, (WT_VERSION){10, 0, 0}))
+ __wt_version_lte(conn->recovery_version, (WT_VERSION){10, 0, 0})) {
+ /* Return an empty snapshot. */
+ conn->recovery_ckpt_snap_min = WT_TXN_NONE;
+ conn->recovery_ckpt_snap_max = WT_TXN_NONE;
+ conn->recovery_ckpt_snapshot = NULL;
+ conn->recovery_ckpt_snapshot_count = 0;
return (0);
+ }
/*
* Read the system checkpoint information from the metadata file and save the snapshot related
- * details of the last checkpoint for later query. This gets saved in the connection.
+ * details of the last checkpoint in the connection for later query.
*/
- WT_ERR_NOTFOUND_OK(
- __wt_metadata_search(session, WT_SYSTEM_CKPT_SNAPSHOT_URI, &sys_config), false);
- if (sys_config != NULL) {
- WT_CLEAR(cval);
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MIN, &cval) == 0 &&
- cval.len != 0)
- conn->recovery_ckpt_snap_min = (uint64_t)cval.val;
-
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_MAX, &cval) == 0 &&
- cval.len != 0)
- conn->recovery_ckpt_snap_max = (uint64_t)cval.val;
-
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT_COUNT, &cval) == 0 &&
- cval.len != 0)
- conn->recovery_ckpt_snapshot_count = (uint32_t)cval.val;
-
- if (__wt_config_getones(session, sys_config, WT_SYSTEM_CKPT_SNAPSHOT, &cval) == 0 &&
- cval.len != 0) {
- __wt_config_subinit(session, &list, &cval);
- WT_ERR(__wt_calloc_def(
- session, conn->recovery_ckpt_snapshot_count, &conn->recovery_ckpt_snapshot));
- while (__wt_config_subget_next(&list, &k) == 0)
- conn->recovery_ckpt_snapshot[counter++] = (uint64_t)k.val;
- }
-
- /*
- * Make sure that checkpoint snapshot does not have any unexpected value. The recovered
- * snapshot array should contain the values between recovered snapshot min and recovered
- * snapshot max.
- */
- WT_ASSERT(session,
- conn->recovery_ckpt_snapshot == NULL ||
- (conn->recovery_ckpt_snapshot_count == counter &&
- conn->recovery_ckpt_snapshot[0] == conn->recovery_ckpt_snap_min &&
- conn->recovery_ckpt_snapshot[counter - 1] < conn->recovery_ckpt_snap_max));
- }
-
-err:
- __wt_free(session, sys_config);
- return (ret);
+ return (__wt_meta_read_checkpoint_snapshot(session, NULL, &conn->recovery_ckpt_snap_min,
+ &conn->recovery_ckpt_snap_max, &conn->recovery_ckpt_snapshot,
+ &conn->recovery_ckpt_snapshot_count, NULL));
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index b4112f6f416..3b3a30b819b 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1329,7 +1329,7 @@ __rollback_to_stable_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_tim
return (0);
/* There is never anything to do for checkpoint handles. */
- if (session->dhandle->checkpoint != NULL)
+ if (WT_READING_CHECKPOINT(session))
return (0);
/* There is nothing to do on an empty tree. */
diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
index 6e12630ec5d..cf88ee2b423 100644
--- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
+++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
@@ -164,7 +164,6 @@ real_checkpointer(void)
const char *checkpoint_config;
checkpoint_config = "use_timestamp=false";
- g.ts_oldest = 0;
verify_ts = WT_TS_NONE;
if (g.running == 0)
@@ -191,7 +190,7 @@ real_checkpointer(void)
* Check for consistency of online data, here we don't expect to see the version at the
* checkpoint just a consistent view across all tables.
*/
- if ((ret = verify_consistency(session, WT_TS_NONE)) != 0)
+ if ((ret = verify_consistency(session, WT_TS_NONE, false)) != 0)
return (log_print_err("verify_consistency (online)", ret, 1));
if (g.use_timestamps) {
@@ -216,12 +215,12 @@ real_checkpointer(void)
if (!g.running)
goto done;
- /*
- * Verify the content of the checkpoint at the stable timestamp. We can't verify checkpoints
- * without timestamps as such we don't perform a verification here in the non-timestamped
- * scenario.
- */
- if (g.use_timestamps && (ret = verify_consistency(session, verify_ts)) != 0)
+ /* Verify the checkpoint we just wrote. */
+ if ((ret = verify_consistency(session, WT_TS_NONE, true)) != 0)
+ return (log_print_err("verify_consistency (checkpoint)", ret, 1));
+
+ /* Verify the content of the database at the verify timestamp. */
+ if (g.use_timestamps && (ret = verify_consistency(session, verify_ts, false)) != 0)
return (log_print_err("verify_consistency (timestamps)", ret, 1));
/* Advance the oldest timestamp to the most recently set stable timestamp. */
@@ -318,12 +317,13 @@ do_cursor_prev(table_type type, WT_CURSOR *cursor)
* The key/values should match across all tables.
*/
int
-verify_consistency(WT_SESSION *session, wt_timestamp_t verify_ts)
+verify_consistency(WT_SESSION *session, wt_timestamp_t verify_ts, bool use_checkpoint)
{
WT_CURSOR **cursors;
uint64_t key_count;
int i, reference_table, ret, t_ret;
- char cfg_buf[128], next_uri[128];
+ char cfg_buf[128], ckpt_buf[128], next_uri[128];
+ const char *ckpt;
ret = t_ret = 0;
key_count = 0;
@@ -331,16 +331,30 @@ verify_consistency(WT_SESSION *session, wt_timestamp_t verify_ts)
if (cursors == NULL)
return (log_print_err("verify_consistency", ENOMEM, 1));
- if (verify_ts != WT_TS_NONE)
- testutil_check(__wt_snprintf(cfg_buf, sizeof(cfg_buf),
- "isolation=snapshot,read_timestamp=%" PRIx64 ",roundup_timestamps=read", verify_ts));
- else
- testutil_check(__wt_snprintf(cfg_buf, sizeof(cfg_buf), "isolation=snapshot"));
- testutil_check(session->begin_transaction(session, cfg_buf));
+ if (use_checkpoint) {
+ testutil_check(
+ __wt_snprintf(ckpt_buf, sizeof(ckpt_buf), "checkpoint=%s", g.checkpoint_name));
+ ckpt = ckpt_buf;
+ } else {
+ ckpt = NULL;
+ if (verify_ts != WT_TS_NONE)
+ testutil_check(__wt_snprintf(cfg_buf, sizeof(cfg_buf),
+ "isolation=snapshot,read_timestamp=%" PRIx64 ",roundup_timestamps=read", verify_ts));
+ else
+ testutil_check(__wt_snprintf(cfg_buf, sizeof(cfg_buf), "isolation=snapshot"));
+ testutil_check(session->begin_transaction(session, cfg_buf));
+ }
for (i = 0; i < g.ntables; i++) {
+ /*
+ * TODO: LSM doesn't currently support reading from checkpoints.
+ */
+ if (g.cookies[i].type == LSM && use_checkpoint) {
+ cursors[i] = NULL;
+ continue;
+ }
testutil_check(__wt_snprintf(next_uri, sizeof(next_uri), "table:__wt%04d", i));
- if ((ret = session->open_cursor(session, next_uri, NULL, NULL, &cursors[i])) != 0) {
+ if ((ret = session->open_cursor(session, next_uri, NULL, ckpt, &cursors[i])) != 0) {
(void)log_print_err("verify_consistency:session.open_cursor", ret, 1);
goto err;
}
@@ -356,7 +370,7 @@ verify_consistency(WT_SESSION *session, wt_timestamp_t verify_ts)
/* There's no way to verify LSM-only runs. */
if (cursors[reference_table] == NULL) {
- printf("LSM-only, skipping checkpoint verification\n");
+ printf("LSM-only, skipping verification\n");
goto err;
}
@@ -404,8 +418,8 @@ verify_consistency(WT_SESSION *session, wt_timestamp_t verify_ts)
}
}
}
- printf("Finished verifying with %d tables and %" PRIu64 " keys at timestamp %" PRIu64 "\n",
- g.ntables, key_count, verify_ts);
+ printf("Finished verifying%s with %d tables and %" PRIu64 " keys at timestamp %" PRIu64 "\n",
+ use_checkpoint ? " a checkpoint" : "", g.ntables, key_count, verify_ts);
fflush(stdout);
err:
@@ -413,7 +427,8 @@ err:
if (cursors[i] != NULL && (ret = cursors[i]->close(cursors[i])) != 0)
(void)log_print_err("verify_consistency:cursor close", ret, 1);
}
- testutil_check(session->commit_transaction(session, NULL));
+ if (!use_checkpoint)
+ testutil_check(session->commit_transaction(session, NULL));
free(cursors);
return (ret);
}
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index cc6964d7db3..8328c58aec2 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -182,6 +182,7 @@ main(int argc, char *argv[])
/* Start time at 1 since 0 is not a valid timestamp. */
g.ts_stable = 1;
+ g.ts_oldest = 1;
printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) {
@@ -220,7 +221,7 @@ main(int argc, char *argv[])
break;
}
- verify_consistency(session, WT_TS_NONE);
+ verify_consistency(session, WT_TS_NONE, false);
goto run_complete;
}
@@ -343,7 +344,6 @@ cleanup(bool remove_dir)
{
g.running = 0;
g.ntables_created = 0;
- g.ts_oldest = 0;
if (remove_dir)
testutil_make_work_dir(g.home);
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
index 1d8bd72ab0c..280a6bfac24 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
@@ -94,4 +94,4 @@ int log_print_err_worker(const char *, int, const char *, int, int);
void start_checkpoints(void);
int start_workers(void);
const char *type_to_string(table_type);
-int verify_consistency(WT_SESSION *, wt_timestamp_t);
+int verify_consistency(WT_SESSION *, wt_timestamp_t, bool);
diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
index 609b3718f1a..c60e7140bf0 100644
--- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c
@@ -120,14 +120,9 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp)
/* Loop to open an object handle. */
for (i = __wt_random(rnd) % uris; !done; __wt_yield()) {
- /*
- * Use a checkpoint handle for 50% of reads.
- *
- * FIXME-WT-5927: Checkpoint cursors are known to have issues in durable history so we've
- * removing the use of checkpoint handles in this test. As part of WT-5927, we should either
- * re-enable the testing of checkpoint cursors or remove this comment.
- */
- ret = session->open_cursor(session, uri_list[i], NULL, NULL, &cursor);
+ /* Use a checkpoint handle for 50% of reads. */
+ ret = session->open_cursor(session, uri_list[i], NULL,
+ readonly && (i % 2 == 0) ? "checkpoint=WiredTigerCheckpoint" : NULL, &cursor);
if (ret != EBUSY) {
testutil_check(ret);
break;
diff --git a/src/third_party/wiredtiger/test/suite/test_bulk02.py b/src/third_party/wiredtiger/test/suite/test_bulk02.py
index 107c0dc9417..7de95a83e3e 100644
--- a/src/third_party/wiredtiger/test/suite/test_bulk02.py
+++ b/src/third_party/wiredtiger/test/suite/test_bulk02.py
@@ -73,8 +73,8 @@ class test_bulkload_checkpoint(wttest.WiredTigerTestCase, suite_subprocess):
# Close the bulk cursor.
cursor.close()
- # In the case of named checkpoints, verify they're still there,
- # reflecting an empty file.
+ # Because the checkpoint skipped the table (because of the open bulk cursor), the
+ # checkpoint may exist (appears to) but the table isn't in it and can't be opened.
if self.ckpt_type == 'named':
self.assertRaises(wiredtiger.WiredTigerError,
lambda: self.session.open_cursor(self.uri, None, 'checkpoint=myckpt'))
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint01.py b/src/third_party/wiredtiger/test/suite/test_checkpoint01.py
index 75d3485be0a..89da3973dbe 100755
--- a/src/third_party/wiredtiger/test/suite/test_checkpoint01.py
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint01.py
@@ -402,6 +402,8 @@ class test_checkpoint_empty(wttest.WiredTigerTestCase):
cursor = self.session.open_cursor(self.uri, None, "checkpoint=ckpt")
self.assertEquals(cursor.next(), wiredtiger.WT_NOTFOUND)
+ # Check that if we create an unnamed and then a named checkpoint, opening
+ # WiredTigerCheckpoint opens the most recent (the named) checkpoint.
def test_checkpoint_empty_six(self):
self.session.create(self.uri, "key_format=S,value_format=S")
self.session.checkpoint()
@@ -416,7 +418,25 @@ class test_checkpoint_empty(wttest.WiredTigerTestCase):
cursor = self.session.open_cursor(
self.uri, None, "checkpoint=WiredTigerCheckpoint")
+ self.assertEquals(cursor.next(), 0)
+
+ # Check that if we create a named and then an unnamed checkpoint, opening
+ # WiredTigerCheckpoint opens the most recent (the named) checkpoint.
+ def test_checkpoint_empty_seven(self):
+ self.session.create(self.uri, "key_format=S,value_format=S")
+ self.session.checkpoint('name=ckpt')
+ cursor = self.session.open_cursor(
+ self.uri, None, "checkpoint=WiredTigerCheckpoint")
self.assertEquals(cursor.next(), wiredtiger.WT_NOTFOUND)
+ cursor.close()
+
+ cursor = self.session.open_cursor(self.uri, None)
+ cursor["key"] = "value"
+ self.session.checkpoint()
+
+ cursor = self.session.open_cursor(
+ self.uri, None, "checkpoint=WiredTigerCheckpoint")
+ self.assertEquals(cursor.next(), 0)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint10.py b/src/third_party/wiredtiger/test/suite/test_checkpoint10.py
new file mode 100644
index 00000000000..57c4b98de7f
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint10.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+from wtthread import checkpoint_thread, named_checkpoint_thread
+from helper import simulate_crash_restart
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint10.py
+# Test what happens if we create an inconsistent checkpoint and then try to
+# open it for read. No timestamps in this version.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ overlap_values = [
+ ('no-overlap', dict(do_overlap=False)),
+ ('overlap', dict(do_overlap=True)),
+ ]
+ name_values = [
+ # Reopening and unnamed checkpoints will not work as intended because the reopen makes
+ # a new checkpoint.
+ ('named', dict(second_checkpoint='second_checkpoint', do_reopen=False)),
+ ('named_reopen', dict(second_checkpoint='second_checkpoint', do_reopen=True)),
+ ('unnamed', dict(second_checkpoint=None, do_reopen=False)),
+ ]
+ log_values = [
+ ('nonlogged', dict(do_log=False)),
+ ('logged', dict(do_log=True)),
+ ]
+ scenarios = make_scenarios(format_values, overlap_values, name_values, log_values)
+
+ def conn_config(self):
+ cfg = 'statistics=(all),timing_stress_for_test=[checkpoint_slow]'
+ if self.do_log:
+ cfg += ',log=(enabled=true)'
+ return cfg
+
+ def large_updates(self, uri, ds, nrows, value):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction()
+ self.session.begin_transaction()
+ self.session.commit_transaction()
+ cursor.close()
+
+ # "expected" is a list of maps from values to counts of values.
+ def check(self, ds, ckpt, expected):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cursor = self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt)
+ #self.session.begin_transaction()
+ seen = {}
+ for k, v in cursor:
+ if v in seen:
+ seen[v] += 1
+ else:
+ seen[v] = 1
+ #for v in seen:
+ # self.prout("seen {}: {}".format(v if self.value_format == '8t' else v[0], seen[v]))
+ self.assertTrue(seen in expected)
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint10'
+ nrows = 10000
+ overlap = 5000 if self.do_overlap else 0
+ morerows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ morerows *= 5
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Write some data.
+ self.large_updates(uri, ds, nrows, value_a)
+ # Write this data out now so we aren't waiting for it while trying to
+ # race with the later data.
+ self.session.checkpoint()
+
+ # Write some more data, and hold the transaction open.
+ session2 = self.conn.open_session()
+ cursor2 = session2.open_cursor(uri)
+ session2.begin_transaction()
+ for i in range(nrows - overlap + 1, nrows + morerows + 1):
+ cursor2[ds.key(i)] = value_b
+
+ # Checkpoint in the background.
+ done = threading.Event()
+ if self.second_checkpoint is None:
+ ckpt = checkpoint_thread(self.conn, done)
+ else:
+ ckpt = named_checkpoint_thread(self.conn, done, self.second_checkpoint)
+ try:
+ ckpt.start()
+
+ # Wait for checkpoint to start before committing.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
+ session2.commit_transaction()
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Reopen if desired to cycle the write generations.
+ if self.do_reopen:
+ self.reopen_conn()
+
+ # There are two states we should be able to produce: one with the original
+ # data and one with the additional data.
+ #
+ # It is ok to see either in the checkpoint (since the checkpoint could
+ # reasonably include or not include the second txn) but not ok to see
+ # an intermediate state.
+ expected_a = { value_a: nrows }
+ expected_b = { value_a: nrows - overlap, value_b: overlap + morerows }
+ expected = [expected_a, expected_b]
+
+ # For FLCS, because the table expands under uncommitted data, we should
+ # see zeros once the additional data's been written (that is, always strictly
+ # before the checkpoint) if we don't see the actual values.
+ expected_flcs_a = { value_a: nrows, 0: morerows }
+ expected_flcs = [expected_flcs_a, expected_b]
+
+ # Now read the checkpoint.
+ self.check(ds, self.second_checkpoint, expected_flcs if self.value_format == '8t' else expected)
+
+ # If we haven't died yet, pretend to crash and run RTS to see if the
+ # checkpoint was inconsistent.
+ # (This only works if we didn't reopen the connection, so don't bother if we did.)
+ if not self.do_reopen:
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Make sure we did get an inconsistent checkpoint.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2]
+ stat_cursor.close()
+ self.assertGreater(inconsistent_ckpt, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint11.py b/src/third_party/wiredtiger/test/suite/test_checkpoint11.py
new file mode 100644
index 00000000000..48ab46da033
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint11.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+from wtthread import checkpoint_thread, named_checkpoint_thread
+from helper import simulate_crash_restart
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint11.py
+# Test what happens if we create an inconsistent checkpoint and then try to
+# open it for read. This version uses timestamps.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all),timing_stress_for_test=[checkpoint_slow]'
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ overlap_values = [
+ ('overlap', dict(do_overlap=True)),
+ ('no-overlap', dict(do_overlap=False, long_only=True)),
+ ]
+ stable_ts_values = [
+ ('5', dict(stable_ts=5)),
+ ('15', dict(stable_ts=15, long_only=True)),
+ ('25', dict(stable_ts=25)),
+ # Cannot do 35: we need to commit at 30 after starting the checkpoint.
+ ]
+ advance_values = [
+ ('no-advance', dict(do_advance=False)),
+ ('advance', dict(do_advance=True)),
+ ]
+ name_values = [
+ # Reopening and unnamed checkpoints will not work as intended because the reopen makes
+ # a new checkpoint.
+ ('named', dict(second_checkpoint='second_checkpoint', do_reopen=False)),
+ ('named_reopen', dict(second_checkpoint='second_checkpoint', do_reopen=True)),
+ ('unnamed', dict(second_checkpoint=None, do_reopen=False, long_only=True)),
+ ]
+ scenarios = make_scenarios(format_values,
+ overlap_values, stable_ts_values, advance_values, name_values)
+
+ def large_updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ # "ts_expected" is a map from timestamps to lists of maps from values to counts of values.
+ # That is, ts_expected[ts] is a list of maps from values to counts of values; the map of
+ # values to counts that we see should be in that list.
+ def check(self, ds, ckpt, ts_expected):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ ts_seen = {}
+ for ts in ts_expected:
+ cfg = 'checkpoint=' + ckpt
+ if ts is not None:
+ cfg += ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, cfg)
+ #self.session.begin_transaction()
+ seen = {}
+ for k, v in cursor:
+ if v in seen:
+ seen[v] += 1
+ else:
+ seen[v] = 1
+ #for v in seen:
+ # pv = v if self.value_format == '8t' else v[0]
+ # self.prout("at {} seen {}: {}".format(ts, pv, seen[v]))
+ ts_seen[ts] = seen
+ #self.session.rollback_transaction()
+ cursor.close()
+ # Check in a separate loop so that all the values have been examined before failing.
+ for ts in ts_expected:
+ expected = ts_expected[ts]
+ seen = ts_seen[ts]
+ self.assertTrue(seen in expected)
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint11'
+ nrows = 10000
+ overlap = 5000 if self.do_overlap else 0
+ morerows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ morerows *= 5
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Pin oldest and stable timestamps to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some data at time 10.
+ self.large_updates(uri, ds, nrows, value_a, 10)
+
+ # Write some more data at time 20.
+ self.large_updates(uri, ds, nrows, value_b, 20)
+
+ # Write this data out now so we aren't waiting for it while trying to
+ # race with the later data.
+ self.session.checkpoint()
+
+ # Write some further data, and hold the transaction open. Eventually commit at time 30.
+ session2 = self.conn.open_session()
+ cursor2 = session2.open_cursor(uri)
+ session2.begin_transaction()
+ #for i in range(1, nrows + 1, 10):
+ for i in range(nrows - overlap + 1, nrows + morerows + 1):
+ cursor2[ds.key(i)] = value_c
+
+ # Optionally move stable forward.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.stable_ts))
+
+ # Checkpoint in the background.
+ done = threading.Event()
+ if self.second_checkpoint is None:
+ ckpt = checkpoint_thread(self.conn, done)
+ else:
+ ckpt = named_checkpoint_thread(self.conn, done, self.second_checkpoint)
+ try:
+ ckpt.start()
+
+ # Wait for checkpoint to start before committing.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
+ session2.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Reopen if desired to cycle the write generations.
+ if self.do_reopen:
+ self.reopen_conn()
+
+ # There are two states we should be able to produce. In all cases we should
+ # see all the original data (value_a at time 10, value_b at time 20). If
+ # the value_c transaction appears in the checkpoint we should see all the
+ # additional data as well (value_c at time 30); otherwise reading past time 20
+ # should yield value_b.
+ #
+ # It is ok to see either state in the checkpoint (since the checkpoint could
+ # reasonably include or not include the second txn) but not ok to see
+ # an intermediate state, and in particular we must not see _part_ of the value_c
+ # data.
+ expected_5 = {}
+ expected_15 = { value_a: nrows }
+ expected_25 = { value_b: nrows }
+ expected_35_a = { value_b: nrows }
+ expected_35_b = { value_b: nrows - overlap, value_c: overlap + morerows }
+ expected = {
+ 5: [expected_5],
+ 15: [expected_15],
+ 25: [expected_25],
+ 35: [expected_35_a, expected_35_b]
+ }
+ # When reading without an explicit timestamp, we should see the state as of
+ # the stable timestamp when the checkpoint was taken.
+ expected[None] = expected[self.stable_ts]
+
+ # For FLCS, because the table expands under uncommitted data, we should
+ # see zeros once the additional data's been written (that is, always strictly
+ # before the checkpoint) if we don't see the actual values.
+ expected_5_flcs = { 0: nrows + morerows }
+ expected_15_flcs = { value_a: nrows, 0: morerows }
+ expected_25_flcs = { value_b: nrows, 0: morerows }
+ expected_35_flcs_a = { value_b: nrows, 0: morerows }
+ expected_35_flcs_b = { value_b: nrows - overlap, value_c: overlap + morerows }
+ expected_flcs = {
+ 5: [expected_5_flcs],
+ 15: [expected_15_flcs],
+ 25: [expected_25_flcs],
+ 35: [expected_35_flcs_a, expected_35_flcs_b]
+ }
+ expected_flcs[None] = expected_flcs[self.stable_ts]
+
+ if self.do_advance:
+ # Move oldest up in case that interferes with handling the checkpoint.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(50))
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(50))
+
+ # Now read the checkpoint.
+ self.check(ds, self.second_checkpoint, expected_flcs if self.value_format == '8t' else expected)
+
+ # If we haven't died yet, pretend to crash and run RTS to see if the
+ # checkpoint was inconsistent.
+ # (This only works if we didn't reopen the connection, so don't bother if we did.)
+ if not self.do_reopen:
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Make sure we did get an inconsistent checkpoint.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2]
+ stat_cursor.close()
+ self.assertGreater(inconsistent_ckpt, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint12.py b/src/third_party/wiredtiger/test/suite/test_checkpoint12.py
new file mode 100644
index 00000000000..71e432bd196
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint12.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint12.py
+# Make sure you can't read from a checkpoint while you have a prepared transaction.
+# (This is to make sure that any transaction shenanigans involved in reading from
+# checkpoints don't interfere with the blanket ban on doing other operations after
+# preparing.)
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ conn_config = ''
+ session_config = 'isolation=snapshot'
+
+ operation_values = [
+ ('search', dict(op='search')),
+ ('next', dict(op='next')),
+ ('prev', dict(op='prev')),
+ ('search_near', dict(op='search_near')),
+ ]
+ scenarios = make_scenarios(operation_values)
+
+ # No need to run this on more than one btree type.
+ key_format = 'r'
+ value_format = 'S'
+
+ def large_updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def operate(self, ckpt_cursor):
+ if self.op == 'search':
+ ckpt_cursor.search()
+ elif self.op == 'next':
+ ckpt_cursor.next()
+ elif self.op == 'prev':
+ ckpt_cursor.prev()
+ elif self.op == 'search_near':
+ ckpt_cursor.search_near()
+ else:
+ self.assertTrue(False)
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint12'
+ nrows = 1000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Pin oldest and stable timestamps to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some data at time 10.
+ self.large_updates(uri, ds, nrows, value_a, 10)
+
+ # Make a checkpoint.
+ self.session.checkpoint()
+
+ # Write some more data at time 20.
+ self.large_updates(uri, ds, nrows, value_a, 20)
+
+ # Open the checkpoint.
+ ckpt_cursor = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint')
+ ckpt_cursor.set_key(ds.key(1))
+
+ # Write some further data, and prepare it at time 30.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows // 2):
+ cursor[ds.key(i)] = value_b
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(30))
+
+ # Now try reading the checkpoint.
+ msg = '/Invalid argument/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.operate(ckpt_cursor), msg)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint13.py b/src/third_party/wiredtiger/test/suite/test_checkpoint13.py
new file mode 100644
index 00000000000..f6f08cc9d4f
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint13.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint13.py: API restrictions on checkpoint cursors
+#
+# - You may not read from a checkpoint cursor while in a transaction.
+# (The checkpoint cursor is its own private transaction.)
+#
+# - You may not read from a checkpoint prior to its oldest timestamp.
+#
+# - You may not regen or drop a named checkpoint with a cursor open.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ conn_config = ''
+ session_config = 'isolation=snapshot'
+
+ ckptname_values = [
+ ('named', dict(checkpoint_name='my_ckpt')),
+ ('unnamed', dict(checkpoint_name=None)),
+ ]
+ scenarios = make_scenarios(ckptname_values)
+
+ # No need to run this on more than one btree type.
+ key_format = 'r'
+ value_format = 'S'
+
+ def updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint13'
+ nrows = 10
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 10
+ value_b = "bbbbb" * 10
+ value_c = "ccccc" * 10
+
+ # Set oldest and stable timestamps to 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ # Write some data at time 20.
+ self.updates(uri, ds, nrows, value_a, 20)
+
+ # Make it stable.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(20))
+
+ # Make a checkpoint.
+ if self.checkpoint_name is not None:
+ self.session.checkpoint('name=' + self.checkpoint_name)
+ read_name = self.checkpoint_name
+ else:
+ self.session.checkpoint()
+ read_name = 'WiredTigerCheckpoint'
+
+ # Write some more data at time 30 to make sure it's not seen.
+ self.updates(uri, ds, nrows, value_b, 30)
+
+ # Open the checkpoint.
+ ckpt_cursor = self.session.open_cursor(uri, None, 'checkpoint=' + read_name)
+
+ # We should be able to read.
+ self.assertEqual(ckpt_cursor[ds.key(1)], value_a)
+
+ # We should also able to read within a transaction.
+ self.session.begin_transaction()
+ self.assertEqual(ckpt_cursor[ds.key(1)], value_a)
+ self.session.rollback_transaction()
+
+ # Close this cursor.
+ ckpt_cursor.close()
+
+ # Opening the cursor at time 10 should produce no data.
+ ckpt_cursor = self.session.open_cursor(uri, None, 'checkpoint=' + read_name +
+ ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(10) + ')')
+ ckpt_cursor.set_key(ds.key(1))
+ self.assertEqual(ckpt_cursor.search(), wiredtiger.WT_NOTFOUND)
+ ckpt_cursor.close()
+
+ # Opening the cursor at time 5 should fail.
+ def tryit():
+ return self.session.open_cursor(uri, None, 'checkpoint=' + read_name +
+ ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(5) + ')')
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: tryit(), '/before the checkpoint oldest/')
+
+ if self.checkpoint_name is not None:
+ # Open the cursor.
+ ckpt_cursor = self.session.open_cursor(uri, None, 'checkpoint=' + read_name)
+
+ # Updating the checkpoint should fail.
+ def tryregen():
+ self.session.checkpoint('name=' + self.checkpoint_name)
+ # This produces EBUSY, but self.raisesBusy() from wttest does not work.
+ # Including "dropped" in the expected message is not optimal, since we are't
+ # dropping the checkpoint (that regenerating it drops it first is an internal
+ # detail) but I guess it can't be helped.
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: tryregen(), '/cannot be dropped/')
+
+ # Dropping the checkpoint should fail.
+ def trydrop():
+ self.session.checkpoint('drop=(' + self.checkpoint_name + ')')
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: trydrop(), '/cannot be dropped/')
+
+ ckpt_cursor.close()
+
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint14.py b/src/third_party/wiredtiger/test/suite/test_checkpoint14.py
new file mode 100644
index 00000000000..0fdda7f8975
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint14.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+from wtthread import checkpoint_thread, named_checkpoint_thread
+from helper import simulate_crash_restart
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint14.py
+#
+# Make sure each checkpoint has its own snapshot by creating two successive
+# inconsistent checkpoints and reading both of them.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all),timing_stress_for_test=[checkpoint_slow]'
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ name_values = [
+ ('nn', dict(first_checkpoint='first_checkpoint', second_checkpoint='second_checkpoint')),
+ ('nu', dict(first_checkpoint='first_checkpoint', second_checkpoint=None)),
+ # This doesn't work because there's no way to open the first unnamed checkpoint.
+ #('un', dict(first_checkpoint=None, second_checkpoint='second_checkpoint')),
+ ]
+ scenarios = make_scenarios(format_values, name_values)
+
+ def large_updates(self, uri, ds, nrows, value):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction()
+ self.session.begin_transaction()
+ self.session.commit_transaction()
+ cursor.close()
+
+ # "expected" is a list of maps from values to counts of values.
+ def check(self, ds, ckpt, nrows, value):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cursor = self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt)
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ self.assertEqual(count, nrows)
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint14'
+ nrows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ nrows *= 5
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Write some baseline data.
+ self.large_updates(uri, ds, nrows, value_a)
+ # Write this data out now so we aren't waiting for it while trying to
+ # race with the later data.
+ self.session.checkpoint()
+
+ # Write some more data, and hold the transaction open.
+ session2 = self.conn.open_session()
+ cursor2 = session2.open_cursor(uri)
+ session2.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor2[ds.key(i)] = value_b
+
+ # Checkpoint in the background.
+ done = threading.Event()
+ if self.first_checkpoint is None:
+ ckpt = checkpoint_thread(self.conn, done)
+ else:
+ ckpt = named_checkpoint_thread(self.conn, done, self.first_checkpoint)
+ try:
+ ckpt.start()
+
+ # Wait for checkpoint to start before committing.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
+ session2.commit_transaction()
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Rinse and repeat.
+ session2.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor2[ds.key(i)] = value_c
+
+ # Checkpoint in the background.
+ done = threading.Event()
+ if self.second_checkpoint is None:
+ ckpt = checkpoint_thread(self.conn, done)
+ else:
+ ckpt = named_checkpoint_thread(self.conn, done, self.second_checkpoint)
+ try:
+ ckpt.start()
+ # Sleep a bit so that checkpoint starts before committing last transaction.
+ time.sleep(2)
+ session2.commit_transaction()
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Other tests check for whether the visibility of a partially-written transaction
+ # is handled correctly. Here we're interested in whether the visibility mechanism
+ # is using the right snapshot for the checkpoint we're reading. So insist that we
+ # not see the value_b transaction in the first checkpoint, or the value_c transaction
+ # in the second checkpoint. If test machine lag causes either transaction to commit
+ # before the checkpoint starts, we'll see value_b in the first checkpoint and/or
+ # value_c in the second. But also, if we end up using the second checkpoint's snapshot
+ # for the first checkpoint, we'll see value_b. So if this happens more than once in a
+ # blue moon we should probably strengthen the test so we can more reliably distinguish
+ # the cases, probably by doing a third transaction/checkpoint pair.
+ #
+ # If we end up using the first checkpoint's snapshot for reading the second checkpoint,
+ # we'll most likely see no data at all; that would be a serious failure if it happened.
+
+ # Read the checkpoints.
+ self.check(ds, self.first_checkpoint, nrows, value_a)
+ self.check(ds, self.second_checkpoint, nrows, value_b)
+
+ # If we haven't died yet, pretend to crash, and run RTS to see if the
+ # (second) checkpoint was inconsistent. Unfortunately we can't readily
+ # check on both.
+ simulate_crash_restart(self, ".", "RESTART")
+
+ # Make sure we did get an inconsistent checkpoint.
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ inconsistent_ckpt = stat_cursor[stat.conn.txn_rts_inconsistent_ckpt][2]
+ stat_cursor.close()
+ self.assertGreater(inconsistent_ckpt, 0)
+
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint15.py b/src/third_party/wiredtiger/test/suite/test_checkpoint15.py
new file mode 100644
index 00000000000..fd387aee49e
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint15.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint15.py
+#
+# Make sure each checkpoint has its own timestamp info by writing out
+# multiple checkpoints with different times and reading all of them.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ name_values = [
+ ('nnn', dict(first_checkpoint='first_checkpoint',
+ second_checkpoint='second_checkpoint',
+ third_checkpoint='third_checkpoint')),
+ ('nnu', dict(first_checkpoint='first_checkpoint',
+ second_checkpoint='second_checkpoint',
+ third_checkpoint=None)),
+ # These don't work because there's no way to open an unnamed checkpoint if it's
+ # not the most recent checkpoint.
+ #('nun', dict(first_checkpoint='first_checkpoint',
+ # second_checkpoint=None,
+ # third_checkpoint='third_checkpoint')),
+ #('unn', dict(first_checkpoint=None,
+ # second_checkpoint='second_checkpoint',
+ # third_checkpoint='third_checkpoint')),
+ ]
+ scenarios = make_scenarios(format_values, name_values)
+
+ def large_updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, ds, ckpt, nrows, value, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ if ts is None:
+ tsstr = ''
+ else:
+ tsstr = ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt + tsstr)
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ self.assertEqual(count, nrows)
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def checkfail(self, ds, ckpt, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ if ts is None:
+ tsstr = ''
+ else:
+ tsstr = ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ def tryit():
+ return self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt + tsstr)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: tryit(), '/before the checkpoint oldest/')
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint15'
+ nrows = 1000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ value_d = 100
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Set oldest and stable to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some stable baseline data at time 10.
+ self.large_updates(uri, ds, nrows, value_a, 10)
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+ self.session.checkpoint()
+
+ # Write some more data at time 20.
+ self.large_updates(uri, ds, nrows, value_b, 20)
+
+ # Mark this data stable and checkpoint it.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(20))
+ self.do_checkpoint(self.first_checkpoint)
+
+ # Rinse and repeat twice more. Move oldest forward too.
+ self.large_updates(uri, ds, nrows, value_c, 30)
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30))
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(15))
+ self.do_checkpoint(self.second_checkpoint)
+
+ self.large_updates(uri, ds, nrows, value_d, 40)
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(25))
+ self.do_checkpoint(self.third_checkpoint)
+
+ # Read the checkpoints.
+ self.check(ds, self.first_checkpoint, nrows, value_a, 10)
+ self.check(ds, self.first_checkpoint, nrows, value_b, 20)
+ self.check(ds, self.first_checkpoint, nrows, value_b, None)
+
+ self.checkfail(ds, self.second_checkpoint, 10)
+ self.check(ds, self.second_checkpoint, nrows, value_b, 20)
+ self.check(ds, self.second_checkpoint, nrows, value_c, 30)
+ self.check(ds, self.second_checkpoint, nrows, value_c, None)
+
+ self.checkfail(ds, self.third_checkpoint, 10)
+ self.checkfail(ds, self.third_checkpoint, 20)
+ self.check(ds, self.third_checkpoint, nrows, value_c, 30)
+ self.check(ds, self.third_checkpoint, nrows, value_d, 40)
+ self.check(ds, self.third_checkpoint, nrows, value_d, None)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint16.py b/src/third_party/wiredtiger/test/suite/test_checkpoint16.py
new file mode 100644
index 00000000000..c0d84f1ddda
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint16.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint16.py
+#
+# Make sure a table that's clean when a checkpointed can still be read in
+# that checkpoint.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ name_values = [
+ ('named', dict(second_checkpoint='second_checkpoint')),
+ ('unnamed', dict(second_checkpoint=None)),
+ ]
+ scenarios = make_scenarios(format_values, name_values)
+
+ def large_updates(self, ds, nrows, value):
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction()
+ self.session.begin_transaction()
+ self.session.commit_transaction()
+ cursor.close()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, ds, ckpt, nrows, value):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cursor = self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt)
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ self.assertEqual(count, nrows)
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri1 = 'table:checkpoint16a'
+ uri2 = 'table:checkpoint16b'
+ nrows = 1000
+
+ # Create two tables.
+ ds1 = SimpleDataSet(
+ self, uri1, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds1.populate()
+ ds2 = SimpleDataSet(
+ self, uri2, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds2.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Set oldest and stable to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some data to both tables and checkpoint it.
+ self.large_updates(ds1, nrows, value_a)
+ self.large_updates(ds2, nrows, value_a)
+ self.session.checkpoint()
+
+ # Write some more data but only to table 2.
+ self.large_updates(ds2, nrows, value_b)
+
+ # Checkpoint this data.
+ self.do_checkpoint(self.second_checkpoint)
+
+ # Make sure we can read table 1 from the second checkpoint.
+ self.check(ds1, self.second_checkpoint, nrows, value_a)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint17.py b/src/third_party/wiredtiger/test/suite/test_checkpoint17.py
new file mode 100644
index 00000000000..cb769aa5540
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint17.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint17.py
+#
+# Make sure that if the history store is clean when a checkpoint is taken
+# that we can still access it via the checkpoint.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ name_values = [
+ ('named', dict(second_checkpoint='second_checkpoint')),
+ ('unnamed', dict(second_checkpoint=None)),
+ ]
+ scenarios = make_scenarios(format_values, name_values)
+
+ def large_updates(self, ds, lo, hi, value, ts):
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(lo, hi):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, ds, ckpt, nrows, value, zeros, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ if ts is None:
+ tsstr = ''
+ else:
+ tsstr = ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, 'checkpoint=' + ckpt + tsstr)
+ #self.session.begin_transaction()
+ count = 0
+ zerocount = 0
+ for k, v in cursor:
+ if self.value_format == '8t' and v == 0:
+ zerocount += 1
+ else:
+ self.assertEqual(v, value)
+ count += 1
+ self.assertEqual(count, nrows)
+ if self.value_format == '8t':
+ self.assertEqual(zerocount, zeros)
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint17'
+ nrows = 1000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ value_d = 100
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Set oldest and stable to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some history and checkpoint it.
+ self.large_updates(ds, 1, nrows + 1, value_a, 10)
+ self.large_updates(ds, 1, nrows + 1, value_b, 20)
+ self.large_updates(ds, 1, nrows + 1, value_c, 30)
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30))
+ self.session.checkpoint()
+
+ # Write some disjoint data that should not generate more history.
+ self.large_updates(ds, nrows + 1, 2 * nrows + 1, value_d, 40)
+
+ # Mark this data stable and checkpoint it.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ self.do_checkpoint(self.second_checkpoint)
+
+ # Make sure we can still read the history.
+ self.check(ds, self.second_checkpoint, nrows, value_a, nrows, 10)
+ self.check(ds, self.second_checkpoint, nrows, value_b, nrows, 20)
+ self.check(ds, self.second_checkpoint, nrows, value_c, nrows, 30)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint18.py b/src/third_party/wiredtiger/test/suite/test_checkpoint18.py
new file mode 100644
index 00000000000..2e5d4236167
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint18.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wiredtiger import stat
+from wtthread import checkpoint_thread
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint18.py
+#
+# Make sure that when we open a cursor we secure the proper matching
+# history store checkpoint, and don't bobble or lose it if the database
+# moves on. Non-timestamp version.
+#
+# It doesn't make sense to run this test for named checkpoints, because
+# regenerating a named checkpoint with the cursor open isn't allowed and
+# generating two different checkpoints with different names doesn't make
+# an interesting scenario. The concern is getting the matching version
+# of WiredTigerCheckpoint and hanging onto it.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ conn_config = 'statistics=(all),timing_stress_for_test=[checkpoint_slow]'
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ scenarios = make_scenarios(format_values)
+
+ def large_updates(self, ds, nrows, value):
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction()
+ self.session.begin_transaction()
+ self.session.commit_transaction()
+ cursor.close()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, cursor, nrows, value):
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ self.assertEqual(count, nrows)
+ #self.session.rollback_transaction()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint18'
+ nrows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ nrows *= 5
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Write some baseline data and checkpoint it.
+ self.large_updates(ds, nrows, value_a)
+ self.session.checkpoint()
+
+ # Write more data. Touch odd keys only. Hold the transaction open.
+ session2 = self.conn.open_session()
+ cursor2 = session2.open_cursor(uri)
+ session2.begin_transaction()
+ for i in range(1, nrows + 1, 2):
+ cursor2[ds.key(i)] = value_b
+
+ # Commit the transaction with a background checkpoint so we get part of it
+ # in the checkpoint.
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ ckpt.start()
+
+ # Wait for checkpoint to start before committing.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
+ session2.commit_transaction()
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Now write the other keys.
+ session2.begin_transaction()
+ for i in range(2, nrows + 2, 2):
+ cursor2[ds.key(i)] = value_c
+ session2.commit_transaction()
+
+ # Open the checkpoint now.
+ ckpt = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint')
+
+ # Take another checkpoint. This will write out the rest of the partial
+ # transaction and remove its history store footprint, so reading from the
+ # first checkpoint will fail if the cursor hasn't secured itself a reference
+ # to its matching history store checkpoint.
+ self.session.checkpoint()
+
+ # Make sure we can read the table from the second checkpoint.
+ # We shouldn't see either value_b or value_c.
+ self.check(ckpt, nrows, value_a)
+ ckpt.close()
+
+ # Note that it would be nice to crosscheck that the first checkpoint was in fact
+ # inconsistent. Could do that by copying the database before the second checkpoint
+ # and opening the copy here, I guess. FUTURE?
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint19.py b/src/third_party/wiredtiger/test/suite/test_checkpoint19.py
new file mode 100644
index 00000000000..3a6e92dfe8d
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint19.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint19.py
+#
+# Make sure that when we open a cursor we secure the proper matching
+# history store checkpoint, and don't bobble or lose it if the database
+# moves on. Timestamped version.
+#
+# It doesn't make sense to run this test for named checkpoints, because
+# regenerating a named checkpoint with the cursor open isn't allowed and
+# generating two different checkpoints with different names doesn't make
+# an interesting scenario. The concern is getting the matching version
+# of WiredTigerCheckpoint and hanging onto it.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+ session_config = 'isolation=snapshot'
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ scenarios = make_scenarios(format_values)
+
+ def large_updates(self, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(ds.uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, cursor, nrows, oddvalue, evenvalue):
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ # Alas, count is even when the key number is odd.
+ if count % 2 == 0:
+ self.assertEqual(v, oddvalue)
+ else:
+ self.assertEqual(v, evenvalue)
+ count += 1
+ self.assertEqual(count, nrows)
+ #self.session.rollback_transaction()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint17'
+ nrows = 1000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ value_d = 100
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Set oldest and stable to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some baseline data and checkpoint it.
+ self.large_updates(ds, nrows, value_a, 10)
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+ self.session.checkpoint()
+
+ # Write more data. Touch odd keys only. Do two rounds to make sure there's history.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1, 2):
+ cursor[ds.key(i)] = value_b
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20))
+
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1, 2):
+ cursor[ds.key(i)] = value_c
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30))
+
+ # Now checkpoint.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30))
+ self.session.checkpoint()
+
+ # Now write the other keys.
+ self.session.begin_transaction()
+ for i in range(2, nrows + 2, 2):
+ cursor[ds.key(i)] = value_d
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40))
+
+ # Open the checkpoint now.
+ # Since we can't change the cursor read timestamp on the fly, get a separate one
+ # for each read time we're interested in.
+ def cfgstr(ts):
+ return 'checkpoint=WiredTigerCheckpoint,debug=(checkpoint_read_timestamp={})'.format(
+ self.timestamp_str(ts))
+ ckpt10 = self.session.open_cursor(uri, None, cfgstr(10))
+ ckpt20 = self.session.open_cursor(uri, None, cfgstr(20))
+ ckpt30 = self.session.open_cursor(uri, None, cfgstr(30))
+
+ # Take another checkpoint. Advance oldest so this checkpoint will throw away
+ # some of the history. (Because we've interleaved the keys, we're guaranteed
+ # that every history store page involved will be rewritten.) Reading from the
+ # first checkpoint will fail if the cursor hasn't secured itself a reference
+ # to its matching history store checkpoint.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(35))
+ self.session.checkpoint()
+
+ # Make sure we can still read the history.
+ self.check(ckpt10, nrows, value_a, value_a)
+ self.check(ckpt20, nrows, value_b, value_a)
+ self.check(ckpt30, nrows, value_c, value_a)
+
+ ckpt10.close()
+ ckpt20.close()
+ ckpt30.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint20.py b/src/third_party/wiredtiger/test/suite/test_checkpoint20.py
new file mode 100644
index 00000000000..73135307bd1
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint20.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint20.py
+#
+# Test reading a checkpoint that contains prepared data.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ stable_ts_values = [
+ ('15', dict(stable_ts=15)),
+ ('25', dict(stable_ts=25)),
+ ]
+ name_values = [
+ ('named', dict(first_checkpoint='first_checkpoint')),
+ ('unnamed', dict(first_checkpoint=None)),
+ ]
+ scenarios = make_scenarios(format_values, stable_ts_values, name_values)
+
+
+ def large_updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def evict(self, ds, lo, hi, value, ts):
+ evict_cursor = self.session.open_cursor(ds.uri, None, "debug=(release_evict)")
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+ # Evict every 10th key. FUTURE: when that's possible, evict each page exactly once.
+ for k in range(lo, hi, 10):
+ v = evict_cursor[ds.key(k)]
+ self.assertEqual(v, value)
+ self.assertEqual(evict_cursor.reset(), 0)
+ self.session.rollback_transaction()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, ds, ckpt, nrows, value, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cfg = 'checkpoint=' + ckpt
+ if ts is not None:
+ cfg += ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, cfg)
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ #self.session.rollback_transaction()
+ self.assertEqual(count, nrows)
+ cursor.close()
+
+ def checkfail(self, ds, ckpt, key, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cfg = 'checkpoint=' + ckpt
+ if ts is not None:
+ cfg += ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, cfg)
+ #self.session.begin_transaction()
+ cursor.set_key(ds.key(key))
+ self.assertRaisesException(wiredtiger.WiredTigerError,
+ lambda: cursor.search(),
+ '/conflict with a prepared/')
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint20'
+ nrows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Pin oldest and stable timestamps to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some data at time 10.
+ self.large_updates(uri, ds, nrows, value_a, 10)
+
+ # Prepare some more data at time 20.
+ session2 = self.conn.open_session()
+ cursor2 = session2.open_cursor(uri)
+ session2.begin_transaction()
+ for i in range(nrows // 2 + 1, nrows):
+ cursor2[ds.key(i)] = value_b
+ session2.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Evict the lot. Otherwise the checkpoint won't write the prepared data.
+ # Read at 10 to do the eviction to avoid tripping on the prepared transaction.
+ self.evict(ds, 1, nrows + 1, value_a, 10)
+
+ # Checkpoint.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(self.stable_ts))
+ self.do_checkpoint(self.first_checkpoint)
+
+ # Commit the prepared transaction so it isn't in the way.
+ session2.timestamp_transaction('commit_timestamp=' + self.timestamp_str(20))
+ session2.commit_transaction('durable_timestamp=' + self.timestamp_str(30))
+
+ # Read the checkpoint.
+ # We decided that checkpoint cursors should always use ignore_prepare, so we
+ # should always see value_a.
+ self.check(ds, self.first_checkpoint, nrows, value_a, 10)
+ self.check(ds, self.first_checkpoint, nrows, value_a, 20)
+ self.check(ds, self.first_checkpoint, nrows, value_a, None)
+
+ # Without ignore_prepare, we'd want to check that one of the prepared keys fails.
+ #self.checkfail(ds, self.first_checkpoint, nrows // 2 + 1, 20)
+ #if self.stable_ts >= 20:
+ # self.checkfail(ds, self.first_checkpoint, nrows // 2 + 1, None)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint21.py b/src/third_party/wiredtiger/test/suite/test_checkpoint21.py
new file mode 100644
index 00000000000..b43060da2b3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint21.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import threading, time
+import wttest
+import wiredtiger
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_checkpoint21.py
+#
+# Test reading a checkpoint that contains data from a committed but not
+# durable transaction.
+#
+# Because transactions are allowed to commit before stable if they prepared
+# after stable but stable then advanced, but their durable timestamp is required
+# to be after stable as of commit, the commit time can be before the durable
+# time. If a checkpoint occurs in this window, there is a bit of a dilemma: from
+# one point of view, the data from these transactions should *not* be visible
+# because that's what happens if we crash and recover to the same checkpoint.
+# From another, it *should* because otherwise (and if we don't crash), reading
+# from the checkpoint and the live database at the same read timestamp will
+# produce different results.
+#
+# It is not clear what the proper solution is; for the time being the expedient
+# approach is to allow the data to appear. As far as I know the entirety of such
+# transactions should always be written out by the checkpoint (because they're
+# committed) -- we will therefore never see a torn transaction, which is the
+# most important consideration.
+#
+# This test sets up such a transaction, evicts half of it, then checkpoints the
+# rest, and checks that it is all visible by reading the checkpoint.
+
+class test_checkpoint(wttest.WiredTigerTestCase):
+
+ format_values = [
+ ('column-fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('string_row', dict(key_format='S', value_format='S', extraconfig='')),
+ ]
+ name_values = [
+ ('named', dict(first_checkpoint='first_checkpoint')),
+ ('unnamed', dict(first_checkpoint=None)),
+ ]
+ scenarios = make_scenarios(format_values, name_values)
+
+
+ def large_updates(self, uri, ds, nrows, value, ts):
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value
+ if i % 101 == 0:
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ self.session.begin_transaction()
+ self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
+ cursor.close()
+
+ def evict(self, ds, lo, hi, value, ts):
+ evict_cursor = self.session.open_cursor(ds.uri, None, "debug=(release_evict)")
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+ # Evict every 10th key. FUTURE: when that's possible, evict each page exactly once.
+ for k in range(lo, hi, 10):
+ v = evict_cursor[ds.key(k)]
+ self.assertEqual(v, value)
+ self.assertEqual(evict_cursor.reset(), 0)
+ self.session.rollback_transaction()
+
+ def do_checkpoint(self, ckpt_name):
+ if ckpt_name is None:
+ self.session.checkpoint()
+ else:
+ self.session.checkpoint('name=' + ckpt_name)
+
+ def check(self, ds, ckpt, nrows, value, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cfg = 'checkpoint=' + ckpt
+ if ts is not None:
+ cfg += ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, cfg)
+ #self.session.begin_transaction()
+ count = 0
+ for k, v in cursor:
+ self.assertEqual(v, value)
+ count += 1
+ #self.session.rollback_transaction()
+ self.assertEqual(count, nrows)
+ cursor.close()
+
+ def checkfail(self, ds, ckpt, key, ts):
+ if ckpt is None:
+ ckpt = 'WiredTigerCheckpoint'
+ cfg = 'checkpoint=' + ckpt
+ if ts is not None:
+ cfg += ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(ts) + ')'
+ cursor = self.session.open_cursor(ds.uri, None, cfg)
+ #self.session.begin_transaction()
+ cursor.set_key(ds.key(key))
+ self.assertRaisesException(wiredtiger.WiredTigerError,
+ lambda: cursor.search(),
+ '/conflict with a prepared/')
+ #self.session.rollback_transaction()
+ cursor.close()
+
+ def test_checkpoint(self):
+ uri = 'table:checkpoint21'
+ nrows = 10000
+
+ # Create a table.
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Pin oldest and stable timestamps to 5.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(5) +
+ ',stable_timestamp=' + self.timestamp_str(5))
+
+ # Write some data at time 10.
+ self.large_updates(uri, ds, nrows, value_a, 10)
+
+ # Prepare some more data at time 20.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_b
+ self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20))
+
+ # Move stable up to 30.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(30))
+
+ # Commit the transaction at 25 but make it durable at 35.
+ self.session.timestamp_transaction('commit_timestamp=' + self.timestamp_str(25))
+ self.session.commit_transaction('durable_timestamp=' + self.timestamp_str(35))
+
+ # Evict half the pages to make sure they get written with the updated values.
+ self.evict(ds, 1, nrows // 2 + 1, value_b, 25)
+
+ # Checkpoint the rest while stable is still 30.
+ self.do_checkpoint(self.first_checkpoint)
+
+ # Read the checkpoint.
+ self.check(ds, self.first_checkpoint, nrows, value_a, 15)
+ self.check(ds, self.first_checkpoint, nrows, value_b, 25)
+ self.check(ds, self.first_checkpoint, nrows, value_b, None) # default read ts
+ self.check(ds, self.first_checkpoint, nrows, value_b, 0) # no read ts
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_gc05.py b/src/third_party/wiredtiger/test/suite/test_gc05.py
index 325d035d96f..fb48abac55e 100755
--- a/src/third_party/wiredtiger/test/suite/test_gc05.py
+++ b/src/third_party/wiredtiger/test/suite/test_gc05.py
@@ -28,22 +28,41 @@
from test_gc01 import test_gc_base
from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
# test_gc05.py
# Verify a locked checkpoint is not removed during garbage collection.
+
class test_gc05(test_gc_base):
conn_config = 'cache_size=50MB,statistics=(all)'
+ format_values = [
+ ('column', dict(key_format='r', value_format='S', extraconfig='')),
+ ('column_fix', dict(key_format='r', value_format='8t',
+ extraconfig=',allocation_size=512,leaf_page_max=512')),
+ ('integer_row', dict(key_format='i', value_format='S', extraconfig='')),
+ ]
+ named_values = [
+ ('named', dict(named=True)),
+ ('anonymous', dict(named=False)),
+ ]
+ scenarios = make_scenarios(format_values, named_values)
+
def test_gc(self):
uri = "table:gc05"
create_params = 'value_format=S,key_format=i'
self.session.create(uri, create_params)
nrows = 10000
+ value_u = "uuuuu" * 100
+ value_v = "vvvvv" * 100
+ value_w = "wwwww" * 100
value_x = "xxxxx" * 100
value_y = "yyyyy" * 100
value_z = "zzzzz" * 100
- ds = SimpleDataSet(self, uri, 0, key_format="i", value_format="S")
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config=self.extraconfig)
ds.populate()
# Set the oldest and stable timestamps to 10.
@@ -55,23 +74,32 @@ class test_gc05(test_gc_base):
self.large_updates(uri, value_y, ds, nrows, 30)
self.large_updates(uri, value_z, ds, nrows, 40)
+ # Move stable to 35 so there's something to checkpoint.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(35))
+
# Perform a checkpoint.
- self.session.checkpoint("name=checkpoint_one")
+ if self.named:
+ self.session.checkpoint("name=checkpoint_one")
+ else:
+ self.session.checkpoint()
# Check statistics.
self.check_gc_stats()
# Open a cursor to the checkpoint just performed.
- ckpt_cursor = self.session.open_cursor(uri, None, "checkpoint=checkpoint_one")
+ if self.named:
+ ckpt_cursor = self.session.open_cursor(uri, None, "checkpoint=checkpoint_one")
+ else:
+ ckpt_cursor = self.session.open_cursor(uri, None, "checkpoint=WiredTigerCheckpoint")
# Move the oldest and stable timestamps to 40.
self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(40) +
',stable_timestamp=' + self.timestamp_str(40))
# Insert values with varying timestamps.
- self.large_updates(uri, value_z, ds, nrows, 50)
- self.large_updates(uri, value_y, ds, nrows, 60)
- self.large_updates(uri, value_x, ds, nrows, 70)
+ self.large_updates(uri, value_u, ds, nrows, 50)
+ self.large_updates(uri, value_v, ds, nrows, 60)
+ self.large_updates(uri, value_w, ds, nrows, 70)
# Move the oldest and stable timestamps to 70.
self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(70) +
@@ -81,14 +109,30 @@ class test_gc05(test_gc_base):
self.session.checkpoint()
self.check_gc_stats()
- # Verify checkpoint_one still exists and contains the expected values.
+ # Verify the open checkpoint still exists and contains the expected values.
for i in range(0, nrows):
ckpt_cursor.set_key(i)
- ckpt_cursor.search()
- self.assertEqual(value_z, ckpt_cursor.get_value())
+ self.assertEqual(ckpt_cursor.search(), 0)
+ self.assertEqual(ckpt_cursor.get_value(), value_y)
# Close checkpoint cursor.
ckpt_cursor.close()
+ if self.named:
+ # If we named the checkpoint, it should still exist and still have the same values.
+ ckpt_cursor = self.session.open_cursor(uri, None, "checkpoint=checkpoint_one")
+ for i in range(0, nrows):
+ ckpt_cursor.set_key(i)
+ self.assertEqual(ckpt_cursor.search(), 0)
+ self.assertEqual(ckpt_cursor.get_value(), value_y)
+ else:
+ # If we didn't, reopening should get the most recent checkpoint.
+ ckpt_cursor = self.session.open_cursor(uri, None, "checkpoint=WiredTigerCheckpoint")
+ for i in range(0, nrows):
+ ckpt_cursor.set_key(i)
+ self.assertEqual(ckpt_cursor.search(), 0)
+ self.assertEqual(ckpt_cursor.get_value(), value_w)
+
+
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py
index 983a9480400..5eac8554261 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs06.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs06.py
@@ -96,18 +96,36 @@ class test_hs06(wttest.WiredTigerTestCase):
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(2))
self.session.checkpoint()
- # Check the checkpoint wrote the expected values.
- #
- # FIXME-WT-5927: Checkpoint cursors are known to have issues in durable history so we've
- # removing the use of checkpoint handles in this test. As part of WT-5927, we should either
- # re-enable the testing of checkpoint cursors or remove this comment.
- #
- # cursor2 = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint')
- cursor2 = self.session.open_cursor(uri)
- self.session.begin_transaction('read_timestamp=' + self.timestamp_str(2))
+ # Check the checkpoint wrote the expected values. We should get the stable data by
+ # default.
+ cursor2 = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint')
+ for key, value in cursor2:
+ self.assertEqual(value, value1)
+ cursor2.close()
+
+ # Also check with an explicit read timestamp.
+ ckpt_ts = ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(2) + ')'
+ cursor2 = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint' + ckpt_ts)
for key, value in cursor2:
self.assertEqual(value, value1)
- self.session.commit_transaction()
+ cursor2.close()
+
+ # At least for the moment, we can also read the unstable values out of the checkpoint
+ # if we ask for them.
+ ckpt_ts = ',debug=(checkpoint_read_timestamp=' + self.timestamp_str(3) + ')'
+ cursor2 = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint' + ckpt_ts)
+ for key, value in cursor2:
+ self.assertEqual(value, value2)
+ cursor2.close()
+
+ # Check what happens if we explicitly read the checkpoint with no timestamp.
+ # Should get the most recent data.
+ # (A timestamp string of "0" is explicitly allowed here to override the checkpoint's
+ # own timestamp.)
+ ckpt_ts = ',debug=(checkpoint_read_timestamp=0)'
+ cursor2 = self.session.open_cursor(uri, None, 'checkpoint=WiredTigerCheckpoint' + ckpt_ts)
+ for key, value in cursor2:
+ self.assertEqual(value, value2)
cursor2.close()
start_usage = self.get_non_page_image_memory_usage()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs09.py b/src/third_party/wiredtiger/test/suite/test_hs09.py
index 6175fbbfed9..82681e286cc 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs09.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs09.py
@@ -61,17 +61,12 @@ class test_hs09(wttest.WiredTigerTestCase):
# Check the data file value.
cursor = session.open_cursor(self.uri, None, 'checkpoint=WiredTigerCheckpoint')
- # If we are expecting prepapred updates in the datastore, start an explicit transaction with
- # ignore prepare flag to avoid getting a WT_PREPARE_CONFLICT error.
- if expect_prepared_in_datastore:
- session.begin_transaction("ignore_prepare=true")
+ # We no longer need to do anything special if we are expecting prepared updates
+ # in the datastore, because checkpoint cursors always set ignore_prepare.
for _, value in cursor:
self.assertEqual(value, expected_data_value)
- if expect_prepared_in_datastore:
- session.rollback_transaction()
-
cursor.close()
# Check the history store file value.
cursor = session.open_cursor("file:WiredTigerHS.wt", None, 'checkpoint=WiredTigerCheckpoint')
diff --git a/src/third_party/wiredtiger/test/suite/wtthread.py b/src/third_party/wiredtiger/test/suite/wtthread.py
index e4620e188e0..156336e6298 100755
--- a/src/third_party/wiredtiger/test/suite/wtthread.py
+++ b/src/third_party/wiredtiger/test/suite/wtthread.py
@@ -43,6 +43,21 @@ class checkpoint_thread(threading.Thread):
sess.checkpoint()
sess.close()
+class named_checkpoint_thread(threading.Thread):
+ def __init__(self, conn, done, ckpt_name):
+ self.conn = conn
+ self.done = done
+ self.ckpt_name = ckpt_name
+ threading.Thread.__init__(self)
+
+ def run(self):
+ sess = self.conn.open_session()
+ while not self.done.isSet():
+ # Sleep for 10 milliseconds.
+ time.sleep(0.001)
+ sess.checkpoint('name=' + self.ckpt_name)
+ sess.close()
+
class flush_tier_thread(threading.Thread):
def __init__(self, conn, done):
self.conn = conn