summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-08-08 11:53:07 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2017-08-08 11:53:07 +1000
commitf584aa9493ab810fc918252dfd580add6502cd68 (patch)
tree03cdb86df34c7293d48a41b1b6a9447ea0b81825
parent89693f98876e11c8fae9b44303380aa1fd6e9be8 (diff)
downloadmongo-f584aa9493ab810fc918252dfd580add6502cd68.tar.gz
WT-3388 Add new API that allows forgetting history newer than a timestamp (#3506)
* Add in lookaside rollback, by walking the lookaside table. We do need to review lookaside records, since reconciliation will choose to write updates newer than the pinned timestamp into the lookaside file. * It's OK to read at a newer timestamp than the stable timestamp, remove a check that stopped that.
-rw-r--r--dist/api_data.py2
-rw-r--r--dist/filelist1
-rw-r--r--dist/flags.py1
-rwxr-xr-xdist/s_longlines1
-rwxr-xr-xdist/s_void2
-rw-r--r--examples/c/ex_all.c4
-rw-r--r--lang/java/java_doc.i1
-rw-r--r--src/btree/bt_compact.c9
-rw-r--r--src/btree/bt_handle.c21
-rw-r--r--src/btree/bt_walk.c89
-rw-r--r--src/config/config_def.c4
-rw-r--r--src/conn/conn_api.c20
-rw-r--r--src/include/config.h79
-rw-r--r--src/include/connection.h3
-rw-r--r--src/include/extern.h5
-rw-r--r--src/include/flags.h21
-rw-r--r--src/include/txn.i26
-rw-r--r--src/include/wiredtiger.in26
-rw-r--r--src/txn/txn.c6
-rw-r--r--src/txn/txn_rollback_to_stable.c398
-rw-r--r--test/suite/test_timestamp04.py142
21 files changed, 769 insertions, 92 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index aa1e47879cb..3a0822b33b2 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -1264,6 +1264,8 @@ methods = {
transaction_timestamps'''),
]),
+'WT_CONNECTION.rollback_to_stable' : Method([]),
+
'WT_SESSION.reconfigure' : Method(session_config),
# There are 4 variants of the wiredtiger_open configurations.
diff --git a/dist/filelist b/dist/filelist
index f96bb8b6f2f..33ede795c69 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -201,4 +201,5 @@ src/txn/txn_ext.c
src/txn/txn_log.c
src/txn/txn_nsnap.c
src/txn/txn_recover.c
+src/txn/txn_rollback_to_stable.c
src/txn/txn_timestamp.c
diff --git a/dist/flags.py b/dist/flags.py
index 48952768c18..05ffb8851a2 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -23,7 +23,6 @@ flags = {
],
'page_read' : [
'READ_CACHE',
- 'READ_COMPACT',
'READ_NOTFOUND_OK',
'READ_NO_EMPTY',
'READ_NO_EVICT',
diff --git a/dist/s_longlines b/dist/s_longlines
index 736ea36e6d4..69fe22ac3ee 100755
--- a/dist/s_longlines
+++ b/dist/s_longlines
@@ -10,6 +10,7 @@ l=`(cd .. &&
find src -name '*.in') |
sed -e '/checksum\/power8/d' \
-e '/checksum\/zseries/d' \
+ -e '/config\/config_def\.c/d' \
-e '/dist\/stat_data\.py/d' \
-e '/include\/extern\.h/d' \
-e '/include\/extern_posix\.h/d' \
diff --git a/dist/s_void b/dist/s_void
index d7f2c81a211..4cb34d0ec97 100755
--- a/dist/s_void
+++ b/dist/s_void
@@ -48,6 +48,8 @@ func_ok()
-e '/int __page_write_gen_wrapped_check$/d' \
-e '/int __posix_terminate$/d' \
-e '/int __rec_destroy_session$/d' \
+ -e '/int __tree_walk_skip_count_callback$/d' \
+ -e '/int __txn_rollback_to_stable_custom_skip$/d' \
-e '/int __win_terminate$/d' \
-e '/int __wt_block_compact_end$/d' \
-e '/int __wt_block_compact_start$/d' \
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 7dce4744db3..672c54fa1f6 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -866,6 +866,10 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session)
/*! [set oldest timestamp] */
error_check(conn->set_timestamp(conn, "oldest_timestamp=2a"));
/*! [set oldest timestamp] */
+
+ /*! [rollback to stable] */
+ error_check(conn->rollback_to_stable(conn, ""));
+ /*! [rollback to stable] */
}
/*! [Implement WT_COLLATOR] */
diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i
index a1efb1b63a8..c1062f313f3 100644
--- a/lang/java/java_doc.i
+++ b/lang/java/java_doc.i
@@ -63,6 +63,7 @@ COPYDOC(__wt_connection, WT_CONNECTION, is_new)
COPYDOC(__wt_connection, WT_CONNECTION, open_session)
COPYDOC(__wt_connection, WT_CONNECTION, query_timestamp)
COPYDOC(__wt_connection, WT_CONNECTION, set_timestamp)
+COPYDOC(__wt_connection, WT_CONNECTION, rollback_to_stable)
COPYDOC(__wt_connection, WT_CONNECTION, load_extension)
COPYDOC(__wt_connection, WT_CONNECTION, add_data_source)
COPYDOC(__wt_connection, WT_CONNECTION, add_collator)
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index c6a412aa84e..c8d9c7a5bc5 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -141,8 +141,9 @@ __wt_compact(WT_SESSION_IMPL *session)
* read, set its generation to a low value so it is evicted
* quickly.
*/
- WT_ERR(__wt_tree_walk(session, &ref,
- WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
+ WT_ERR(__wt_tree_walk_custom_skip(session, &ref,
+ __wt_compact_page_skip, NULL,
+ WT_READ_NO_GEN | WT_READ_WONT_NEED));
if (ref == NULL)
break;
@@ -173,7 +174,8 @@ err: if (ref != NULL)
* Return if compaction requires we read this page.
*/
int
-__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
+__wt_compact_page_skip(
+ WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
{
WT_BM *bm;
WT_DECL_RET;
@@ -181,6 +183,7 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
u_int type;
const uint8_t *addr;
+ WT_UNUSED(context);
/*
* Skip deleted pages, rewriting them doesn't seem useful; in a better
* world we'd write the parent to delete the page.
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 5fad76849dc..5d356d7d47f 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -875,3 +875,24 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
return (0);
}
+
+/*
+ * __wt_btree_immediately_durable --
+ * Check whether this btree is configured for immediate durability.
+ */
+bool
+__wt_btree_immediately_durable(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /*
+ * This is used to determine whether timestamp updates should
+ * be rolled back for this btree. It's likely that the particular
+ * test required here will change when rollback to stable is
+ * supported with in-memory configurations.
+ */
+ return (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) &&
+ !F_ISSET(btree, WT_BTREE_NO_LOGGING));
+}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index d783f8f6e71..3fdafcebfb9 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -285,7 +285,9 @@ __ref_initial_descent_prev(
*/
static inline int
__tree_walk_internal(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags)
+ WT_REF **refp, uint64_t *walkcntp,
+ int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
+ void *func_cookie, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -486,14 +488,9 @@ restart: /*
if (skip)
break;
empty_internal = false;
- } else if (LF_ISSET(WT_READ_COMPACT)) {
- /*
- * Compaction has relatively complex tests to
- * decide if a page can be skipped, call out
- * to a helper function.
- */
- WT_ERR(__wt_compact_page_skip(
- session, ref, &skip));
+ } else if (skip_func != NULL) {
+ WT_ERR(skip_func(session,
+ ref, func_cookie, &skip));
if (skip)
break;
} else {
@@ -505,23 +502,6 @@ restart: /*
break;
}
- /*
- * Optionally skip leaf pages: when the skip-leaf-count
- * variable is non-zero, skip some count of leaf pages,
- * then take the next leaf page we can.
- *
- * The reason to do some of this work here (rather than
- * in our caller), is because we can look at the cell
- * and know it's a leaf page without reading it into
- * memory. If this page is disk-based, crack the cell
- * to figure out it's a leaf page without reading it.
- */
- if (skipleafcntp != NULL &&
- *skipleafcntp > 0 && __ref_is_leaf(ref)) {
- --*skipleafcntp;
- break;
- }
-
ret = __wt_page_swap(session, couple, ref,
WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
@@ -654,7 +634,8 @@ err: WT_LEAVE_PAGE_INDEX(session);
int
__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
{
- return (__tree_walk_internal(session, refp, NULL, NULL, flags));
+ return (__tree_walk_internal(
+ session, refp, NULL, NULL, NULL, flags));
}
/*
@@ -666,7 +647,56 @@ int
__wt_tree_walk_count(WT_SESSION_IMPL *session,
WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
{
- return (__tree_walk_internal(session, refp, walkcntp, NULL, flags));
+ return (__tree_walk_internal(
+ session, refp, walkcntp, NULL, NULL, flags));
+}
+
+/*
+ * __wt_tree_walk_custom_skip --
+ * Walk the tree calling a custom function to decide whether to skip refs.
+ */
+int
+__wt_tree_walk_custom_skip(
+ WT_SESSION_IMPL *session, WT_REF **refp,
+ int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
+ void *func_cookie, uint32_t flags)
+{
+ return (__tree_walk_internal(session, refp,
+ NULL, skip_func, func_cookie, flags));
+}
+
+/*
+ * __tree_walk_skip_count_callback --
+ * Optionally skip leaf pages.
+ * When the skip-leaf-count variable is non-zero, skip some count of leaf
+ * pages, then take the next leaf page we can.
+ *
+ * The reason to do some of this work here, is because we can look at the cell
+ * and know it's a leaf page without reading it into memory. If this page is
+ * disk-based, crack the cell to figure out it's a leaf page without reading
+ * it.
+ */
+static int
+__tree_walk_skip_count_callback(
+ WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
+{
+ uint64_t *skipleafcntp;
+
+ skipleafcntp = (uint64_t *)context;
+ WT_ASSERT(session, skipleafcntp != NULL);
+
+ /*
+ * Skip deleted pages visible to us.
+ */
+ if (ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, ref, false))
+ *skipp = true;
+ else if (*skipleafcntp > 0 && __ref_is_leaf(ref)) {
+ --*skipleafcntp;
+ *skipp = true;
+ } else
+ *skipp = false;
+ return (0);
}
/*
@@ -688,7 +718,8 @@ __wt_tree_walk_skip(
* decrementing the count.
*/
do {
- WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp,
+ WT_RET(__tree_walk_internal(session, refp, NULL,
+ __tree_walk_skip_count_callback, skipleafcntp,
WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
/*
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 2fca9dcf69f..f0c4e47e7e7 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -1143,6 +1143,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"timing_stress_for_test=,verbose=",
confchk_WT_CONNECTION_reconfigure, 21
},
+ { "WT_CONNECTION.rollback_to_stable",
+ "",
+ NULL, 0
+ },
{ "WT_CONNECTION.set_file_system",
"",
NULL, 0
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index ded0e39b218..df71ddf18f6 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1262,6 +1262,25 @@ err: API_END_RET(session, ret);
}
/*
+ * __conn_rollback_to_stable --
+ * WT_CONNECTION->rollback_to_stable method.
+ */
+static int
+__conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ CONNECTION_API_CALL(
+ conn, session, rollback_to_stable, config, cfg);
+ WT_TRET(__wt_txn_rollback_to_stable(session, cfg));
+err: API_END_RET(session, ret);
+}
+
+/*
* __conn_config_append --
* Append an entry to a config stack.
*/
@@ -2207,6 +2226,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
__conn_open_session,
__conn_query_timestamp,
__conn_set_timestamp,
+ __conn_rollback_to_stable,
__conn_load_extension,
__conn_add_data_source,
__conn_add_collator,
diff --git a/src/include/config.h b/src/include/config.h
index 8bd201eea18..d02ec21023b 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -60,45 +60,46 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_CONNECTION_open_session 8
#define WT_CONFIG_ENTRY_WT_CONNECTION_query_timestamp 9
#define WT_CONFIG_ENTRY_WT_CONNECTION_reconfigure 10
-#define WT_CONFIG_ENTRY_WT_CONNECTION_set_file_system 11
-#define WT_CONFIG_ENTRY_WT_CONNECTION_set_timestamp 12
-#define WT_CONFIG_ENTRY_WT_CURSOR_close 13
-#define WT_CONFIG_ENTRY_WT_CURSOR_reconfigure 14
-#define WT_CONFIG_ENTRY_WT_SESSION_alter 15
-#define WT_CONFIG_ENTRY_WT_SESSION_begin_transaction 16
-#define WT_CONFIG_ENTRY_WT_SESSION_checkpoint 17
-#define WT_CONFIG_ENTRY_WT_SESSION_close 18
-#define WT_CONFIG_ENTRY_WT_SESSION_commit_transaction 19
-#define WT_CONFIG_ENTRY_WT_SESSION_compact 20
-#define WT_CONFIG_ENTRY_WT_SESSION_create 21
-#define WT_CONFIG_ENTRY_WT_SESSION_drop 22
-#define WT_CONFIG_ENTRY_WT_SESSION_join 23
-#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 24
-#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 25
-#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 26
-#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 27
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 28
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 29
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 30
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 31
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 32
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 33
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 34
-#define WT_CONFIG_ENTRY_WT_SESSION_timestamp_transaction 35
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 36
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 37
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 38
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 39
-#define WT_CONFIG_ENTRY_colgroup_meta 40
-#define WT_CONFIG_ENTRY_file_config 41
-#define WT_CONFIG_ENTRY_file_meta 42
-#define WT_CONFIG_ENTRY_index_meta 43
-#define WT_CONFIG_ENTRY_lsm_meta 44
-#define WT_CONFIG_ENTRY_table_meta 45
-#define WT_CONFIG_ENTRY_wiredtiger_open 46
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 47
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 48
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 49
+#define WT_CONFIG_ENTRY_WT_CONNECTION_rollback_to_stable 11
+#define WT_CONFIG_ENTRY_WT_CONNECTION_set_file_system 12
+#define WT_CONFIG_ENTRY_WT_CONNECTION_set_timestamp 13
+#define WT_CONFIG_ENTRY_WT_CURSOR_close 14
+#define WT_CONFIG_ENTRY_WT_CURSOR_reconfigure 15
+#define WT_CONFIG_ENTRY_WT_SESSION_alter 16
+#define WT_CONFIG_ENTRY_WT_SESSION_begin_transaction 17
+#define WT_CONFIG_ENTRY_WT_SESSION_checkpoint 18
+#define WT_CONFIG_ENTRY_WT_SESSION_close 19
+#define WT_CONFIG_ENTRY_WT_SESSION_commit_transaction 20
+#define WT_CONFIG_ENTRY_WT_SESSION_compact 21
+#define WT_CONFIG_ENTRY_WT_SESSION_create 22
+#define WT_CONFIG_ENTRY_WT_SESSION_drop 23
+#define WT_CONFIG_ENTRY_WT_SESSION_join 24
+#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 25
+#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 26
+#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 27
+#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 28
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 29
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 30
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 31
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 32
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 33
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 34
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 35
+#define WT_CONFIG_ENTRY_WT_SESSION_timestamp_transaction 36
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 37
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 38
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 39
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 40
+#define WT_CONFIG_ENTRY_colgroup_meta 41
+#define WT_CONFIG_ENTRY_file_config 42
+#define WT_CONFIG_ENTRY_file_meta 43
+#define WT_CONFIG_ENTRY_index_meta 44
+#define WT_CONFIG_ENTRY_lsm_meta 45
+#define WT_CONFIG_ENTRY_table_meta 46
+#define WT_CONFIG_ENTRY_wiredtiger_open 47
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 48
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 49
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 50
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/connection.h b/src/include/connection.h
index 7bd4a818495..2370444b681 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -377,6 +377,9 @@ struct __wt_connection_impl {
volatile uint64_t las_verb_gen_read;
volatile uint64_t las_verb_gen_write;
+ /* Set of btree IDs not being rolled back */
+ uint8_t *stable_rollback_bitstring;
+
/* Locked: collator list */
TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
diff --git a/src/include/extern.h b/src/include/extern.h
index 34aabea8e81..afd4c874cf1 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -91,7 +91,7 @@ extern int __wt_bloom_intersection(WT_BLOOM *bloom, WT_BLOOM *other) WT_GCC_FUNC
extern int __wt_bloom_close(WT_BLOOM *bloom) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_compact(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_compact_page_skip( WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt);
@@ -140,6 +140,7 @@ extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBU
extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -180,6 +181,7 @@ extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, cons
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_tree_walk_custom_skip( WT_SESSION_IMPL *session, WT_REF **refp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -806,6 +808,7 @@ extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM
extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char *cfg[], bool *has_create, bool *has_drops) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session);
extern int __wt_txn_recover(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_global_query_timestamp( WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/include/flags.h b/src/include/flags.h
index 4f7b59c7849..243716c2ecb 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -38,17 +38,16 @@
#define WT_LOG_FSYNC 0x00000008
#define WT_LOG_SYNC_ENABLED 0x00000010
#define WT_READ_CACHE 0x00000001
-#define WT_READ_COMPACT 0x00000002
-#define WT_READ_NOTFOUND_OK 0x00000004
-#define WT_READ_NO_EMPTY 0x00000008
-#define WT_READ_NO_EVICT 0x00000010
-#define WT_READ_NO_GEN 0x00000020
-#define WT_READ_NO_WAIT 0x00000040
-#define WT_READ_PREV 0x00000080
-#define WT_READ_RESTART_OK 0x00000100
-#define WT_READ_SKIP_INTL 0x00000200
-#define WT_READ_TRUNCATE 0x00000400
-#define WT_READ_WONT_NEED 0x00000800
+#define WT_READ_NOTFOUND_OK 0x00000002
+#define WT_READ_NO_EMPTY 0x00000004
+#define WT_READ_NO_EVICT 0x00000008
+#define WT_READ_NO_GEN 0x00000010
+#define WT_READ_NO_WAIT 0x00000020
+#define WT_READ_PREV 0x00000040
+#define WT_READ_RESTART_OK 0x00000080
+#define WT_READ_SKIP_INTL 0x00000100
+#define WT_READ_TRUNCATE 0x00000200
+#define WT_READ_WONT_NEED 0x00000400
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
diff --git a/src/include/txn.i b/src/include/txn.i
index d693633fabe..30f29e0f5d0 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -36,7 +36,7 @@ __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
* Check if a timestamp is equal to the special "zero" time.
*/
static inline bool
-__wt_timestamp_iszero(wt_timestamp_t *ts)
+__wt_timestamp_iszero(const wt_timestamp_t *ts)
{
return (ts->val == 0);
}
@@ -86,7 +86,7 @@ __wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
* Check if a timestamp is equal to the special "zero" time.
*/
static inline bool
-__wt_timestamp_iszero(wt_timestamp_t *ts)
+__wt_timestamp_iszero(const wt_timestamp_t *ts)
{
static const wt_timestamp_t zero_timestamp;
@@ -728,3 +728,25 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session)
return (true);
}
+
+/*
+ * __wt_txn_are_any_active --
+ * Check whether there are any running transactions.
+ */
+static inline int
+__wt_txn_are_any_active(WT_SESSION_IMPL *session, bool *any_active)
+{
+ WT_TXN_GLOBAL *txn_global;
+
+ txn_global = &S2C(session)->txn_global;
+
+ /*
+ * Ensure the oldest ID is as up to date as possible so we can use a
+ * simple check to find if there are any running transactions.
+ */
+ WT_RET(__wt_txn_update_oldest(session,
+ WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+
+ *any_active = (txn_global->oldest_id != txn_global->current);
+ return (0);
+}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 09939577c45..ef863176ba8 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -2273,6 +2273,32 @@ struct __wt_connection {
*/
int __F(set_timestamp)(
WT_CONNECTION *connection, const char *config);
+
+ /*!
+ * Rollback in-memory non-logged state to an earlier point in time.
+ *
+ * This method uses a timestamp to define the rollback point, and thus
+ * requires that the application uses timestamps and that the
+ * stable_timestamp must have been set via a call to
+ * WT_CONNECTION::set_timestamp. Any updates to checkpoint durable
+ * tables that are more recent than the stable timestamp are removed.
+ *
+ * This method requires that there are no active cursor operations
+ * for the duration of the call.
+ *
+ * Any updates made to logged tables will not be rolled back. Any
+ * updates made without an associated timestamp will not be rolled
+ * back. See @ref transaction_timestamps.
+ *
+ * @snippet ex_all.c rollback to stable
+ *
+ * @param connection the connection handle
+ * @configempty{WT_CONNECTION.rollback_to_stable, see dist/api_data.py}
+ * @errors
+ */
+ int __F(rollback_to_stable)(
+ WT_CONNECTION *connection, const char *config);
+
/*! @} */
/*!
diff --git a/src/txn/txn.c b/src/txn/txn.c
index c7e7999d887..191f7e0ba0f 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -456,12 +456,6 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET_MSG(session, EINVAL,
"read timestamp %.*s older than oldest timestamp",
(int)cval.len, cval.str);
- if (!__wt_timestamp_iszero(&stable_timestamp) &&
- __wt_timestamp_cmp(
- &txn->read_timestamp, &stable_timestamp) > 0)
- WT_RET_MSG(session, EINVAL,
- "read timestamp %.*s newer than stable timestamp",
- (int)cval.len, cval.str);
__wt_txn_set_read_timestamp(session);
txn->isolation = WT_ISO_SNAPSHOT;
diff --git a/src/txn/txn_rollback_to_stable.c b/src/txn/txn_rollback_to_stable.c
new file mode 100644
index 00000000000..9c02322c526
--- /dev/null
+++ b/src/txn/txn_rollback_to_stable.c
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2014-2017 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_TIMESTAMPS
+/*
+ * __txn_rollback_to_stable_lookaside_fixup --
+ * Remove any updates that need to be rolled back from the lookaside file.
+ */
+static int
+__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_DECL_TIMESTAMP(rollback_timestamp)
+ WT_ITEM las_addr, las_key, las_timestamp;
+ WT_TXN_GLOBAL *txn_global;
+ uint64_t las_counter, las_txnid, remove_cnt;
+ uint32_t las_id, session_flags;
+
+ conn = S2C(session);
+ cursor = NULL;
+ remove_cnt = 0;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+ WT_CLEAR(las_timestamp);
+
+ /*
+ * Copy the stable timestamp, otherwise we'd need to lock it each time
+ * it's accessed. Even though the stable timestamp isn't supposed to be
+ * updated while rolling back, accessing it without a lock would
+ * violate protocol.
+ */
+ txn_global = &S2C(session)->txn_global;
+ __wt_readlock(session, &txn_global->rwlock);
+ __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp);
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ __wt_las_cursor(session, &cursor, &session_flags);
+
+ /* Discard pages we read as soon as we're done with them. */
+ F_SET(session, WT_SESSION_NO_CACHE);
+
+ /* Walk the file. */
+ for (; (ret = cursor->next(cursor)) == 0; ) {
+ WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
+ &las_txnid, &las_timestamp, &las_key));
+
+ /* Check the file ID so we can skip durable tables */
+ if (__bit_test(conn->stable_rollback_bitstring, las_id))
+ continue;
+
+ /*
+ * Entries with no timestamp will have a timestamp of zero,
+ * which will fail the following check and cause them to never
+ * be removed.
+ */
+ if (__wt_timestamp_cmp(
+ &rollback_timestamp, las_timestamp.data) < 0) {
+ WT_ERR(cursor->remove(cursor));
+ ++remove_cnt;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+ /*
+ * If there were races to remove records, we can over-count. Underflow
+ * isn't fatal, but check anyway so we don't skew low over time.
+ */
+ if (remove_cnt > conn->las_record_cnt)
+ conn->las_record_cnt = 0;
+ else if (remove_cnt > 0)
+ (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt);
+
+ F_CLR(session, WT_SESSION_NO_CACHE);
+
+ return (ret);
+}
+
+/*
+ * __txn_abort_newer_update --
+ * Abort updates in an update change with timestamps newer than the
+ * rollback timestamp.
+ */
+static void
+__txn_abort_newer_update(WT_SESSION_IMPL *session,
+ WT_UPDATE *upd, wt_timestamp_t *rollback_timestamp)
+{
+ WT_UPDATE *next_upd;
+ bool aborted_one;
+
+ aborted_one = false;
+ for (next_upd = upd; next_upd != NULL; next_upd = next_upd->next) {
+ /*
+ * Updates with no timestamp will have a timestamp of zero
+ * which will fail the following check and cause them to never
+ * be aborted.
+ */
+ if (__wt_timestamp_cmp(
+ rollback_timestamp, &next_upd->timestamp) < 0) {
+ next_upd->txnid = WT_TXN_ABORTED;
+ __wt_timestamp_set_zero(&next_upd->timestamp);
+
+ /*
+ * If any updates are aborted, all newer updates
+ * better be aborted as well.
+ */
+ if (!aborted_one)
+ WT_ASSERT(session,
+ !aborted_one || upd == next_upd);
+ aborted_one = true;
+ }
+ }
+}
+
+/*
+ * __txn_abort_newer_row_skip --
+ * Apply the update abort check to each entry in an insert skip list
+ */
+static void
+__txn_abort_newer_row_skip(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD *head, wt_timestamp_t *rollback_timestamp)
+{
+ WT_INSERT *ins;
+
+ WT_SKIP_FOREACH(ins, head)
+ __txn_abort_newer_update(session, ins->upd, rollback_timestamp);
+}
+
+/*
+ * __txn_abort_newer_row_leaf --
+ * Abort updates on a row leaf page with timestamps newer than the
+ * rollback timestamp.
+ */
+static void
+__txn_abort_newer_row_leaf(
+ WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp)
+{
+ WT_INSERT_HEAD *insert;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ uint32_t i;
+
+ /*
+ * Review the insert list for keys before the first entry on the disk
+ * page.
+ */
+ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+ __txn_abort_newer_row_skip(
+ session, insert, rollback_timestamp);
+
+ /*
+ * Review updates that belong to keys that are on the disk image,
+ * as well as for keys inserted since the page was read from disk.
+ */
+ WT_ROW_FOREACH(page, rip, i) {
+ if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
+ __txn_abort_newer_update(
+ session, upd, rollback_timestamp);
+
+ if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+ __txn_abort_newer_row_skip(
+ session, insert, rollback_timestamp);
+ }
+}
+
+/*
+ * __txn_abort_newer_updates --
+ * Abort updates on this page newer than the timestamp.
+ */
+static int
+__txn_abort_newer_updates(
+ WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t *rollback_timestamp)
+{
+ WT_PAGE *page;
+
+ page = ref->page;
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ /*
+ * There is nothing to do for internal pages, since we aren't
+ * rolling back far enough to potentially include reconciled
+ * changes - and thus won't need to roll back structure
+ * changes on internal pages.
+ */
+ break;
+ case WT_PAGE_ROW_LEAF:
+ __txn_abort_newer_row_leaf(session, page, rollback_timestamp);
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable "
+ "is only supported for row store btrees");
+ }
+
+ return (0);
+}
+
+/*
+ * __txn_rollback_to_stable_custom_skip --
+ * Return if custom rollback requires we read this page.
+ */
+static int
+__txn_rollback_to_stable_custom_skip(
+ WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(context);
+
+ /* Review all pages that are in memory. */
+ if (ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED)
+ *skipp = false;
+ else
+ *skipp = true;
+ return (0);
+}
+
+/*
+ * __txn_rollback_to_stable_btree_walk --
+ * Called for each open handle - choose to either skip or wipe the commits
+ */
+static int
+__txn_rollback_to_stable_btree_walk(
+ WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *ref;
+
+ /* Walk the tree, marking commits aborted where appropriate. */
+ ref = NULL;
+ while ((ret = __wt_tree_walk_custom_skip(session, &ref,
+ __txn_rollback_to_stable_custom_skip,
+ NULL, WT_READ_NO_EVICT)) == 0 && ref != NULL) {
+ page = ref->page;
+
+ /* Review deleted page saved to the ref */
+ if (ref->page_del != NULL && __wt_timestamp_cmp(
+ rollback_timestamp, &ref->page_del->timestamp) < 0)
+ __wt_delete_page_rollback(session, ref);
+
+ if (!__wt_page_is_modified(page))
+ continue;
+
+ WT_RET(__txn_abort_newer_updates(
+ session, ref, rollback_timestamp));
+ }
+ return (ret);
+}
+
+/*
+ * __txn_rollback_to_stable_btree --
+ * Called for each open handle - choose to either skip or wipe the commits
+ */
+static int
+__txn_rollback_to_stable_btree(
+ WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_DECL_TIMESTAMP(rollback_timestamp)
+ WT_BTREE *btree;
+ WT_TXN_GLOBAL *txn_global;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+ txn_global = &S2C(session)->txn_global;
+
+ /*
+ * Immediately durable files don't get their commits wiped. This case
+ * mostly exists to support the semantic required for the oplog in
+ * MongoDB - updates that have been made to the oplog should not be
+ * aborted. It also wouldn't be safe to roll back updates for any
+ * table that had it's records logged, since those updates would be
+ * recovered after a crash making them inconsistent.
+ */
+ if (__wt_btree_immediately_durable(session)) {
+ /*
+ * Add the btree ID to the bitstring, so we can exclude any
+ * lookaside entries for this btree.
+ */
+ __bit_set(
+ S2C(session)->stable_rollback_bitstring, btree->id);
+ return (0);
+ }
+
+ /* There is never anything to do for checkpoint handles */
+ if (session->dhandle->checkpoint != NULL)
+ return (0);
+
+ /* There is nothing to do on an empty tree. */
+ if (btree->root.page == NULL)
+ return (0);
+
+ if (btree->type != BTREE_ROW)
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable "
+ "is only supported for row store btrees");
+
+ /*
+ * Copy the stable timestamp, otherwise we'd need to lock it each time
+ * it's accessed. Even though the stable timestamp isn't supposed to be
+ * updated while rolling back, accessing it without a lock would
+ * violate protocol.
+ */
+ __wt_readlock(session, &txn_global->rwlock);
+ __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp);
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /*
+ * Ensure the eviction server is out of the file - we don't
+ * want it messing with us. This step shouldn't be required, but
+ * it simplifies some of the reasoning about what state trees can
+ * be in.
+ */
+ WT_RET(__wt_evict_file_exclusive_on(session));
+ ret = __txn_rollback_to_stable_btree_walk(
+ session, &rollback_timestamp);
+ __wt_evict_file_exclusive_off(session);
+
+ return (ret);
+}
+
+/*
+ * __txn_rollback_to_stable_check --
+ * Ensure the rollback request is reasonable.
+ */
+static int
+__txn_rollback_to_stable_check(WT_SESSION_IMPL *session)
+{
+ WT_TXN_GLOBAL *txn_global;
+ bool active_txns, stable_set;
+
+ txn_global = &S2C(session)->txn_global;
+ __wt_readlock(session, &txn_global->rwlock);
+ stable_set = !__wt_timestamp_iszero(&txn_global->stable_timestamp);
+ __wt_readunlock(session, &txn_global->rwlock);
+ if (!stable_set)
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a "
+ "stable timestamp");
+
+ /*
+ * Help the user - see if they have any active transactions. I'd
+ * like to check the transaction running flag, but that would
+ * require peeking into all open sessions, which isn't really
+ * kosher.
+ */
+ WT_RET(__wt_txn_are_any_active(session, &active_txns));
+ if (active_txns)
+ WT_RET_MSG(session, EINVAL,
+ "rollback_to_stable illegal with active transactions");
+
+ return (0);
+}
+#endif
+
+/*
+ * __wt_txn_rollback_to_stable --
+ * Rollback all in-memory state related to timestamps more recent than
+ * the passed in timestamp.
+ */
+int
+__wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
+{
+#ifndef HAVE_TIMESTAMPS
+ WT_UNUSED(cfg);
+
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable "
+ "requires a version of WiredTiger built with timestamp "
+ "support");
+#else
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ WT_RET(__txn_rollback_to_stable_check(session));
+
+ /* Allocate a non-durable btree bitstring */
+ WT_RET(__bit_alloc(session,
+ conn->next_file_id, &conn->stable_rollback_bitstring));
+ WT_ERR(__wt_conn_btree_apply(session,
+ NULL, __txn_rollback_to_stable_btree, NULL, cfg));
+
+ /*
+ * Clear any offending content from the lookaside file. This must be
+ * done after the in-memory application, since the process of walking
+ * trees in cache populates a list that is used to check which
+ * lookaside records should be removed.
+ */
+ WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
+err: __wt_free(session, conn->stable_rollback_bitstring);
+ return (ret);
+#endif
+}
diff --git a/test/suite/test_timestamp04.py b/test/suite/test_timestamp04.py
new file mode 100644
index 00000000000..146326834db
--- /dev/null
+++ b/test/suite/test_timestamp04.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_timestamp04.py
+# Timestamps: Test that rollback_to_stable obeys expected visibility rules
+#
+
+import datetime
+import random
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+def timestamp_ret_str(t):
+ s = timestamp_str(t)
+ if len(s) % 2 == 1:
+ s = '0' + s
+ return s
+
+class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
+ tablename = 'test_timestamp04'
+ uri = 'table:' + tablename
+
+ scenarios = make_scenarios([
+ #('col', dict(extra_config=',key_format=r')),
+ #('lsm', dict(extra_config=',type=lsm')),
+ ('row', dict(extra_config=',memory_page_max=32k,leaf_page_max=8k,internal_page_max=8k')),
+ ])
+
+ # Rollback only works for non-durable tables
+ conn_config = 'cache_size=20MB,log=(enabled=false)'
+
+ # Check that a cursor (optionally started in a new transaction), sees the
+ # expected values.
+ def check(self, session, txn_config, expected, missing=False):
+ if txn_config:
+ session.begin_transaction(txn_config)
+ c = session.open_cursor(self.uri, None)
+ if missing == False:
+ actual = dict((k, v) for k, v, pad in c if v != 0)
+ #print expected
+ #print actual
+ self.assertEqual(actual, expected)
+ # Search for the expected items as well as iterating
+ for k, v in expected.iteritems():
+ if missing == False:
+ self.assertEqual(c[k][0], v, "for key " + str(k))
+ else:
+ c.set_key(k)
+ self.assertEqual(c.search(), wiredtiger.WT_NOTFOUND)
+ c.close()
+ if txn_config:
+ session.commit_transaction()
+
+ def test_basic(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ # Configure small page sizes to ensure eviction comes through and we have a
+ # somewhat complex tree
+ self.session.create(self.uri,
+ 'key_format=i,value_format=iS,memory_page_max=16k,leaf_page_max=8k' + self.extra_config)
+ c = self.session.open_cursor(self.uri)
+
+ # Insert keys each with timestamp=key, in some order
+ key_range = 10000
+ keys = range(1, key_range + 1)
+
+ for k in keys:
+ self.session.begin_transaction()
+ c[k] = (1, 'the quick brown fox')
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(k))
+ # Setup an oldest timestamp to ensure state remains in cache.
+ if k == 1:
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
+
+ # Roll back half the timestamps.
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(key_range / 2))
+ self.conn.rollback_to_stable()
+
+ # Now check that we see the expected state when reading at each
+ # timestamp
+ self.check(self.session, 'read_timestamp=' + timestamp_str(key_range / 2),
+ dict((k, 1) for k in keys[:(key_range / 2)]))
+ self.check(self.session, 'read_timestamp=' + timestamp_str(key_range / 2),
+ dict((k, 1) for k in keys[(key_range / 2 + 1):]), missing=True)
+
+ # Bump the oldest timestamp, we're not going back...
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(key_range / 2))
+
+ # Update the values again in preparation for rolling back more
+ for k in keys:
+ self.session.begin_transaction()
+ c[k] = (2, 'jumped over the lazy dog')
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + key_range))
+
+ # Now we should have: keys 1-100 with value 2
+ self.check(self.session, 'read_timestamp=' + timestamp_str(2 * key_range),
+ dict((k, 2) for k in keys[:]))
+
+ # Rollback a quater of the new commits
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1 + key_range + key_range / 4))
+ self.conn.rollback_to_stable()
+
+ # There should be 50 keys, the first half of which have a value of 2, the
+ # second half have a value of 1
+ self.check(self.session, 'read_timestamp=' + timestamp_str(2 * key_range),
+ dict((k, (2 if j <= (key_range / 4) else 1))
+ for j, k in enumerate(keys[:(key_range / 2)])))
+ self.check(self.session, 'read_timestamp=' + timestamp_str(key_range / 2),
+ dict((k, 1) for k in keys[(1 + key_range / 2):]), missing=True)
+
+if __name__ == '__main__':
+ wttest.run()