summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/strict.m44
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok2
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_async.c2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/async/async_api.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_capacity.c3
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c24
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c2
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h17
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i34
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h8
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h43
-rw-r--r--src/third_party/wiredtiger/src/include/lint.h9
-rw-r--r--src/third_party/wiredtiger/src/include/msvc.h5
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i6
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h6
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i32
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i3
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c31
-rw-r--r--src/third_party/wiredtiger/src/support/modify.c355
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c35
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c4
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c89
-rw-r--r--src/third_party/wiredtiger/test/format/format.h8
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c21
-rw-r--r--src/third_party/wiredtiger/test/format/snap.c188
-rw-r--r--src/third_party/wiredtiger/test/format/util.c42
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat02.py8
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_txn19.py230
34 files changed, 844 insertions, 399 deletions
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4
index 3600a39fe43..b912335fd16 100644
--- a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4
+++ b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4
@@ -134,6 +134,10 @@ AC_DEFUN([AM_CLANG_WARNINGS], [
w="$w -Wno-unused-command-line-argument";;
esac
+ # We occasionally use an extra semicolon to indicate an empty loop or
+ # conditional body.
+ w="$w -Wno-extra-semi-stmt"
+
# Ignore unrecognized options.
w="$w -Wno-unknown-warning-option"
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 6ce1ad16a5f..563236661aa 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1072,6 +1072,7 @@ ownp
pR
pS
packv
+pagedump
pagesize
parens
pareto
@@ -1325,6 +1326,7 @@ unmodify
unordered
unpackv
unpadded
+unreconciled
unreferenced
unregister
unsized
diff --git a/src/third_party/wiredtiger/examples/c/ex_async.c b/src/third_party/wiredtiger/examples/c/ex_async.c
index e9ffad4807c..85f783092fa 100644
--- a/src/third_party/wiredtiger/examples/c/ex_async.c
+++ b/src/third_party/wiredtiger/examples/c/ex_async.c
@@ -37,7 +37,7 @@ static const char *home;
#elif defined(_WIN32)
#define ATOMIC_ADD(v, val) (_InterlockedExchangeAdd(&(v), val) + val)
#else
-#define ATOMIC_ADD(v, val) __sync_add_and_fetch(&(v), val)
+#define ATOMIC_ADD(v, val) __atomic_add_fetch(&(v), val, __ATOMIC_SEQ_CST)
#endif
static int global_error = 0;
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index b50a7f09165..2673308c46e 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "1a1197ef3c891458cd73290ad9b01c1e969f7e86",
+ "commit": "280c572c8097a322e429a349f73135266f3faacf",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.2"
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
index 9f9aa979139..0ef85b8cd28 100644
--- a/src/third_party/wiredtiger/src/async/async_api.c
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -160,8 +160,7 @@ retry:
WT_RET(__async_get_format(conn, uri, config, op));
op->unique_id = __wt_atomic_add64(&async->op_id, 1);
op->optype = WT_AOP_NONE;
- (void)__wt_atomic_store32(
- &async->ops_index, (i + 1) % conn->async_size);
+ async->ops_index = (i + 1) % conn->async_size;
*opp = op;
return (0);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 4d94bcdb23e..d045405f85a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -1506,8 +1506,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) ||
!F_ISSET(cursor, WT_CURSTD_VALUE_INT))
WT_ERR(__wt_btcur_search(cbt));
+
+ WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries));
+
orig = cursor->value.size;
- WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
+ WT_ERR(__wt_modify_apply(cursor, modify->data));
new = cursor->value.size;
WT_ERR(__cursor_size_chk(session, &cursor->value));
@@ -1527,8 +1530,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
F_CLR(cursor, WT_CURSTD_OVERWRITE);
if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt))
ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
- else if ((ret =
- __wt_modify_pack(session, &modify, entries, nentries)) == 0)
+ else
ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY);
if (overwrite)
F_SET(cursor, WT_CURSTD_OVERWRITE);
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 685fb983718..9f5cadfecd0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -718,6 +718,7 @@ __wt_debug_page(
*/
int
__wt_debug_cursor_page(void *cursor_arg, const char *ofile)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
WT_CURSOR *cursor;
WT_CURSOR_BTREE *cbt;
@@ -889,7 +890,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
if (split_gen != 0)
WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen));
if (mod != NULL)
- WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen));
+ WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state));
WT_RET(ds->f(ds,
", memory-size %" WT_SIZET_FMT, page->memory_footprint));
WT_RET(ds->f(ds, "\n"));
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 52277efb85d..d41f76c6442 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -250,7 +250,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session,
* updates.
*/
while (i > 0)
- WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data));
+ WT_ERR(__wt_modify_apply(cursor, listp[--i]->data));
err: if (allocated_bytes != 0)
__wt_free(session, listp);
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index a7d34d49f84..d2ac866bc59 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -322,6 +322,10 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* discarded), that is not wasted effort because
* checkpoint doesn't need to write the page again.
*
+ * Once the transaction has given up it's snapshot it
+ * is no longer safe to reconcile pages. That happens
+ * prior to the final metadata checkpoint.
+ *
* XXX Only attempt this eviction when there are no
* readers older than the checkpoint. Otherwise, a bug
* in eviction can mark the page clean and discard
@@ -331,6 +335,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
if (!WT_PAGE_IS_INTERNAL(page) &&
page->read_gen == WT_READGEN_WONT_NEED &&
!tried_eviction &&
+ F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) &&
(!F_ISSET(txn, WT_TXN_HAS_TS_READ) ||
txn->read_timestamp ==
conn->txn_global.pinned_timestamp)) {
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 8603d329c15..a01ef5a49a7 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -30,6 +30,15 @@ __search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if ((ins = WT_SKIP_LAST(ins_head)) == NULL)
return (0);
+ /*
+ * Since the head of the skip list doesn't get mutated within this
+ * function, the compiler may move this assignment above within the
+ * loop below if it needs to (and may read a different value on each
+ * loop due to other threads mutating the skip list).
+ *
+ * Place a read barrier here to avoid this issue.
+ */
+ WT_READ_BARRIER();
key.data = WT_INSERT_KEY(ins);
key.size = WT_INSERT_KEY_SIZE(ins);
diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c
index a75bdd259c4..38052a8e412 100644
--- a/src/third_party/wiredtiger/src/conn/conn_capacity.c
+++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c
@@ -270,8 +270,7 @@ __capacity_reserve(uint64_t *reservation, uint64_t bytes, uint64_t capacity,
* If the reservation clock is out of date, bring it
* to within a second of a current time.
*/
- (void)__wt_atomic_store64(reservation,
- (now_ns - WT_BILLION) + res_len);
+ *reservation = (now_ns - WT_BILLION) + res_len;
} else
res_value = now_ns;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index 07bfe02a142..12be6929022 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -508,10 +508,8 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
if (!passed) {
if (iter != NULL &&
(iter->is_equal ||
- F_ISSET(end, WT_CURJOIN_END_LT))) {
- WT_RET(__curjoin_iter_bump(iter));
+ F_ISSET(end, WT_CURJOIN_END_LT)))
return (WT_NOTFOUND);
- }
if (!disjunction)
return (WT_NOTFOUND);
iter = NULL;
@@ -606,6 +604,9 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
WT_ITEM v;
bool bloom_found;
+ /* We cannot have a bloom filter on a join entry with subordinates. */
+ WT_ASSERT(session, entry->bloom == NULL || entry->subjoin == NULL);
+
if (entry->subjoin == NULL && iter != NULL &&
(iter->end_pos + iter->end_skip >= entry->ends_next ||
(iter->end_skip > 0 &&
@@ -633,16 +634,19 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
bloom_found = true;
}
if (entry->subjoin != NULL) {
+ /*
+ * If we have a subordinate join, the membership
+ * check is delegated to it.
+ */
WT_ASSERT(session,
iter == NULL || entry->subjoin == iter->child->cjoin);
- ret = __curjoin_entries_in_range(session, entry->subjoin,
- key, iter == NULL ? NULL : iter->child);
+ WT_ERR(__curjoin_entries_in_range(session, entry->subjoin,
+ key, iter == NULL ? NULL : iter->child));
if (iter != NULL &&
- WT_CURJOIN_ITER_CONSUMED(iter->child)) {
- WT_ERR(__curjoin_iter_bump(iter));
- ret = WT_NOTFOUND;
- }
- return (ret);
+ WT_CURJOIN_ITER_CONSUMED(iter->child))
+ return (WT_NOTFOUND);
+ /* There's nothing more to do for this node. */
+ return (0);
}
if (entry->index != NULL) {
/*
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 073df6eaaf6..22d067ef90e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -931,7 +931,7 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
/* Get the current value, apply the modifications. */
WT_ERR(cursor->search(cursor));
- WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
+ WT_ERR(__wt_modify_apply_api(cursor, entries, nentries));
/* We know both key and value are set, "overwrite" doesn't matter. */
ret = cursor->update(cursor);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index a7c289a7b7f..03643f473e1 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -489,10 +489,21 @@ struct __wt_page_modify {
WT_SPINLOCK page_lock; /* Page's spinlock */
/*
- * The write generation is incremented when a page is modified, a page
- * is clean if the write generation is 0.
+ * The page state is incremented when a page is modified.
+ *
+ * WT_PAGE_CLEAN --
+ * The page is clean.
+ * WT_PAGE_DIRTY_FIRST --
+ * The page is in this state after the first operation that marks a
+ * page dirty, or when reconciliation is checking to see if it has
+ * done enough work to be able to mark the page clean.
+ * WT_PAGE_DIRTY --
+ * Two or more updates have been added to the page.
*/
- uint32_t write_gen;
+#define WT_PAGE_CLEAN 0
+#define WT_PAGE_DIRTY_FIRST 1
+#define WT_PAGE_DIRTY 2
+ uint32_t page_state;
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index d0679a9fb38..3fa5d60f1f1 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page)
static inline bool
__wt_page_evict_clean(WT_PAGE *page)
{
- return (page->modify == NULL || (page->modify->write_gen == 0 &&
+ return (page->modify == NULL ||
+ (page->modify->page_state == WT_PAGE_CLEAN &&
page->modify->rec_result == 0));
}
@@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page)
static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0);
+ return (page->modify != NULL &&
+ page->modify->page_state != WT_PAGE_CLEAN);
}
/*
@@ -496,19 +498,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
last_running = 0;
- if (page->modify->write_gen == 0)
+ if (page->modify->page_state == WT_PAGE_CLEAN)
last_running = S2C(session)->txn_global.last_running;
/*
- * We depend on atomic-add being a write barrier, that is, a barrier to
- * ensure all changes to the page are flushed before updating the page
- * write generation and/or marking the tree dirty, otherwise checkpoints
+ * We depend on the atomic operation being a write barrier, that is, a
+ * barrier to ensure all changes to the page are flushed before updating
+ * the page state and/or marking the tree dirty, otherwise checkpoints
* and/or page reconciliation might be looking at a clean page/tree.
*
* Every time the page transitions from clean to dirty, update the cache
* and transactional information.
+ *
+ * The page state can only ever be incremented above dirty by the number
+ * of concurrently running threads, so the counter will never approach
+ * the point where it would wrap.
*/
- if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
+ if (page->modify->page_state < WT_PAGE_DIRTY &&
+ __wt_atomic_add32(&page->modify->page_state, 1) ==
+ WT_PAGE_DIRTY_FIRST) {
__wt_cache_dirty_incr(session, page);
/*
@@ -579,7 +587,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
* Allow the call to be made on clean pages.
*/
if (__wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
+ /*
+ * The only part where ordering matters is during
+ * reconciliation where updates on other threads are performing
+ * writes to the page state that need to be visible to the
+ * reconciliation thread.
+ *
+ * Since clearing of the page state is not going to be happening
+ * during reconciliation on a separate thread, there's no write
+ * barrier needed here.
+ */
+ page->modify->page_state = WT_PAGE_CLEAN;
__wt_cache_dirty_decr(session, page);
}
}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 80046127d3f..5dbd7115684 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -277,7 +277,7 @@ extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTR
extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t checksum, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -503,9 +503,9 @@ extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char
extern int __wt_metadata_set_base_write_gen(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_turtle_rewrite(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_apply( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_apply_api(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply(WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
index a9d271ed0bd..7ee64cb663f 100644
--- a/src/third_party/wiredtiger/src/include/gcc.h
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -89,38 +89,40 @@
* swap) operations.
*/
-#ifdef __clang__
/*
- * We avoid __sync_bool_compare_and_swap with due to problems with optimization
- * with some versions of clang. See http://llvm.org/bugs/show_bug.cgi?id=21499
- * for details.
+ * We've hit optimization bugs with Clang 3.5 in the past when using the atomic
+ * builtins. See http://llvm.org/bugs/show_bug.cgi?id=21499 for details.
*/
-#define WT_ATOMIC_CAS(ptr, old, new) \
- (__sync_val_compare_and_swap(ptr, old, new) == (old))
-#else
-#define WT_ATOMIC_CAS(ptr, old, new) \
- __sync_bool_compare_and_swap(ptr, old, new)
+#if defined(__clang__) && \
+ defined(__clang_major__) && defined(__clang_minor__) && \
+ (((__clang_major__ == 3) && (__clang_minor__ <= 5)) || \
+ (__clang_major__ < 3))
+#error "Clang versions 3.5 and earlier are unsupported by WiredTiger"
#endif
+
+#define WT_ATOMIC_CAS(ptr, oldp, new) \
+ __atomic_compare_exchange_n( \
+ ptr, oldp, new, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#define WT_ATOMIC_CAS_FUNC(name, vp_arg, old_arg, new_arg) \
static inline bool \
__wt_atomic_cas##name(vp_arg, old_arg, new_arg) \
{ \
- return (WT_ATOMIC_CAS(vp, old, new)); \
+ return (WT_ATOMIC_CAS(vp, &old, new)); \
}
WT_ATOMIC_CAS_FUNC(8, uint8_t *vp, uint8_t old, uint8_t new)
WT_ATOMIC_CAS_FUNC(16, uint16_t *vp, uint16_t old, uint16_t new)
WT_ATOMIC_CAS_FUNC(32, uint32_t *vp, uint32_t old, uint32_t new)
WT_ATOMIC_CAS_FUNC(v32, \
- volatile uint32_t *vp, volatile uint32_t old, volatile uint32_t new)
+ volatile uint32_t *vp, uint32_t old, volatile uint32_t new)
WT_ATOMIC_CAS_FUNC(i32, int32_t *vp, int32_t old, int32_t new)
WT_ATOMIC_CAS_FUNC(iv32, \
- volatile int32_t *vp, volatile int32_t old, volatile int32_t new)
+ volatile int32_t *vp, int32_t old, volatile int32_t new)
WT_ATOMIC_CAS_FUNC(64, uint64_t *vp, uint64_t old, uint64_t new)
WT_ATOMIC_CAS_FUNC(v64, \
- volatile uint64_t *vp, volatile uint64_t old, volatile uint64_t new)
+ volatile uint64_t *vp, uint64_t old, volatile uint64_t new)
WT_ATOMIC_CAS_FUNC(i64, int64_t *vp, int64_t old, int64_t new)
WT_ATOMIC_CAS_FUNC(iv64, \
- volatile int64_t *vp, volatile int64_t old, volatile int64_t new)
+ volatile int64_t *vp, int64_t old, volatile int64_t new)
WT_ATOMIC_CAS_FUNC(size, size_t *vp, size_t old, size_t new)
/*
@@ -130,29 +132,24 @@ WT_ATOMIC_CAS_FUNC(size, size_t *vp, size_t old, size_t new)
static inline bool
__wt_atomic_cas_ptr(void *vp, void *old, void *new)
{
- return (WT_ATOMIC_CAS((void **)vp, old, new));
+ return (WT_ATOMIC_CAS((void **)vp, &old, new));
}
#define WT_ATOMIC_FUNC(name, ret, vp_arg, v_arg) \
static inline ret \
__wt_atomic_add##name(vp_arg, v_arg) \
{ \
- return (__sync_add_and_fetch(vp, v)); \
+ return (__atomic_add_fetch(vp, v, __ATOMIC_SEQ_CST)); \
} \
static inline ret \
__wt_atomic_fetch_add##name(vp_arg, v_arg) \
{ \
- return (__sync_fetch_and_add(vp, v)); \
-} \
-static inline ret \
-__wt_atomic_store##name(vp_arg, v_arg) \
-{ \
- return (__sync_lock_test_and_set(vp, v)); \
+ return (__atomic_fetch_add(vp, v, __ATOMIC_SEQ_CST)); \
} \
static inline ret \
__wt_atomic_sub##name(vp_arg, v_arg) \
{ \
- return (__sync_sub_and_fetch(vp, v)); \
+ return (__atomic_sub_fetch(vp, v, __ATOMIC_SEQ_CST)); \
}
WT_ATOMIC_FUNC(8, uint8_t, uint8_t *vp, uint8_t v)
WT_ATOMIC_FUNC(16, uint16_t, uint16_t *vp, uint16_t v)
diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h
index 903b0238b37..5d7cee531c2 100644
--- a/src/third_party/wiredtiger/src/include/lint.h
+++ b/src/third_party/wiredtiger/src/include/lint.h
@@ -35,15 +35,6 @@ __wt_atomic_fetch_add##name(type *vp, type v) \
return (orig); \
} \
static inline ret \
-__wt_atomic_store##name(type *vp, type v) \
-{ \
- type orig; \
- \
- orig = *vp; \
- *vp = v; \
- return (orig); \
-} \
-static inline ret \
__wt_atomic_sub##name(type *vp, type v) \
{ \
*vp -= v; \
diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h
index 1586dae22b8..f4d8dc942f6 100644
--- a/src/third_party/wiredtiger/src/include/msvc.h
+++ b/src/third_party/wiredtiger/src/include/msvc.h
@@ -45,11 +45,6 @@ __wt_atomic_fetch_add##name(type *vp, type v) \
return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v))); \
} \
static inline ret \
-__wt_atomic_store##name(type *vp, type v) \
-{ \
- return (_InterlockedExchange ## s((t *)(vp), (t)(v))); \
-} \
-static inline ret \
__wt_atomic_sub##name(type *vp, type v) \
{ \
return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
index 15e7218dd28..660ee22ed96 100644
--- a/src/third_party/wiredtiger/src/include/mutex.i
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -68,7 +68,7 @@ __wt_spin_trylock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
WT_UNUSED(session);
- return (__sync_lock_test_and_set(&t->lock, 1) == 0 ? 0 : EBUSY);
+ return (__atomic_test_and_set(&t->lock, __ATOMIC_ACQUIRE) ? 0 : EBUSY);
}
/*
@@ -82,7 +82,7 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
WT_UNUSED(session);
- while (__sync_lock_test_and_set(&t->lock, 1)) {
+ while (__atomic_test_and_set(&t->lock, __ATOMIC_ACQUIRE)) {
for (i = 0; t->lock && i < WT_SPIN_COUNT; i++)
WT_PAUSE();
if (t->lock)
@@ -99,7 +99,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
WT_UNUSED(session);
- __sync_lock_release(&t->lock);
+ __atomic_clear(&t->lock, __ATOMIC_RELEASE);
}
#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX || \
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index be6440c27bc..c3c46ec11c5 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -20,12 +20,6 @@ struct __wt_reconcile {
uint32_t flags; /* Caller's configuration */
/*
- * Track start/stop write generation to decide if all changes to the
- * page are written.
- */
- uint32_t orig_write_gen;
-
- /*
* Track start/stop checkpoint generations to decide if lookaside table
* records are correct.
*/
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 1c67a84adbf..701f73df84f 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -7,29 +7,6 @@
*/
/*
- * __page_write_gen_wrapped_check --
- * Confirm the page's write generation number won't wrap.
- */
-static inline int
-__page_write_gen_wrapped_check(WT_PAGE *page)
-{
- /*
- * Check to see if the page's write generation is about to wrap (wildly
- * unlikely as it implies 4B updates between clean page reconciliations,
- * but technically possible), and fail the update.
- *
- * The check is outside of the serialization mutex because the page's
- * write generation is going to be a hot cache line, so technically it's
- * possible for the page's write generation to wrap between the test and
- * our subsequent modification of it. However, the test is (4B-1M), and
- * there cannot be a million threads that have done the test but not yet
- * completed their modification.
- */
- return (page->modify->write_gen >
- UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
-}
-
-/*
* __insert_simple_func --
* Worker function to add a WT_INSERT entry to the middle of a skiplist.
*/
@@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* Acquire the page's spinlock unless we already have exclusive access.
* Then call the worker function.
@@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
simple = true;
for (i = 0; i < skipdepth; i++)
if (new_ins->next[i] == NULL)
@@ -272,9 +243,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
upd = *updp;
*updp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* All structure setup must be flushed before the structure is entered
* into the list. We need a write barrier here, our callers depend on
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index ce58f9f7301..e9c6f7f8e9d 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -232,8 +232,6 @@ __wt_txn_resolve_prepared_op(
continue;
if (upd->txnid != txn->id)
break;
- if (op->u.op_upd == NULL)
- op->u.op_upd = upd;
++(*resolved_update_countp);
@@ -844,6 +842,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
return (WT_VISIBLE_TRUE);
}
+
/*
* __wt_txn_upd_durable --
* Can the current transaction make the given update durable.
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index f7eeada4c8e..477894bcf14 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -456,14 +456,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean.
+ * We set the page state to mark it as having been dirtied for
+ * the first time prior to reconciliation. A failed atomic cas
+ * indicates that an update has taken place during
+ * reconciliation.
*
- * If the write generation changed, the page has been written
- * since reconciliation started and remains dirty (that can't
- * happen when evicting, the page is exclusively locked).
+ * The page only might be clean; if the page state is unchanged
+ * since reconciliation started, it's clean.
+ *
+ * If the page state changed, the page has been written since
+ * reconciliation started and remains dirty (that can't happen
+ * when evicting, the page is exclusively locked).
*/
- if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ if (__wt_atomic_cas32(
+ &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN))
__wt_cache_dirty_decr(session, page);
else
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
@@ -602,13 +608,22 @@ __rec_init(WT_SESSION_IMPL *session,
r->page = page;
/*
- * Save the page's write generation before reading the page.
* Save the transaction generations before reading the page.
* These are all ordered reads, but we only need one.
*/
r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Update the page state to indicate that all currently installed
+ * updates will be included in this reconciliation if it would mark the
+ * page clean.
+ *
+ * Add a write barrier to make it more likely that a thread adding an
+ * update will see this state change.
+ */
+ page->modify->page_state = WT_PAGE_DIRTY_FIRST;
+ WT_FULL_BARRIER();
/*
* Cache the oldest running transaction ID. This is used to check
diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c
index be1b1970da6..e8260cb41b6 100644
--- a/src/third_party/wiredtiger/src/support/modify.c
+++ b/src/third_party/wiredtiger/src/support/modify.c
@@ -8,28 +8,60 @@
#include "wt_internal.h"
+#define WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) do { \
+ const size_t *__p = p; \
+ const uint8_t *__data = \
+ (const uint8_t *)(__p + (size_t)(nentries) * 3); \
+ int __i; \
+ for (__i = 0; __i < (nentries); ++__i) { \
+ memcpy(&(mod).data.size, __p++, sizeof(size_t)); \
+ memcpy(&(mod).offset, __p++, sizeof(size_t)); \
+ memcpy(&(mod).size, __p++, sizeof(size_t)); \
+ (mod).data.data = __data; \
+ __data += (mod).data.size; \
+ if (__i < (napplied)) \
+ continue;
+
+#define WT_MODIFY_FOREACH_REVERSE(mod, p, nentries, napplied, datasz) do {\
+ const size_t *__p = (p) + (size_t)(nentries) * 3; \
+ const uint8_t *__data = (const uint8_t *)__p + datasz; \
+ int __i; \
+ for (__i = (napplied); __i < (nentries); ++__i) { \
+ memcpy(&(mod).size, --__p, sizeof(size_t)); \
+ memcpy(&(mod).offset, --__p, sizeof(size_t)); \
+ memcpy(&(mod).data.size, --__p, sizeof(size_t)); \
+ (mod).data.data = (__data -= (mod).data.size);
+
+#define WT_MODIFY_FOREACH_END \
+ } \
+} while (0)
+
/*
* __wt_modify_pack --
* Pack a modify structure into a buffer.
*/
int
-__wt_modify_pack(WT_SESSION_IMPL *session,
+__wt_modify_pack(WT_CURSOR *cursor,
WT_ITEM **modifyp, WT_MODIFY *entries, int nentries)
{
WT_ITEM *modify;
- size_t len, *p;
+ WT_SESSION_IMPL *session;
+ size_t diffsz, len, *p;
uint8_t *data;
int i;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
/*
* Build the in-memory modify value. It's the entries count, followed
* by the modify structure offsets written in order, followed by the
* data (data at the end to minimize unaligned reads/writes).
*/
len = sizeof(size_t); /* nentries */
- for (i = 0; i < nentries; ++i) {
+ for (i = 0, diffsz = 0; i < nentries; ++i) {
len += 3 * sizeof(size_t); /* WT_MODIFY fields */
len += entries[i].data.size; /* data */
+ diffsz += entries[i].size; /* bytes touched */
}
WT_RET(__wt_scr_alloc(session, len, &modify));
@@ -48,6 +80,18 @@ __wt_modify_pack(WT_SESSION_IMPL *session,
}
modify->size = WT_PTRDIFF(data, modify->data);
*modifyp = modify;
+
+ /*
+ * Update statistics. This is the common path called by
+ * WT_CURSOR::modify implementations.
+ */
+ WT_STAT_CONN_INCR(session, cursor_modify);
+ WT_STAT_DATA_INCR(session, cursor_modify);
+ WT_STAT_CONN_INCRV(session, cursor_modify_bytes, cursor->value.size);
+ WT_STAT_DATA_INCRV(session, cursor_modify_bytes, cursor->value.size);
+ WT_STAT_CONN_INCRV(session, cursor_modify_bytes_touch, diffsz);
+ WT_STAT_DATA_INCRV(session, cursor_modify_bytes_touch, diffsz);
+
return (0);
}
@@ -56,51 +100,46 @@ __wt_modify_pack(WT_SESSION_IMPL *session,
* Apply a single modify structure change to the buffer.
*/
static int
-__modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
- size_t data_size, size_t offset, size_t size, const uint8_t *data)
+__modify_apply_one(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify, bool sformat)
{
- WT_ITEM *value;
- size_t len;
- uint8_t *from, *to;
- bool sformat;
+ size_t data_size, item_offset, offset, size;
+ const uint8_t *data, *from;
+ uint8_t *to;
- value = &cursor->value;
- sformat = cursor->value_format[0] == 'S';
+ data = modify->data.data;
+ data_size = modify->data.size;
+ offset = modify->offset;
+ size = modify->size;
/*
* Grow the buffer to the maximum size we'll need. This is pessimistic
* because it ignores replacement bytes, but it's a simpler calculation.
*
- * Grow the buffer before we fast-path the expected case. This function
- * is often called using a cursor buffer referencing on-page memory and
- * it's easy to overwrite a page. A side-effect of growing the buffer is
- * to ensure the buffer's value is in buffer-local memory.
+ * Grow the buffer first. This function is often called using a cursor
+ * buffer referencing on-page memory and it's easy to overwrite a page.
+ * A side-effect of growing the buffer is to ensure the buffer's value
+ * is in buffer-local memory.
*
* Because the buffer may reference an overflow item, the data may not
* start at the start of the buffer's memory and we have to correct for
* that.
*/
- len = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
- WT_RET(__wt_buf_grow(session, value,
- len + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0)));
+ item_offset =
+ WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
+ WT_RET(__wt_buf_grow(session, value, item_offset +
+ WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0)));
/*
- * Fast-path the expected case, where we're overwriting a set of bytes
+ * Fast-path the common case, where we're overwriting a set of bytes
* that already exist in the buffer.
*/
if (value->size > offset + data_size && data_size == size) {
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
return (0);
}
/*
- * Decrement the size to discard the trailing nul (done after growing
- * the buffer to ensure it can be restored without further checking).
- */
- if (sformat)
- --value->size;
-
- /*
* If appending bytes past the end of the value, initialize gap bytes
* and copy the new bytes into place.
*/
@@ -108,12 +147,8 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
if (value->size < offset)
memset((uint8_t *)value->data + value->size,
sformat ? ' ' : 0, offset - value->size);
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
value->size = offset + data_size;
-
- /* Restore the trailing nul. */
- if (sformat)
- ((char *)value->data)[value->size++] = '\0';
return (0);
}
@@ -125,9 +160,12 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
if (value->size < offset + size)
size = value->size - offset;
+ WT_ASSERT(session, value->size + (data_size - size) +
+ (sformat ? 1 : 0) <= value->memsize);
+
if (data_size == size) { /* Overwrite */
/* Copy in the new data. */
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
/*
* The new data must overlap the buffer's end (else, we'd use
@@ -137,7 +175,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
value->size = offset + data_size;
} else { /* Shrink or grow */
/* Move trailing data forward/backward to its new location. */
- from = (uint8_t *)value->data + (offset + size);
+ from = (const uint8_t *)value->data + (offset + size);
WT_ASSERT(session, WT_DATA_IN_ITEM(value) &&
from + (value->size - (offset + size)) <=
(uint8_t *)value->mem + value->memsize);
@@ -148,7 +186,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
memmove(to, from, value->size - (offset + size));
/* Copy in the new data. */
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
/*
* Correct the size. This works because of how the C standard
@@ -165,49 +203,134 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
value->size += (data_size - size);
}
- /* Restore the trailing nul. */
- if (sformat)
- ((char *)value->data)[value->size++] = '\0';
-
return (0);
}
/*
- * __wt_modify_apply_api --
- * Apply a single set of WT_MODIFY changes to a buffer, the cursor API
- * interface.
+ * __modify_fast_path --
+ * Process a set of modifications, applying any that can be made in place,
+ * and check if the remaining ones are sorted and non-overlapping.
*/
-int
-__wt_modify_apply_api(WT_SESSION_IMPL *session,
- WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+static void
+__modify_fast_path(
+ WT_ITEM *value, const size_t *p, int nentries,
+ int *nappliedp, bool *overlapp, size_t *dataszp, size_t *destszp)
{
- size_t modified;
- int i;
+ WT_MODIFY current, prev;
+ size_t datasz, destoff;
+ bool fastpath, first;
+
+ *overlapp = true;
+
+ datasz = destoff = 0;
+ WT_CLEAR(current);
+ WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */
- for (modified = 0, i = 0; i < nentries; ++i) {
- modified += entries[i].size;
- WT_RET(__modify_apply_one(session, cursor, entries[i].data.size,
- entries[i].offset, entries[i].size, entries[i].data.data));
- }
/*
- * This API is used by some external test functions with a NULL
- * session pointer - they don't expect statistics to be incremented.
+ * If the modifications are sorted and don't overlap in the old or new
+ * values, we can do a fast application of all the modifications
+ * modifications in a single pass.
+ *
+ * The requirement for ordering is unfortunate, but modifications are
+ * performed in order, and applications specify byte offsets based on
+ * that. In other words, byte offsets are cumulative, modifications
+ * that shrink or grow the data affect subsequent modification's byte
+ * offsets.
*/
- if (session != NULL) {
- WT_STAT_CONN_INCR(session, cursor_modify);
- WT_STAT_DATA_INCR(session, cursor_modify);
- WT_STAT_CONN_INCRV(session,
- cursor_modify_bytes, cursor->value.size);
- WT_STAT_DATA_INCRV(session,
- cursor_modify_bytes, cursor->value.size);
- WT_STAT_CONN_INCRV(session,
- cursor_modify_bytes_touch, modified);
- WT_STAT_DATA_INCRV(session,
- cursor_modify_bytes_touch, modified);
- }
+ fastpath = first = true;
+ *nappliedp = 0;
+ WT_MODIFY_FOREACH_BEGIN(current, p, nentries, 0) {
+ datasz += current.data.size;
- return (0);
+ if (fastpath && current.data.size == current.size &&
+ current.offset + current.size <= value->size) {
+ memcpy((uint8_t *)value->data + current.offset,
+ current.data.data, current.data.size);
+ ++(*nappliedp);
+ continue;
+ }
+ fastpath = false;
+
+ /* Step over the bytes before the current block. */
+ if (first)
+ destoff = current.offset;
+ else {
+ /* Check that entries are sorted and non-overlapping. */
+ if (current.offset < prev.offset + prev.size ||
+ current.offset < prev.offset + prev.data.size)
+ return;
+ destoff += current.offset - (prev.offset + prev.size);
+ }
+
+ /*
+ * If the source is past the end of the current value, we have
+ * to deal with padding bytes. Don't try to fast-path padding
+ * bytes; it's not common and adds branches to the loop
+ * applying the changes.
+ */
+ if (current.offset + current.size > value->size)
+ return;
+
+ /*
+ * If copying this block overlaps with the next one, we can't
+ * build the value in reverse order.
+ */
+ if (current.size != current.data.size &&
+ current.offset + current.size > destoff)
+ return;
+
+ /* Step over the current modification. */
+ destoff += current.data.size;
+
+ prev = current;
+ first = false;
+ } WT_MODIFY_FOREACH_END;
+
+ /* Step over the final unmodified block. */
+ destoff += value->size - (current.offset + current.size);
+
+ *overlapp = false;
+ *dataszp = datasz;
+ *destszp = destoff;
+ return;
+}
+
+/*
+ * __modify_apply_no_overlap --
+ * Apply a single set of WT_MODIFY changes to a buffer, where the changes
+ * are in sorted order and none of the changes overlap.
+ */
+static void
+__modify_apply_no_overlap(WT_SESSION_IMPL *session, WT_ITEM *value,
+ const size_t *p, int nentries, int napplied, size_t datasz, size_t destsz)
+{
+ WT_MODIFY current;
+ size_t sz;
+ const uint8_t *from;
+ uint8_t *to;
+
+ from = (const uint8_t *)value->data + value->size;
+ to = (uint8_t *)value->data + destsz;
+ WT_MODIFY_FOREACH_REVERSE(current, p, nentries, napplied, datasz) {
+ /* Move the current unmodified block into place if necessary. */
+ sz = WT_PTRDIFF(to, value->data) -
+ (current.offset + current.data.size);
+ from -= sz;
+ to -= sz;
+ WT_ASSERT(session, from >= (const uint8_t *)value->data &&
+ to >= (uint8_t *)value->data);
+ WT_ASSERT(session,
+ from + sz <= (const uint8_t *)value->data + value->size);
+
+ if (to != from)
+ memmove(to, from, sz);
+
+ from -= current.size;
+ to -= current.data.size;
+ memcpy(to, current.data.data, current.data.size);
+ } WT_MODIFY_FOREACH_END;
+
+ value->size = destsz;
}
/*
@@ -215,31 +338,91 @@ __wt_modify_apply_api(WT_SESSION_IMPL *session,
* Apply a single set of WT_MODIFY changes to a buffer.
*/
int
-__wt_modify_apply(
- WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify)
+__wt_modify_apply(WT_CURSOR *cursor, const void *modify)
{
- size_t data_size, nentries, offset, size;
+ WT_ITEM *value;
+ WT_MODIFY mod;
+ WT_SESSION_IMPL *session;
+ size_t datasz, destsz, item_offset, tmp;
const size_t *p;
- const uint8_t *data;
+ int napplied, nentries;
+ bool overlap, sformat;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ sformat = cursor->value_format[0] == 'S';
+ value = &cursor->value;
/*
- * Get the number of entries, and set a second pointer to reference the
- * change data. The modify string isn't necessarily aligned for size_t
- * access, copy to be sure.
+ * Get the number of modify entries and set a second pointer to
+ * reference the replacement data.
*/
p = modify;
- memcpy(&nentries, p++, sizeof(size_t));
- data = (uint8_t *)modify +
- sizeof(size_t) + (nentries * 3 * sizeof(size_t));
-
- /* Step through the list of entries, applying them in order. */
- for (; nentries-- > 0; data += data_size) {
- memcpy(&data_size, p++, sizeof(size_t));
- memcpy(&offset, p++, sizeof(size_t));
- memcpy(&size, p++, sizeof(size_t));
- WT_RET(__modify_apply_one(
- session, cursor, data_size, offset, size, data));
+ memcpy(&tmp, p++, sizeof(size_t));
+ nentries = (int)tmp;
+
+ /*
+ * Grow the buffer first. This function is often called using a cursor
+ * buffer referencing on-page memory and it's easy to overwrite a page.
+ * A side-effect of growing the buffer is to ensure the buffer's value
+ * is in buffer-local memory.
+ *
+ * Because the buffer may reference an overflow item, the data may not
+ * start at the start of the buffer's memory and we have to correct for
+ * that.
+ */
+ item_offset = WT_DATA_IN_ITEM(value) ?
+ WT_PTRDIFF(value->data, value->mem) : 0;
+ WT_RET(__wt_buf_grow(session, value, item_offset + value->size));
+
+ /*
+ * Decrement the size to discard the trailing nul (done after growing
+ * the buffer to ensure it can be restored without further checking).
+ */
+ if (sformat)
+ --value->size;
+
+ __modify_fast_path(
+ value, p, nentries, &napplied, &overlap, &datasz, &destsz);
+
+ if (napplied == nentries)
+ goto done;
+
+ if (!overlap) {
+ /* Grow the buffer first, correcting for the data offset. */
+ WT_RET(__wt_buf_grow(session, value, item_offset +
+ WT_MAX(destsz, value->size) + (sformat ? 1 : 0)));
+
+ __modify_apply_no_overlap(
+ session, value, p, nentries, napplied, datasz, destsz);
+ goto done;
}
+ WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) {
+ WT_RET(__modify_apply_one(session, value, &mod, sformat));
+ } WT_MODIFY_FOREACH_END;
+
+done: /* Restore the trailing nul. */
+ if (sformat)
+ ((char *)value->data)[value->size++] = '\0';
+
return (0);
}
+
+/*
+ * __wt_modify_apply_api --
+ * Apply a single set of WT_MODIFY changes to a buffer, the cursor API
+ * interface.
+ */
+int
+__wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ WT_DECL_ITEM(modify);
+ WT_DECL_RET;
+
+ WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries));
+ WT_ERR(__wt_modify_apply(cursor, modify->data));
+
+err: __wt_scr_free((WT_SESSION_IMPL *)cursor->session, &modify);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index f888a470c8c..b3085080956 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -664,13 +664,15 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
WT_TXN *txn;
WT_TXN_OP *op;
WT_UPDATE *upd;
- wt_timestamp_t op_timestamp;
+ wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp;
u_int i;
const char *open_cursor_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
bool op_zero_ts, upd_zero_ts;
txn = &session->txn;
+ cursor = NULL;
+ durable_op_timestamp = prev_op_timestamp = WT_TS_NONE;
/*
* Debugging checks on timestamps, if user requested them.
@@ -728,13 +730,15 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
WT_WITH_BTREE(session, op->btree,
ret = __wt_btcur_search_uncommitted(
(WT_CURSOR_BTREE *)cursor, &upd));
- WT_TRET(cursor->close(cursor));
if (ret != 0)
WT_RET_MSG(session, EINVAL,
"prepared update restore failed");
- op->u.op_upd = upd;
} else
- upd = op->u.op_upd->next;
+ upd = op->u.op_upd;
+
+ WT_ASSERT(session, upd != NULL);
+ op_timestamp = upd->start_ts;
+
/*
* Skip over any aborted update structures, internally
* created update structures or ones from our own
@@ -749,6 +753,22 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
* first valid update in the chain. They're in
* most recent order.
*/
+ if (upd != NULL) {
+ prev_op_timestamp = upd->start_ts;
+ durable_op_timestamp = upd->durable_ts;
+ }
+
+ /*
+ * We no longer need to access the update structure so
+ * it's safe to release our reference to the page.
+ */
+ if (cursor != NULL) {
+ WT_ASSERT(
+ session, F_ISSET(txn, WT_TXN_PREPARE));
+ WT_RET(cursor->close(cursor));
+ cursor = NULL;
+ }
+
if (upd == NULL)
continue;
/*
@@ -760,7 +780,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
* Check timestamps are used in order.
*/
op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
- upd_zero_ts = upd->start_ts == WT_TS_NONE;
+ upd_zero_ts = prev_op_timestamp == WT_TS_NONE;
if (op_zero_ts != upd_zero_ts)
WT_RET_MSG(session, EINVAL,
"per-key timestamps used inconsistently");
@@ -772,7 +792,6 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
if (op_zero_ts)
continue;
- op_timestamp = op->u.op_upd->start_ts;
/*
* Only if the update structure doesn't have a timestamp
* then use the one in the transaction structure.
@@ -780,11 +799,11 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
if (op_timestamp == WT_TS_NONE)
op_timestamp = txn->commit_timestamp;
if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) &&
- op_timestamp < upd->start_ts)
+ op_timestamp < prev_op_timestamp)
WT_RET_MSG(session, EINVAL,
"out of order commit timestamps");
if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) &&
- txn->durable_timestamp < upd->durable_ts)
+ txn->durable_timestamp < durable_op_timestamp)
WT_RET_MSG(session, EINVAL,
"out of order durable timestamps");
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index a5e3e139178..504b2c0e8b4 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -155,7 +155,7 @@ __txn_op_apply(
* than using cursor modify to create a partial update
* (for no particular reason than simplicity).
*/
- WT_ERR(__wt_modify_apply(session, cursor, value.data));
+ WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
}
break;
@@ -222,7 +222,7 @@ __txn_op_apply(
* than using cursor modify to create a partial update
* (for no particular reason than simplicity).
*/
- WT_ERR(__wt_modify_apply(session, cursor, value.data));
+ WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
}
break;
diff --git a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
index 879a8e96c6a..5a413c0df3b 100644
--- a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
@@ -37,7 +37,7 @@
#define DATASIZE 1024
#define MAX_MODIFY_ENTRIES 37 /* Maximum modify vectors */
-static WT_MODIFY entries[1000]; /* Entries vector */
+static WT_MODIFY entries[MAX_MODIFY_ENTRIES]; /* Entries vector */
static int nentries; /* Entries count */
/*
@@ -50,7 +50,6 @@ static char modify_repl[MAX_REPL_BYTES * 2]; /* Replacement bytes */
static WT_RAND_STATE rnd; /* RNG state */
-#if DEBUG
/*
* show --
* Dump out a buffer.
@@ -62,15 +61,10 @@ show(WT_ITEM *buf, const char *tag)
const uint8_t *a;
fprintf(stderr, "%s: %" WT_SIZET_FMT " bytes\n\t", tag, buf->size);
- for (a = buf->data, i = 0; i < buf->size; ++i, ++a) {
- if (isprint(*a))
- fprintf(stderr, " %c", *a);
- else
- fprintf(stderr, " %#x", *a);
- }
+ for (a = buf->data, i = 0; i < buf->size; ++i, ++a)
+ fprintf(stderr, " %c", isprint(*a) ? *a : '.');
fprintf(stderr, "\n");
}
-#endif
/*
* modify_repl_init --
@@ -82,7 +76,7 @@ modify_repl_init(void)
size_t i;
for (i = 0; i < sizeof(modify_repl); ++i)
- modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26];
+ modify_repl[i] = 'Z' - (i % 26);
}
/*
@@ -95,13 +89,13 @@ modify_build(void)
int i;
/* Mess up the entries. */
- memset(entries, 0xff, MAX_MODIFY_ENTRIES * sizeof(entries[0]));
+ memset(entries, 0xff, sizeof(entries));
/*
* Randomly select a number of byte changes, offsets and lengths.
* Allow a value of 0, the API should accept it.
*/
- nentries = (int)(__wt_random(&rnd) % MAX_MODIFY_ENTRIES);
+ nentries = (int)(__wt_random(&rnd) % (MAX_MODIFY_ENTRIES + 1));
for (i = 0; i < nentries; ++i) {
entries[i].data.data =
modify_repl + __wt_random(&rnd) % MAX_REPL_BYTES;
@@ -115,7 +109,7 @@ modify_build(void)
printf(
"%d: {%.*s} %" WT_SIZET_FMT " bytes replacing %"
WT_SIZET_FMT " bytes @ %" WT_SIZET_FMT "\n",
- i, (int)entries[i].data.size, entries[i].data.data,
+ i, (int)entries[i].data.size, (char *)entries[i].data.data,
entries[i].data.size, entries[i].size, entries[i].offset);
#endif
}
@@ -217,16 +211,29 @@ slow_apply_api(WT_ITEM *orig)
* Compare two results.
*/
static void
-compare(WT_ITEM *local, WT_ITEM *library)
+compare(WT_ITEM *orig, WT_ITEM *local, WT_ITEM *library)
{
-#if DEBUG
+ size_t i, max;
+ const uint8_t *p, *t;
+
+ max = WT_MIN(local->size, library->size);
if (local->size != library->size ||
memcmp(local->data, library->data, local->size) != 0) {
- fprintf(stderr, "results differ\n");
+ for (i = 0,
+ p = local->data, t = library->data; i < max; ++i, ++p, ++t)
+ if (*p != *t)
+ break;
+ fprintf(stderr, "results differ: ");
+ if (max == 0)
+ fprintf(stderr,
+ "identical up to %" WT_SIZET_FMT " bytes\n", max);
+ else
+ fprintf(stderr,
+ "first mismatch at offset %" WT_SIZET_FMT "\n", i);
+ show(orig, "original");
show(local, "local results");
show(library, "library results");
}
-#endif
testutil_assert(
local->size == library->size && memcmp(
local->data, library->data, local->size) == 0);
@@ -250,16 +257,22 @@ compare(WT_ITEM *local, WT_ITEM *library)
* calculate-modify API.
*/
static void
-modify_run(bool verbose)
+modify_run(TEST_OPTS *opts)
{
WT_CURSOR *cursor, _cursor;
WT_DECL_RET;
WT_ITEM *localA, _localA, *localB, _localB;
+ WT_SESSION_IMPL *session;
size_t len;
int i, j;
+ u_char *p;
+ bool verbose;
+
+ session = (WT_SESSION_IMPL *)opts->session;
+ verbose = opts->verbose;
/* Initialize the RNG. */
- __wt_random_init_seed(NULL, &rnd);
+ __wt_random_init_seed(session, &rnd);
/* Set up replacement information. */
modify_repl_init();
@@ -271,18 +284,24 @@ modify_run(bool verbose)
memset(&_localB, 0, sizeof(_localB));
cursor = &_cursor;
memset(&_cursor, 0, sizeof(_cursor));
+ cursor->session = (WT_SESSION *)session;
cursor->value_format = "u";
#define NRUNS 10000
for (i = 0; i < NRUNS; ++i) {
/* Create an initial value. */
len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES);
- testutil_check(__wt_buf_set(NULL, localA, modify_repl, len));
+ testutil_check(__wt_buf_set(session, localA, modify_repl, len));
for (j = 0; j < 1000; ++j) {
+ /* Make lower case so modifications are easy to see. */
+ for (p = localA->mem;
+ WT_PTRDIFF(p, localA->mem) < localA->size; p++)
+ *p = __wt_tolower(*p);
+
/* Copy the current value into the second item. */
testutil_check(__wt_buf_set(
- NULL, localB, localA->data, localA->size));
+ session, localB, localA->data, localA->size));
/*
* Create a random set of modify vectors, run the
@@ -291,12 +310,12 @@ modify_run(bool verbose)
* of modify.
*/
modify_build();
- testutil_check(__wt_buf_set(
- NULL, &cursor->value, localA->data, localA->size));
+ testutil_check(__wt_buf_set(session,
+ &cursor->value, localA->data, localA->size));
testutil_check(__wt_modify_apply_api(
- NULL, cursor, entries, nentries));
+ cursor, entries, nentries));
slow_apply_api(localA);
- compare(localA, &cursor->value);
+ compare(localB, localA, &cursor->value);
/*
* Call the WiredTiger function to build a modification
@@ -305,18 +324,18 @@ modify_run(bool verbose)
* against our implementation of modify.
*/
nentries = WT_ELEMENTS(entries);
- ret = wiredtiger_calc_modify(NULL,
+ ret = wiredtiger_calc_modify(opts->session,
localB, localA,
WT_MAX(localB->size, localA->size) + 100,
entries, &nentries);
if (ret == WT_NOTFOUND)
continue;
testutil_check(ret);
- testutil_check(__wt_buf_set(
- NULL, &cursor->value, localB->data, localB->size));
+ testutil_check(__wt_buf_set(session,
+ &cursor->value, localB->data, localB->size));
testutil_check(__wt_modify_apply_api(
- NULL, cursor, entries, nentries));
- compare(localA, &cursor->value);
+ cursor, entries, nentries));
+ compare(localB, localA, &cursor->value);
}
if (verbose) {
printf("%d (%d%%)\r", i, (i * 100) / NRUNS);
@@ -326,9 +345,9 @@ modify_run(bool verbose)
if (verbose)
printf("%d (100%%)\n", i);
- __wt_buf_free(NULL, localA);
- __wt_buf_free(NULL, localB);
- __wt_buf_free(NULL, &cursor->value);
+ __wt_buf_free(session, localA);
+ __wt_buf_free(session, localB);
+ __wt_buf_free(session, &cursor->value);
}
int
@@ -342,9 +361,11 @@ main(int argc, char *argv[])
testutil_make_work_dir(opts->home);
testutil_check(
wiredtiger_open(opts->home, NULL, "create", &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &opts->session));
/* Run the test. */
- modify_run(opts->verbose);
+ modify_run(opts);
testutil_cleanup(opts);
return (EXIT_SUCCESS);
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 7cba583b2b4..9d97a2d0428 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -70,6 +70,7 @@ typedef struct {
char *home_config; /* Run CONFIG file path */
char *home_init; /* Initialize home command */
char *home_log; /* Operation log file path */
+ char *home_pagedump; /* Page dump filename */
char *home_rand; /* RNG log file path */
char *home_salvage_copy; /* Salvage copy command */
char *home_stats; /* Statistics file path */
@@ -266,6 +267,8 @@ typedef enum { NEXT, PREV, SEARCH, SEARCH_NEAR } read_operation;
typedef struct {
thread_op op; /* Operation */
+ uint64_t opid; /* Operation ID */
+
uint64_t keyno; /* Row number */
uint64_t ts; /* Read/commit timestamp */
@@ -311,6 +314,8 @@ typedef struct {
WT_ITEM *lastkey, _lastkey;
bool repeatable_reads; /* if read ops repeatable */
+ bool repeatable_wrap; /* if circular buffer wrapped */
+ uint64_t opid; /* Operation ID */
uint64_t read_ts; /* read timestamp */
uint64_t commit_ts; /* commit timestamp */
SNAP_OPS *snap, *snap_first, snap_list[512];
@@ -348,10 +353,9 @@ void key_gen_teardown(WT_ITEM *);
void key_init(void);
WT_THREAD_RET lrt(void *);
void path_setup(const char *);
-void print_item(const char *, WT_ITEM *);
-void print_item_data(const char *, const uint8_t *, size_t);
int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool);
uint32_t rng(WT_RAND_STATE *);
+void snap_init(TINFO *, uint64_t, bool);
void snap_repeat_single(WT_CURSOR *, TINFO *);
int snap_repeat_txn(WT_CURSOR *, TINFO *);
void snap_repeat_update(TINFO *, bool);
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index a27dec3dd0c..7adfb795694 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -338,11 +338,10 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
buf, sizeof(buf), "read_timestamp=%" PRIx64, ts));
ret = session->timestamp_transaction(session, buf);
if (ret == 0) {
- tinfo->read_ts = ts;
- tinfo->repeatable_reads = true;
+ snap_init(tinfo, ts, true);
logop(session,
"begin snapshot read-ts=%" PRIu64 " (repeatable)",
- tinfo->read_ts);
+ ts);
return;
}
if (ret != EINVAL)
@@ -371,11 +370,9 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
testutil_check(pthread_rwlock_unlock(&g.ts_lock));
- tinfo->read_ts = ts;
- tinfo->repeatable_reads = false;
-
- logop(session, "begin snapshot read-ts=%" PRIu64 " (not repeatable)",
- tinfo->read_ts);
+ snap_init(tinfo, ts, false);
+ logop(session,
+ "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts);
}
/*
@@ -415,9 +412,7 @@ begin_transaction(TINFO *tinfo, u_int *iso_configp)
wiredtiger_begin_transaction(session, config);
- tinfo->read_ts = WT_TS_NONE;
- tinfo->repeatable_reads = false;
-
+ snap_init(tinfo, WT_TS_NONE, false);
logop(session, "begin %s", log);
}
@@ -719,8 +714,6 @@ ops(void *arg)
begin_transaction_ts(tinfo, &iso_config);
else
begin_transaction(tinfo, &iso_config);
-
- tinfo->snap_first = tinfo->snap;
intxn = true;
}
@@ -899,7 +892,7 @@ remove_instead_of_truncate:
*/
greater_than = mmrand(&tinfo->rnd, 0, 1) == 1;
range = g.rows < 20 ?
- 1 : mmrand(&tinfo->rnd, 1, (u_int)g.rows / 20);
+ 0 : mmrand(&tinfo->rnd, 0, (u_int)g.rows / 20);
tinfo->last = tinfo->keyno;
if (greater_than) {
if (g.c_reverse) {
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index a1853c56db9..b38f6958f1c 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -29,6 +29,22 @@
#include "format.h"
/*
+ * snap_init --
+ * Initialize the repeatable operation tracking.
+ */
+void
+snap_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads)
+{
+ ++tinfo->opid;
+
+ tinfo->snap_first = tinfo->snap;
+
+ tinfo->read_ts = read_ts;
+ tinfo->repeatable_reads = repeatable_reads;
+ tinfo->repeatable_wrap = false;
+}
+
+/*
* snap_track --
* Add a single snapshot isolation returned value to the list.
*/
@@ -40,10 +56,12 @@ snap_track(TINFO *tinfo, thread_op op)
snap = tinfo->snap;
snap->op = op;
+ snap->opid = tinfo->opid;
snap->keyno = tinfo->keyno;
snap->ts = WT_TS_NONE;
snap->repeatable = false;
snap->last = op == TRUNCATE ? tinfo->last : 0;
+ snap->ksize = snap->vsize = 0;
if (op == INSERT && g.type == ROW) {
ip = tinfo->key;
@@ -63,15 +81,43 @@ snap_track(TINFO *tinfo, thread_op op)
memcpy(snap->vdata, ip->data, snap->vsize = ip->size);
}
+ /* Move to the next slot, wrap at the end of the circular buffer. */
+ if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
+ tinfo->snap = tinfo->snap_list;
+
/*
- * Move to the next slot, wrap at the end of the circular buffer.
- *
* It's possible to pass this transaction's buffer starting point and
- * start replacing our own entries. That's OK, we just skip earlier
- * operations when we check.
+ * start replacing our own entries. If that happens, we can't repeat
+ * operations because we don't know which ones were previously modified.
*/
- if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
- tinfo->snap = tinfo->snap_list;
+ if (tinfo->snap->opid == tinfo->opid)
+ tinfo->repeatable_wrap = true;
+}
+
+/*
+ * print_item_data --
+ * Display a single data/size pair, with a tag.
+ */
+static void
+print_item_data(const char *tag, const uint8_t *data, size_t size)
+{
+ static const char hex[] = "0123456789abcdef";
+ u_char ch;
+
+ fprintf(stderr, "%s {", tag);
+ if (g.type == FIX)
+ fprintf(stderr, "0x%02x", data[0]);
+ else
+ for (; size > 0; --size, ++data) {
+ ch = data[0];
+ if (__wt_isprint(ch))
+ fprintf(stderr, "%c", (int)ch);
+ else
+ fprintf(stderr, "%x%x",
+ (u_int)hex[(data[0] & 0xf0) >> 4],
+ (u_int)hex[data[0] & 0x0f]);
+ }
+ fprintf(stderr, "}\n");
}
/*
@@ -83,10 +129,14 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
{
WT_DECL_RET;
WT_ITEM *key, *value;
+ uint64_t keyno;
uint8_t bitfield;
+ testutil_assert(snap->op != TRUNCATE);
+
key = tinfo->key;
value = tinfo->value;
+ keyno = snap->keyno;
/*
* Retrieve the key/value pair by key. Row-store inserts have a unique
@@ -100,10 +150,10 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
switch (g.type) {
case FIX:
case VAR:
- cursor->set_key(cursor, snap->keyno);
+ cursor->set_key(cursor, keyno);
break;
case ROW:
- key_gen(key, snap->keyno);
+ key_gen(key, keyno);
cursor->set_key(cursor, key);
break;
}
@@ -125,12 +175,11 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
}
/* Check for simple matches. */
- if (ret == 0 &&
- snap->op != REMOVE && snap->op != TRUNCATE &&
+ if (ret == 0 && snap->op != REMOVE &&
value->size == snap->vsize &&
memcmp(value->data, snap->vdata, value->size) == 0)
return (0);
- if (ret == WT_NOTFOUND && (snap->op == REMOVE || snap->op == TRUNCATE))
+ if (ret == WT_NOTFOUND && snap->op == REMOVE)
return (0);
/*
@@ -142,18 +191,23 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
if (ret == WT_NOTFOUND &&
snap->vsize == 1 && *(uint8_t *)snap->vdata == 0)
return (0);
- if ((snap->op == REMOVE || snap->op == TRUNCATE) &&
+ if (snap->op == REMOVE &&
value->size == 1 && *(uint8_t *)value->data == 0)
return (0);
}
/* Things went pear-shaped. */
+#ifdef HAVE_DIAGNOSTIC
+ fprintf(stderr,
+ "snapshot-isolation error: Dumping page to %s\n", g.home_pagedump);
+ testutil_check(__wt_debug_cursor_page(cursor, g.home_pagedump));
+#endif
switch (g.type) {
case FIX:
testutil_die(ret,
"snapshot-isolation: %" PRIu64 " search: "
"expected {0x%02x}, found {0x%02x}",
- snap->keyno,
+ keyno,
snap->op == REMOVE ? 0 : *(uint8_t *)snap->vdata,
ret == WT_NOTFOUND ? 0 : *(uint8_t *)value->data);
/* NOTREACHED */
@@ -177,8 +231,7 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
/* NOTREACHED */
case VAR:
fprintf(stderr,
- "snapshot-isolation %" PRIu64 " search mismatch\n",
- snap->keyno);
+ "snapshot-isolation %" PRIu64 " search mismatch\n", keyno);
if (snap->op == REMOVE)
fprintf(stderr, "expected {deleted}\n");
@@ -190,8 +243,7 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
print_item_data(" found", value->data, value->size);
testutil_die(ret,
- "snapshot-isolation: %" PRIu64 " search mismatch",
- snap->keyno);
+ "snapshot-isolation: %" PRIu64 " search mismatch", keyno);
/* NOTREACHED */
}
@@ -209,7 +261,7 @@ snap_ts_clear(TINFO *tinfo, uint64_t ts)
SNAP_OPS *snap;
int count;
- /* Check from the first operation to the last. */
+ /* Check from the first slot to the last. */
for (snap = tinfo->snap_list,
count = WT_ELEMENTS(tinfo->snap_list); count > 0; --count, ++snap)
if (snap->repeatable && snap->ts <= ts)
@@ -253,12 +305,18 @@ snap_repeat_ok_match(SNAP_OPS *current, SNAP_OPS *a)
* committed successfully.
*/
static bool
-snap_repeat_ok_commit(
- TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first, SNAP_OPS *last)
+snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current)
{
SNAP_OPS *p;
/*
+ * Truncates can't be repeated, we don't know the exact range of records
+ * that were removed (if any).
+ */
+ if (current->op == TRUNCATE)
+ return (false);
+
+ /*
* For updates, check for subsequent changes to the record and don't
* repeat the read. For reads, check for either subsequent or previous
* changes to the record and don't repeat the read. (The reads are
@@ -266,13 +324,10 @@ snap_repeat_ok_commit(
* do the repeatable read in that case.)
*/
for (p = current;;) {
- /*
- * Wrap at the end of the circular buffer; "last" is the element
- * after the last element we want to test.
- */
+ /* Wrap at the end of the circular buffer. */
if (++p >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
p = tinfo->snap_list;
- if (p == last)
+ if (p->opid != tinfo->opid)
break;
if (!snap_repeat_ok_match(current, p))
@@ -282,21 +337,18 @@ snap_repeat_ok_commit(
if (current->op != READ)
return (true);
for (p = current;;) {
- /*
- * Wrap at the beginning of the circular buffer; "first" is the
- * last element we want to test.
- */
- if (p == first)
- return (true);
+ /* Wrap at the beginning of the circular buffer. */
if (--p < tinfo->snap_list)
p = &tinfo->snap_list[
WT_ELEMENTS(tinfo->snap_list) - 1];
+ if (p->opid != tinfo->opid)
+ break;
if (!snap_repeat_ok_match(current, p))
return (false);
}
- /* NOTREACHED */
+ return (true);
}
/*
@@ -305,7 +357,7 @@ snap_repeat_ok_commit(
* transaction has rolled back.
*/
static bool
-snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first)
+snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current)
{
SNAP_OPS *p;
@@ -318,21 +370,18 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first)
* the read in that case.
*/
for (p = current;;) {
- /*
- * Wrap at the beginning of the circular buffer; "first" is the
- * last element we want to test.
- */
- if (p == first)
- return (true);
+ /* Wrap at the beginning of the circular buffer. */
if (--p < tinfo->snap_list)
p = &tinfo->snap_list[
WT_ELEMENTS(tinfo->snap_list) - 1];
+ if (p->opid != tinfo->opid)
+ break;
if (!snap_repeat_ok_match(current, p))
return (false);
}
- /* NOTREACHED */
+ return (true);
}
/*
@@ -342,31 +391,21 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first)
int
snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo)
{
- SNAP_OPS *current, *stop;
+ SNAP_OPS *current;
+
+ /* If we wrapped the buffer, we can't repeat operations. */
+ if (tinfo->repeatable_wrap)
+ return (0);
/* Check from the first operation we saved to the last. */
- for (current = tinfo->snap_first, stop = tinfo->snap;; ++current) {
+ for (current = tinfo->snap_first;; ++current) {
/* Wrap at the end of the circular buffer. */
if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
current = tinfo->snap_list;
- if (current == stop)
+ if (current->opid != tinfo->opid)
break;
- /*
- * We don't test all of the records in a truncate range, only
- * the first because that matches the rest of the isolation
- * checks. If a truncate range was from the start of the table,
- * switch to the record at the end. This is done in the first
- * routine that considers if operations are repeatable, and the
- * rest of those functions depend on it already being done.
- */
- if (current->op == TRUNCATE && current->keyno == 0) {
- current->keyno = current->last;
- testutil_assert(current->keyno != 0);
- }
-
- if (snap_repeat_ok_commit(
- tinfo, current, tinfo->snap_first, stop))
+ if (snap_repeat_ok_commit(tinfo, current))
WT_RET(snap_verify(cursor, tinfo, current));
}
@@ -381,19 +420,18 @@ snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo)
void
snap_repeat_update(TINFO *tinfo, bool committed)
{
- SNAP_OPS *start, *stop;
+ SNAP_OPS *current;
- /*
- * Check from the first operation we saved to the last. It's possible
- * to update none at all if we did exactly the number of operations
- * in the circular buffer, it will look like we didn't do any. That's
- * OK, it's a big enough buffer that it's not going to matter.
- */
- for (start = tinfo->snap_first, stop = tinfo->snap;; ++start) {
+ /* If we wrapped the buffer, we can't repeat operations. */
+ if (tinfo->repeatable_wrap)
+ return;
+
+ /* Check from the first operation we saved to the last. */
+ for (current = tinfo->snap_first;; ++current) {
/* Wrap at the end of the circular buffer. */
- if (start >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
- start = tinfo->snap_list;
- if (start == stop)
+ if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
+ current = tinfo->snap_list;
+ if (current->opid != tinfo->opid)
break;
/*
@@ -401,23 +439,23 @@ snap_repeat_update(TINFO *tinfo, bool committed)
* timestamp chosen wasn't older than all concurrently running
* uncommitted updates.
*/
- if (!tinfo->repeatable_reads && start->op == READ)
+ if (!tinfo->repeatable_reads && current->op == READ)
continue;
/*
* Second, check based on the transaction resolution (the rules
* are different if the transaction committed or rolled back).
*/
- start->repeatable = committed ? snap_repeat_ok_commit(
- tinfo, start, tinfo->snap_first, stop) :
- snap_repeat_ok_rollback(tinfo, start, tinfo->snap_first);
+ current->repeatable = committed ?
+ snap_repeat_ok_commit(tinfo, current) :
+ snap_repeat_ok_rollback(tinfo, current);
/*
* Repeat reads at the transaction's read timestamp and updates
* at the commit timestamp.
*/
- if (start->repeatable)
- start->ts = start->op == READ ?
+ if (current->repeatable)
+ current->ts = current->op == READ ?
tinfo->read_ts : tinfo->commit_ts;
}
}
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index 6f7783a3a32..91d9bf51697 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -330,6 +330,12 @@ path_setup(const char *home)
g.home_log = dmalloc(len);
testutil_check(__wt_snprintf(g.home_log, len, "%s/%s", g.home, "log"));
+ /* Page dump file. */
+ len = strlen(g.home) + strlen("pagedump") + 2;
+ g.home_pagedump = dmalloc(len);
+ testutil_check(__wt_snprintf(
+ g.home_pagedump, len, "%s/%s", g.home, "pagedump"));
+
/* RNG log file. */
len = strlen(g.home) + strlen("rand") + 2;
g.home_rand = dmalloc(len);
@@ -675,39 +681,3 @@ alter(void *arg)
testutil_check(session->close(session, NULL));
return (WT_THREAD_RET_VALUE);
}
-
-/*
- * print_item_data --
- * Display a single data/size pair, with a tag.
- */
-void
-print_item_data(const char *tag, const uint8_t *data, size_t size)
-{
- static const char hex[] = "0123456789abcdef";
- u_char ch;
-
- fprintf(stderr, "\t%s {", tag);
- if (g.type == FIX)
- fprintf(stderr, "0x%02x", data[0]);
- else
- for (; size > 0; --size, ++data) {
- ch = data[0];
- if (__wt_isprint(ch))
- fprintf(stderr, "%c", (int)ch);
- else
- fprintf(stderr, "%x%x",
- (u_int)hex[(data[0] & 0xf0) >> 4],
- (u_int)hex[data[0] & 0x0f]);
- }
- fprintf(stderr, "}\n");
-}
-
-/*
- * print_item --
- * Display a single data/size pair, with a tag.
- */
-void
-print_item(const char *tag, WT_ITEM *item)
-{
- print_item_data(tag, item->data, item->size);
-}
diff --git a/src/third_party/wiredtiger/test/suite/test_compat02.py b/src/third_party/wiredtiger/test/suite/test_compat02.py
index de5862513a8..a92c3f54300 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat02.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat02.py
@@ -133,6 +133,13 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
# version. That configuration needs an existing database for it to be
# useful. Test for success or failure based on the relative versions
# configured.
+
+ # Turn on checkpoint verbose to debug a rare occurence of a test
+ # hanging, most likely during the checkpoint of conn.close.
+ self.pr("Closing connection")
+ self.conn.reconfigure('verbose=(checkpoint)')
+ with self.expectedStdoutPattern('.'):
+ self.conn.close()
compat_str = ''
if (self.max_req != 'none'):
compat_str += 'compatibility=(require_max="%s"),' % self.max_req
@@ -140,7 +147,6 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess):
compat_str += 'compatibility=(require_min="%s"),' % self.min_req
if (self.rel != 'none'):
compat_str += 'compatibility=(release="%s"),' % self.rel
- self.conn.close()
log_str = 'log=(enabled,file_max=%s,archive=false),' % self.logmax
restart_config = log_str + compat_str
self.pr("Restart conn " + restart_config)
diff --git a/src/third_party/wiredtiger/test/suite/test_txn19.py b/src/third_party/wiredtiger/test/suite/test_txn19.py
index 604d8bed8bb..c63e9f9c6e9 100755
--- a/src/third_party/wiredtiger/test/suite/test_txn19.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn19.py
@@ -53,6 +53,20 @@ def corrupt(fname, truncate, offset, writeit):
if writeit:
log.write(writeit)
+def copy_for_crash_restart(olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+
class test_txn19(wttest.WiredTigerTestCase, suite_subprocess):
base_config = 'log=(archive=false,enabled,file_max=100K),' + \
'transaction_sync=(enabled,method=none)'
@@ -158,20 +172,6 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess):
self.tty('LOGS ' + msg + ': ' + str(i) + ' is empty')
self.tty('LOGS ' + msg + ': ' + str(loglist))
- def copy_for_crash_restart(self, olddir, newdir):
- ''' Simulate a crash from olddir and restart in newdir. '''
- # with the connection still open, copy files to new directory
- shutil.rmtree(newdir, ignore_errors=True)
- os.mkdir(newdir)
- for fname in os.listdir(olddir):
- fullname = os.path.join(olddir, fname)
- # Skip lock file on Windows since it is locked
- if os.path.isfile(fullname) and \
- "WiredTiger.lock" not in fullname and \
- "Tmplog" not in fullname and \
- "Preplog" not in fullname:
- shutil.copy(fullname, newdir)
-
# Generate a value that is a bit over half the size of the log file.
def valuegen(self, i):
return str(i) + 'A' * (1024 * 60) # ~60K
@@ -280,7 +280,7 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess):
self.session.create(self.uri, self.create_params)
self.inserts([x for x in range(0, self.nrecords)])
newdir = "RESTART"
- self.copy_for_crash_restart(self.home, newdir)
+ copy_for_crash_restart(self.home, newdir)
self.close_conn()
#self.show_logs(newdir, 'before corruption')
self.corrupt_log(newdir)
@@ -346,12 +346,210 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess):
newdir2 = "RESTART2"
self.inserts([self.nrecords, self.nrecords + 1])
expect.extend([self.nrecords, self.nrecords + 1])
- self.copy_for_crash_restart(newdir, newdir2)
+ copy_for_crash_restart(newdir, newdir2)
self.checks(expect)
self.reopen_conn(newdir)
self.checks(expect)
self.reopen_conn(newdir2, self.conn_config)
self.checks(expect)
+class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess):
+ base_config = 'log=(archive=false,enabled,file_max=100K),' + \
+ 'transaction_sync=(enabled,method=none)'
+ conn_config = base_config
+
+ # The type of corruption to be applied
+ corruption_scenarios = [
+ ('removal', dict(kind='removal', f=lambda fname:
+ os.remove(fname))),
+ ('truncate', dict(kind='truncate', f=lambda fname:
+ corrupt(fname, True, 0, None))),
+ ('truncate-middle', dict(kind='truncate-middle', f=lambda fname:
+ corrupt(fname, True, 1024 * 25, None))),
+ ('zero-begin', dict(kind='zero', f=lambda fname:
+ corrupt(fname, False, 0, '\0' * 4096))),
+ ('zero-trunc', dict(kind='zero', f=lambda fname:
+ corrupt(fname, True, 0, '\0' * 4096))),
+ ('zero-end', dict(kind='zero-end', f=lambda fname:
+ corrupt(fname, False, -1, '\0' * 4096))),
+ ('garbage-begin', dict(kind='garbage-begin', f=lambda fname:
+ corrupt(fname, False, 0, 'Bad!' * 1024))),
+ ('garbage-middle', dict(kind='garbage-middle', f=lambda fname:
+ corrupt(fname, False, 1024 * 25, 'Bad!' * 1024))),
+ ('garbage-end', dict(kind='garbage-end', f=lambda fname:
+ corrupt(fname, False, -1, 'Bad!' * 1024))),
+ ]
+ # File to be corrupted
+ filename_scenarios = [
+ ('WiredTiger', dict(filename='WiredTiger')),
+ ('WiredTiger.basecfg', dict(filename='WiredTiger.basecfg')),
+ ('WiredTiger.turtle', dict(filename='WiredTiger.turtle')),
+ ('WiredTiger.wt', dict(filename='WiredTiger.wt')),
+ ('WiredTigerLAS.wt', dict(filename='WiredTigerLAS.wt')),
+ ]
+
+ # In many cases, wiredtiger_open without any salvage options will
+ # just work. We list those cases here.
+ openable = [
+ "removal:WiredTiger.basecfg",
+ "removal:WiredTiger.turtle",
+ "removal:WiredTigerLAS.wt",
+ "truncate:WiredTiger",
+ "truncate:WiredTiger.basecfg",
+ "truncate:WiredTigerLAS.wt",
+ "truncate-middle:WiredTiger",
+ "truncate-middle:WiredTiger.basecfg",
+ "truncate-middle:WiredTiger.turtle",
+ "truncate-middle:WiredTiger.wt",
+ "truncate-middle:WiredTigerLAS.wt",
+ "zero:WiredTiger",
+ "zero:WiredTiger.basecfg",
+ "zero:WiredTigerLAS.wt",
+ "zero-end:WiredTiger",
+ "zero-end:WiredTiger.basecfg",
+ "zero-end:WiredTiger.turtle",
+ "zero-end:WiredTiger.wt",
+ "zero-end:WiredTigerLAS.wt",
+ "garbage-begin:WiredTiger",
+ "garbage-begin:WiredTigerLAS.wt",
+ "garbage-middle:WiredTiger",
+ "garbage-middle:WiredTiger.basecfg",
+ "garbage-middle:WiredTiger.turtle",
+ "garbage-middle:WiredTiger.wt",
+ "garbage-middle:WiredTigerLAS.wt",
+ "garbage-end:WiredTiger",
+ "garbage-end:WiredTiger.turtle",
+ "garbage-end:WiredTiger.wt",
+ "garbage-end:WiredTigerLAS.wt",
+ ]
+
+ # The cases for which salvage will not work, represented in the
+ # form (self.kind + ':' + self.filename)
+ not_salvageable = [
+ "removal:WiredTiger.turtle",
+ "removal:WiredTiger.wt",
+ "truncate:WiredTiger.wt",
+ "zero:WiredTiger.wt",
+ "garbage-begin:WiredTiger.basecfg",
+ "garbage-begin:WiredTiger.wt",
+ "garbage-end:WiredTiger.basecfg",
+ ]
+
+ scenarios = make_scenarios(corruption_scenarios, filename_scenarios)
+ uri = 'table:test_txn19_meta_'
+ ntables = 5
+ create_params = 'key_format=i,value_format=S'
+ nrecords = 1000 # records per table.
+ suffixes = [ str(x) for x in range(0, ntables)] # [ '0', '1', ... ]
+
+ def valuegen(self, i):
+ return str(i) + 'A' * 1024
+
+ # Insert a list of keys
+ def inserts(self, keylist):
+ for suffix in self.suffixes:
+ c = self.session.open_cursor(self.uri + suffix)
+ for i in keylist:
+ c[i] = self.valuegen(i)
+ c.close()
+
+ def checks(self, expectlist):
+ for suffix in self.suffixes:
+ c = self.session.open_cursor(self.uri + suffix, None, None)
+ gotlist = []
+ for key, value in c:
+ gotlist.append(key)
+ self.assertEqual(self.valuegen(key), value)
+ self.assertEqual(expectlist, gotlist)
+ c.close()
+
+ def corrupt_meta(self, homedir):
+ # Mark this test has having corrupted files
+ self.databaseCorrupted()
+ filename = os.path.join(homedir, self.filename)
+ self.f(filename)
+
+ def is_openable(self):
+ key = self.kind + ':' + self.filename
+ return key in self.openable
+
+ def is_salvageable(self):
+ key = self.kind + ':' + self.filename
+ return key not in self.not_salvageable
+
+ def test_corrupt_meta(self):
+ errfile = 'list.err'
+ outfile = 'list.out'
+ newdir = "RESTART"
+ newdir2 = "RESTART2"
+ expect = list(range(0, self.nrecords))
+ salvage_config = self.base_config + ',salvage=true'
+
+ for suffix in self.suffixes:
+ self.session.create(self.uri + suffix, self.create_params)
+ self.inserts(expect)
+
+ # Simulate a crash by copying the contents of the directory
+ # before closing. After we corrupt the copy, make another
+ # copy of the corrupted directory.
+ #
+ # The first corrupted copy will be used to run:
+ # wiredtiger_open without salvage flag, followed by:
+ # wiredtiger_open with salvage flag.
+ # The second directory will be used to run:
+ # wiredtiger_open with salvage flag first.
+
+ copy_for_crash_restart(self.home, newdir)
+ self.close_conn()
+ self.corrupt_meta(newdir)
+ copy_for_crash_restart(newdir, newdir2)
+
+ # In cases of corruption, we cannot always call wiredtiger_open
+ # directly, because there may be a panic, and abort() is called
+ # in diagnostic mode which terminates the Python interpreter.
+ #
+ # Running any wt command externally to Python allows
+ # us to observe the failure or success safely.
+ # Use -R to force recover=on, which is the default for
+ # wiredtiger_open, (wt utilities normally have recover=error)
+
+ expect_fail = not self.is_openable()
+ self.runWt(['-h', newdir, '-C', self.base_config, '-R', 'list'],
+ errfilename=errfile, outfilename=outfile, failure=expect_fail,
+ closeconn=False)
+
+ if expect_fail:
+ self.check_file_contains_one_of(errfile,
+ ['/unknown configuration key/',
+ '/handle-open:/',
+ '/turtle file read error: WT_NOTFOUND: item not found/',
+ 'WT_ERROR: non-specific WiredTiger error',
+ 'WT_TRY_SALVAGE: database corruption detected'])
+
+ for salvagedir in [ newdir, newdir2 ]:
+ # Removing the 'WiredTiger.turtle' file has weird behavior:
+ # Immediately doing wiredtiger_open (without salvage) succeeds.
+ # Following that, wiredtiger_open w/ salvage also succeeds.
+ #
+ # But, immediately after the corruption, if we run
+ # wiredtiger_open with salvage, it will fail.
+ # This anomoly should be fixed or explained.
+ if salvagedir == newdir and self.kind == 'removal' and \
+ self.filename == 'WiredTiger.turtle':
+ continue
+
+ if self.is_salvageable():
+ self.reopen_conn(salvagedir, salvage_config)
+ self.checks(expect)
+ else:
+ # Certain cases are not currently salvageable, they result in
+ # an error during the wiredtiger_open. But the nature of the
+ # messages produced during the error is variable by which case
+ # it is, and even variable from system to system.
+ with self.expectedStdoutPattern('.'):
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.reopen_conn(salvagedir, salvage_config),
+ '/.*/')
+
if __name__ == '__main__':
wttest.run()