diff options
34 files changed, 844 insertions, 399 deletions
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 index 3600a39fe43..b912335fd16 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/strict.m4 @@ -134,6 +134,10 @@ AC_DEFUN([AM_CLANG_WARNINGS], [ w="$w -Wno-unused-command-line-argument";; esac + # We occasionally use an extra semicolon to indicate an empty loop or + # conditional body. + w="$w -Wno-extra-semi-stmt" + # Ignore unrecognized options. w="$w -Wno-unknown-warning-option" diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 6ce1ad16a5f..563236661aa 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1072,6 +1072,7 @@ ownp pR pS packv +pagedump pagesize parens pareto @@ -1325,6 +1326,7 @@ unmodify unordered unpackv unpadded +unreconciled unreferenced unregister unsized diff --git a/src/third_party/wiredtiger/examples/c/ex_async.c b/src/third_party/wiredtiger/examples/c/ex_async.c index e9ffad4807c..85f783092fa 100644 --- a/src/third_party/wiredtiger/examples/c/ex_async.c +++ b/src/third_party/wiredtiger/examples/c/ex_async.c @@ -37,7 +37,7 @@ static const char *home; #elif defined(_WIN32) #define ATOMIC_ADD(v, val) (_InterlockedExchangeAdd(&(v), val) + val) #else -#define ATOMIC_ADD(v, val) __sync_add_and_fetch(&(v), val) +#define ATOMIC_ADD(v, val) __atomic_add_fetch(&(v), val, __ATOMIC_SEQ_CST) #endif static int global_error = 0; diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index b50a7f09165..2673308c46e 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "1a1197ef3c891458cd73290ad9b01c1e969f7e86", + "commit": "280c572c8097a322e429a349f73135266f3faacf", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.2" diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c index 9f9aa979139..0ef85b8cd28 100644 --- a/src/third_party/wiredtiger/src/async/async_api.c +++ b/src/third_party/wiredtiger/src/async/async_api.c @@ -160,8 +160,7 @@ retry: WT_RET(__async_get_format(conn, uri, config, op)); op->unique_id = __wt_atomic_add64(&async->op_id, 1); op->optype = WT_AOP_NONE; - (void)__wt_atomic_store32( - &async->ops_index, (i + 1) % conn->async_size); + async->ops_index = (i + 1) % conn->async_size; *opp = op; return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 4d94bcdb23e..d045405f85a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1506,8 +1506,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) || !F_ISSET(cursor, WT_CURSTD_VALUE_INT)) WT_ERR(__wt_btcur_search(cbt)); + + WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries)); + orig = cursor->value.size; - WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries)); + WT_ERR(__wt_modify_apply(cursor, modify->data)); new = cursor->value.size; WT_ERR(__cursor_size_chk(session, &cursor->value)); @@ -1527,8 +1530,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) F_CLR(cursor, WT_CURSTD_OVERWRITE); if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt)) ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD); - else if ((ret = - __wt_modify_pack(session, &modify, entries, nentries)) == 0) + else ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY); if (overwrite) F_SET(cursor, WT_CURSTD_OVERWRITE); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 685fb983718..9f5cadfecd0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -718,6 +718,7 @@ __wt_debug_page( */ int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; @@ -889,7 +890,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) if (split_gen != 0) WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen)); if (mod != NULL) - WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen)); + WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state)); WT_RET(ds->f(ds, ", memory-size %" WT_SIZET_FMT, page->memory_footprint)); WT_RET(ds->f(ds, "\n")); diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 52277efb85d..d41f76c6442 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -250,7 +250,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, * updates. */ while (i > 0) - WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data)); + WT_ERR(__wt_modify_apply(cursor, listp[--i]->data)); err: if (allocated_bytes != 0) __wt_free(session, listp); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index a7d34d49f84..d2ac866bc59 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -322,6 +322,10 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. * + * Once the transaction has given up it's snapshot it + * is no longer safe to reconcile pages. That happens + * prior to the final metadata checkpoint. + * * XXX Only attempt this eviction when there are no * readers older than the checkpoint. Otherwise, a bug * in eviction can mark the page clean and discard @@ -331,6 +335,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction && + F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) && (!F_ISSET(txn, WT_TXN_HAS_TS_READ) || txn->read_timestamp == conn->txn_global.pinned_timestamp)) { diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 8603d329c15..a01ef5a49a7 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -30,6 +30,15 @@ __search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); + /* + * Since the head of the skip list doesn't get mutated within this + * function, the compiler may move this assignment above within the + * loop below if it needs to (and may read a different value on each + * loop due to other threads mutating the skip list). + * + * Place a read barrier here to avoid this issue. + */ + WT_READ_BARRIER(); key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c index a75bdd259c4..38052a8e412 100644 --- a/src/third_party/wiredtiger/src/conn/conn_capacity.c +++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c @@ -270,8 +270,7 @@ __capacity_reserve(uint64_t *reservation, uint64_t bytes, uint64_t capacity, * If the reservation clock is out of date, bring it * to within a second of a current time. */ - (void)__wt_atomic_store64(reservation, - (now_ns - WT_BILLION) + res_len); + *reservation = (now_ns - WT_BILLION) + res_len; } else res_value = now_ns; diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 07bfe02a142..12be6929022 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -508,10 +508,8 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, if (!passed) { if (iter != NULL && (iter->is_equal || - F_ISSET(end, WT_CURJOIN_END_LT))) { - WT_RET(__curjoin_iter_bump(iter)); + F_ISSET(end, WT_CURJOIN_END_LT))) return (WT_NOTFOUND); - } if (!disjunction) return (WT_NOTFOUND); iter = NULL; @@ -606,6 +604,9 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, WT_ITEM v; bool bloom_found; + /* We cannot have a bloom filter on a join entry with subordinates. */ + WT_ASSERT(session, entry->bloom == NULL || entry->subjoin == NULL); + if (entry->subjoin == NULL && iter != NULL && (iter->end_pos + iter->end_skip >= entry->ends_next || (iter->end_skip > 0 && @@ -633,16 +634,19 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, bloom_found = true; } if (entry->subjoin != NULL) { + /* + * If we have a subordinate join, the membership + * check is delegated to it. + */ WT_ASSERT(session, iter == NULL || entry->subjoin == iter->child->cjoin); - ret = __curjoin_entries_in_range(session, entry->subjoin, - key, iter == NULL ? NULL : iter->child); + WT_ERR(__curjoin_entries_in_range(session, entry->subjoin, + key, iter == NULL ? NULL : iter->child)); if (iter != NULL && - WT_CURJOIN_ITER_CONSUMED(iter->child)) { - WT_ERR(__curjoin_iter_bump(iter)); - ret = WT_NOTFOUND; - } - return (ret); + WT_CURJOIN_ITER_CONSUMED(iter->child)) + return (WT_NOTFOUND); + /* There's nothing more to do for this node. */ + return (0); } if (entry->index != NULL) { /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 073df6eaaf6..22d067ef90e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -931,7 +931,7 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) /* Get the current value, apply the modifications. */ WT_ERR(cursor->search(cursor)); - WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries)); + WT_ERR(__wt_modify_apply_api(cursor, entries, nentries)); /* We know both key and value are set, "overwrite" doesn't matter. */ ret = cursor->update(cursor); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index a7c289a7b7f..03643f473e1 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -489,10 +489,21 @@ struct __wt_page_modify { WT_SPINLOCK page_lock; /* Page's spinlock */ /* - * The write generation is incremented when a page is modified, a page - * is clean if the write generation is 0. + * The page state is incremented when a page is modified. + * + * WT_PAGE_CLEAN -- + * The page is clean. + * WT_PAGE_DIRTY_FIRST -- + * The page is in this state after the first operation that marks a + * page dirty, or when reconciliation is checking to see if it has + * done enough work to be able to mark the page clean. + * WT_PAGE_DIRTY -- + * Two or more updates have been added to the page. */ - uint32_t write_gen; +#define WT_PAGE_CLEAN 0 +#define WT_PAGE_DIRTY_FIRST 1 +#define WT_PAGE_DIRTY 2 + uint32_t page_state; #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index d0679a9fb38..3fa5d60f1f1 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page) static inline bool __wt_page_evict_clean(WT_PAGE *page) { - return (page->modify == NULL || (page->modify->write_gen == 0 && + return (page->modify == NULL || + (page->modify->page_state == WT_PAGE_CLEAN && page->modify->rec_result == 0)); } @@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page) static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0); + return (page->modify != NULL && + page->modify->page_state != WT_PAGE_CLEAN); } /* @@ -496,19 +498,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); last_running = 0; - if (page->modify->write_gen == 0) + if (page->modify->page_state == WT_PAGE_CLEAN) last_running = S2C(session)->txn_global.last_running; /* - * We depend on atomic-add being a write barrier, that is, a barrier to - * ensure all changes to the page are flushed before updating the page - * write generation and/or marking the tree dirty, otherwise checkpoints + * We depend on the atomic operation being a write barrier, that is, a + * barrier to ensure all changes to the page are flushed before updating + * the page state and/or marking the tree dirty, otherwise checkpoints * and/or page reconciliation might be looking at a clean page/tree. * * Every time the page transitions from clean to dirty, update the cache * and transactional information. + * + * The page state can only ever be incremented above dirty by the number + * of concurrently running threads, so the counter will never approach + * the point where it would wrap. */ - if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) { + if (page->modify->page_state < WT_PAGE_DIRTY && + __wt_atomic_add32(&page->modify->page_state, 1) == + WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); /* @@ -579,7 +587,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { - page->modify->write_gen = 0; + /* + * The only part where ordering matters is during + * reconciliation where updates on other threads are performing + * writes to the page state that need to be visible to the + * reconciliation thread. + * + * Since clearing of the page state is not going to be happening + * during reconciliation on a separate thread, there's no write + * barrier needed here. + */ + page->modify->page_state = WT_PAGE_CLEAN; __wt_cache_dirty_decr(session, page); } } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 80046127d3f..5dbd7115684 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -277,7 +277,7 @@ extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTR extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t checksum, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -503,9 +503,9 @@ extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char extern int __wt_metadata_set_base_write_gen(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_turtle_rewrite(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply_api(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_apply(WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index a9d271ed0bd..7ee64cb663f 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -89,38 +89,40 @@ * swap) operations. */ -#ifdef __clang__ /* - * We avoid __sync_bool_compare_and_swap with due to problems with optimization - * with some versions of clang. See http://llvm.org/bugs/show_bug.cgi?id=21499 - * for details. + * We've hit optimization bugs with Clang 3.5 in the past when using the atomic + * builtins. See http://llvm.org/bugs/show_bug.cgi?id=21499 for details. */ -#define WT_ATOMIC_CAS(ptr, old, new) \ - (__sync_val_compare_and_swap(ptr, old, new) == (old)) -#else -#define WT_ATOMIC_CAS(ptr, old, new) \ - __sync_bool_compare_and_swap(ptr, old, new) +#if defined(__clang__) && \ + defined(__clang_major__) && defined(__clang_minor__) && \ + (((__clang_major__ == 3) && (__clang_minor__ <= 5)) || \ + (__clang_major__ < 3)) +#error "Clang versions 3.5 and earlier are unsupported by WiredTiger" #endif + +#define WT_ATOMIC_CAS(ptr, oldp, new) \ + __atomic_compare_exchange_n( \ + ptr, oldp, new, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #define WT_ATOMIC_CAS_FUNC(name, vp_arg, old_arg, new_arg) \ static inline bool \ __wt_atomic_cas##name(vp_arg, old_arg, new_arg) \ { \ - return (WT_ATOMIC_CAS(vp, old, new)); \ + return (WT_ATOMIC_CAS(vp, &old, new)); \ } WT_ATOMIC_CAS_FUNC(8, uint8_t *vp, uint8_t old, uint8_t new) WT_ATOMIC_CAS_FUNC(16, uint16_t *vp, uint16_t old, uint16_t new) WT_ATOMIC_CAS_FUNC(32, uint32_t *vp, uint32_t old, uint32_t new) WT_ATOMIC_CAS_FUNC(v32, \ - volatile uint32_t *vp, volatile uint32_t old, volatile uint32_t new) + volatile uint32_t *vp, uint32_t old, volatile uint32_t new) WT_ATOMIC_CAS_FUNC(i32, int32_t *vp, int32_t old, int32_t new) WT_ATOMIC_CAS_FUNC(iv32, \ - volatile int32_t *vp, volatile int32_t old, volatile int32_t new) + volatile int32_t *vp, int32_t old, volatile int32_t new) WT_ATOMIC_CAS_FUNC(64, uint64_t *vp, uint64_t old, uint64_t new) WT_ATOMIC_CAS_FUNC(v64, \ - volatile uint64_t *vp, volatile uint64_t old, volatile uint64_t new) + volatile uint64_t *vp, uint64_t old, volatile uint64_t new) WT_ATOMIC_CAS_FUNC(i64, int64_t *vp, int64_t old, int64_t new) WT_ATOMIC_CAS_FUNC(iv64, \ - volatile int64_t *vp, volatile int64_t old, volatile int64_t new) + volatile int64_t *vp, int64_t old, volatile int64_t new) WT_ATOMIC_CAS_FUNC(size, size_t *vp, size_t old, size_t new) /* @@ -130,29 +132,24 @@ WT_ATOMIC_CAS_FUNC(size, size_t *vp, size_t old, size_t new) static inline bool __wt_atomic_cas_ptr(void *vp, void *old, void *new) { - return (WT_ATOMIC_CAS((void **)vp, old, new)); + return (WT_ATOMIC_CAS((void **)vp, &old, new)); } #define WT_ATOMIC_FUNC(name, ret, vp_arg, v_arg) \ static inline ret \ __wt_atomic_add##name(vp_arg, v_arg) \ { \ - return (__sync_add_and_fetch(vp, v)); \ + return (__atomic_add_fetch(vp, v, __ATOMIC_SEQ_CST)); \ } \ static inline ret \ __wt_atomic_fetch_add##name(vp_arg, v_arg) \ { \ - return (__sync_fetch_and_add(vp, v)); \ -} \ -static inline ret \ -__wt_atomic_store##name(vp_arg, v_arg) \ -{ \ - return (__sync_lock_test_and_set(vp, v)); \ + return (__atomic_fetch_add(vp, v, __ATOMIC_SEQ_CST)); \ } \ static inline ret \ __wt_atomic_sub##name(vp_arg, v_arg) \ { \ - return (__sync_sub_and_fetch(vp, v)); \ + return (__atomic_sub_fetch(vp, v, __ATOMIC_SEQ_CST)); \ } WT_ATOMIC_FUNC(8, uint8_t, uint8_t *vp, uint8_t v) WT_ATOMIC_FUNC(16, uint16_t, uint16_t *vp, uint16_t v) diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h index 903b0238b37..5d7cee531c2 100644 --- a/src/third_party/wiredtiger/src/include/lint.h +++ b/src/third_party/wiredtiger/src/include/lint.h @@ -35,15 +35,6 @@ __wt_atomic_fetch_add##name(type *vp, type v) \ return (orig); \ } \ static inline ret \ -__wt_atomic_store##name(type *vp, type v) \ -{ \ - type orig; \ - \ - orig = *vp; \ - *vp = v; \ - return (orig); \ -} \ -static inline ret \ __wt_atomic_sub##name(type *vp, type v) \ { \ *vp -= v; \ diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h index 1586dae22b8..f4d8dc942f6 100644 --- a/src/third_party/wiredtiger/src/include/msvc.h +++ b/src/third_party/wiredtiger/src/include/msvc.h @@ -45,11 +45,6 @@ __wt_atomic_fetch_add##name(type *vp, type v) \ return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v))); \ } \ static inline ret \ -__wt_atomic_store##name(type *vp, type v) \ -{ \ - return (_InterlockedExchange ## s((t *)(vp), (t)(v))); \ -} \ -static inline ret \ __wt_atomic_sub##name(type *vp, type v) \ { \ return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \ diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index 15e7218dd28..660ee22ed96 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -68,7 +68,7 @@ __wt_spin_trylock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_UNUSED(session); - return (__sync_lock_test_and_set(&t->lock, 1) == 0 ? 0 : EBUSY); + return (__atomic_test_and_set(&t->lock, __ATOMIC_ACQUIRE) ? 0 : EBUSY); } /* @@ -82,7 +82,7 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) WT_UNUSED(session); - while (__sync_lock_test_and_set(&t->lock, 1)) { + while (__atomic_test_and_set(&t->lock, __ATOMIC_ACQUIRE)) { for (i = 0; t->lock && i < WT_SPIN_COUNT; i++) WT_PAUSE(); if (t->lock) @@ -99,7 +99,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_UNUSED(session); - __sync_lock_release(&t->lock); + __atomic_clear(&t->lock, __ATOMIC_RELEASE); } #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX || \ diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index be6440c27bc..c3c46ec11c5 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -20,12 +20,6 @@ struct __wt_reconcile { uint32_t flags; /* Caller's configuration */ /* - * Track start/stop write generation to decide if all changes to the - * page are written. - */ - uint32_t orig_write_gen; - - /* * Track start/stop checkpoint generations to decide if lookaside table * records are correct. */ diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 1c67a84adbf..701f73df84f 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -7,29 +7,6 @@ */ /* - * __page_write_gen_wrapped_check -- - * Confirm the page's write generation number won't wrap. - */ -static inline int -__page_write_gen_wrapped_check(WT_PAGE *page) -{ - /* - * Check to see if the page's write generation is about to wrap (wildly - * unlikely as it implies 4B updates between clean page reconciliations, - * but technically possible), and fail the update. - * - * The check is outside of the serialization mutex because the page's - * write generation is going to be a hot cache line, so technically it's - * possible for the page's write generation to wrap between the test and - * our subsequent modification of it. However, the test is (4B-1M), and - * there cannot be a million threads that have done the test but not yet - * completed their modification. - */ - return (page->modify->write_gen > - UINT32_MAX - WT_MILLION ? WT_RESTART : 0); -} - -/* * __insert_simple_func -- * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ @@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * Acquire the page's spinlock unless we already have exclusive access. * Then call the worker function. @@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - simple = true; for (i = 0; i < skipdepth; i++) if (new_ins->next[i] == NULL) @@ -272,9 +243,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, upd = *updp; *updp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * All structure setup must be flushed before the structure is entered * into the list. We need a write barrier here, our callers depend on diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index ce58f9f7301..e9c6f7f8e9d 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -232,8 +232,6 @@ __wt_txn_resolve_prepared_op( continue; if (upd->txnid != txn->id) break; - if (op->u.op_upd == NULL) - op->u.op_upd = upd; ++(*resolved_update_countp); @@ -844,6 +842,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) return (WT_VISIBLE_TRUE); } + /* * __wt_txn_upd_durable -- * Can the current transaction make the given update durable. diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f7eeada4c8e..477894bcf14 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -456,14 +456,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. + * We set the page state to mark it as having been dirtied for + * the first time prior to reconciliation. A failed atomic cas + * indicates that an update has taken place during + * reconciliation. * - * If the write generation changed, the page has been written - * since reconciliation started and remains dirty (that can't - * happen when evicting, the page is exclusively locked). + * The page only might be clean; if the page state is unchanged + * since reconciliation started, it's clean. + * + * If the page state changed, the page has been written since + * reconciliation started and remains dirty (that can't happen + * when evicting, the page is exclusively locked). */ - if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + if (__wt_atomic_cas32( + &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN)) __wt_cache_dirty_decr(session, page); else WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); @@ -602,13 +608,22 @@ __rec_init(WT_SESSION_IMPL *session, r->page = page; /* - * Save the page's write generation before reading the page. * Save the transaction generations before reading the page. * These are all ordered reads, but we only need one. */ r->orig_btree_checkpoint_gen = btree->checkpoint_gen; r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); - WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Update the page state to indicate that all currently installed + * updates will be included in this reconciliation if it would mark the + * page clean. + * + * Add a write barrier to make it more likely that a thread adding an + * update will see this state change. + */ + page->modify->page_state = WT_PAGE_DIRTY_FIRST; + WT_FULL_BARRIER(); /* * Cache the oldest running transaction ID. This is used to check diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index be1b1970da6..e8260cb41b6 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -8,28 +8,60 @@ #include "wt_internal.h" +#define WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) do { \ + const size_t *__p = p; \ + const uint8_t *__data = \ + (const uint8_t *)(__p + (size_t)(nentries) * 3); \ + int __i; \ + for (__i = 0; __i < (nentries); ++__i) { \ + memcpy(&(mod).data.size, __p++, sizeof(size_t)); \ + memcpy(&(mod).offset, __p++, sizeof(size_t)); \ + memcpy(&(mod).size, __p++, sizeof(size_t)); \ + (mod).data.data = __data; \ + __data += (mod).data.size; \ + if (__i < (napplied)) \ + continue; + +#define WT_MODIFY_FOREACH_REVERSE(mod, p, nentries, napplied, datasz) do {\ + const size_t *__p = (p) + (size_t)(nentries) * 3; \ + const uint8_t *__data = (const uint8_t *)__p + datasz; \ + int __i; \ + for (__i = (napplied); __i < (nentries); ++__i) { \ + memcpy(&(mod).size, --__p, sizeof(size_t)); \ + memcpy(&(mod).offset, --__p, sizeof(size_t)); \ + memcpy(&(mod).data.size, --__p, sizeof(size_t)); \ + (mod).data.data = (__data -= (mod).data.size); + +#define WT_MODIFY_FOREACH_END \ + } \ +} while (0) + /* * __wt_modify_pack -- * Pack a modify structure into a buffer. */ int -__wt_modify_pack(WT_SESSION_IMPL *session, +__wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) { WT_ITEM *modify; - size_t len, *p; + WT_SESSION_IMPL *session; + size_t diffsz, len, *p; uint8_t *data; int i; + session = (WT_SESSION_IMPL *)cursor->session; + /* * Build the in-memory modify value. It's the entries count, followed * by the modify structure offsets written in order, followed by the * data (data at the end to minimize unaligned reads/writes). */ len = sizeof(size_t); /* nentries */ - for (i = 0; i < nentries; ++i) { + for (i = 0, diffsz = 0; i < nentries; ++i) { len += 3 * sizeof(size_t); /* WT_MODIFY fields */ len += entries[i].data.size; /* data */ + diffsz += entries[i].size; /* bytes touched */ } WT_RET(__wt_scr_alloc(session, len, &modify)); @@ -48,6 +80,18 @@ __wt_modify_pack(WT_SESSION_IMPL *session, } modify->size = WT_PTRDIFF(data, modify->data); *modifyp = modify; + + /* + * Update statistics. This is the common path called by + * WT_CURSOR::modify implementations. + */ + WT_STAT_CONN_INCR(session, cursor_modify); + WT_STAT_DATA_INCR(session, cursor_modify); + WT_STAT_CONN_INCRV(session, cursor_modify_bytes, cursor->value.size); + WT_STAT_DATA_INCRV(session, cursor_modify_bytes, cursor->value.size); + WT_STAT_CONN_INCRV(session, cursor_modify_bytes_touch, diffsz); + WT_STAT_DATA_INCRV(session, cursor_modify_bytes_touch, diffsz); + return (0); } @@ -56,51 +100,46 @@ __wt_modify_pack(WT_SESSION_IMPL *session, * Apply a single modify structure change to the buffer. */ static int -__modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, - size_t data_size, size_t offset, size_t size, const uint8_t *data) +__modify_apply_one( + WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify, bool sformat) { - WT_ITEM *value; - size_t len; - uint8_t *from, *to; - bool sformat; + size_t data_size, item_offset, offset, size; + const uint8_t *data, *from; + uint8_t *to; - value = &cursor->value; - sformat = cursor->value_format[0] == 'S'; + data = modify->data.data; + data_size = modify->data.size; + offset = modify->offset; + size = modify->size; /* * Grow the buffer to the maximum size we'll need. This is pessimistic * because it ignores replacement bytes, but it's a simpler calculation. * - * Grow the buffer before we fast-path the expected case. This function - * is often called using a cursor buffer referencing on-page memory and - * it's easy to overwrite a page. A side-effect of growing the buffer is - * to ensure the buffer's value is in buffer-local memory. + * Grow the buffer first. This function is often called using a cursor + * buffer referencing on-page memory and it's easy to overwrite a page. + * A side-effect of growing the buffer is to ensure the buffer's value + * is in buffer-local memory. * * Because the buffer may reference an overflow item, the data may not * start at the start of the buffer's memory and we have to correct for * that. */ - len = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; - WT_RET(__wt_buf_grow(session, value, - len + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0))); + item_offset = + WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; + WT_RET(__wt_buf_grow(session, value, item_offset + + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0))); /* - * Fast-path the expected case, where we're overwriting a set of bytes + * Fast-path the common case, where we're overwriting a set of bytes * that already exist in the buffer. */ if (value->size > offset + data_size && data_size == size) { - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); return (0); } /* - * Decrement the size to discard the trailing nul (done after growing - * the buffer to ensure it can be restored without further checking). - */ - if (sformat) - --value->size; - - /* * If appending bytes past the end of the value, initialize gap bytes * and copy the new bytes into place. */ @@ -108,12 +147,8 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, if (value->size < offset) memset((uint8_t *)value->data + value->size, sformat ? ' ' : 0, offset - value->size); - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); value->size = offset + data_size; - - /* Restore the trailing nul. */ - if (sformat) - ((char *)value->data)[value->size++] = '\0'; return (0); } @@ -125,9 +160,12 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, if (value->size < offset + size) size = value->size - offset; + WT_ASSERT(session, value->size + (data_size - size) + + (sformat ? 1 : 0) <= value->memsize); + if (data_size == size) { /* Overwrite */ /* Copy in the new data. */ - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); /* * The new data must overlap the buffer's end (else, we'd use @@ -137,7 +175,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, value->size = offset + data_size; } else { /* Shrink or grow */ /* Move trailing data forward/backward to its new location. */ - from = (uint8_t *)value->data + (offset + size); + from = (const uint8_t *)value->data + (offset + size); WT_ASSERT(session, WT_DATA_IN_ITEM(value) && from + (value->size - (offset + size)) <= (uint8_t *)value->mem + value->memsize); @@ -148,7 +186,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, memmove(to, from, value->size - (offset + size)); /* Copy in the new data. */ - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); /* * Correct the size. This works because of how the C standard @@ -165,49 +203,134 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, value->size += (data_size - size); } - /* Restore the trailing nul. */ - if (sformat) - ((char *)value->data)[value->size++] = '\0'; - return (0); } /* - * __wt_modify_apply_api -- - * Apply a single set of WT_MODIFY changes to a buffer, the cursor API - * interface. + * __modify_fast_path -- + * Process a set of modifications, applying any that can be made in place, + * and check if the remaining ones are sorted and non-overlapping. */ -int -__wt_modify_apply_api(WT_SESSION_IMPL *session, - WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +static void +__modify_fast_path( + WT_ITEM *value, const size_t *p, int nentries, + int *nappliedp, bool *overlapp, size_t *dataszp, size_t *destszp) { - size_t modified; - int i; + WT_MODIFY current, prev; + size_t datasz, destoff; + bool fastpath, first; + + *overlapp = true; + + datasz = destoff = 0; + WT_CLEAR(current); + WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */ - for (modified = 0, i = 0; i < nentries; ++i) { - modified += entries[i].size; - WT_RET(__modify_apply_one(session, cursor, entries[i].data.size, - entries[i].offset, entries[i].size, entries[i].data.data)); - } /* - * This API is used by some external test functions with a NULL - * session pointer - they don't expect statistics to be incremented. + * If the modifications are sorted and don't overlap in the old or new + * values, we can do a fast application of all the modifications + * modifications in a single pass. + * + * The requirement for ordering is unfortunate, but modifications are + * performed in order, and applications specify byte offsets based on + * that. In other words, byte offsets are cumulative, modifications + * that shrink or grow the data affect subsequent modification's byte + * offsets. */ - if (session != NULL) { - WT_STAT_CONN_INCR(session, cursor_modify); - WT_STAT_DATA_INCR(session, cursor_modify); - WT_STAT_CONN_INCRV(session, - cursor_modify_bytes, cursor->value.size); - WT_STAT_DATA_INCRV(session, - cursor_modify_bytes, cursor->value.size); - WT_STAT_CONN_INCRV(session, - cursor_modify_bytes_touch, modified); - WT_STAT_DATA_INCRV(session, - cursor_modify_bytes_touch, modified); - } + fastpath = first = true; + *nappliedp = 0; + WT_MODIFY_FOREACH_BEGIN(current, p, nentries, 0) { + datasz += current.data.size; - return (0); + if (fastpath && current.data.size == current.size && + current.offset + current.size <= value->size) { + memcpy((uint8_t *)value->data + current.offset, + current.data.data, current.data.size); + ++(*nappliedp); + continue; + } + fastpath = false; + + /* Step over the bytes before the current block. */ + if (first) + destoff = current.offset; + else { + /* Check that entries are sorted and non-overlapping. */ + if (current.offset < prev.offset + prev.size || + current.offset < prev.offset + prev.data.size) + return; + destoff += current.offset - (prev.offset + prev.size); + } + + /* + * If the source is past the end of the current value, we have + * to deal with padding bytes. Don't try to fast-path padding + * bytes; it's not common and adds branches to the loop + * applying the changes. + */ + if (current.offset + current.size > value->size) + return; + + /* + * If copying this block overlaps with the next one, we can't + * build the value in reverse order. + */ + if (current.size != current.data.size && + current.offset + current.size > destoff) + return; + + /* Step over the current modification. */ + destoff += current.data.size; + + prev = current; + first = false; + } WT_MODIFY_FOREACH_END; + + /* Step over the final unmodified block. */ + destoff += value->size - (current.offset + current.size); + + *overlapp = false; + *dataszp = datasz; + *destszp = destoff; + return; +} + +/* + * __modify_apply_no_overlap -- + * Apply a single set of WT_MODIFY changes to a buffer, where the changes + * are in sorted order and none of the changes overlap. + */ +static void +__modify_apply_no_overlap(WT_SESSION_IMPL *session, WT_ITEM *value, + const size_t *p, int nentries, int napplied, size_t datasz, size_t destsz) +{ + WT_MODIFY current; + size_t sz; + const uint8_t *from; + uint8_t *to; + + from = (const uint8_t *)value->data + value->size; + to = (uint8_t *)value->data + destsz; + WT_MODIFY_FOREACH_REVERSE(current, p, nentries, napplied, datasz) { + /* Move the current unmodified block into place if necessary. */ + sz = WT_PTRDIFF(to, value->data) - + (current.offset + current.data.size); + from -= sz; + to -= sz; + WT_ASSERT(session, from >= (const uint8_t *)value->data && + to >= (uint8_t *)value->data); + WT_ASSERT(session, + from + sz <= (const uint8_t *)value->data + value->size); + + if (to != from) + memmove(to, from, sz); + + from -= current.size; + to -= current.data.size; + memcpy(to, current.data.data, current.data.size); + } WT_MODIFY_FOREACH_END; + + value->size = destsz; } /* @@ -215,31 +338,91 @@ __wt_modify_apply_api(WT_SESSION_IMPL *session, * Apply a single set of WT_MODIFY changes to a buffer. */ int -__wt_modify_apply( - WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) +__wt_modify_apply(WT_CURSOR *cursor, const void *modify) { - size_t data_size, nentries, offset, size; + WT_ITEM *value; + WT_MODIFY mod; + WT_SESSION_IMPL *session; + size_t datasz, destsz, item_offset, tmp; const size_t *p; - const uint8_t *data; + int napplied, nentries; + bool overlap, sformat; + + session = (WT_SESSION_IMPL *)cursor->session; + sformat = cursor->value_format[0] == 'S'; + value = &cursor->value; /* - * Get the number of entries, and set a second pointer to reference the - * change data. The modify string isn't necessarily aligned for size_t - * access, copy to be sure. + * Get the number of modify entries and set a second pointer to + * reference the replacement data. */ p = modify; - memcpy(&nentries, p++, sizeof(size_t)); - data = (uint8_t *)modify + - sizeof(size_t) + (nentries * 3 * sizeof(size_t)); - - /* Step through the list of entries, applying them in order. */ - for (; nentries-- > 0; data += data_size) { - memcpy(&data_size, p++, sizeof(size_t)); - memcpy(&offset, p++, sizeof(size_t)); - memcpy(&size, p++, sizeof(size_t)); - WT_RET(__modify_apply_one( - session, cursor, data_size, offset, size, data)); + memcpy(&tmp, p++, sizeof(size_t)); + nentries = (int)tmp; + + /* + * Grow the buffer first. This function is often called using a cursor + * buffer referencing on-page memory and it's easy to overwrite a page. + * A side-effect of growing the buffer is to ensure the buffer's value + * is in buffer-local memory. + * + * Because the buffer may reference an overflow item, the data may not + * start at the start of the buffer's memory and we have to correct for + * that. + */ + item_offset = WT_DATA_IN_ITEM(value) ? + WT_PTRDIFF(value->data, value->mem) : 0; + WT_RET(__wt_buf_grow(session, value, item_offset + value->size)); + + /* + * Decrement the size to discard the trailing nul (done after growing + * the buffer to ensure it can be restored without further checking). + */ + if (sformat) + --value->size; + + __modify_fast_path( + value, p, nentries, &napplied, &overlap, &datasz, &destsz); + + if (napplied == nentries) + goto done; + + if (!overlap) { + /* Grow the buffer first, correcting for the data offset. */ + WT_RET(__wt_buf_grow(session, value, item_offset + + WT_MAX(destsz, value->size) + (sformat ? 1 : 0))); + + __modify_apply_no_overlap( + session, value, p, nentries, napplied, datasz, destsz); + goto done; } + WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) { + WT_RET(__modify_apply_one(session, value, &mod, sformat)); + } WT_MODIFY_FOREACH_END; + +done: /* Restore the trailing nul. */ + if (sformat) + ((char *)value->data)[value->size++] = '\0'; + return (0); } + +/* + * __wt_modify_apply_api -- + * Apply a single set of WT_MODIFY changes to a buffer, the cursor API + * interface. + */ +int +__wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + WT_DECL_ITEM(modify); + WT_DECL_RET; + + WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries)); + WT_ERR(__wt_modify_apply(cursor, modify->data)); + +err: __wt_scr_free((WT_SESSION_IMPL *)cursor->session, &modify); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index f888a470c8c..b3085080956 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -664,13 +664,15 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_OP *op; WT_UPDATE *upd; - wt_timestamp_t op_timestamp; + wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp; u_int i; const char *open_cursor_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; bool op_zero_ts, upd_zero_ts; txn = &session->txn; + cursor = NULL; + durable_op_timestamp = prev_op_timestamp = WT_TS_NONE; /* * Debugging checks on timestamps, if user requested them. @@ -728,13 +730,15 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) WT_WITH_BTREE(session, op->btree, ret = __wt_btcur_search_uncommitted( (WT_CURSOR_BTREE *)cursor, &upd)); - WT_TRET(cursor->close(cursor)); if (ret != 0) WT_RET_MSG(session, EINVAL, "prepared update restore failed"); - op->u.op_upd = upd; } else - upd = op->u.op_upd->next; + upd = op->u.op_upd; + + WT_ASSERT(session, upd != NULL); + op_timestamp = upd->start_ts; + /* * Skip over any aborted update structures, internally * created update structures or ones from our own @@ -749,6 +753,22 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) * first valid update in the chain. They're in * most recent order. */ + if (upd != NULL) { + prev_op_timestamp = upd->start_ts; + durable_op_timestamp = upd->durable_ts; + } + + /* + * We no longer need to access the update structure so + * it's safe to release our reference to the page. + */ + if (cursor != NULL) { + WT_ASSERT( + session, F_ISSET(txn, WT_TXN_PREPARE)); + WT_RET(cursor->close(cursor)); + cursor = NULL; + } + if (upd == NULL) continue; /* @@ -760,7 +780,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) * Check timestamps are used in order. */ op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); - upd_zero_ts = upd->start_ts == WT_TS_NONE; + upd_zero_ts = prev_op_timestamp == WT_TS_NONE; if (op_zero_ts != upd_zero_ts) WT_RET_MSG(session, EINVAL, "per-key timestamps used inconsistently"); @@ -772,7 +792,6 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) if (op_zero_ts) continue; - op_timestamp = op->u.op_upd->start_ts; /* * Only if the update structure doesn't have a timestamp * then use the one in the transaction structure. @@ -780,11 +799,11 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) if (op_timestamp == WT_TS_NONE) op_timestamp = txn->commit_timestamp; if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) && - op_timestamp < upd->start_ts) + op_timestamp < prev_op_timestamp) WT_RET_MSG(session, EINVAL, "out of order commit timestamps"); if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) && - txn->durable_timestamp < upd->durable_ts) + txn->durable_timestamp < durable_op_timestamp) WT_RET_MSG(session, EINVAL, "out of order durable timestamps"); } diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index a5e3e139178..504b2c0e8b4 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -155,7 +155,7 @@ __txn_op_apply( * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(session, cursor, value.data)); + WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; @@ -222,7 +222,7 @@ __txn_op_apply( * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(session, cursor, value.data)); + WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; diff --git a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c index 879a8e96c6a..5a413c0df3b 100644 --- a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c @@ -37,7 +37,7 @@ #define DATASIZE 1024 #define MAX_MODIFY_ENTRIES 37 /* Maximum modify vectors */ -static WT_MODIFY entries[1000]; /* Entries vector */ +static WT_MODIFY entries[MAX_MODIFY_ENTRIES]; /* Entries vector */ static int nentries; /* Entries count */ /* @@ -50,7 +50,6 @@ static char modify_repl[MAX_REPL_BYTES * 2]; /* Replacement bytes */ static WT_RAND_STATE rnd; /* RNG state */ -#if DEBUG /* * show -- * Dump out a buffer. @@ -62,15 +61,10 @@ show(WT_ITEM *buf, const char *tag) const uint8_t *a; fprintf(stderr, "%s: %" WT_SIZET_FMT " bytes\n\t", tag, buf->size); - for (a = buf->data, i = 0; i < buf->size; ++i, ++a) { - if (isprint(*a)) - fprintf(stderr, " %c", *a); - else - fprintf(stderr, " %#x", *a); - } + for (a = buf->data, i = 0; i < buf->size; ++i, ++a) + fprintf(stderr, " %c", isprint(*a) ? *a : '.'); fprintf(stderr, "\n"); } -#endif /* * modify_repl_init -- @@ -82,7 +76,7 @@ modify_repl_init(void) size_t i; for (i = 0; i < sizeof(modify_repl); ++i) - modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26]; + modify_repl[i] = 'Z' - (i % 26); } /* @@ -95,13 +89,13 @@ modify_build(void) int i; /* Mess up the entries. */ - memset(entries, 0xff, MAX_MODIFY_ENTRIES * sizeof(entries[0])); + memset(entries, 0xff, sizeof(entries)); /* * Randomly select a number of byte changes, offsets and lengths. * Allow a value of 0, the API should accept it. */ - nentries = (int)(__wt_random(&rnd) % MAX_MODIFY_ENTRIES); + nentries = (int)(__wt_random(&rnd) % (MAX_MODIFY_ENTRIES + 1)); for (i = 0; i < nentries; ++i) { entries[i].data.data = modify_repl + __wt_random(&rnd) % MAX_REPL_BYTES; @@ -115,7 +109,7 @@ modify_build(void) printf( "%d: {%.*s} %" WT_SIZET_FMT " bytes replacing %" WT_SIZET_FMT " bytes @ %" WT_SIZET_FMT "\n", - i, (int)entries[i].data.size, entries[i].data.data, + i, (int)entries[i].data.size, (char *)entries[i].data.data, entries[i].data.size, entries[i].size, entries[i].offset); #endif } @@ -217,16 +211,29 @@ slow_apply_api(WT_ITEM *orig) * Compare two results. */ static void -compare(WT_ITEM *local, WT_ITEM *library) +compare(WT_ITEM *orig, WT_ITEM *local, WT_ITEM *library) { -#if DEBUG + size_t i, max; + const uint8_t *p, *t; + + max = WT_MIN(local->size, library->size); if (local->size != library->size || memcmp(local->data, library->data, local->size) != 0) { - fprintf(stderr, "results differ\n"); + for (i = 0, + p = local->data, t = library->data; i < max; ++i, ++p, ++t) + if (*p != *t) + break; + fprintf(stderr, "results differ: "); + if (max == 0) + fprintf(stderr, + "identical up to %" WT_SIZET_FMT " bytes\n", max); + else + fprintf(stderr, + "first mismatch at offset %" WT_SIZET_FMT "\n", i); + show(orig, "original"); show(local, "local results"); show(library, "library results"); } -#endif testutil_assert( local->size == library->size && memcmp( local->data, library->data, local->size) == 0); @@ -250,16 +257,22 @@ compare(WT_ITEM *local, WT_ITEM *library) * calculate-modify API. */ static void -modify_run(bool verbose) +modify_run(TEST_OPTS *opts) { WT_CURSOR *cursor, _cursor; WT_DECL_RET; WT_ITEM *localA, _localA, *localB, _localB; + WT_SESSION_IMPL *session; size_t len; int i, j; + u_char *p; + bool verbose; + + session = (WT_SESSION_IMPL *)opts->session; + verbose = opts->verbose; /* Initialize the RNG. */ - __wt_random_init_seed(NULL, &rnd); + __wt_random_init_seed(session, &rnd); /* Set up replacement information. */ modify_repl_init(); @@ -271,18 +284,24 @@ modify_run(bool verbose) memset(&_localB, 0, sizeof(_localB)); cursor = &_cursor; memset(&_cursor, 0, sizeof(_cursor)); + cursor->session = (WT_SESSION *)session; cursor->value_format = "u"; #define NRUNS 10000 for (i = 0; i < NRUNS; ++i) { /* Create an initial value. */ len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES); - testutil_check(__wt_buf_set(NULL, localA, modify_repl, len)); + testutil_check(__wt_buf_set(session, localA, modify_repl, len)); for (j = 0; j < 1000; ++j) { + /* Make lower case so modifications are easy to see. */ + for (p = localA->mem; + WT_PTRDIFF(p, localA->mem) < localA->size; p++) + *p = __wt_tolower(*p); + /* Copy the current value into the second item. */ testutil_check(__wt_buf_set( - NULL, localB, localA->data, localA->size)); + session, localB, localA->data, localA->size)); /* * Create a random set of modify vectors, run the @@ -291,12 +310,12 @@ modify_run(bool verbose) * of modify. */ modify_build(); - testutil_check(__wt_buf_set( - NULL, &cursor->value, localA->data, localA->size)); + testutil_check(__wt_buf_set(session, + &cursor->value, localA->data, localA->size)); testutil_check(__wt_modify_apply_api( - NULL, cursor, entries, nentries)); + cursor, entries, nentries)); slow_apply_api(localA); - compare(localA, &cursor->value); + compare(localB, localA, &cursor->value); /* * Call the WiredTiger function to build a modification @@ -305,18 +324,18 @@ modify_run(bool verbose) * against our implementation of modify. */ nentries = WT_ELEMENTS(entries); - ret = wiredtiger_calc_modify(NULL, + ret = wiredtiger_calc_modify(opts->session, localB, localA, WT_MAX(localB->size, localA->size) + 100, entries, &nentries); if (ret == WT_NOTFOUND) continue; testutil_check(ret); - testutil_check(__wt_buf_set( - NULL, &cursor->value, localB->data, localB->size)); + testutil_check(__wt_buf_set(session, + &cursor->value, localB->data, localB->size)); testutil_check(__wt_modify_apply_api( - NULL, cursor, entries, nentries)); - compare(localA, &cursor->value); + cursor, entries, nentries)); + compare(localB, localA, &cursor->value); } if (verbose) { printf("%d (%d%%)\r", i, (i * 100) / NRUNS); @@ -326,9 +345,9 @@ modify_run(bool verbose) if (verbose) printf("%d (100%%)\n", i); - __wt_buf_free(NULL, localA); - __wt_buf_free(NULL, localB); - __wt_buf_free(NULL, &cursor->value); + __wt_buf_free(session, localA); + __wt_buf_free(session, localB); + __wt_buf_free(session, &cursor->value); } int @@ -342,9 +361,11 @@ main(int argc, char *argv[]) testutil_make_work_dir(opts->home); testutil_check( wiredtiger_open(opts->home, NULL, "create", &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &opts->session)); /* Run the test. */ - modify_run(opts->verbose); + modify_run(opts); testutil_cleanup(opts); return (EXIT_SUCCESS); diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 7cba583b2b4..9d97a2d0428 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -70,6 +70,7 @@ typedef struct { char *home_config; /* Run CONFIG file path */ char *home_init; /* Initialize home command */ char *home_log; /* Operation log file path */ + char *home_pagedump; /* Page dump filename */ char *home_rand; /* RNG log file path */ char *home_salvage_copy; /* Salvage copy command */ char *home_stats; /* Statistics file path */ @@ -266,6 +267,8 @@ typedef enum { NEXT, PREV, SEARCH, SEARCH_NEAR } read_operation; typedef struct { thread_op op; /* Operation */ + uint64_t opid; /* Operation ID */ + uint64_t keyno; /* Row number */ uint64_t ts; /* Read/commit timestamp */ @@ -311,6 +314,8 @@ typedef struct { WT_ITEM *lastkey, _lastkey; bool repeatable_reads; /* if read ops repeatable */ + bool repeatable_wrap; /* if circular buffer wrapped */ + uint64_t opid; /* Operation ID */ uint64_t read_ts; /* read timestamp */ uint64_t commit_ts; /* commit timestamp */ SNAP_OPS *snap, *snap_first, snap_list[512]; @@ -348,10 +353,9 @@ void key_gen_teardown(WT_ITEM *); void key_init(void); WT_THREAD_RET lrt(void *); void path_setup(const char *); -void print_item(const char *, WT_ITEM *); -void print_item_data(const char *, const uint8_t *, size_t); int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool); uint32_t rng(WT_RAND_STATE *); +void snap_init(TINFO *, uint64_t, bool); void snap_repeat_single(WT_CURSOR *, TINFO *); int snap_repeat_txn(WT_CURSOR *, TINFO *); void snap_repeat_update(TINFO *, bool); diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index a27dec3dd0c..7adfb795694 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -338,11 +338,10 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) buf, sizeof(buf), "read_timestamp=%" PRIx64, ts)); ret = session->timestamp_transaction(session, buf); if (ret == 0) { - tinfo->read_ts = ts; - tinfo->repeatable_reads = true; + snap_init(tinfo, ts, true); logop(session, "begin snapshot read-ts=%" PRIu64 " (repeatable)", - tinfo->read_ts); + ts); return; } if (ret != EINVAL) @@ -371,11 +370,9 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) testutil_check(pthread_rwlock_unlock(&g.ts_lock)); - tinfo->read_ts = ts; - tinfo->repeatable_reads = false; - - logop(session, "begin snapshot read-ts=%" PRIu64 " (not repeatable)", - tinfo->read_ts); + snap_init(tinfo, ts, false); + logop(session, + "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts); } /* @@ -415,9 +412,7 @@ begin_transaction(TINFO *tinfo, u_int *iso_configp) wiredtiger_begin_transaction(session, config); - tinfo->read_ts = WT_TS_NONE; - tinfo->repeatable_reads = false; - + snap_init(tinfo, WT_TS_NONE, false); logop(session, "begin %s", log); } @@ -719,8 +714,6 @@ ops(void *arg) begin_transaction_ts(tinfo, &iso_config); else begin_transaction(tinfo, &iso_config); - - tinfo->snap_first = tinfo->snap; intxn = true; } @@ -899,7 +892,7 @@ remove_instead_of_truncate: */ greater_than = mmrand(&tinfo->rnd, 0, 1) == 1; range = g.rows < 20 ? - 1 : mmrand(&tinfo->rnd, 1, (u_int)g.rows / 20); + 0 : mmrand(&tinfo->rnd, 0, (u_int)g.rows / 20); tinfo->last = tinfo->keyno; if (greater_than) { if (g.c_reverse) { diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index a1853c56db9..b38f6958f1c 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -29,6 +29,22 @@ #include "format.h" /* + * snap_init -- + * Initialize the repeatable operation tracking. + */ +void +snap_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads) +{ + ++tinfo->opid; + + tinfo->snap_first = tinfo->snap; + + tinfo->read_ts = read_ts; + tinfo->repeatable_reads = repeatable_reads; + tinfo->repeatable_wrap = false; +} + +/* * snap_track -- * Add a single snapshot isolation returned value to the list. */ @@ -40,10 +56,12 @@ snap_track(TINFO *tinfo, thread_op op) snap = tinfo->snap; snap->op = op; + snap->opid = tinfo->opid; snap->keyno = tinfo->keyno; snap->ts = WT_TS_NONE; snap->repeatable = false; snap->last = op == TRUNCATE ? tinfo->last : 0; + snap->ksize = snap->vsize = 0; if (op == INSERT && g.type == ROW) { ip = tinfo->key; @@ -63,15 +81,43 @@ snap_track(TINFO *tinfo, thread_op op) memcpy(snap->vdata, ip->data, snap->vsize = ip->size); } + /* Move to the next slot, wrap at the end of the circular buffer. */ + if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) + tinfo->snap = tinfo->snap_list; + /* - * Move to the next slot, wrap at the end of the circular buffer. - * * It's possible to pass this transaction's buffer starting point and - * start replacing our own entries. That's OK, we just skip earlier - * operations when we check. + * start replacing our own entries. If that happens, we can't repeat + * operations because we don't know which ones were previously modified. */ - if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) - tinfo->snap = tinfo->snap_list; + if (tinfo->snap->opid == tinfo->opid) + tinfo->repeatable_wrap = true; +} + +/* + * print_item_data -- + * Display a single data/size pair, with a tag. + */ +static void +print_item_data(const char *tag, const uint8_t *data, size_t size) +{ + static const char hex[] = "0123456789abcdef"; + u_char ch; + + fprintf(stderr, "%s {", tag); + if (g.type == FIX) + fprintf(stderr, "0x%02x", data[0]); + else + for (; size > 0; --size, ++data) { + ch = data[0]; + if (__wt_isprint(ch)) + fprintf(stderr, "%c", (int)ch); + else + fprintf(stderr, "%x%x", + (u_int)hex[(data[0] & 0xf0) >> 4], + (u_int)hex[data[0] & 0x0f]); + } + fprintf(stderr, "}\n"); } /* @@ -83,10 +129,14 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) { WT_DECL_RET; WT_ITEM *key, *value; + uint64_t keyno; uint8_t bitfield; + testutil_assert(snap->op != TRUNCATE); + key = tinfo->key; value = tinfo->value; + keyno = snap->keyno; /* * Retrieve the key/value pair by key. Row-store inserts have a unique @@ -100,10 +150,10 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) switch (g.type) { case FIX: case VAR: - cursor->set_key(cursor, snap->keyno); + cursor->set_key(cursor, keyno); break; case ROW: - key_gen(key, snap->keyno); + key_gen(key, keyno); cursor->set_key(cursor, key); break; } @@ -125,12 +175,11 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) } /* Check for simple matches. */ - if (ret == 0 && - snap->op != REMOVE && snap->op != TRUNCATE && + if (ret == 0 && snap->op != REMOVE && value->size == snap->vsize && memcmp(value->data, snap->vdata, value->size) == 0) return (0); - if (ret == WT_NOTFOUND && (snap->op == REMOVE || snap->op == TRUNCATE)) + if (ret == WT_NOTFOUND && snap->op == REMOVE) return (0); /* @@ -142,18 +191,23 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) if (ret == WT_NOTFOUND && snap->vsize == 1 && *(uint8_t *)snap->vdata == 0) return (0); - if ((snap->op == REMOVE || snap->op == TRUNCATE) && + if (snap->op == REMOVE && value->size == 1 && *(uint8_t *)value->data == 0) return (0); } /* Things went pear-shaped. */ +#ifdef HAVE_DIAGNOSTIC + fprintf(stderr, + "snapshot-isolation error: Dumping page to %s\n", g.home_pagedump); + testutil_check(__wt_debug_cursor_page(cursor, g.home_pagedump)); +#endif switch (g.type) { case FIX: testutil_die(ret, "snapshot-isolation: %" PRIu64 " search: " "expected {0x%02x}, found {0x%02x}", - snap->keyno, + keyno, snap->op == REMOVE ? 0 : *(uint8_t *)snap->vdata, ret == WT_NOTFOUND ? 0 : *(uint8_t *)value->data); /* NOTREACHED */ @@ -177,8 +231,7 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) /* NOTREACHED */ case VAR: fprintf(stderr, - "snapshot-isolation %" PRIu64 " search mismatch\n", - snap->keyno); + "snapshot-isolation %" PRIu64 " search mismatch\n", keyno); if (snap->op == REMOVE) fprintf(stderr, "expected {deleted}\n"); @@ -190,8 +243,7 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) print_item_data(" found", value->data, value->size); testutil_die(ret, - "snapshot-isolation: %" PRIu64 " search mismatch", - snap->keyno); + "snapshot-isolation: %" PRIu64 " search mismatch", keyno); /* NOTREACHED */ } @@ -209,7 +261,7 @@ snap_ts_clear(TINFO *tinfo, uint64_t ts) SNAP_OPS *snap; int count; - /* Check from the first operation to the last. */ + /* Check from the first slot to the last. */ for (snap = tinfo->snap_list, count = WT_ELEMENTS(tinfo->snap_list); count > 0; --count, ++snap) if (snap->repeatable && snap->ts <= ts) @@ -253,12 +305,18 @@ snap_repeat_ok_match(SNAP_OPS *current, SNAP_OPS *a) * committed successfully. */ static bool -snap_repeat_ok_commit( - TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first, SNAP_OPS *last) +snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current) { SNAP_OPS *p; /* + * Truncates can't be repeated, we don't know the exact range of records + * that were removed (if any). + */ + if (current->op == TRUNCATE) + return (false); + + /* * For updates, check for subsequent changes to the record and don't * repeat the read. For reads, check for either subsequent or previous * changes to the record and don't repeat the read. (The reads are @@ -266,13 +324,10 @@ snap_repeat_ok_commit( * do the repeatable read in that case.) */ for (p = current;;) { - /* - * Wrap at the end of the circular buffer; "last" is the element - * after the last element we want to test. - */ + /* Wrap at the end of the circular buffer. */ if (++p >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) p = tinfo->snap_list; - if (p == last) + if (p->opid != tinfo->opid) break; if (!snap_repeat_ok_match(current, p)) @@ -282,21 +337,18 @@ snap_repeat_ok_commit( if (current->op != READ) return (true); for (p = current;;) { - /* - * Wrap at the beginning of the circular buffer; "first" is the - * last element we want to test. - */ - if (p == first) - return (true); + /* Wrap at the beginning of the circular buffer. */ if (--p < tinfo->snap_list) p = &tinfo->snap_list[ WT_ELEMENTS(tinfo->snap_list) - 1]; + if (p->opid != tinfo->opid) + break; if (!snap_repeat_ok_match(current, p)) return (false); } - /* NOTREACHED */ + return (true); } /* @@ -305,7 +357,7 @@ snap_repeat_ok_commit( * transaction has rolled back. */ static bool -snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first) +snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current) { SNAP_OPS *p; @@ -318,21 +370,18 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first) * the read in that case. */ for (p = current;;) { - /* - * Wrap at the beginning of the circular buffer; "first" is the - * last element we want to test. - */ - if (p == first) - return (true); + /* Wrap at the beginning of the circular buffer. */ if (--p < tinfo->snap_list) p = &tinfo->snap_list[ WT_ELEMENTS(tinfo->snap_list) - 1]; + if (p->opid != tinfo->opid) + break; if (!snap_repeat_ok_match(current, p)) return (false); } - /* NOTREACHED */ + return (true); } /* @@ -342,31 +391,21 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current, SNAP_OPS *first) int snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo) { - SNAP_OPS *current, *stop; + SNAP_OPS *current; + + /* If we wrapped the buffer, we can't repeat operations. */ + if (tinfo->repeatable_wrap) + return (0); /* Check from the first operation we saved to the last. */ - for (current = tinfo->snap_first, stop = tinfo->snap;; ++current) { + for (current = tinfo->snap_first;; ++current) { /* Wrap at the end of the circular buffer. */ if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) current = tinfo->snap_list; - if (current == stop) + if (current->opid != tinfo->opid) break; - /* - * We don't test all of the records in a truncate range, only - * the first because that matches the rest of the isolation - * checks. If a truncate range was from the start of the table, - * switch to the record at the end. This is done in the first - * routine that considers if operations are repeatable, and the - * rest of those functions depend on it already being done. - */ - if (current->op == TRUNCATE && current->keyno == 0) { - current->keyno = current->last; - testutil_assert(current->keyno != 0); - } - - if (snap_repeat_ok_commit( - tinfo, current, tinfo->snap_first, stop)) + if (snap_repeat_ok_commit(tinfo, current)) WT_RET(snap_verify(cursor, tinfo, current)); } @@ -381,19 +420,18 @@ snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo) void snap_repeat_update(TINFO *tinfo, bool committed) { - SNAP_OPS *start, *stop; + SNAP_OPS *current; - /* - * Check from the first operation we saved to the last. It's possible - * to update none at all if we did exactly the number of operations - * in the circular buffer, it will look like we didn't do any. That's - * OK, it's a big enough buffer that it's not going to matter. - */ - for (start = tinfo->snap_first, stop = tinfo->snap;; ++start) { + /* If we wrapped the buffer, we can't repeat operations. */ + if (tinfo->repeatable_wrap) + return; + + /* Check from the first operation we saved to the last. */ + for (current = tinfo->snap_first;; ++current) { /* Wrap at the end of the circular buffer. */ - if (start >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) - start = tinfo->snap_list; - if (start == stop) + if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) + current = tinfo->snap_list; + if (current->opid != tinfo->opid) break; /* @@ -401,23 +439,23 @@ snap_repeat_update(TINFO *tinfo, bool committed) * timestamp chosen wasn't older than all concurrently running * uncommitted updates. */ - if (!tinfo->repeatable_reads && start->op == READ) + if (!tinfo->repeatable_reads && current->op == READ) continue; /* * Second, check based on the transaction resolution (the rules * are different if the transaction committed or rolled back). */ - start->repeatable = committed ? snap_repeat_ok_commit( - tinfo, start, tinfo->snap_first, stop) : - snap_repeat_ok_rollback(tinfo, start, tinfo->snap_first); + current->repeatable = committed ? + snap_repeat_ok_commit(tinfo, current) : + snap_repeat_ok_rollback(tinfo, current); /* * Repeat reads at the transaction's read timestamp and updates * at the commit timestamp. */ - if (start->repeatable) - start->ts = start->op == READ ? + if (current->repeatable) + current->ts = current->op == READ ? tinfo->read_ts : tinfo->commit_ts; } } diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 6f7783a3a32..91d9bf51697 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -330,6 +330,12 @@ path_setup(const char *home) g.home_log = dmalloc(len); testutil_check(__wt_snprintf(g.home_log, len, "%s/%s", g.home, "log")); + /* Page dump file. */ + len = strlen(g.home) + strlen("pagedump") + 2; + g.home_pagedump = dmalloc(len); + testutil_check(__wt_snprintf( + g.home_pagedump, len, "%s/%s", g.home, "pagedump")); + /* RNG log file. */ len = strlen(g.home) + strlen("rand") + 2; g.home_rand = dmalloc(len); @@ -675,39 +681,3 @@ alter(void *arg) testutil_check(session->close(session, NULL)); return (WT_THREAD_RET_VALUE); } - -/* - * print_item_data -- - * Display a single data/size pair, with a tag. - */ -void -print_item_data(const char *tag, const uint8_t *data, size_t size) -{ - static const char hex[] = "0123456789abcdef"; - u_char ch; - - fprintf(stderr, "\t%s {", tag); - if (g.type == FIX) - fprintf(stderr, "0x%02x", data[0]); - else - for (; size > 0; --size, ++data) { - ch = data[0]; - if (__wt_isprint(ch)) - fprintf(stderr, "%c", (int)ch); - else - fprintf(stderr, "%x%x", - (u_int)hex[(data[0] & 0xf0) >> 4], - (u_int)hex[data[0] & 0x0f]); - } - fprintf(stderr, "}\n"); -} - -/* - * print_item -- - * Display a single data/size pair, with a tag. - */ -void -print_item(const char *tag, WT_ITEM *item) -{ - print_item_data(tag, item->data, item->size); -} diff --git a/src/third_party/wiredtiger/test/suite/test_compat02.py b/src/third_party/wiredtiger/test/suite/test_compat02.py index de5862513a8..a92c3f54300 100644 --- a/src/third_party/wiredtiger/test/suite/test_compat02.py +++ b/src/third_party/wiredtiger/test/suite/test_compat02.py @@ -133,6 +133,13 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess): # version. That configuration needs an existing database for it to be # useful. Test for success or failure based on the relative versions # configured. + + # Turn on checkpoint verbose to debug a rare occurence of a test + # hanging, most likely during the checkpoint of conn.close. + self.pr("Closing connection") + self.conn.reconfigure('verbose=(checkpoint)') + with self.expectedStdoutPattern('.'): + self.conn.close() compat_str = '' if (self.max_req != 'none'): compat_str += 'compatibility=(require_max="%s"),' % self.max_req @@ -140,7 +147,6 @@ class test_compat02(wttest.WiredTigerTestCase, suite_subprocess): compat_str += 'compatibility=(require_min="%s"),' % self.min_req if (self.rel != 'none'): compat_str += 'compatibility=(release="%s"),' % self.rel - self.conn.close() log_str = 'log=(enabled,file_max=%s,archive=false),' % self.logmax restart_config = log_str + compat_str self.pr("Restart conn " + restart_config) diff --git a/src/third_party/wiredtiger/test/suite/test_txn19.py b/src/third_party/wiredtiger/test/suite/test_txn19.py index 604d8bed8bb..c63e9f9c6e9 100755 --- a/src/third_party/wiredtiger/test/suite/test_txn19.py +++ b/src/third_party/wiredtiger/test/suite/test_txn19.py @@ -53,6 +53,20 @@ def corrupt(fname, truncate, offset, writeit): if writeit: log.write(writeit) +def copy_for_crash_restart(olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): base_config = 'log=(archive=false,enabled,file_max=100K),' + \ 'transaction_sync=(enabled,method=none)' @@ -158,20 +172,6 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): self.tty('LOGS ' + msg + ': ' + str(i) + ' is empty') self.tty('LOGS ' + msg + ': ' + str(loglist)) - def copy_for_crash_restart(self, olddir, newdir): - ''' Simulate a crash from olddir and restart in newdir. ''' - # with the connection still open, copy files to new directory - shutil.rmtree(newdir, ignore_errors=True) - os.mkdir(newdir) - for fname in os.listdir(olddir): - fullname = os.path.join(olddir, fname) - # Skip lock file on Windows since it is locked - if os.path.isfile(fullname) and \ - "WiredTiger.lock" not in fullname and \ - "Tmplog" not in fullname and \ - "Preplog" not in fullname: - shutil.copy(fullname, newdir) - # Generate a value that is a bit over half the size of the log file. def valuegen(self, i): return str(i) + 'A' * (1024 * 60) # ~60K @@ -280,7 +280,7 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): self.session.create(self.uri, self.create_params) self.inserts([x for x in range(0, self.nrecords)]) newdir = "RESTART" - self.copy_for_crash_restart(self.home, newdir) + copy_for_crash_restart(self.home, newdir) self.close_conn() #self.show_logs(newdir, 'before corruption') self.corrupt_log(newdir) @@ -346,12 +346,210 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): newdir2 = "RESTART2" self.inserts([self.nrecords, self.nrecords + 1]) expect.extend([self.nrecords, self.nrecords + 1]) - self.copy_for_crash_restart(newdir, newdir2) + copy_for_crash_restart(newdir, newdir2) self.checks(expect) self.reopen_conn(newdir) self.checks(expect) self.reopen_conn(newdir2, self.conn_config) self.checks(expect) +class test_txn19_meta(wttest.WiredTigerTestCase, suite_subprocess): + base_config = 'log=(archive=false,enabled,file_max=100K),' + \ + 'transaction_sync=(enabled,method=none)' + conn_config = base_config + + # The type of corruption to be applied + corruption_scenarios = [ + ('removal', dict(kind='removal', f=lambda fname: + os.remove(fname))), + ('truncate', dict(kind='truncate', f=lambda fname: + corrupt(fname, True, 0, None))), + ('truncate-middle', dict(kind='truncate-middle', f=lambda fname: + corrupt(fname, True, 1024 * 25, None))), + ('zero-begin', dict(kind='zero', f=lambda fname: + corrupt(fname, False, 0, '\0' * 4096))), + ('zero-trunc', dict(kind='zero', f=lambda fname: + corrupt(fname, True, 0, '\0' * 4096))), + ('zero-end', dict(kind='zero-end', f=lambda fname: + corrupt(fname, False, -1, '\0' * 4096))), + ('garbage-begin', dict(kind='garbage-begin', f=lambda fname: + corrupt(fname, False, 0, 'Bad!' * 1024))), + ('garbage-middle', dict(kind='garbage-middle', f=lambda fname: + corrupt(fname, False, 1024 * 25, 'Bad!' * 1024))), + ('garbage-end', dict(kind='garbage-end', f=lambda fname: + corrupt(fname, False, -1, 'Bad!' * 1024))), + ] + # File to be corrupted + filename_scenarios = [ + ('WiredTiger', dict(filename='WiredTiger')), + ('WiredTiger.basecfg', dict(filename='WiredTiger.basecfg')), + ('WiredTiger.turtle', dict(filename='WiredTiger.turtle')), + ('WiredTiger.wt', dict(filename='WiredTiger.wt')), + ('WiredTigerLAS.wt', dict(filename='WiredTigerLAS.wt')), + ] + + # In many cases, wiredtiger_open without any salvage options will + # just work. We list those cases here. + openable = [ + "removal:WiredTiger.basecfg", + "removal:WiredTiger.turtle", + "removal:WiredTigerLAS.wt", + "truncate:WiredTiger", + "truncate:WiredTiger.basecfg", + "truncate:WiredTigerLAS.wt", + "truncate-middle:WiredTiger", + "truncate-middle:WiredTiger.basecfg", + "truncate-middle:WiredTiger.turtle", + "truncate-middle:WiredTiger.wt", + "truncate-middle:WiredTigerLAS.wt", + "zero:WiredTiger", + "zero:WiredTiger.basecfg", + "zero:WiredTigerLAS.wt", + "zero-end:WiredTiger", + "zero-end:WiredTiger.basecfg", + "zero-end:WiredTiger.turtle", + "zero-end:WiredTiger.wt", + "zero-end:WiredTigerLAS.wt", + "garbage-begin:WiredTiger", + "garbage-begin:WiredTigerLAS.wt", + "garbage-middle:WiredTiger", + "garbage-middle:WiredTiger.basecfg", + "garbage-middle:WiredTiger.turtle", + "garbage-middle:WiredTiger.wt", + "garbage-middle:WiredTigerLAS.wt", + "garbage-end:WiredTiger", + "garbage-end:WiredTiger.turtle", + "garbage-end:WiredTiger.wt", + "garbage-end:WiredTigerLAS.wt", + ] + + # The cases for which salvage will not work, represented in the + # form (self.kind + ':' + self.filename) + not_salvageable = [ + "removal:WiredTiger.turtle", + "removal:WiredTiger.wt", + "truncate:WiredTiger.wt", + "zero:WiredTiger.wt", + "garbage-begin:WiredTiger.basecfg", + "garbage-begin:WiredTiger.wt", + "garbage-end:WiredTiger.basecfg", + ] + + scenarios = make_scenarios(corruption_scenarios, filename_scenarios) + uri = 'table:test_txn19_meta_' + ntables = 5 + create_params = 'key_format=i,value_format=S' + nrecords = 1000 # records per table. + suffixes = [ str(x) for x in range(0, ntables)] # [ '0', '1', ... ] + + def valuegen(self, i): + return str(i) + 'A' * 1024 + + # Insert a list of keys + def inserts(self, keylist): + for suffix in self.suffixes: + c = self.session.open_cursor(self.uri + suffix) + for i in keylist: + c[i] = self.valuegen(i) + c.close() + + def checks(self, expectlist): + for suffix in self.suffixes: + c = self.session.open_cursor(self.uri + suffix, None, None) + gotlist = [] + for key, value in c: + gotlist.append(key) + self.assertEqual(self.valuegen(key), value) + self.assertEqual(expectlist, gotlist) + c.close() + + def corrupt_meta(self, homedir): + # Mark this test has having corrupted files + self.databaseCorrupted() + filename = os.path.join(homedir, self.filename) + self.f(filename) + + def is_openable(self): + key = self.kind + ':' + self.filename + return key in self.openable + + def is_salvageable(self): + key = self.kind + ':' + self.filename + return key not in self.not_salvageable + + def test_corrupt_meta(self): + errfile = 'list.err' + outfile = 'list.out' + newdir = "RESTART" + newdir2 = "RESTART2" + expect = list(range(0, self.nrecords)) + salvage_config = self.base_config + ',salvage=true' + + for suffix in self.suffixes: + self.session.create(self.uri + suffix, self.create_params) + self.inserts(expect) + + # Simulate a crash by copying the contents of the directory + # before closing. After we corrupt the copy, make another + # copy of the corrupted directory. + # + # The first corrupted copy will be used to run: + # wiredtiger_open without salvage flag, followed by: + # wiredtiger_open with salvage flag. + # The second directory will be used to run: + # wiredtiger_open with salvage flag first. + + copy_for_crash_restart(self.home, newdir) + self.close_conn() + self.corrupt_meta(newdir) + copy_for_crash_restart(newdir, newdir2) + + # In cases of corruption, we cannot always call wiredtiger_open + # directly, because there may be a panic, and abort() is called + # in diagnostic mode which terminates the Python interpreter. + # + # Running any wt command externally to Python allows + # us to observe the failure or success safely. + # Use -R to force recover=on, which is the default for + # wiredtiger_open, (wt utilities normally have recover=error) + + expect_fail = not self.is_openable() + self.runWt(['-h', newdir, '-C', self.base_config, '-R', 'list'], + errfilename=errfile, outfilename=outfile, failure=expect_fail, + closeconn=False) + + if expect_fail: + self.check_file_contains_one_of(errfile, + ['/unknown configuration key/', + '/handle-open:/', + '/turtle file read error: WT_NOTFOUND: item not found/', + 'WT_ERROR: non-specific WiredTiger error', + 'WT_TRY_SALVAGE: database corruption detected']) + + for salvagedir in [ newdir, newdir2 ]: + # Removing the 'WiredTiger.turtle' file has weird behavior: + # Immediately doing wiredtiger_open (without salvage) succeeds. + # Following that, wiredtiger_open w/ salvage also succeeds. + # + # But, immediately after the corruption, if we run + # wiredtiger_open with salvage, it will fail. + # This anomoly should be fixed or explained. + if salvagedir == newdir and self.kind == 'removal' and \ + self.filename == 'WiredTiger.turtle': + continue + + if self.is_salvageable(): + self.reopen_conn(salvagedir, salvage_config) + self.checks(expect) + else: + # Certain cases are not currently salvageable, they result in + # an error during the wiredtiger_open. But the nature of the + # messages produced during the error is variable by which case + # it is, and even variable from system to system. + with self.expectedStdoutPattern('.'): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.reopen_conn(salvagedir, salvage_config), + '/.*/') + if __name__ == '__main__': wttest.run() |