diff options
author | Luke Chen <luke.chen@mongodb.com> | 2018-04-06 16:06:32 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2018-04-06 16:23:17 +1000 |
commit | b180ec4ad34b40b499cb4c7a2f01508ed639b44b (patch) | |
tree | 87c8fe8e92a19bb0beb69c1c2696e2b3d98aca07 /src/third_party/wiredtiger | |
parent | f728898d2be6b231175c9d64b39d0f072f5d8d18 (diff) | |
download | mongo-b180ec4ad34b40b499cb4c7a2f01508ed639b44b.tar.gz |
Import wiredtiger: ea986ede145b8c2e3da8f8d11ef25813770c0b39 from branch mongodb-3.8
ref: 875e91581c..ea986ede14
for: 3.7.4
WT-3724 Log an error if flushing with F_FULLSYNC fails
WT-3849 Add timestamp validation to WT_SESSION::prepare_transaction
WT-3870 Bi-weekly WT codebase lint
WT-3922 Allow truncate operations to be prepared
WT-3931 cursor.prev split race
WT-3971 Make cursor duplication use cursor caching
WT-3973 Allow alter to modify app_metadata
WT-3981 Make snapshot consistent with read_timestamp
WT-3984 Fix race conditions around prepare state transitions
WT-3996 Test truncate with timestamps and lookaside
WT-3997 The cursor walk code can spin without sleeping on restart/split.
WT-4002 Allow duplicates in api_data.py
WT-4005 AddressSanitizer in __wt_timestamp_iszero().
WT-4007 eviction instantiates pages from dead trees.
WT-4008 Add ARM NEON support for row search operations
WT-4011 Checkpoint should not read truncated pages
WT-4022 Avoid WT_RESTART error return during eviction walk
WT-4025 Allow debug dumping of internal pages
Diffstat (limited to 'src/third_party/wiredtiger')
52 files changed, 1763 insertions, 777 deletions
diff --git a/src/third_party/wiredtiger/dist/api_config.py b/src/third_party/wiredtiger/dist/api_config.py index d83a632321e..0471bde51fd 100644 --- a/src/third_party/wiredtiger/dist/api_config.py +++ b/src/third_party/wiredtiger/dist/api_config.py @@ -128,18 +128,7 @@ for line in open(f, 'r'): break_on_hyphens=False, replace_whitespace=False, fix_sentence_endings=True) - lastname = None - for c in sorted(api_data.methods[config_name].config): - name = c.name - if '.' in name: - print >>sys.stderr, "Bad config key " + name - - # Deal with duplicates: with complex configurations (like - # WT_SESSION::create), it's simpler to deal with duplicates here than - # manually in api_data.py. - if name == lastname: - continue - lastname = name + for c in api_data.methods[config_name].config: if 'undoc' in c.flags: continue output = parseconfig(c, config_name) @@ -244,8 +233,8 @@ def getsubconfigstr(c): # Write structures of arrays of allowable configuration options, including a # NULL as a terminator for iteration. for name in sorted(api_data.methods.keys()): - ctype = api_data.methods[name].config - if ctype: + config = api_data.methods[name].config + if config: tfile.write(''' static const WT_CONFIG_CHECK confchk_%(name)s[] = { \t%(check)s @@ -253,7 +242,7 @@ static const WT_CONFIG_CHECK confchk_%(name)s[] = { }; ''' % { 'name' : name.replace('.', '_'), - 'check' : '\n\t'.join(getconfcheck(c) for c in sorted(ctype)), + 'check' : '\n\t'.join(getconfcheck(c) for c in config), }) # Write the initialized list of configuration entry structures. @@ -263,7 +252,7 @@ tfile.write('static const WT_CONFIG_ENTRY config_entries[] = {') slot=-1 config_defines = '' for name in sorted(api_data.methods.keys()): - ctype = api_data.methods[name].config + config = api_data.methods[name].config slot += 1 # Build a list of #defines that reference specific slots in the list (the @@ -279,15 +268,15 @@ for name in sorted(api_data.methods.keys()): %(config)s,''' % { 'config' : '\n'.join('\t "%s"' % line for line in w.wrap(','.join('%s=%s' % (c.name, get_default(c)) - for c in sorted(ctype))) or [""]), + for c in config)) or [""]), 'name' : name }) # Write the checks reference, or NULL if no related checks structure. tfile.write('\n\t ') - if ctype: + if config: tfile.write( - 'confchk_' + name.replace('.', '_') + ', ' + str(len(ctype))) + 'confchk_' + name.replace('.', '_') + ', ' + str(len(config))) else: tfile.write('NULL, 0') diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index f5e0b4a67a3..f54d2e1fe5b 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -2,7 +2,18 @@ class Method: def __init__(self, config): - self.config = config + # Deal with duplicates: with complex configurations (like + # WT_SESSION::create), it's simpler to deal with duplicates once than + # manually as configurations are defined + self.config = [] + lastname = None + for c in sorted(config): + if '.' in c.name: + raise "Bad config key '%s'" % c.name + if c.name == lastname: + continue + lastname = c.name + self.config.append(c) class Config: def __init__(self, name, default, desc, subconfig=None, **flags): @@ -15,10 +26,13 @@ class Config: def __cmp__(self, other): return cmp(self.name, other.name) -# Metadata shared by all schema objects -common_meta = [ +common_runtime_config = [ Config('app_metadata', '', r''' application-owned metadata for this object'''), +] + +# Metadata shared by all schema objects +common_meta = common_runtime_config + [ Config('collator', 'none', r''' configure custom collation for keys. Permitted values are \c "none" or a custom collator name created with WT_CONNECTION::add_collator'''), @@ -130,7 +144,7 @@ lsm_config = [ ]), ] -file_runtime_config = [ +file_runtime_config = common_runtime_config + [ Config('access_pattern_hint', 'none', r''' It is recommended that workloads that consist primarily of updates and/or point queries specify \c random. Workloads that diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index cfae3106fcf..0cee432d8b1 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -41,6 +41,7 @@ WT_OPTRACK_MAXRECS WT_PACKED_STRUCT_BEGIN WT_PACKED_STRUCT_END WT_PADDING_CHECK +WT_PREPARE_INIT WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list index a7653e5b497..95c568a19ff 100644 --- a/src/third_party/wiredtiger/dist/s_funcs.list +++ b/src/third_party/wiredtiger/dist/s_funcs.list @@ -16,6 +16,7 @@ __wt_config_getone __wt_cursor_get_raw_value __wt_debug_addr __wt_debug_addr_print +__wt_debug_cursor_page __wt_debug_offset __wt_debug_set_verbose __wt_debug_tree diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 7330f560eb6..3b4c5eb8883 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -176,6 +176,7 @@ INCR INIT INITIALIZER INMEM +INPROGRESS INSN INTL INULL @@ -685,6 +686,7 @@ enqueue enqueued env eof +epi eq equalp errhandler @@ -947,6 +949,7 @@ mkdir mmap mmrand mnt +movemask msecs msg msvc diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 1441187812e..8b79d2daed8 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -271,6 +271,7 @@ connection_stats = [ CacheStat('cache_read', 'pages read into cache'), CacheStat('cache_read_app_count', 'application threads page read from disk to cache count'), CacheStat('cache_read_app_time', 'application threads page read from disk to cache time (usecs)'), + CacheStat('cache_read_deleted', 'pages read into cache after truncate'), CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_lookaside_delay', 'pages read into cache with skipped lookaside entries needed later'), CacheStat('cache_read_lookaside_skipped', 'pages read into cache skipping older lookaside entries'), @@ -549,7 +550,6 @@ connection_stats = [ YieldStat('page_locked_blocked', 'page acquire locked blocked'), YieldStat('page_read_blocked', 'page acquire read blocked'), YieldStat('page_sleep', 'page acquire time sleeping (usecs)'), - YieldStat('tree_descend_blocked', 'tree descend one level yielded for split page index update'), YieldStat('txn_release_blocked', 'connection close blocked waiting for transaction state stabilization'), ] @@ -629,6 +629,7 @@ dsrc_stats = [ CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'), CacheStat('cache_pages_requested', 'pages requested from the cache'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_deleted', 'pages read into cache after truncate'), CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 699ff2083f7..dc6689f6bf1 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "875e91581c63e1e4d47c547291f0a582f30eddae", + "commit": "ea986ede145b8c2e3da8f8d11ef25813770c0b39", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.8" diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 6575080c858..2d6f8623059 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -29,7 +29,7 @@ struct __wt_dbg { const char *key_format; const char *value_format; - WT_ITEM *tmp; /* Temporary space */ + WT_ITEM *t1, *t2; /* Temporary space */ }; static const /* Output separator */ @@ -51,8 +51,7 @@ static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); static int __debug_ref(WT_DBG *, WT_REF *); static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); -static int __debug_tree( - WT_SESSION_IMPL *, WT_BTREE *, WT_REF *, const char *, uint32_t); +static int __debug_tree(WT_SESSION_IMPL *, WT_REF *, const char *, uint32_t); static int __debug_update(WT_DBG *, WT_UPDATE *, bool); static int __dmsg_wrapup(WT_DBG *); @@ -124,10 +123,25 @@ __debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) static int __debug_item_key(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) { + WT_SESSION_IMPL *session; + + session = ds->session; + + /* + * If the format is 'S', it's a string and our version of it may + * not yet be nul-terminated. + */ + if (WT_STREQ(ds->key_format, "S") && + ((char *)data_arg)[size - 1] != '\0') { + WT_RET(__wt_buf_fmt( + session, ds->t2, "%.*s", (int)size, (char *)data_arg)); + data_arg = ds->t2->data; + size = (size_t)ds->t2->size + 1; + } return (ds->f(ds, "\t%s%s{%s}\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", __wt_buf_set_printable_format( - ds->session, data_arg, size, ds->key_format, ds->tmp))); + ds->session, data_arg, size, ds->key_format, ds->t1))); } /* @@ -138,10 +152,25 @@ static int __debug_item_value( WT_DBG *ds, const char *tag, const void *data_arg, size_t size) { + WT_SESSION_IMPL *session; + + session = ds->session; + + /* + * If the format is 'S', it's a string and our version of it may + * not yet be nul-terminated. + */ + if (WT_STREQ(ds->value_format, "S") && + ((char *)data_arg)[size - 1] != '\0') { + WT_RET(__wt_buf_fmt( + session, ds->t2, "%.*s", (int)size, (char *)data_arg)); + data_arg = ds->t2->data; + size = (size_t)ds->t2->size + 1; + } return (ds->f(ds, "\t%s%s{%s}\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", __wt_buf_set_printable_format( - ds->session, data_arg, size, ds->value_format, ds->tmp))); + ds->session, data_arg, size, ds->value_format, ds->t1))); } /* @@ -229,7 +258,8 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile) ds->session = session; - WT_RET(__wt_scr_alloc(session, 512, &ds->tmp)); + WT_RET(__wt_scr_alloc(session, 512, &ds->t1)); + WT_RET(__wt_scr_alloc(session, 512, &ds->t2)); /* * If we weren't given a file, we use the default event handler, and @@ -245,7 +275,7 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile) ds->f = __dmsg_file; } - btree = S2BT_SAFE(session); + btree = S2BT(session); ds->key_format = btree->key_format; ds->value_format = btree->value_format; return (0); @@ -264,7 +294,8 @@ __dmsg_wrapup(WT_DBG *ds) session = ds->session; msg = ds->msg; - __wt_scr_free(session, &ds->tmp); + __wt_scr_free(session, &ds->t1); + __wt_scr_free(session, &ds->t2); /* * Discard the buffer -- it shouldn't have anything in it, but might @@ -608,10 +639,18 @@ __wt_debug_tree_shape( */ int __wt_debug_tree_all( - WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) + void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) { - return (__debug_tree(session, - btree, ref, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)session_arg; + if (btree == NULL) + btree = S2BT(session); + + WT_WITH_BTREE(session, btree, ret = __debug_tree( + session, ref, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); + return (ret); } /* @@ -623,9 +662,18 @@ __wt_debug_tree_all( */ int __wt_debug_tree( - WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) + void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) { - return (__debug_tree(session, btree, ref, ofile, WT_DEBUG_TREE_WALK)); + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)session_arg; + if (btree == NULL) + btree = S2BT(session); + + WT_WITH_BTREE(session, btree, + ret = __debug_tree(session, ref, ofile, WT_DEBUG_TREE_WALK)); + return (ret); } /* @@ -633,18 +681,41 @@ __wt_debug_tree( * Dump the in-memory information for a page. */ int -__wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) +__wt_debug_page( + void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) { WT_DBG *ds, _ds; + WT_DECL_RET; + WT_SESSION_IMPL *session; - WT_ASSERT(session, S2BT_SAFE(session) != NULL); + session = (WT_SESSION_IMPL *)session_arg; + if (btree == NULL) + btree = S2BT(session); ds = &_ds; - WT_RET(__debug_config(session, ds, ofile)); + WT_WITH_BTREE(session, btree, ret = __debug_config(session, ds, ofile)); + WT_RET(ret); - WT_RET(__debug_page(ds, ref, WT_DEBUG_TREE_LEAF)); + WT_WITH_BTREE(session, btree, + ret = __debug_page(ds, ref, WT_DEBUG_TREE_LEAF)); - return (__dmsg_wrapup(ds)); + WT_TRET(__dmsg_wrapup(ds)); + return (ret); +} + +/* + * __wt_debug_cursor_page -- + * Dump the in-memory information for a cursor-referenced page. + */ +int +__wt_debug_cursor_page(void *cursor_arg, const char *ofile) +{ + WT_CURSOR *cursor; + WT_CURSOR_BTREE *cbt; + + cursor = cursor_arg; + cbt = cursor_arg; + return (__wt_debug_page(cursor->session, cbt->btree, cbt->ref, ofile)); } /* @@ -656,8 +727,8 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) * in this function */ static int -__debug_tree(WT_SESSION_IMPL *session, - WT_BTREE *btree, WT_REF *ref, const char *ofile, uint32_t flags) +__debug_tree( + WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile, uint32_t flags) { WT_DBG *ds, _ds; WT_DECL_RET; @@ -667,12 +738,12 @@ __debug_tree(WT_SESSION_IMPL *session, /* A NULL page starts at the top of the tree -- it's a convenience. */ if (ref == NULL) - ref = &btree->root; + ref = &S2BT(session)->root; - WT_WITH_BTREE(session, btree, ret = __debug_page(ds, ref, flags)); - WT_RET(ret); + ret = __debug_page(ds, ref, flags); - return (__dmsg_wrapup(ds)); + WT_TRET(__dmsg_wrapup(ds)); + return (ret); } /* @@ -1189,7 +1260,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) __wt_ref_info(ref, &addr, &addr_size, NULL); return (ds->f(ds, "\t" "%p %s %s\n", (void *)ref, - state, __wt_addr_string(session, addr, addr_size, ds->tmp))); + state, __wt_addr_string(session, addr, addr_size, ds->t1))); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index cb50bfbcf61..a10c82d2cf2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -71,9 +71,13 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) *skipp = false; - /* If we have a clean page in memory, attempt to evict it. */ + /* + * If we have a clean page in memory, attempt to evict it. Do a fast + * check for a dirty page, and then repeat the test once we're locked. + */ previous_state = ref->state; if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) && + !__wt_page_is_modified(ref->page) && __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { ref->state = previous_state; @@ -222,7 +226,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * and if we've yielded enough times, start sleeping so we * don't burn CPU to no purpose. */ - __wt_ref_state_yield_sleep(&yield_count, &sleep_count); + __wt_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_count); } @@ -287,11 +291,7 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); - skip = ref->page_del == NULL || (visible_all ? - __wt_txn_visible_all(session, ref->page_del->txnid, - WT_TIMESTAMP_NULL(&ref->page_del->timestamp)): - __wt_txn_visible(session, ref->page_del->txnid, - WT_TIMESTAMP_NULL(&ref->page_del->timestamp))); + skip = !__wt_page_del_active(session, ref, visible_all); /* * The page_del structure can be freed as soon as the delete is stable: @@ -330,6 +330,7 @@ __tombstone_update_alloc(WT_SESSION_IMPL *session, if (page_del != NULL) { upd->txnid = page_del->txnid; __wt_timestamp_set(&upd->timestamp, &page_del->timestamp); + upd->prepare_state = page_del->prepare_state; } *updp = upd; return (0); @@ -356,6 +357,9 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; + WT_STAT_CONN_INCR(session, cache_read_deleted); + WT_STAT_DATA_INCR(session, cache_read_deleted); + /* * Give the page a modify structure. * @@ -390,8 +394,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * needs to be resolved, otherwise, there may not be one (and, if the * transaction has resolved, we can ignore the page-deleted structure). */ - page_del = - __wt_btree_truncate_active(session, ref) ? ref->page_del : NULL; + page_del = __wt_page_del_active(session, ref, true) ? + ref->page_del : NULL; /* * Allocate the per-page update array if one doesn't already exist. (It diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 8eb120f06ec..17497561248 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -261,8 +261,8 @@ restart: /* * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ -descend: if ((ret = - __wt_page_swap(session, current, descent, flags)) == 0) { +descend: if ((ret = __wt_page_swap( + session, current, descent, false, flags)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 450fd6cf563..345556c4c41 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -562,6 +562,13 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags if (F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) LF_SET(WT_READ_IGNORE_CACHE_SIZE); + /* Sanity check flag combinations. */ + WT_ASSERT(session, !LF_ISSET( + WT_READ_DELETED_SKIP | WT_READ_NO_WAIT | WT_READ_LOOKASIDE) || + LF_ISSET(WT_READ_CACHE)); + WT_ASSERT(session, !LF_ISSET(WT_READ_DELETED_CHECK) || + !LF_ISSET(WT_READ_DELETED_SKIP)); + /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. @@ -575,7 +582,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (current_state = ref->state) { case WT_REF_DELETED: - if (LF_ISSET(WT_READ_NO_EMPTY) && + if (LF_ISSET(WT_READ_DELETED_SKIP | WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_DELETED_CHECK) && __wt_delete_page_skip(session, ref, false)) return (WT_NOTFOUND); goto read; @@ -799,7 +808,7 @@ skip_evict: /* if (cache_work) continue; } - __wt_ref_state_yield_sleep(&wait_cnt, &sleep_cnt); + __wt_state_yield_sleep(&wait_cnt, &sleep_cnt); WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 8600c7d6555..ad7d7d9fcab 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -132,7 +132,20 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) tried_eviction = false; time_start = time_stop = 0; + /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; + + /* + * Skip all deleted pages. For a page to be marked deleted, it must + * have been evicted from cache and marked clean. Checkpoint should + * never instantiate deleted pages: if a truncate is not visible to the + * checkpoint, the on-disk version is correct. If the truncate is + * visible, we skip over the child page when writing its parent. We + * check whether a truncate is visible in the checkpoint as part of + * reconciling internal pages (specifically in __rec_child_modify). + */ + LF_SET(WT_READ_DELETED_SKIP); + internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index c10a9256769..7f711be3480 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -384,7 +384,7 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) - WT_RET(__wt_debug_page(session, ref, NULL)); + WT_RET(__wt_debug_page(session, NULL, ref, NULL)); #endif /* diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 535e804d6a8..a800d896023 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -70,7 +70,7 @@ __ref_index_slot(WT_SESSION_IMPL *session, * before retrying, and if we've yielded enough times, start * sleeping so we don't burn CPU to no purpose. */ - __wt_ref_state_yield_sleep(&yield_count, &sleep_count); + __wt_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked, sleep_count); } @@ -176,84 +176,6 @@ __ref_ascend(WT_SESSION_IMPL *session, } /* - * __ref_descend_prev -- - * Descend the tree one level, during a previous-cursor walk. - */ -static inline void -__ref_descend_prev( - WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) -{ - WT_PAGE_INDEX *pindex; - uint64_t yield_count; - - /* - * We're passed a child page into which we're descending, and on which - * we have a hazard pointer. - */ - for (yield_count = 0;; yield_count++, __wt_yield()) { - /* - * There's a split race when a cursor moving backwards through - * the tree descends the tree. If we're splitting an internal - * page into its parent, we move the WT_REF structures and - * update the parent's page index before updating the split - * page's page index, and it's not an atomic update. A thread - * can read the parent page's replacement page index and then - * read the split page's original index. - * - * This can create a race for previous-cursor movements. - * - * For example, imagine an internal page with 3 child pages, - * with the namespaces a-f, g-h and i-j; the first child page - * splits. The parent starts out with the following page-index: - * - * | ... | a | g | i | ... | - * - * The split page starts out with the following page-index: - * - * | a | b | c | d | e | f | - * - * The first step is to move the c-f ranges into a new subtree, - * so, for example we might have two new internal pages 'c' and - * 'e', where the new 'c' page references the c-d namespace and - * the new 'e' page references the e-f namespace. The top of the - * subtree references the parent page, but until the parent's - * page index is updated, any threads in the subtree won't be - * able to ascend out of the subtree. However, once the parent - * page's page index is updated to this: - * - * | ... | a | c | e | g | i | ... | - * - * threads in the subtree can ascend into the parent. Imagine a - * cursor in the c-d part of the namespace that ascends to the - * parent's 'c' slot. It would then decrement to the slot before - * the 'c' slot, the 'a' slot. - * - * The previous-cursor movement selects the last slot in the 'a' - * page; if the split page's page-index hasn't been updated yet, - * it will select the 'f' slot, which is incorrect. Once the - * split page's page index is updated to this: - * - * | a | b | - * - * the previous-cursor movement will select the 'b' slot, which - * is correct. - * - * This function takes an argument which is the internal page - * from which we're descending. If the last slot on the page no - * longer points to the current page as its "home", the page is - * being split and part of its namespace moved. We have the - * correct page and we don't have to move, all we have to do is - * wait until the split page's page index is updated. - */ - WT_INTL_INDEX_GET(session, ref->page, pindex); - if (pindex->index[pindex->entries - 1]->home == ref->page) - break; - } - *pindexp = pindex; - WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); -} - -/* * __ref_initial_descent_prev -- * Descend the tree one level, when setting up the initial cursor position * for a previous-cursor walk. @@ -265,6 +187,21 @@ __ref_initial_descent_prev( WT_PAGE_INDEX *pindex; /* + * When splitting an internal page into its parent, we move the WT_REF + * structures and update the parent's page index before updating the + * split page's page index, and it's not an atomic update. A thread can + * read the parent page's replacement page index, then read the split + * page's original index, or the parent page's original and the split + * page's replacement. + * + * This isn't a problem for a cursor setting up at the start of the tree + * because we do right-hand splits on internal pages and the initial + * part of the split page's namespace won't change as part of a split. + * A thread reading the parent page's and split page's indexes will move + * to the same slot no matter what order of indexes are read. + * + * Handle a cursor setting up at the end of the tree. + * * We're passed a child page into which we're descending, and on which * we have a hazard pointer. * @@ -293,11 +230,13 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; + uint64_t sleep_count, yield_count; uint32_t current_state, slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; + sleep_count = yield_count = 0; empty_internal = initial_descent = false; /* @@ -307,8 +246,9 @@ __tree_walk_internal(WT_SESSION_IMPL *session, */ WT_ENTER_PAGE_INDEX(session); - /* Walk should never instantiate deleted pages. */ - LF_SET(WT_READ_NO_EMPTY); + /* Check whether deleted pages can be skipped. */ + if (!LF_ISSET(WT_READ_DELETED_SKIP)) + LF_SET(WT_READ_DELETED_CHECK); /* * !!! @@ -427,11 +367,14 @@ restart: /* * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, - * the parent can't have been evicted. + * the parent can't have been evicted. (This is why we + * don't pass "prev" to the page-swap function, we can't + * handle the restart error returned if the parent page + * is currently splitting.) */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( - session, couple, ref, flags)); + session, couple, ref, false, flags)); *refp = ref; goto done; } @@ -509,7 +452,7 @@ restart: /* break; } - ret = __wt_page_swap(session, couple, ref, + ret = __wt_page_swap(session, couple, ref, prev, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* @@ -529,6 +472,14 @@ restart: /* ret = 0; /* + * Yield before retrying, and if we've yielded + * enough times, start sleeping so we don't burn + * CPU to no purpose. + */ + __wt_state_yield_sleep( + &yield_count, &sleep_count); + + /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart @@ -576,44 +527,16 @@ descend: empty_internal = true; /* * There's a split race when a cursor is setting - * up at the end of the tree or moving backwards - * through the tree and descending a level. When - * splitting an internal page into its parent, - * we move the WT_REF structures and update the - * parent's page index before updating the split - * page's page index, and it's not an atomic - * update. A thread can read the parent page's - * replacement page index, then read the split - * page's original index, or the parent page's - * original and the split page's replacement. - * - * This isn't a problem for a cursor setting up - * at the start of the tree or moving forwards - * through the tree because we do right-hand - * splits on internal pages and the initial part - * of the split page's namespace won't change as - * part of a split. A thread reading the parent - * page's and split page's indexes will move to - * the same slot no matter what order of indexes - * are read. - * - * Handle a cursor setting up at the end of the - * tree or moving backwards through the tree. + * up at the end of the tree. */ - if (!prev) { - WT_INTL_INDEX_GET( - session, ref->page, pindex); - slot = 0; - } else if (initial_descent) { + if (prev && initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; - slot = pindex->entries - 1; - } else { - __ref_descend_prev( - session, ref, &pindex); - slot = pindex->entries - 1; - } + } else + WT_INTL_INDEX_GET( + session, ref->page, pindex); + slot = prev ? pindex->entries - 1 : 0; continue; } diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 5c0e066647a..8cc6630599b 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -191,8 +191,8 @@ descend: /* * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { + if ((ret = __wt_page_swap(session, + current, descent, false, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index ca8e2418857..39eddb6e0cc 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -408,7 +408,7 @@ switch_and_jump: /* Switching to a forward roll. */ } next: switch (direction) { - case BACKWARD: + case BACKWARD: --rip; ++slot_offset; break; diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index e75b307812c..20acda8a1ab 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -443,8 +443,8 @@ descend: /* * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { + if ((ret = __wt_page_swap(session, + current, descent, false, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index ffcb2139330..bd68a8b0937 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -238,6 +238,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_alter[] = { { "access_pattern_hint", "string", NULL, "choices=[\"none\",\"random\",\"sequential\"]", NULL, 0 }, + { "app_metadata", "string", NULL, NULL, NULL, 0 }, { "assert", "category", NULL, NULL, confchk_assert_subconfigs, 2 }, @@ -1274,9 +1275,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_CURSOR_reconfigure, 2 }, { "WT_SESSION.alter", - "access_pattern_hint=none,assert=(commit_timestamp=none," - "read_timestamp=none),cache_resident=false,log=(enabled=true)", - confchk_WT_SESSION_alter, 4 + "access_pattern_hint=none,app_metadata=," + "assert=(commit_timestamp=none,read_timestamp=none)," + "cache_resident=false,log=(enabled=true)", + confchk_WT_SESSION_alter, 5 }, { "WT_SESSION.begin_transaction", "ignore_prepare=false,isolation=,name=,priority=0,read_timestamp=" diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 1f5c5f25c57..00a6bc4645d 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -704,7 +704,7 @@ err: WT_TRET(cursor->reopen(cursor, false)); */ int __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, - const char *cfg[], WT_CURSOR **cursorp) + WT_CURSOR *to_dup, const char *cfg[], WT_CURSOR **cursorp) { WT_CONFIG_ITEM cval; WT_CURSOR *cursor; @@ -752,10 +752,22 @@ __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, } /* + * Caller guarantees that exactly one of the URI and the + * duplicate cursor is non-NULL. + */ + if (to_dup != NULL) { + WT_ASSERT(session, uri == NULL); + uri = to_dup->uri; + hash_value = to_dup->uri_hash; + } else { + WT_ASSERT(session, uri != NULL); + hash_value = __wt_hash_city64(uri, strlen(uri)); + } + + /* * Walk through all cursors, if there is a cached * cursor that matches uri and configuration, use it. */ - hash_value = __wt_hash_city64(uri, strlen(uri)); bucket = hash_value % WT_HASH_ARRAY_SIZE; TAILQ_FOREACH(cursor, &session->cursor_cache[bucket], q) { if (cursor->uri_hash == hash_value && diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 5c478654585..719fa7e8c5f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -441,7 +441,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) * control can be running below our locked internal * page. */ - if (__wt_btree_truncate_active(session, child)) + if (__wt_page_del_active(session, child, true)) return (EBUSY); break; default: diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 893f51aa022..9752737ef41 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -10,18 +10,19 @@ /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_READ_CACHE 0x0001u -#define WT_READ_IGNORE_CACHE_SIZE 0x0002u -#define WT_READ_LOOKASIDE 0x0004u -#define WT_READ_NOTFOUND_OK 0x0008u -#define WT_READ_NO_EMPTY 0x0010u -#define WT_READ_NO_GEN 0x0020u -#define WT_READ_NO_SPLIT 0x0040u -#define WT_READ_NO_WAIT 0x0080u -#define WT_READ_PREV 0x0100u -#define WT_READ_RESTART_OK 0x0200u -#define WT_READ_SKIP_INTL 0x0400u -#define WT_READ_TRUNCATE 0x0800u -#define WT_READ_WONT_NEED 0x1000u +#define WT_READ_DELETED_CHECK 0x0002u +#define WT_READ_DELETED_SKIP 0x0004u +#define WT_READ_IGNORE_CACHE_SIZE 0x0008u +#define WT_READ_LOOKASIDE 0x0010u +#define WT_READ_NOTFOUND_OK 0x0020u +#define WT_READ_NO_GEN 0x0040u +#define WT_READ_NO_SPLIT 0x0080u +#define WT_READ_NO_WAIT 0x0100u +#define WT_READ_PREV 0x0200u +#define WT_READ_RESTART_OK 0x0400u +#define WT_READ_SKIP_INTL 0x0800u +#define WT_READ_TRUNCATE 0x1000u +#define WT_READ_WONT_NEED 0x2000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ /* AUTOMATIC FLAG VALUE GENERATION START */ @@ -506,11 +507,7 @@ struct __wt_page { * Internal pages (both column- and row-store). * * In-memory internal pages have an array of pointers to child - * structures, maintained in collated order. When a page is - * read into memory, the initial list of children is stored in - * the "orig_index" field, and it and the collated order are - * the same. After a page splits, the collated order and the - * original order will differ. + * structures, maintained in collated order. * * Multiple threads of control may be searching the in-memory * internal page and a child page of the internal page may @@ -707,6 +704,45 @@ struct __wt_page { ((void *)((uint8_t *)((page)->dsk) + (o))) /* + * Prepare update states. + * + * Prepare update synchronization is based on the state field, which has the + * following possible states: + * + * WT_PREPARE_INIT: + * The initial prepare state of either an update or a page_del structure, + * indicating a prepare phase has not started yet. + * This state has no impact on the visibility of the update's data. + * + * WT_PREPARE_INPROGRESS: + * Update is in prepared phase. + * + * WT_PREPARE_LOCKED: + * State is locked as state transition is in progress from INPROGRESS to + * RESOLVED. Any reader of the state needs to wait for state transition to + * complete. + * + * WT_PREPARE_RESOLVED: + * Represents the commit state of the prepared update. + * + * State Transition: + * From uncommitted -> prepare -> commit: + * INIT --> INPROGRESS --> LOCKED --> RESOLVED + * LOCKED will be a momentary phase during timestamp update. + * + * From uncommitted -> prepare -> rollback: + * INIT --> INPROGRESS + * Prepare state will not be updated during rollback and will continue to + * have the state as INPROGRESS. + */ +#define WT_PREPARE_INIT 0 /* Must be 0, as structures + will be default initialized + with 0. */ +#define WT_PREPARE_INPROGRESS 1 +#define WT_PREPARE_LOCKED 2 +#define WT_PREPARE_RESOLVED 3 + +/* * Page state. * * Synchronization is based on the WT_REF->state field, which has a number of @@ -779,6 +815,12 @@ struct __wt_page_deleted { volatile uint64_t txnid; /* Transaction ID */ WT_DECL_TIMESTAMP(timestamp) + /* + * The state is used for transaction prepare to manage visibility + * and inheriting prepare state to update_list. + */ + volatile uint8_t prepare_state; /* Prepare state. */ + uint32_t previous_state; /* Previous state */ WT_UPDATE **update_list; /* List of updates for abort */ @@ -989,16 +1031,6 @@ struct __wt_update { #define WT_UPDATE_TOMBSTONE 5 /* deleted */ uint8_t type; /* type (one byte to conserve memory) */ - /* - * The update state is used for transaction prepare to manage - * visibility and transitioning update structure state safely. - */ -#define WT_UPDATE_STATE_READY 0 /* Must be 0. Default or - finalized prepare */ -#define WT_UPDATE_STATE_LOCKED 1 /* locked */ -#define WT_UPDATE_STATE_PREPARED 2 /* prepared */ - volatile uint8_t state; - /* If the update includes a complete value. */ #define WT_UPDATE_DATA_VALUE(upd) \ ((upd)->type == WT_UPDATE_STANDARD || \ @@ -1009,6 +1041,12 @@ struct __wt_update { #endif /* + * The update state is used for transaction prepare to manage + * visibility and transitioning update structure state safely. + */ + volatile uint8_t prepare_state; /* Prepare state. */ + + /* * Zero or more bytes of value (the payload) immediately follows the * WT_UPDATE structure. We use a C99 flexible array member which has * the semantics we want. diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index de28eb7232f..149f4304692 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1151,19 +1151,28 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * __wt_btree_truncate_active -- + * __wt_page_del_active -- * Return if a truncate operation is active. */ static inline bool -__wt_btree_truncate_active(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_page_del_active( + WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { WT_PAGE_DELETED *page_del; + uint8_t prepare_state; if ((page_del = ref->page_del) == NULL) return (false); if (page_del->txnid == WT_TXN_ABORTED) return (false); - return (!__wt_txn_visible_all(session, + WT_ORDERED_READ(prepare_state, page_del->prepare_state); + if (prepare_state == WT_PREPARE_INPROGRESS || + prepare_state == WT_PREPARE_LOCKED) + return (true); + return (visible_all ? + !__wt_txn_visible_all(session, + page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)) : + !__wt_txn_visible(session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))); } @@ -1354,7 +1363,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) mod = page->modify; /* A truncated page can't be evicted until the truncate completes. */ - if (__wt_btree_truncate_active(session, ref)) + if (__wt_page_del_active(session, ref, true)) return (false); /* Otherwise, never modified pages can always be evicted. */ @@ -1485,81 +1494,6 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) } /* - * __wt_page_swap_func -- - * Swap one page's hazard pointer for another one when hazard pointer - * coupling up/down the tree. - */ -static inline int -__wt_page_swap_func( - WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags -#ifdef HAVE_DIAGNOSTIC - , const char *file, int line -#endif - ) -{ - WT_DECL_RET; - bool acquired; - - /* - * This function is here to simplify the error handling during hazard - * pointer coupling so we never leave a hazard pointer dangling. The - * assumption is we're holding a hazard pointer on "held", and want to - * acquire a hazard pointer on "want", releasing the hazard pointer on - * "held" when we're done. - * - * When walking the tree, we sometimes swap to the same page. Fast-path - * that to avoid thinking about error handling. - */ - if (held == want) - return (0); - - /* Get the wanted page. */ - ret = __wt_page_in_func(session, want, flags -#ifdef HAVE_DIAGNOSTIC - , file, line -#endif - ); - - /* - * Expected failures: page not found or restart. Our callers list the - * errors they're expecting to handle. - */ - if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) - return (WT_NOTFOUND); - if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) - return (WT_RESTART); - - /* Discard the original held page on either success or error. */ - acquired = ret == 0; - WT_TRET(__wt_page_release(session, held, flags)); - - /* Fast-path expected success. */ - if (ret == 0) - return (0); - - /* - * If there was an error at any point that our caller isn't prepared to - * handle, discard any page we acquired. - */ - if (acquired) - WT_TRET(__wt_page_release(session, want, flags)); - - /* - * If we're returning an error, don't let it be one our caller expects - * to handle as returned by page-in: the expectation includes the held - * page not having been released, and that's not the case. - */ - if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) - WT_RET_MSG(session, - EINVAL, "page-release WT_NOTFOUND error mapped to EINVAL"); - if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) - WT_RET_MSG(session, - EINVAL, "page-release WT_RESTART error mapped to EINVAL"); - - return (ret); -} - -/* * __wt_skip_choose_depth -- * Randomly choose a depth for a skiplist insert. */ @@ -1693,22 +1627,152 @@ __wt_split_descent_race( } /* - * __wt_ref_state_yield_sleep -- - * sleep while waiting for the wt_ref state after THOUSAND yields. + * __wt_split_prev_race -- + * Return if we raced with an internal page split when moving backwards + * through the tree. */ -static inline void -__wt_ref_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count) +static inline bool +__wt_split_prev_race(WT_SESSION_IMPL *session, WT_REF *ref) { + WT_PAGE_INDEX *pindex; + /* - * We yield before retrying, and if we've yielded enough times, start - * sleeping so we don't burn CPU to no purpose. + * There's a split race when a cursor moving backwards through the tree + * descends the tree. If we're splitting an internal page into its + * parent, we move the WT_REF structures and update the parent's page + * index before updating the split page's page index, and it's not an + * atomic update. A thread can read the parent and split page's original + * indexes during a split, or read the parent page's replacement page + * index and then read the split page's original index, either of which + * can lead to skipping pages. + * + * For example, imagine an internal page with 3 child pages, with the + * namespaces a-f, g-h and i-j; the first child page splits. The parent + * starts out with the following page-index: + * + * | ... | a | g | i | ... | + * + * The split page starts out with the following page-index: + * + * | a | b | c | d | e | f | + * + * The first step is to move the c-f ranges into a new subtree, so, for + * example we might have two new internal pages 'c' and 'e', where the + * new 'c' page references the c-d namespace and the new 'e' page + * references the e-f namespace. The top of the subtree references the + * parent page, but until the parent's page index is updated, threads in + * the subtree won't be able to ascend out of the subtree. However, once + * the parent page's page index is updated to this: + * + * | ... | a | c | e | g | i | ... | + * + * threads in the subtree can ascend into the parent. Imagine a cursor + * in the c-d part of the namespace that ascends to the parent's 'c' + * slot. It would then decrement to the slot before the 'c' slot, the + * 'a' slot. + * + * The previous-cursor movement selects the last slot in the 'a' page; + * if the split page's page-index hasn't been updated yet, it selects + * the 'f' slot, which is incorrect. Once the split page's page index is + * updated to this: + * + * | a | b | + * + * the previous-cursor movement will select the 'b' slot, which is + * correct. + * + * This function takes an argument which is the internal page into which + * we're coupling. If the last slot on the page no longer points to + * the current page as its "home", the page is being split and part of + * its namespace moved, we have to restart. */ - if ((*yield_count) < WT_THOUSAND) { - (*yield_count)++; - __wt_yield(); - return; + WT_INTL_INDEX_GET(session, ref->page, pindex); + return (pindex->index[pindex->entries - 1]->home != ref->page); +} + +/* + * __wt_page_swap_func -- + * Swap one page's hazard pointer for another one when hazard pointer + * coupling up/down the tree. + */ +static inline int +__wt_page_swap_func(WT_SESSION_IMPL *session, + WT_REF *held, WT_REF *want, bool prev_race, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_DECL_RET; + bool acquired; + + /* + * This function is here to simplify the error handling during hazard + * pointer coupling so we never leave a hazard pointer dangling. The + * assumption is we're holding a hazard pointer on "held", and want to + * acquire a hazard pointer on "want", releasing the hazard pointer on + * "held" when we're done. + * + * When walking the tree, we sometimes swap to the same page. Fast-path + * that to avoid thinking about error handling. + */ + if (held == want) + return (0); + + /* Get the wanted page. */ + ret = __wt_page_in_func(session, want, flags +#ifdef HAVE_DIAGNOSTIC + , file, line +#endif + ); + + /* + * We can race when descending into an internal page as part of moving + * backwards through the tree, and we have to detect that race before + * releasing the page from which we are coupling, else we can't restart + * the movement. + */ + if (ret == 0 && prev_race && WT_PAGE_IS_INTERNAL(want->page) && + __wt_split_prev_race(session, want)) { + ret = WT_RESTART; + WT_TRET(__wt_page_release(session, want, flags)); } - (*sleep_count) = WT_MIN((*sleep_count) + 100, WT_THOUSAND); - __wt_sleep(0, (*sleep_count)); + /* + * Expected failures: page not found or restart. Our callers list the + * errors they're expecting to handle. + */ + if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) + return (WT_RESTART); + + /* Discard the original held page on either success or error. */ + acquired = ret == 0; + WT_TRET(__wt_page_release(session, held, flags)); + + /* Fast-path expected success. */ + if (ret == 0) + return (0); + + /* + * If there was an error at any point that our caller isn't prepared to + * handle, discard any page we acquired. + */ + if (acquired) + WT_TRET(__wt_page_release(session, want, flags)); + + /* + * If we're returning an error, don't let it be one our caller expects + * to handle as returned by page-in: the expectation includes the held + * page not having been released, and that's not the case. + */ + if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) + WT_RET_MSG(session, + EINVAL, "page-release WT_NOTFOUND error mapped to EINVAL"); + if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) + WT_RET_MSG(session, + EINVAL, "page-release WT_RESTART error mapped to EINVAL"); + + return (ret); } diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i index 8f8e0e83717..f8679933210 100644 --- a/src/third_party/wiredtiger/src/include/btree_cmp.i +++ b/src/third_party/wiredtiger/src/include/btree_cmp.i @@ -10,9 +10,33 @@ #if !defined(_MSC_VER) && !defined(_lint) #include <x86intrin.h> #endif +#endif /* 16B alignment */ #define WT_ALIGNED_16(p) (((uintptr_t)(p) & 0x0f) == 0) #define WT_VECTOR_SIZE 16 /* chunk size */ + +#if defined(HAVE_ARM_NEON_INTRIN_H) +#include <arm_neon.h> +/* + * _mm_movemask_epi8_neon -- + * Creates a 16-bit mask from the most significant bits of the 16 signed + * or unsigned 8-bit integers. + */ +static inline uint16_t +_mm_movemask_epi8_neon(const uint8x16_t data) +{ + uint64x1_t p; + p = vset_lane_u64(0x8040201008040201, p, 0); + uint8x16_t powers = vcombine_u8(p, p); + uint8x16_t zero8x16 = vdupq_n_s8(0); + int8x16_t input = vcltq_s8((int8x16_t)data, (int8x16_t)zero8x16); + uint64x2_t mask = vpaddlq_u32( + vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)input, powers)))); + uint16_t output; + output = + ((vgetq_lane_u8(mask, 8) << 8) | (vgetq_lane_u8(mask, 0) << 0)); + return (output); +} #endif /* @@ -70,6 +94,24 @@ __wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) } len += remain; } +#elif defined(HAVE_ARM_NEON_INTRIN_H) + /* Use vector instructions if we'll execute at least 1 of them. */ + if (len >= WT_VECTOR_SIZE) { + size_t remain; + uint8x16_t res_eq, u, t; + remain = len % WT_VECTOR_SIZE; + len -= remain; + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (_mm_movemask_epi8_neon(res_eq) != 65535) + break; + } + len += remain; + } #endif /* * Use the non-vectorized version for the remaining bytes and for the @@ -158,6 +200,26 @@ __wt_lex_compare_skip( } len += remain; } +#elif defined(HAVE_ARM_NEON_INTRIN_H) + /* Use vector instructions if we'll execute at least 1 of them. */ + if (len >= WT_VECTOR_SIZE) { + size_t remain; + uint8x16_t res_eq, u, t; + remain = len % WT_VECTOR_SIZE; + len -= remain; + if (WT_ALIGNED_16(userp) && WT_ALIGNED_16(treep)) + for (; len > 0; + len -= WT_VECTOR_SIZE, + userp += WT_VECTOR_SIZE, treep += WT_VECTOR_SIZE, + *matchp += WT_VECTOR_SIZE) { + u = vld1q_u8(userp); + t = vld1q_u8(treep); + res_eq = vceqq_u8(u, t); + if (_mm_movemask_epi8_neon(res_eq) != 65535) + break; + } + len += remain; + } #endif /* * Use the non-vectorized version for the remaining bytes and for the diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d884401feb2..7b932f3ec49 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -123,9 +123,10 @@ extern int __wt_debug_offset_blind(WT_SESSION_IMPL *session, wt_off_t offset, co extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t checksum, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_tree_shape(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_tree_all(void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_tree(void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_page(void *session_arg, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -353,7 +354,7 @@ extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap); extern int __wt_cursor_cache(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cursor_reopen(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle); extern int __wt_cursor_cache_release(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool *released) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *to_dup, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_close(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_equals(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -602,7 +603,7 @@ extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_alter(WT_SESSION_IMPL *session, const char *newcfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_schema_alter(WT_SESSION_IMPL *session, const char *uri, const char *newcfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -641,6 +642,7 @@ extern int __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name); extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_exclusive_handle_operation(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[], uint32_t open_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_notsup(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -846,8 +848,9 @@ extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, extern int __wt_txn_global_query_timestamp(WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *ts, WT_CONFIG_ITEM *cval, bool cmp_oldest, bool cmp_stable, bool cmp_commit) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *ts, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_parse_prepare_timestamp(WT_SESSION_IMPL *session, const char *cfg[], wt_timestamp_t *timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index c4d7def85c0..6b30e63d2a3 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -292,15 +292,16 @@ typedef void wt_timestamp_t; __wt_scr_alloc_func(session, size, scratchp, __func__, __LINE__) #define __wt_page_in(session, ref, flags) \ __wt_page_in_func(session, ref, flags, __func__, __LINE__) -#define __wt_page_swap(session, held, want, flags) \ - __wt_page_swap_func(session, held, want, flags, __func__, __LINE__) +#define __wt_page_swap(session, held, want, prev_race, flags) \ + __wt_page_swap_func( \ + session, held, want, prev_race, flags, __func__, __LINE__) #else #define __wt_scr_alloc(session, size, scratchp) \ __wt_scr_alloc_func(session, size, scratchp) #define __wt_page_in(session, ref, flags) \ __wt_page_in_func(session, ref, flags) -#define __wt_page_swap(session, held, want, flags) \ - __wt_page_swap_func(session, held, want, flags) +#define __wt_page_swap(session, held, want, prev_race, flags) \ + __wt_page_swap_func(session, held, want, prev_race, flags) #endif /* Called on unexpected code path: locate the failure. */ diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index acbbbcaff83..102d4f0cce0 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -223,3 +223,24 @@ __wt_txn_context_check(WT_SESSION_IMPL *session, bool requires_txn) session->name); return (0); } + +/* + * __wt_state_yield_sleep -- + * Sleep while waiting, after a thousand yields. + */ +static inline void +__wt_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count) +{ + /* + * We yield before retrying, and if we've yielded enough times, start + * sleeping so we don't burn CPU to no purpose. + */ + if ((*yield_count) < WT_THOUSAND) { + (*yield_count)++; + __wt_yield(); + return; + } + + (*sleep_count) = WT_MIN((*sleep_count) + 100, WT_THOUSAND); + __wt_sleep(0, (*sleep_count)); +} diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 01a982b8602..616ca59b57e 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -419,6 +419,7 @@ struct __wt_connection_stats { int64_t cache_eviction_pages_queued_urgent; int64_t cache_eviction_pages_queued_oldest; int64_t cache_read; + int64_t cache_read_deleted; int64_t cache_read_lookaside; int64_t cache_read_lookaside_skipped; int64_t cache_read_lookaside_delay; @@ -615,7 +616,6 @@ struct __wt_connection_stats { int64_t page_sleep; int64_t page_del_rollback_blocked; int64_t child_modify_blocked_page; - int64_t tree_descend_blocked; int64_t txn_commit_queue_empty; int64_t txn_commit_queue_tail; int64_t txn_commit_queue_inserts; @@ -736,6 +736,7 @@ struct __wt_dsrc_stats { int64_t cache_eviction_deepen; int64_t cache_write_lookaside; int64_t cache_read; + int64_t cache_read_deleted; int64_t cache_read_lookaside; int64_t cache_pages_requested; int64_t cache_eviction_pages_seen; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 9061157ff5a..f077ef164e9 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -251,8 +251,20 @@ static inline bool __wt_txn_update_needs_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) { WT_TXN *txn; + wt_timestamp_t *timestamp; txn = &session->txn; + + /* + * The timestamp is in the page deleted structure for truncates, or + * in the update for other operations. + */ + if (op->type == WT_TXN_OP_REF_DELETE) + timestamp = op->u.ref == NULL || op->u.ref->page_del == NULL ? + NULL : &op->u.ref->page_del->timestamp; + else + timestamp = op->u.upd == NULL ? NULL : &op->u.upd->timestamp; + /* * Updates in the metadata never get timestamps (either now or at * commit): metadata cannot be read at a point in time, only the most @@ -260,8 +272,7 @@ __wt_txn_update_needs_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) */ return (op->fileid != WT_METAFILE_ID && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - (op->u.upd == NULL || - __wt_timestamp_iszero(&(op->u.upd->timestamp)) || + (timestamp == NULL || __wt_timestamp_iszero(timestamp) || F_ISSET(txn, WT_TXN_PREPARE))); } #endif @@ -550,12 +561,13 @@ __wt_txn_visible( static inline WT_VISIBLE_TYPE __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) { - uint8_t upd_state; + uint8_t prepare_state, previous_state; bool upd_visible; for (;;__wt_yield()) { - /* Commit is in progress, yield and try again. */ - if ((upd_state = upd->state) == WT_UPDATE_STATE_LOCKED) + /* Prepare state change is in progress, yield and try again. */ + WT_ORDERED_READ(prepare_state, upd->prepare_state); + if (prepare_state == WT_PREPARE_LOCKED) continue; upd_visible = __wt_txn_visible( @@ -565,14 +577,17 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) * The visibility check is only valid if the update does not * change state. If the state does change, recheck visibility. */ - if (upd->state == upd_state) + previous_state = prepare_state; + WT_ORDERED_READ(prepare_state, upd->prepare_state); + if (previous_state == prepare_state) break; } if (!upd_visible) return (WT_VISIBLE_FALSE); - if (upd_state == WT_UPDATE_STATE_PREPARED) + /* Ignore the prepared update, if transaction configuration says so. */ + if (prepare_state == WT_PREPARE_INPROGRESS) return (F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ? WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE); @@ -620,7 +635,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp) if (upd == NULL && skipped_birthmark) upd = &tombstone; - *updp = (upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd); + *updp = upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd; return (0); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 1f2a438b8e9..1c3b75ec6ae 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1139,6 +1139,8 @@ struct __wt_session { * option leads to an advisory call to an appropriate operating system * API where available., a string\, chosen from the following options: * \c "none"\, \c "random"\, \c "sequential"; default \c none.} + * @config{app_metadata, application-owned metadata for this object., a + * string; default empty.} * @config{cache_resident, do not ever evict the object's pages from * cache. Not compatible with LSM tables; see @ref * tuning_cache_resident for more information., a boolean flag; default @@ -5102,445 +5104,442 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1104 /*! cache: pages read into cache */ #define WT_STAT_CONN_CACHE_READ 1105 +/*! cache: pages read into cache after truncate */ +#define WT_STAT_CONN_CACHE_READ_DELETED 1106 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1106 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1107 /*! cache: pages read into cache skipping older lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1107 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1108 /*! * cache: pages read into cache with skipped lookaside entries needed * later */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1108 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1109 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1109 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1110 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1110 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1111 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1111 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1112 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1112 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1113 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1113 +#define WT_STAT_CONN_CACHE_WRITE 1114 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1114 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1115 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1115 +#define WT_STAT_CONN_CACHE_OVERHEAD 1116 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1116 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1117 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1117 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1118 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1118 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1119 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1119 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1120 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1120 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1121 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1121 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1122 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1122 +#define WT_STAT_CONN_COND_AUTO_WAIT 1123 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1123 +#define WT_STAT_CONN_TIME_TRAVEL 1124 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1124 +#define WT_STAT_CONN_FILE_OPEN 1125 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1125 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1126 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1126 +#define WT_STAT_CONN_MEMORY_FREE 1127 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1127 +#define WT_STAT_CONN_MEMORY_GROW 1128 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1128 +#define WT_STAT_CONN_COND_WAIT 1129 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1129 +#define WT_STAT_CONN_RWLOCK_READ 1130 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1130 +#define WT_STAT_CONN_RWLOCK_WRITE 1131 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1131 +#define WT_STAT_CONN_FSYNC_IO 1132 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1132 +#define WT_STAT_CONN_READ_IO 1133 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1133 +#define WT_STAT_CONN_WRITE_IO 1134 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1134 +#define WT_STAT_CONN_CURSOR_CREATE 1135 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1135 +#define WT_STAT_CONN_CURSOR_INSERT 1136 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1136 +#define WT_STAT_CONN_CURSOR_MODIFY 1137 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1137 +#define WT_STAT_CONN_CURSOR_NEXT 1138 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1138 +#define WT_STAT_CONN_CURSOR_PREV 1139 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1139 +#define WT_STAT_CONN_CURSOR_REMOVE 1140 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1140 +#define WT_STAT_CONN_CURSOR_RESERVE 1141 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1141 +#define WT_STAT_CONN_CURSOR_RESET 1142 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1142 +#define WT_STAT_CONN_CURSOR_RESTART 1143 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1143 +#define WT_STAT_CONN_CURSOR_SEARCH 1144 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1144 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1145 /*! cursor: cursor sweep buckets */ -#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1145 +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1146 /*! cursor: cursor sweep cursors closed */ -#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1146 +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1147 /*! cursor: cursor sweep cursors examined */ -#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1147 +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1148 /*! cursor: cursor sweeps */ -#define WT_STAT_CONN_CURSOR_SWEEP 1148 +#define WT_STAT_CONN_CURSOR_SWEEP 1149 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1149 +#define WT_STAT_CONN_CURSOR_UPDATE 1150 /*! cursor: cursors cached on close */ -#define WT_STAT_CONN_CURSOR_CACHE 1150 +#define WT_STAT_CONN_CURSOR_CACHE 1151 /*! cursor: cursors reused from cache */ -#define WT_STAT_CONN_CURSOR_REOPEN 1151 +#define WT_STAT_CONN_CURSOR_REOPEN 1152 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1152 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1153 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1153 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1154 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1154 +#define WT_STAT_CONN_DH_SWEEP_REF 1155 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1155 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1156 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1156 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1157 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1157 +#define WT_STAT_CONN_DH_SWEEP_TOD 1158 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1158 +#define WT_STAT_CONN_DH_SWEEPS 1159 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1159 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1160 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1160 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1161 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1161 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1162 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1162 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1163 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1163 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1164 /*! * lock: commit timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1164 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1165 /*! * lock: commit timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1165 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1166 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1166 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1167 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1167 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1168 /*! * lock: dhandle lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1168 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1169 /*! * lock: dhandle lock internal thread time waiting for the dhandle lock * (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1169 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1170 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1170 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1171 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1171 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1172 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1172 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1173 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1173 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1174 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1174 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1175 /*! * lock: read timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1175 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1176 /*! * lock: read timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1176 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1177 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1177 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1178 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1178 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1179 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1179 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1180 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1180 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1181 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1181 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1182 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1182 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1183 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1183 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1184 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1184 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1185 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1185 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1186 /*! * lock: txn global lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1186 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1187 /*! * lock: txn global lock internal thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1187 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1188 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1188 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1189 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1189 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1190 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1190 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1191 /*! log: force checkpoint calls slept */ -#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1191 +#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1192 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1192 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1193 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1193 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1194 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1194 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1195 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1195 +#define WT_STAT_CONN_LOG_FLUSH 1196 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1196 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1197 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1197 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1198 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1198 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1199 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1199 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1200 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1200 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1201 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1201 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1202 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1202 +#define WT_STAT_CONN_LOG_SCANS 1203 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1203 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1204 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1204 +#define WT_STAT_CONN_LOG_WRITE_LSN 1205 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1205 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1206 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1206 +#define WT_STAT_CONN_LOG_SYNC 1207 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1207 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1208 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1208 +#define WT_STAT_CONN_LOG_SYNC_DIR 1209 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1209 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1210 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1210 +#define WT_STAT_CONN_LOG_WRITES 1211 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1211 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1212 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1212 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1213 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1213 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1214 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1214 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1215 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1215 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1216 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1216 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1217 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1217 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1218 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1218 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1219 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1219 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1220 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1220 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1221 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1221 +#define WT_STAT_CONN_LOG_SLOT_RACES 1222 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1222 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1223 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1223 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1224 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1224 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1225 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1225 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1226 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1226 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1227 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1227 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1228 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1228 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1229 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1229 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1230 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1230 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1231 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1231 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1232 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1232 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1233 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1233 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1234 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1234 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1235 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1235 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1236 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1236 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1237 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1237 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1238 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1238 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1239 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1239 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1240 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1240 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1241 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1241 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1242 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1242 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1243 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1243 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1244 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1244 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1245 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1245 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1246 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1246 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1247 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1247 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1248 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1248 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1249 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1249 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1250 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1250 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1251 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1251 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1252 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1252 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1253 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1253 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1254 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1254 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1255 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1255 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1256 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1256 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1257 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1257 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1258 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1258 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1259 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1259 +#define WT_STAT_CONN_REC_PAGES 1260 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1260 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1261 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1261 +#define WT_STAT_CONN_REC_PAGE_DELETE 1262 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1262 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1263 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1263 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1264 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1264 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1265 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1265 +#define WT_STAT_CONN_SESSION_OPEN 1266 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1266 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1267 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1267 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1268 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1268 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1269 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1269 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1270 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1270 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1271 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1271 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1272 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1272 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1273 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1273 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1274 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1274 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1275 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1275 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1276 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1276 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1277 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1277 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1278 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1278 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1279 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1279 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1280 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1280 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1281 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1281 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1282 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1282 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1283 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1283 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1284 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1284 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1285 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1285 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1286 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1286 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1287 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1287 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1288 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1288 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1289 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1289 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1290 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1290 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1291 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1291 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1292 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1292 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1293 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1293 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1294 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1294 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1295 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1295 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1296 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1296 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1297 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1297 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1298 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1298 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1299 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1299 +#define WT_STAT_CONN_PAGE_SLEEP 1300 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1300 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1301 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1301 -/*! - * thread-yield: tree descend one level yielded for split page index - * update - */ -#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1302 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1302 /*! transaction: commit timestamp queue insert to empty */ #define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1303 /*! transaction: commit timestamp queue inserts to tail */ @@ -5825,220 +5824,222 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2067 /*! cache: pages read into cache */ #define WT_STAT_DSRC_CACHE_READ 2068 +/*! cache: pages read into cache after truncate */ +#define WT_STAT_DSRC_CACHE_READ_DELETED 2069 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2069 +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2070 /*! cache: pages requested from the cache */ -#define WT_STAT_DSRC_CACHE_PAGES_REQUESTED 2070 +#define WT_STAT_DSRC_CACHE_PAGES_REQUESTED 2071 /*! cache: pages seen by eviction walk */ -#define WT_STAT_DSRC_CACHE_EVICTION_PAGES_SEEN 2071 +#define WT_STAT_DSRC_CACHE_EVICTION_PAGES_SEEN 2072 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2072 +#define WT_STAT_DSRC_CACHE_WRITE 2073 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2073 +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2074 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_DSRC_CACHE_BYTES_DIRTY 2074 +#define WT_STAT_DSRC_CACHE_BYTES_DIRTY 2075 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2075 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2076 /*! * cache_walk: Average difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2076 +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2077 /*! * cache_walk: Average on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2077 +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2078 /*! * cache_walk: Average time in cache for pages that have been visited by * the eviction server, only reported if cache_walk or all statistics are * enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_VISITED_AGE 2078 +#define WT_STAT_DSRC_CACHE_STATE_AVG_VISITED_AGE 2079 /*! * cache_walk: Average time in cache for pages that have not been visited * by the eviction server, only reported if cache_walk or all statistics * are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_UNVISITED_AGE 2079 +#define WT_STAT_DSRC_CACHE_STATE_AVG_UNVISITED_AGE 2080 /*! * cache_walk: Clean pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2080 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2081 /*! * cache_walk: Current eviction generation, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2081 +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2082 /*! * cache_walk: Dirty pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2082 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2083 /*! * cache_walk: Entries in the root page, only reported if cache_walk or * all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2083 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2084 /*! * cache_walk: Internal pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2084 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2085 /*! * cache_walk: Leaf pages currently in cache, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2085 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2086 /*! * cache_walk: Maximum difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2086 +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2087 /*! * cache_walk: Maximum page size seen, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2087 +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2088 /*! * cache_walk: Minimum on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2088 +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2089 /*! * cache_walk: Number of pages never visited by eviction server, only * reported if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_UNVISITED_COUNT 2089 +#define WT_STAT_DSRC_CACHE_STATE_UNVISITED_COUNT 2090 /*! * cache_walk: On-disk page image sizes smaller than a single allocation * unit, only reported if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2090 +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2091 /*! * cache_walk: Pages created in memory and never written, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2091 +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2092 /*! * cache_walk: Pages currently queued for eviction, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2092 +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2093 /*! * cache_walk: Pages that could not be queued for eviction, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2093 +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2094 /*! * cache_walk: Refs skipped during cache traversal, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2094 +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2095 /*! * cache_walk: Size of the root page, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2095 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2096 /*! * cache_walk: Total number of pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES 2096 +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2097 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2097 +#define WT_STAT_DSRC_COMPRESS_READ 2098 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2098 +#define WT_STAT_DSRC_COMPRESS_WRITE 2099 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2099 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2100 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2100 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2101 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2101 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2102 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2102 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2103 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2103 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2104 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2104 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2105 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2105 +#define WT_STAT_DSRC_CURSOR_CREATE 2106 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2106 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2107 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2107 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2108 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2108 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2109 /*! cursor: cursors cached on close */ -#define WT_STAT_DSRC_CURSOR_CACHE 2109 +#define WT_STAT_DSRC_CURSOR_CACHE 2110 /*! cursor: cursors reused from cache */ -#define WT_STAT_DSRC_CURSOR_REOPEN 2110 +#define WT_STAT_DSRC_CURSOR_REOPEN 2111 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2111 +#define WT_STAT_DSRC_CURSOR_INSERT 2112 /*! cursor: modify calls */ -#define WT_STAT_DSRC_CURSOR_MODIFY 2112 +#define WT_STAT_DSRC_CURSOR_MODIFY 2113 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2113 +#define WT_STAT_DSRC_CURSOR_NEXT 2114 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2114 +#define WT_STAT_DSRC_CURSOR_PREV 2115 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2115 +#define WT_STAT_DSRC_CURSOR_REMOVE 2116 /*! cursor: reserve calls */ -#define WT_STAT_DSRC_CURSOR_RESERVE 2116 +#define WT_STAT_DSRC_CURSOR_RESERVE 2117 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2117 +#define WT_STAT_DSRC_CURSOR_RESET 2118 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2118 +#define WT_STAT_DSRC_CURSOR_RESTART 2119 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2119 +#define WT_STAT_DSRC_CURSOR_SEARCH 2120 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2120 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2121 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2121 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2122 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2122 +#define WT_STAT_DSRC_CURSOR_UPDATE 2123 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2123 +#define WT_STAT_DSRC_REC_DICTIONARY 2124 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2124 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2125 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2125 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2126 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2126 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2127 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2127 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2128 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2128 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2129 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2129 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2130 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2130 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2131 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2131 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2132 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2132 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2133 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2133 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2134 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2134 +#define WT_STAT_DSRC_REC_PAGES 2135 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2135 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2136 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2136 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2137 /*! session: cached cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_CACHED 2137 +#define WT_STAT_DSRC_SESSION_CURSOR_CACHED 2138 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2138 +#define WT_STAT_DSRC_SESSION_COMPACT 2139 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2139 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2140 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2140 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2141 /*! * @} diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index 811c0576eef..7875a6be028 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -39,6 +39,8 @@ __posix_sync( WT_DECL_RET; #if defined(F_FULLFSYNC) + static bool fullfsync_error_logged = false; + /* * OS X fsync documentation: * "Note that while fsync() will flush all data from the host to the @@ -56,10 +58,16 @@ __posix_sync( WT_SYSCALL_RETRY(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret); if (ret == 0) return (0); + /* * Assume F_FULLFSYNC failed because the file system doesn't support it * and fallback to fsync. */ + if (!fullfsync_error_logged) { + fullfsync_error_logged = true; + __wt_err(session, ret, + "fcntl(F_FULLFSYNC) failed, falling back to fsync"); + } #endif #if defined(HAVE_FDATASYNC) WT_SYSCALL_RETRY(fdatasync(fd), ret); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 1c46da9be10..8bc022cd3e3 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1345,7 +1345,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * globally visible, need to check the update state as well. */ if (F_ISSET(r, WT_REC_EVICT) && - (upd->state != WT_UPDATE_STATE_READY || + (upd->prepare_state == WT_PREPARE_LOCKED || + upd->prepare_state == WT_PREPARE_INPROGRESS || (F_ISSET(r, WT_REC_VISIBLE_ALL) ? WT_TXNID_LE(r->last_running, txnid) : !__txn_visible_id(session, txnid)))) { @@ -1631,10 +1632,12 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * it holds the transaction ID we care about. * * In some cases, there had better not be any updates we can't see. + * + * A visible update to be in READY state (i.e. not in LOCKED or + * PREPARED state), for truly visible to others. */ if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL && - !__wt_txn_visible(session, - page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))) + __wt_page_del_active(session, ref, false)) WT_PANIC_RET(session, EINVAL, "reconciliation illegally skipped an update"); @@ -1662,9 +1665,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * read into this part of the name space again, the cache read function * instantiates an entirely new page.) */ - if (ref->addr != NULL && - (page_del == NULL || __wt_txn_visible_all( - session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)))) + if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) WT_RET(__wt_ref_block_free(session, ref)); /* @@ -1709,10 +1710,11 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * page to reference it from the parent page. * * If the delete is not visible in this checkpoint, write the original - * address normally. Otherwise, we have to write a proxy record. + * address normally. Otherwise, we have to write a proxy record. + * If the delete state is not ready, then delete is not visible as it + * is in prepared state. */ - if (__wt_txn_visible( - session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))) + if (!__wt_page_del_active(session, ref, false)) *statep = WT_CHILD_PROXY; return (0); @@ -1838,6 +1840,11 @@ __rec_child_modify(WT_SESSION_IMPL *session, * * This call cannot return split/restart, we have a lock * on the parent which prevents a child page split. + * + * Set WT_READ_NO_WAIT because we're only interested in + * the WT_REF's final state. Pages in transition might + * change WT_REF state during our read, and then return + * WT_NOTFOUND to us. In that case, loop and look again. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | diff --git a/src/third_party/wiredtiger/src/schema/schema_alter.c b/src/third_party/wiredtiger/src/schema/schema_alter.c index a957969e6cc..2ebfcfc5d9d 100644 --- a/src/third_party/wiredtiger/src/schema/schema_alter.c +++ b/src/third_party/wiredtiger/src/schema/schema_alter.c @@ -7,41 +7,33 @@ */ #include "wt_internal.h" +static int __schema_alter(WT_SESSION_IMPL *, const char *, const char *[]); /* - * __wt_alter -- - * Alter a file. + * __alter_apply -- + * Alter an object */ -int -__wt_alter(WT_SESSION_IMPL *session, const char *newcfg[]) +static int +__alter_apply(WT_SESSION_IMPL *session, + const char *uri, const char *newcfg[], const char *base_config) { WT_DECL_RET; - const char *cfg[4], *filename, *uri; + const char *cfg[4]; char *config, *newconfig; - uri = session->dhandle->name; - WT_RET(__wt_meta_track_on(session)); - - /* - * We know that we have exclusive access to the file. So it will be - * closed after we're done with it and the next open will see the - * updated metadata. - */ - filename = uri; newconfig = NULL; - if (!WT_PREFIX_SKIP(filename, "file:")) - return (__wt_unexpected_object_type(session, uri, "file:")); /* Find the URI */ WT_RET(__wt_metadata_search(session, uri, &config)); WT_ASSERT(session, newcfg[0] != NULL); + /* * Start with the base configuration because collapse is like * a projection and if we are reading older metadata, it may not * have all the components. */ - cfg[0] = WT_CONFIG_BASE(session, file_meta); + cfg[0] = base_config; cfg[1] = config; cfg[2] = newcfg[0]; cfg[3] = NULL; @@ -63,7 +55,176 @@ err: __wt_free(session, config); if (ret == WT_NOTFOUND) ret = ENOENT; - WT_TRET(__wt_meta_track_off(session, true, ret != 0)); + return (ret); +} + +/* + * __alter_file -- + * Alter a file. + */ +static int +__alter_file(WT_SESSION_IMPL *session, const char *newcfg[]) +{ + const char *uri; + + /* + * We know that we have exclusive access to the file. So it will be + * closed after we're done with it and the next open will see the + * updated metadata. + */ + uri = session->dhandle->name; + if (!WT_PREFIX_MATCH(uri, "file:")) + return (__wt_unexpected_object_type(session, uri, "file:")); + + return (__alter_apply(session, + uri, newcfg, WT_CONFIG_BASE(session, file_meta))); +} + +/* + * __alter_tree -- + * Alter an index or colgroup reference. + */ +static int +__alter_tree(WT_SESSION_IMPL *session, const char *name, const char *newcfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_ITEM(data_source); + WT_DECL_RET; + char *value; + bool is_colgroup; + + value = NULL; + + is_colgroup = WT_PREFIX_MATCH(name, "colgroup:"); + if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:")) + return (__wt_unexpected_object_type( + session, name, "'colgroup:' or 'index:'")); + + /* Read the schema value. */ + WT_ERR(__wt_metadata_search(session, name, &value)); + + /* Get the data source URI. */ + if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0) + WT_ERR_MSG(session, EINVAL, + "index or column group has no data source: %s", value); + WT_ERR(__wt_scr_alloc(session, 0, &data_source)); + WT_ERR(__wt_buf_fmt(session, + data_source, "%.*s", (int)cval.len, cval.str)); + + /* Alter the data source */ + WT_ERR(__schema_alter(session, data_source->data, newcfg)); + + /* Alter the index or colgroup */ + if (is_colgroup) + WT_ERR(__alter_apply(session, + name, newcfg, WT_CONFIG_BASE(session, colgroup_meta))); + else + WT_ERR(__alter_apply(session, + name, newcfg, WT_CONFIG_BASE(session, index_meta))); + +err: __wt_scr_free(session, &data_source); + __wt_free(session, value); + return (ret); +} + +/* + * __alter_table -- + * Alter a table. + */ +static int +__alter_table(WT_SESSION_IMPL *session, const char *uri, const char *newcfg[]) +{ + WT_COLGROUP *colgroup; + WT_DECL_RET; + WT_INDEX *idx; + WT_TABLE *table; + u_int i; + const char *name; + + colgroup = NULL; + table = NULL; + name = uri; + WT_PREFIX_SKIP_REQUIRED(session, name, "table:"); + + /* + * Open the table so we can alter its column groups and indexes, keeping + * the table locked exclusive across the alter. + */ + WT_RET(__wt_schema_get_table_uri(session, uri, true, + WT_DHANDLE_EXCLUSIVE, &table)); + /* Meta tracking needs to be used because alter needs to be atomic. */ + WT_ASSERT(session, WT_META_TRACKING(session)); + WT_WITH_DHANDLE(session, &table->iface, + ret = __wt_meta_track_handle_lock(session, false)); + WT_RET(ret); + + /* Alter the column groups. */ + for (i = 0; i < WT_COLGROUPS(table); i++) { + if ((colgroup = table->cgroups[i]) == NULL) + continue; + WT_RET(__alter_tree(session, colgroup->name, newcfg)); + } + + /* Alter the indices. */ + WT_RET(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) { + if ((idx = table->indices[i]) == NULL) + continue; + WT_RET(__alter_tree(session, idx->name, newcfg)); + } + + /* Alter the table */ + WT_RET(__alter_apply(session, + uri, newcfg, WT_CONFIG_BASE(session, table_meta))); + + return (ret); +} + +/* + * __schema_alter -- + * Alter an object. + */ +static int +__schema_alter(WT_SESSION_IMPL *session, const char *uri, const char *newcfg[]) +{ + uint32_t flags; + + /* + * The alter flag is used so LSM can apply some special logic, the + * exclusive flag avoids conflicts with other operations and the lock + * only flag is required because we don't need to have a handle to + * update the metadata and opening the handle causes problems when + * meta tracking is enabled. + */ + flags = WT_BTREE_ALTER | WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY; + if (WT_PREFIX_MATCH(uri, "file:")) + return (__wt_exclusive_handle_operation( + session, uri, __alter_file, newcfg, flags)); + if (WT_PREFIX_MATCH(uri, "colgroup:") || + WT_PREFIX_MATCH(uri, "index:")) + return (__alter_tree(session, uri, newcfg)); + if (WT_PREFIX_MATCH(uri, "lsm:")) + return (__wt_lsm_tree_worker(session, uri, __alter_file, + NULL, newcfg, flags)); + if (WT_PREFIX_MATCH(uri, "table:")) + return (__alter_table(session, uri, newcfg)); + + return (__wt_bad_object_type(session, uri)); +} + +/* + * __wt_schema_alter -- + * Alter an object. + */ +int +__wt_schema_alter(WT_SESSION_IMPL *session, + const char *uri, const char *newcfg[]) +{ + WT_DECL_RET; + + WT_RET(__wt_meta_track_on(session)); + ret = __schema_alter(session, uri, newcfg); + WT_TRET(__wt_meta_track_off(session, true, ret != 0)); return (ret); } diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c index 407550bfdba..aa38ad79bee 100644 --- a/src/third_party/wiredtiger/src/schema/schema_worker.c +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -9,6 +9,36 @@ #include "wt_internal.h" /* + * __wt_exclusive_handle_operation -- + * Get exclusive access to a file and apply a function. + */ +int +__wt_exclusive_handle_operation(WT_SESSION_IMPL *session, + const char *uri, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + const char *cfg[], uint32_t open_flags) +{ + WT_DECL_RET; + + /* + * If the operation requires exclusive access, close + * any open file handles, including checkpoints. + */ + if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __wt_conn_dhandle_close_all( + session, uri, false, false)); + WT_RET(ret); + } + + WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, open_flags)); + WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); + WT_TRET(__wt_session_release_dhandle(session)); + + return (ret); +} + +/* * __wt_schema_worker -- * Get Btree handles for the object and cycle through calls to an * underlying worker function with each handle. @@ -41,25 +71,9 @@ __wt_schema_worker(WT_SESSION_IMPL *session, /* Get the btree handle(s) and call the underlying function. */ if (WT_PREFIX_MATCH(uri, "file:")) { - if (file_func != NULL) { - /* - * If the operation requires exclusive access, close - * any open file handles, including checkpoints. - */ - if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_WRITE_LOCK(session, - ret = __wt_conn_dhandle_close_all( - session, uri, false, false)); - WT_ERR(ret); - } - - WT_ERR(__wt_session_get_btree_ckpt( - session, uri, cfg, open_flags)); - WT_SAVE_DHANDLE(session, - ret = file_func(session, cfg)); - WT_TRET(__wt_session_release_dhandle(session)); - WT_ERR(ret); - } + if (file_func != NULL) + WT_ERR(__wt_exclusive_handle_operation(session, + uri, file_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { WT_ERR(__wt_schema_get_colgroup( session, uri, false, NULL, &colgroup)); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 8db4c5a7615..cd06073a120 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -578,7 +578,7 @@ __wt_open_cursor(WT_SESSION_IMPL *session, /* We do not cache any subordinate tables/files cursors. */ if (owner == NULL) { if ((ret = __wt_cursor_cache_get( - session, uri, cfg, cursorp)) == 0) + session, uri, NULL, cfg, cursorp)) == 0) return (0); WT_RET_NOTFOUND_OK(ret); } @@ -605,35 +605,37 @@ __session_open_cursor(WT_SESSION *wt_session, session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, open_cursor, config, cfg); - if (to_dup == NULL) { + statjoin = (to_dup != NULL && uri != NULL && + WT_STREQ(uri, "statistics:join")); + if (!statjoin) { + if ((to_dup == NULL && uri == NULL) || + (to_dup != NULL && uri != NULL)) + WT_ERR_MSG(session, EINVAL, + "should be passed either a URI or a cursor to " + "duplicate, but not both"); + if ((ret = __wt_cursor_cache_get( - session, uri, cfg, cursorp)) == 0) + session, uri, to_dup, cfg, &cursor)) == 0) goto done; - WT_RET_NOTFOUND_OK(ret); - } + WT_ERR_NOTFOUND_OK(ret); - statjoin = (to_dup != NULL && uri != NULL && - WT_STREQ(uri, "statistics:join")); - if ((to_dup == NULL && uri == NULL) || - (to_dup != NULL && uri != NULL && !statjoin)) - WT_ERR_MSG(session, EINVAL, - "should be passed either a URI or a cursor to duplicate, " - "but not both"); - - if (to_dup != NULL && !statjoin) { - uri = to_dup->uri; - if (!WT_PREFIX_MATCH(uri, "colgroup:") && - !WT_PREFIX_MATCH(uri, "index:") && - !WT_PREFIX_MATCH(uri, "file:") && - !WT_PREFIX_MATCH(uri, "lsm:") && - !WT_PREFIX_MATCH(uri, WT_METADATA_URI) && - !WT_PREFIX_MATCH(uri, "table:") && - __wt_schema_get_source(session, uri) == NULL) - WT_ERR(__wt_bad_object_type(session, uri)); + if (to_dup != NULL) { + uri = to_dup->uri; + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "lsm:") && + !WT_PREFIX_MATCH(uri, WT_METADATA_URI) && + !WT_PREFIX_MATCH(uri, "table:") && + __wt_schema_get_source(session, uri) == NULL) + WT_ERR(__wt_bad_object_type(session, uri)); + } } WT_ERR(__session_open_cursor_int(session, uri, NULL, statjoin ? to_dup : NULL, cfg, &cursor)); + +done: if (to_dup != NULL && !statjoin) WT_ERR(__wt_cursor_dup_position(to_dup, cursor)); @@ -643,7 +645,6 @@ __session_open_cursor(WT_SESSION *wt_session, err: if (cursor != NULL) WT_TRET(cursor->close(cursor)); } -done: /* * Opening a cursor on a non-existent data source will set ret to * either of ENOENT or WT_NOTFOUND at this point. However, @@ -687,8 +688,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config) cfg[1] = NULL; WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, __wt_alter, NULL, cfg, - WT_BTREE_ALTER | WT_DHANDLE_EXCLUSIVE))); + ret = __wt_schema_alter(session, uri, cfg))); err: if (ret != 0) diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index ae13f7d8abe..c418591d294 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -72,6 +72,7 @@ static const char * const __stats_dsrc_desc[] = { "cache: page split during eviction deepened the tree", "cache: page written requiring lookaside records", "cache: pages read into cache", + "cache: pages read into cache after truncate", "cache: pages read into cache requiring lookaside entries", "cache: pages requested from the cache", "cache: pages seen by eviction walk", @@ -255,6 +256,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_eviction_deepen = 0; stats->cache_write_lookaside = 0; stats->cache_read = 0; + stats->cache_read_deleted = 0; stats->cache_read_lookaside = 0; stats->cache_pages_requested = 0; stats->cache_eviction_pages_seen = 0; @@ -435,6 +437,7 @@ __wt_stat_dsrc_aggregate_single( to->cache_eviction_deepen += from->cache_eviction_deepen; to->cache_write_lookaside += from->cache_write_lookaside; to->cache_read += from->cache_read; + to->cache_read_deleted += from->cache_read_deleted; to->cache_read_lookaside += from->cache_read_lookaside; to->cache_pages_requested += from->cache_pages_requested; to->cache_eviction_pages_seen += from->cache_eviction_pages_seen; @@ -633,6 +636,7 @@ __wt_stat_dsrc_aggregate( to->cache_write_lookaside += WT_STAT_READ(from, cache_write_lookaside); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_deleted += WT_STAT_READ(from, cache_read_deleted); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_pages_requested += WT_STAT_READ(from, cache_pages_requested); @@ -844,6 +848,7 @@ static const char * const __stats_connection_desc[] = { "cache: pages queued for urgent eviction", "cache: pages queued for urgent eviction during walk", "cache: pages read into cache", + "cache: pages read into cache after truncate", "cache: pages read into cache requiring lookaside entries", "cache: pages read into cache skipping older lookaside entries", "cache: pages read into cache with skipped lookaside entries needed later", @@ -1040,7 +1045,6 @@ static const char * const __stats_connection_desc[] = { "thread-yield: page acquire time sleeping (usecs)", "thread-yield: page delete rollback time sleeping for state change (usecs)", "thread-yield: page reconciliation yielded due to child modification", - "thread-yield: tree descend one level yielded for split page index update", "transaction: commit timestamp queue insert to empty", "transaction: commit timestamp queue inserts to tail", "transaction: commit timestamp queue inserts total", @@ -1233,6 +1237,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_pages_queued_urgent = 0; stats->cache_eviction_pages_queued_oldest = 0; stats->cache_read = 0; + stats->cache_read_deleted = 0; stats->cache_read_lookaside = 0; stats->cache_read_lookaside_skipped = 0; stats->cache_read_lookaside_delay = 0; @@ -1429,7 +1434,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->page_sleep = 0; stats->page_del_rollback_blocked = 0; stats->child_modify_blocked_page = 0; - stats->tree_descend_blocked = 0; stats->txn_commit_queue_empty = 0; stats->txn_commit_queue_tail = 0; stats->txn_commit_queue_inserts = 0; @@ -1658,6 +1662,7 @@ __wt_stat_connection_aggregate( to->cache_eviction_pages_queued_oldest += WT_STAT_READ(from, cache_eviction_pages_queued_oldest); to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_deleted += WT_STAT_READ(from, cache_read_deleted); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_read_lookaside_skipped += WT_STAT_READ(from, cache_read_lookaside_skipped); @@ -1944,7 +1949,6 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, page_del_rollback_blocked); to->child_modify_blocked_page += WT_STAT_READ(from, child_modify_blocked_page); - to->tree_descend_blocked += WT_STAT_READ(from, tree_descend_blocked); to->txn_commit_queue_empty += WT_STAT_READ(from, txn_commit_queue_empty); to->txn_commit_queue_tail += diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c index 50abe64bbe6..4597d26496d 100644 --- a/src/third_party/wiredtiger/src/support/thread_group.c +++ b/src/third_party/wiredtiger/src/support/thread_group.c @@ -81,8 +81,8 @@ __thread_group_shrink( WT_ASSERT(session, thread->tid.created); __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Stopping utility thread: %p:%" PRIu32, - (void *)group, thread->id); + "Stopping utility thread: %s:%" PRIu32, + group->name, thread->id); if (F_ISSET(thread, WT_THREAD_ACTIVE)) --group->current_threads; F_CLR(thread, WT_THREAD_ACTIVE | WT_THREAD_RUN); @@ -143,9 +143,9 @@ __thread_group_resize( thread = NULL; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 + "Resize thread group: %s, from min: %" PRIu32 " -> %" PRIu32 " from max: %" PRIu32 " -> %" PRIu32, - (void *)group, group->min, new_min, group->max, new_max); + group->name, group->min, new_min, group->max, new_max); WT_ASSERT(session, group->current_threads <= group->alloc && @@ -155,7 +155,10 @@ __thread_group_resize( return (0); if (new_min > new_max) - return (EINVAL); + WT_RET_MSG(session, EINVAL, + "Illegal thread group resize: %s, from min: %" PRIu32 + " -> %" PRIu32 " from max: %" PRIu32 " -> %" PRIu32, + group->name, group->min, new_min, group->max, new_max); /* * Call shrink to reduce the number of thread structures and running @@ -205,8 +208,8 @@ __thread_group_resize( * number later. */ __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Starting utility thread: %p:%" PRIu32, - (void *)group, thread->id); + "Starting utility thread: %s:%" PRIu32, + group->name, thread->id); F_SET(thread, WT_THREAD_RUN); WT_ERR(__wt_thread_create(thread->session, &thread->tid, __thread_run, thread)); @@ -285,8 +288,8 @@ __wt_thread_group_create( cond_alloced = false; - __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Creating thread group: %p", (void *)group); + __wt_verbose(session, + WT_VERB_THREAD_GROUP, "Creating thread group: %s", name); WT_RET(__wt_rwlock_init(session, &group->lock)); WT_ERR(__wt_cond_alloc( @@ -321,7 +324,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_DECL_RET; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Destroying thread group: %p", (void *)group); + "Destroying thread group: %s", group->name); WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); @@ -364,8 +367,8 @@ __wt_thread_group_start_one( thread = group->threads[group->current_threads++]; WT_ASSERT(session, thread != NULL); __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Activating utility thread: %p:%" PRIu32, - (void *)group, thread->id); + "Activating utility thread: %s:%" PRIu32, + group->name, thread->id); WT_ASSERT(session, !F_ISSET(thread, WT_THREAD_ACTIVE)); F_SET(thread, WT_THREAD_ACTIVE); __wt_cond_signal(session, thread->pause_cond); @@ -391,8 +394,8 @@ __wt_thread_group_stop_one(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) if (group->current_threads > group->min) { thread = group->threads[--group->current_threads]; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Pausing utility thread: %p:%" PRIu32, - (void *)group, thread->id); + "Pausing utility thread: %s:%" PRIu32, + group->name, thread->id); WT_ASSERT(session, F_ISSET(thread, WT_THREAD_ACTIVE)); F_CLR(thread, WT_THREAD_ACTIVE); __wt_cond_signal(session, thread->pause_cond); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 3a9b3755ff5..b2952cbec46 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -677,7 +677,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_UPDATE **updp; wt_timestamp_t prev_commit_timestamp, ts; uint32_t previous_state; - bool update_timestamp; + bool prepared_transaction, update_timestamp; #endif txn = &session->txn; @@ -698,8 +698,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); - WT_ERR(__wt_timestamp_validate(session, - "commit", &ts, &cval, true, true, true)); + WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else @@ -794,6 +793,9 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* Note: we're going to commit: nothing can fail after this point. */ +#ifdef HAVE_TIMESTAMPS + prepared_transaction = F_ISSET(txn, WT_TXN_PREPARE); +#endif /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { @@ -827,7 +829,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) if (!__wt_txn_update_needs_timestamp(session, op)) break; - if (F_ISSET(txn, WT_TXN_PREPARE)) { + if (prepared_transaction) { /* * In case of a prepared transaction, the order * of modification of the prepare timestamp to @@ -839,10 +841,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * As updating timestamp might not be an atomic * operation, we will manage using state. */ - upd->state = WT_UPDATE_STATE_LOCKED; + upd->prepare_state = WT_PREPARE_LOCKED; + WT_WRITE_BARRIER(); __wt_timestamp_set( &upd->timestamp, &txn->commit_timestamp); - upd->state = WT_UPDATE_STATE_READY; + WT_PUBLISH(upd->prepare_state, + WT_PREPARE_RESOLVED); } else __wt_timestamp_set( &upd->timestamp, &txn->commit_timestamp); @@ -855,8 +859,21 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) break; ref = op->u.ref; - __wt_timestamp_set( - &ref->page_del->timestamp, &txn->commit_timestamp); + if (prepared_transaction) { + /* + * As updating timestamp might not be an atomic + * operation, we will manage using state. + */ + ref->page_del->prepare_state = + WT_PREPARE_LOCKED; + WT_WRITE_BARRIER(); + __wt_timestamp_set(&ref->page_del->timestamp, + &txn->commit_timestamp); + WT_PUBLISH(ref->page_del->prepare_state, + WT_PREPARE_RESOLVED); + } else + __wt_timestamp_set(&ref->page_del->timestamp, + &txn->commit_timestamp); /* * The page-deleted list can be discarded by eviction, @@ -872,11 +889,35 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) break; } - if ((updp = ref->page_del->update_list) != NULL) - for (; *updp != NULL; ++updp) + if ((updp = ref->page_del->update_list) == NULL) { + /* + * Publish to ensure we don't let the page be + * evicted and the updates discarded before + * being written. + */ + WT_PUBLISH(ref->state, previous_state); + break; + } + + for (; *updp != NULL; ++updp) { + if (prepared_transaction) { + /* + * As ref state is LOCKED, timestamp + * and prepare state are updated in + * exclusive access, hence no need for + * temporary state WT_PREPARE_LOCKED + * and BARRIER. + */ __wt_timestamp_set( &(*updp)->timestamp, &txn->commit_timestamp); + (*updp)->prepare_state = + WT_PREPARE_RESOLVED; + } else + __wt_timestamp_set( + &(*updp)->timestamp, + &txn->commit_timestamp); + } /* * Publish to ensure we don't let the page be evicted @@ -980,7 +1021,6 @@ int __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) { #ifdef HAVE_TIMESTAMPS - WT_CONFIG_ITEM cval; WT_TXN *txn; WT_TXN_OP *op; WT_UPDATE *upd; @@ -990,22 +1030,14 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - /* Transaction should not have a commit timestamp set. */ - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); /* Transaction should not have updated any of the logged tables. */ WT_ASSERT(session, txn->logrec == NULL); WT_RET(__wt_txn_context_check(session, true)); - /* Look for a prepare timestamp. */ - WT_RET( - __wt_config_gets_def(session, cfg, "prepare_timestamp", 0, &cval)); - if (cval.len == 0) - WT_RET_MSG(session, EINVAL, "prepare timestamp is required"); - - /* TODO : Validate prepare timestamp. */ - WT_RET(__wt_txn_parse_timestamp(session, "prepare", &ts, &cval)); + /* Parse and validate the prepare timestamp. */ + WT_RET(__wt_txn_parse_prepare_timestamp(session, cfg, &ts)); __wt_timestamp_set(&txn->prepare_timestamp, &ts); /* @@ -1051,11 +1083,13 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) /* Set prepare timestamp. */ __wt_timestamp_set(&upd->timestamp, &ts); - upd->state = WT_UPDATE_STATE_PREPARED; + WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS); break; case WT_TXN_OP_REF_DELETE: __wt_timestamp_set( &op->u.ref->page_del->timestamp, &ts); + WT_PUBLISH(op->u.ref->page_del->prepare_state, + WT_PREPARE_INPROGRESS); break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 2266a9cd6f5..6fd82db5917 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -567,13 +567,13 @@ set: __wt_writelock(session, &txn_global->rwlock); #ifdef HAVE_TIMESTAMPS /* * __wt_timestamp_validate -- - * Validate a timestamp to be not older than the global oldest and/or - * global stable and/or running transaction commit timestamp. + * Validate a timestamp to be not older than the global oldest and global + * stable and running transaction commit timestamp and running transaction + * prepare timestamp. */ int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, - wt_timestamp_t *ts, WT_CONFIG_ITEM *cval, - bool cmp_oldest, bool cmp_stable, bool cmp_commit) + wt_timestamp_t *ts, WT_CONFIG_ITEM *cval) { WT_TXN *txn = &session->txn; WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; @@ -596,16 +596,14 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, if ((has_stable_ts = txn_global->has_stable_timestamp)) __wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp)); - if (cmp_oldest && has_oldest_ts && - __wt_timestamp_cmp(ts, &oldest_ts) < 0) { + if (has_oldest_ts && __wt_timestamp_cmp(ts, &oldest_ts) < 0) { WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, &oldest_ts)); WT_RET_MSG(session, EINVAL, "%s timestamp %.*s older than oldest timestamp %s", name, (int)cval->len, cval->str, hex_timestamp); } - if (cmp_stable && has_stable_ts && - __wt_timestamp_cmp(ts, &stable_ts) < 0) { + if (has_stable_ts && __wt_timestamp_cmp(ts, &stable_ts) < 0) { WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, &stable_ts)); WT_RET_MSG(session, EINVAL, @@ -618,7 +616,7 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, * Return an error if the given timestamp is older than the first * commit timestamp. */ - if (cmp_commit && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && __wt_timestamp_cmp(ts, &txn->first_commit_timestamp) < 0) { WT_RET(__wt_timestamp_to_hex_string( session, hex_timestamp, &txn->first_commit_timestamp)); @@ -628,6 +626,21 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, name, (int)cval->len, cval->str, hex_timestamp); } + /* + * Compare against the prepare timestamp of the current transaction. + * Return an error if the given timestamp is older than the prepare + * timestamp. + */ + if (F_ISSET(txn, WT_TXN_PREPARE) && + __wt_timestamp_cmp(ts, &txn->prepare_timestamp) < 0) { + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp, &txn->prepare_timestamp)); + WT_RET_MSG(session, EINVAL, + "%s timestamp %.*s older than the prepare timestamp %s " + "for this transaction", + name, (int)cval->len, cval->str, hex_timestamp); + } + return (0); } #endif @@ -651,8 +664,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_TRET(__wt_txn_context_check(session, true)); WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); - WT_RET(__wt_timestamp_validate(session, - "commit", &ts, &cval, true, true, true)); + WT_RET(__wt_timestamp_validate(session, "commit", &ts, &cval)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else @@ -669,6 +681,83 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) } /* + * __wt_txn_parse_prepare_timestamp -- + * Parse a request to set a transaction's prepare_timestamp. + */ +int +__wt_txn_parse_prepare_timestamp( + WT_SESSION_IMPL *session, const char *cfg[], wt_timestamp_t *timestamp) +{ + WT_CONFIG_ITEM cval; + + WT_RET(__wt_config_gets_def(session, + cfg, "prepare_timestamp", 0, &cval)); + if (cval.len > 0) { +#ifdef HAVE_TIMESTAMPS + WT_TXN *prev; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t oldest_ts; + char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; + + txn_global = &S2C(session)->txn_global; + + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_COMMIT)) + WT_RET_MSG(session, EINVAL, + "commit timestamp should not have been set before " + "prepare transaction"); + + WT_RET(__wt_txn_parse_timestamp( + session, "prepare", timestamp, &cval)); + + /* + * Prepare timestamp must be later/greater than latest active + * read timestamp. + */ + __wt_readlock(session, &txn_global->read_timestamp_rwlock); + prev = TAILQ_LAST(&txn_global->read_timestamph, + __wt_txn_rts_qh); + if (prev != NULL && + __wt_timestamp_cmp(&prev->read_timestamp, timestamp) >= 0) { + __wt_readunlock(session, + &txn_global->read_timestamp_rwlock); + WT_RET(__wt_timestamp_to_hex_string(session, + hex_timestamp, &prev->read_timestamp)); + WT_RET_MSG(session, EINVAL, + "prepare timestamp %.*s not later than an active " + "read timestamp %s ", (int)cval.len, cval.str, + hex_timestamp); + } + __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + + /* + * If there are no active readers, prepare timestamp must not + * be older than oldest timestamp. + */ + if (prev == NULL) { + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&oldest_ts, + &txn_global->oldest_timestamp)); + + if (__wt_timestamp_cmp(timestamp, &oldest_ts) < 0) { + WT_RET(__wt_timestamp_to_hex_string(session, + hex_timestamp, &oldest_ts)); + WT_RET_MSG(session, EINVAL, + "prepare timestamp %.*s is older than the " + "oldest timestamp %s ", (int)cval.len, + cval.str, hex_timestamp); + } + } +#else + WT_UNUSED(timestamp); + WT_RET_MSG(session, EINVAL, "prepare_timestamp requires a " + "version of WiredTiger built with timestamp support"); +#endif + } else + WT_RET_MSG(session, EINVAL, "prepare timestamp is required"); + + return (0); +} +/* * __wt_txn_parse_read_timestamp -- * Parse a request to set a transaction's read_timestamp. */ @@ -691,10 +780,17 @@ __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) txn_global = &S2C(session)->txn_global; WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); - /* Read timestamps imply / require snapshot isolation. */ + /* + * Read timestamps imply / require snapshot isolation. + * + * If we already have a snapshot, it may be too early + * to match the timestamp. Get a new one. + */ if (!F_ISSET(txn, WT_TXN_RUNNING)) txn->isolation = WT_ISO_SNAPSHOT; - else if (txn->isolation != WT_ISO_SNAPSHOT) + else if (txn->isolation == WT_ISO_SNAPSHOT) + __wt_txn_get_snapshot(session); + else WT_RET_MSG(session, EINVAL, "setting a read_timestamp" " requires a transaction running at snapshot" " isolation"); diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c index dab24930fe6..2b210419c78 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load.c +++ b/src/third_party/wiredtiger/src/utilities/util_load.c @@ -486,7 +486,7 @@ config_rename(WT_SESSION *session, char **urip, const char *name) return (util_err(session, errno, NULL)); /* - * Find the separating colon characters, but not the trailing one may + * Find the separating colon characters, but note the trailing one may * not be there. */ if ((p = strchr(*urip, ':')) == NULL) { diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index d46b0868887..0677b3b753c 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -159,9 +159,8 @@ config_setup(void) /* * Periodically, run single-threaded so we can compare the results to * a Berkeley DB copy, as long as the thread-count isn't nailed down. - * Don't do it on the first run, all our smoke tests would hit it. */ - if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads")) + if (!config_is_perm("threads") && mmrand(NULL, 1, 20) == 1) g.c_threads = 1; config_checkpoint(); @@ -191,12 +190,8 @@ config_setup(void) /* * Turn off truncate for LSM runs (some configurations with truncate * always results in a timeout). - * - * WiredTiger doesn't currently support truncate and prepare at the - * same time, see WT-3922. For now, pick one on each run. */ - if (!config_is_perm("truncate")) - if (DATASOURCE("lsm") || mmrand(NULL, 0, 1) == 1) + if (!config_is_perm("truncate") && DATASOURCE("lsm")) config_single("truncate=off", 0); /* Give Helium configuration a final review. */ @@ -629,10 +624,10 @@ config_pct(void) /* * If the delete percentage isn't nailed down, periodically set it to - * 0 so salvage gets run. Don't do it on the first run, all our smoke - * tests would hit it. + * 0 so salvage gets run and so we can perform stricter sanity checks + * on key ordering. */ - if (!config_is_perm("delete_pct") && !g.replay && g.run_cnt % 10 == 9) { + if (!config_is_perm("delete_pct") && mmrand(NULL, 1, 10) == 1) { list[CONFIG_DELETE_ENTRY].order = 0; *list[CONFIG_DELETE_ENTRY].vp = 0; } diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index d277fb1a915..4eac7a5eb8e 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -122,6 +122,8 @@ typedef struct { WT_RAND_STATE rnd; /* Global RNG state */ + pthread_rwlock_t prepare_lock; /* Prepare running */ + uint64_t timestamp; /* Counter for timestamps */ uint64_t truncate_cnt; /* Counter for truncation */ @@ -290,6 +292,8 @@ typedef struct { uint64_t last; /* truncate range */ WT_ITEM *lastkey, _lastkey; + WT_ITEM *tbuf, _tbuf; /* temporary buffer */ + #define TINFO_RUNNING 1 /* Running */ #define TINFO_COMPLETE 2 /* Finished */ #define TINFO_JOINED 3 /* Resolved */ diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 596d952dcc6..54aa6d2b766 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -512,6 +512,12 @@ begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp) config = "isolation=snapshot"; if (g.c_txn_timestamps) { /* + * Avoid starting a new reader when a prepare is in + * progress. + */ + (void)pthread_rwlock_rdlock(&g.prepare_lock); + + /* * Set the thread's read timestamp to the current value * before allocating a new read timestamp. This * guarantees the oldest timestamp won't move past the @@ -531,6 +537,9 @@ begin_transaction(TINFO *tinfo, WT_SESSION *session, u_int *iso_configp) testutil_check(session->begin_transaction(session, config)); + if (v == ISOLATION_SNAPSHOT && g.c_txn_timestamps) + (void)pthread_rwlock_unlock(&g.prepare_lock); + /* * It's OK for the oldest timestamp to move past a running query, clear * the thread's read timestamp, it no longer needs to be pinned. @@ -617,6 +626,7 @@ rollback_transaction(TINFO *tinfo, WT_SESSION *session) static int prepare_transaction(TINFO *tinfo, WT_SESSION *session) { + WT_DECL_RET; uint64_t ts; char config_buf[64]; @@ -635,10 +645,23 @@ prepare_transaction(TINFO *tinfo, WT_SESSION *session) */ ++tinfo->prepare; + /* + * Synchronize prepare call with begin transaction to prevent a new + * reader creeping in. + * + * Prepare will return error if prepare timestamp is less than any + * active read timestamp. + */ + (void)pthread_rwlock_wrlock(&g.prepare_lock); + ts = set_commit_timestamp(tinfo); testutil_check(__wt_snprintf( config_buf, sizeof(config_buf), "prepare_timestamp=%" PRIx64, ts)); - return (session->prepare_transaction(session, config_buf)); + ret = session->prepare_transaction(session, config_buf); + + (void)pthread_rwlock_unlock(&g.prepare_lock); + + return (ret); } /* @@ -690,6 +713,7 @@ ops(void *arg) val_gen_init(tinfo->value); tinfo->lastkey = &tinfo->_lastkey; key_gen_init(tinfo->lastkey); + tinfo->tbuf = &tinfo->_tbuf; /* Set the first operation where we'll create sessions and cursors. */ cursor = NULL; @@ -1072,9 +1096,8 @@ update_instead_of_chosen_op: /* * Prepare the transaction 10% of the time. - * Currently doesn't work with truncation, see WT-3922. */ - if (g.c_truncate == 0 && mmrand(&tinfo->rnd, 1, 10) == 1) { + if (mmrand(&tinfo->rnd, 1, 10) == 1) { ret = prepare_transaction(tinfo, session); testutil_assert(ret == 0 || ret == WT_PREPARE_CONFLICT); if (ret == WT_PREPARE_CONFLICT) @@ -1113,6 +1136,7 @@ deadlock: ++tinfo->deadlock; key_gen_teardown(tinfo->key); val_gen_teardown(tinfo->value); key_gen_teardown(tinfo->lastkey); + free(tinfo->tbuf->mem); tinfo->state = TINFO_COMPLETE; return (WT_THREAD_RET_VALUE); @@ -1291,11 +1315,11 @@ nextprev(TINFO *tinfo, WT_CURSOR *cursor, bool next) { WT_DECL_RET; WT_ITEM key, value; - uint64_t keyno; + uint64_t keyno, keyno_prev; uint8_t bitfield; int cmp; const char *which; - bool incrementing; + bool incrementing, record_gaps; keyno = 0; which = next ? "WT_CURSOR.next" : "WT_CURSOR.prev"; @@ -1332,41 +1356,85 @@ nextprev(TINFO *tinfo, WT_CURSOR *cursor, bool next) if (DATASOURCE("lsm")) break; + /* + * Compare the returned key with the previously returned key, + * and assert the order is correct. If not deleting keys, and + * the rows aren't in the column-store insert name space, also + * assert we don't skip groups of records (that's a page-split + * bug symptom). + */ + record_gaps = g.c_delete_pct != 0; switch (g.type) { case FIX: case VAR: - testutil_assertfmt( - !next || tinfo->keyno < keyno, - "%s returned %" PRIu64 " then %" PRIu64, - which, tinfo->keyno, keyno); - testutil_assertfmt( - next || tinfo->keyno > keyno, - "%s returned %" PRIu64 " then %" PRIu64, - which, tinfo->keyno, keyno); + if (tinfo->keyno > g.c_rows || keyno > g.c_rows) + record_gaps = true; + if (!next) { + if (tinfo->keyno < keyno || + (!record_gaps && keyno != tinfo->keyno - 1)) + goto order_error_col; + } else + if (tinfo->keyno > keyno || + (!record_gaps && keyno != tinfo->keyno + 1)) + goto order_error_col; + if (0) { +order_error_col: + testutil_die(0, + "%s returned %" PRIu64 " then %" PRIu64, + which, tinfo->keyno, keyno); + } tinfo->keyno = keyno; break; case ROW: - cmp = memcmp(tinfo->key->data, key.data, - WT_MIN(tinfo->key->size, key.size)); incrementing = (next && !g.c_reverse) || (!next && g.c_reverse); - testutil_assertfmt( - !incrementing || - cmp < 0 || - (cmp == 0 && tinfo->key->size < key.size), - "%s returned {%.*s} then {%.*s}", - which, - (int)tinfo->key->size, tinfo->key->data, - (int)key.size, key.data); - testutil_assertfmt( - incrementing || - cmp > 0 || - (cmp == 0 && tinfo->key->size > key.size), - "%s returned {%.*s} then {%.*s}", - which, - (int)tinfo->key->size, tinfo->key->data, - (int)key.size, key.data); + cmp = memcmp(tinfo->key->data, key.data, + WT_MIN(tinfo->key->size, key.size)); + if (incrementing) { + if (cmp > 0 || + (cmp == 0 && tinfo->key->size < key.size)) + goto order_error_row; + } else + if (cmp < 0 || + (cmp == 0 && tinfo->key->size > key.size)) + goto order_error_row; + if (!record_gaps) { + /* + * Convert the keys to record numbers and then + * compare less-than-or-equal. (Not less-than, + * row-store inserts new rows in-between rows + * by append a new suffix to the row's key.) + */ + testutil_check(__wt_buf_fmt( + (WT_SESSION_IMPL *)cursor->session, + tinfo->tbuf, "%.*s", + (int)tinfo->key->size, + (char *)tinfo->key->data)); + keyno_prev = + strtoul(tinfo->tbuf->data, NULL, 10); + testutil_check(__wt_buf_fmt( + (WT_SESSION_IMPL *)cursor->session, + tinfo->tbuf, "%.*s", + (int)key.size, (char *)key.data)); + keyno = strtoul(tinfo->tbuf->data, NULL, 10); + if (incrementing) { + if (keyno_prev != keyno && + keyno_prev + 1 != keyno) + goto order_error_row; + } else + if (keyno_prev != keyno && + keyno_prev - 1 != keyno) + goto order_error_row; + } + if (0) { +order_error_row: + testutil_die(0, + "%s returned {%.*s} then {%.*s}", + which, + (int)tinfo->key->size, tinfo->key->data, + (int)key.size, key.data); + } testutil_check(__wt_buf_set((WT_SESSION_IMPL *) cursor->session, tinfo->key, key.data, key.size)); diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 6077a67a541..d7b9add1f14 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -170,6 +170,7 @@ main(int argc, char *argv[]) testutil_check(pthread_rwlock_init(&g.append_lock, NULL)); testutil_check(pthread_rwlock_init(&g.backup_lock, NULL)); testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); + testutil_check(pthread_rwlock_init(&g.prepare_lock, NULL)); printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid()); while (++g.run_cnt <= g.c_runs || g.c_runs == 0 ) { @@ -267,6 +268,7 @@ main(int argc, char *argv[]) testutil_check(pthread_rwlock_destroy(&g.append_lock)); testutil_check(pthread_rwlock_destroy(&g.backup_lock)); testutil_check(pthread_rwlock_destroy(&g.death_lock)); + testutil_check(pthread_rwlock_destroy(&g.prepare_lock)); config_clear(); diff --git a/src/third_party/wiredtiger/test/suite/test_alter03.py b/src/third_party/wiredtiger/test/suite/test_alter03.py new file mode 100644 index 00000000000..78d1481f778 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_alter03.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_alter03.py +# Check if app_metadata can be altered. +class test_alter03(wttest.WiredTigerTestCase): + name = "alter03" + + def verify_metadata(self, metastr): + if metastr == '': + return + cursor = self.session.open_cursor('metadata:', None, None) + # + # Walk through all the metadata looking for the entries that are + # the URIs for the named object. + # + found = False + while True: + ret = cursor.next() + if ret != 0: + break + key = cursor.get_key() + if key.find(self.name) != -1: + value = cursor[key] + found = True + self.assertTrue(value.find(metastr) != -1) + cursor.close() + self.assertTrue(found == True) + + # Alter: Change the app_metadata and verify + def test_alter03_app_metadata(self): + uri = "table:" + self.name + entries = 100 + create_params = 'key_format=i,value_format=i,' + app_meta_orig = 'app_metadata="meta_data_1",' + app_meta_new = 'app_metadata="meta_data_2",' + + self.session.create(uri, create_params + app_meta_orig) + + # Put some data in table. + c = self.session.open_cursor(uri, None) + for k in range(entries): + c[k+1] = 1 + c.close() + + # Verify the string in the metadata + self.verify_metadata(app_meta_orig) + + # Alter app metadata and verify + self.session.alter(uri, app_meta_new) + self.verify_metadata(app_meta_new) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_cursor13.py b/src/third_party/wiredtiger/test/suite/test_cursor13.py index 35a841ed78d..9392f7a99d8 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor13.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor13.py @@ -41,6 +41,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase): stat_cursor_cache = 0 stat_cursor_reopen = 0 + # Returns a list: [cursor_cached, cursor_reopened] def caching_stats(self): stat_cursor = self.session.open_cursor('statistics:', None, None) cache = stat_cursor[stat.conn.cursor_cache][2] @@ -48,6 +49,8 @@ class test_cursor13_base(wttest.WiredTigerTestCase): stat_cursor.close() return [cache, reopen] + # Returns a list: [cursor_sweep, cursor_sweep_buckets, + # cursor_sweep_examined, cursor_sweep_closed] def sweep_stats(self): stat_cursor = self.session.open_cursor('statistics:', None, None) sweep = stat_cursor[stat.conn.cursor_sweep][2] @@ -527,3 +530,25 @@ class test_cursor13_sweep(test_cursor13_big_base): # by approximately the number of swept cursors, but it's less # predictable. self.assertGreater(end_stats[1] - begin_stats[1], 0) + +class test_cursor13_dup(test_cursor13_base): + def test_dup(self): + self.cursor_stats_init() + uri = 'table:test_cursor13_dup' + self.session.create(uri, 'key_format=S,value_format=S') + cursor = self.session.open_cursor(uri) + cursor['A'] = 'B' + cursor.close() + + # Get a cursor and position it. + # An unpositioned cursor cannot be duplicated. + c1 = self.session.open_cursor(uri, None) + c1.next() + + for notused in range(0, 100): + self.session.breakpoint() + c2 = self.session.open_cursor(None, c1, None) + c2.close() + stats = self.caching_stats() + self.assertGreaterEqual(stats[0], 100) # cursor_cached > 100 + self.assertGreaterEqual(stats[1], 100) # cursor_reopened > 100 diff --git a/src/third_party/wiredtiger/test/suite/test_las.py b/src/third_party/wiredtiger/test/suite/test_las01.py index f38b11138d2..fd4dea87c35 100644 --- a/src/third_party/wiredtiger/test/suite/test_las.py +++ b/src/third_party/wiredtiger/test/suite/test_las01.py @@ -33,9 +33,9 @@ from wtdataset import SimpleDataSet def timestamp_str(t): return '%x' % t -# test_las.py +# test_las01.py # Smoke tests to ensure lookaside tables are working. -class test_las(wttest.WiredTigerTestCase): +class test_las01(wttest.WiredTigerTestCase): # Force a small cache. def conn_config(self): return 'cache_size=50MB' @@ -93,7 +93,7 @@ class test_las(wttest.WiredTigerTestCase): def test_las(self): # Create a small table. - uri = "table:test_las" + uri = "table:test_las01" nrows = 100 ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u') ds.populate() diff --git a/src/third_party/wiredtiger/test/suite/test_las02.py b/src/third_party/wiredtiger/test/suite/test_las02.py new file mode 100644 index 00000000000..af089d6c19e --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_las02.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +from helper import copy_wiredtiger_home +import wiredtiger, wttest +from wtdataset import SimpleDataSet + +def timestamp_str(t): + return '%x' % t + +# test_las02.py +# Test that truncate with lookaside entries and timestamps gives expected results. +class test_las02(wttest.WiredTigerTestCase): + # Force a small cache. + def conn_config(self): + return 'cache_size=50MB,log=(enabled)' + + def large_updates(self, uri, value, ds, nrows, commit_ts): + # Update a large number of records, we'll hang if the lookaside table isn't working. + session = self.session + cursor = session.open_cursor(uri) + for i in range(1, nrows + 1): + session.begin_transaction() + cursor[ds.key(i)] = value + session.commit_transaction('commit_timestamp=' + timestamp_str(commit_ts)) + cursor.close() + + def check(self, check_value, uri, nrows, read_ts): + session = self.session + session.begin_transaction('read_timestamp=' + timestamp_str(read_ts)) + cursor = session.open_cursor(uri) + count = 0 + for k, v in cursor: + self.assertEqual(v, check_value) + count += 1 + session.rollback_transaction() + self.assertEqual(count, nrows) + + def test_las(self): + nrows = 10000 + + # Create a table without logging to ensure we get "skew_newest" lookaside eviction behavior. + uri = "table:las02_main" + ds = SimpleDataSet( + self, uri, 0, key_format="S", value_format="S", config='log=(enabled=false)') + ds.populate() + + uri2 = "table:las02_extra" + ds2 = SimpleDataSet(self, uri2, 0, key_format="S", value_format="S") + ds2.populate() + + # Pin oldest and stable to timestamp 1. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1) + + ',stable_timestamp=' + timestamp_str(1)) + + bigvalue = "aaaaa" * 100 + self.large_updates(uri, bigvalue, ds, nrows / 3, 1) + + # Check that all updates are seen + self.check(bigvalue, uri, nrows / 3, 1) + + # Check to see lookaside working with old timestamp + bigvalue2 = "ddddd" * 100 + self.large_updates(uri, bigvalue2, ds, nrows, 100) + + # Check that the new updates are only seen after the update timestamp + self.check(bigvalue, uri, nrows / 3, 1) + self.check(bigvalue2, uri, nrows, 100) + + # Force out most of the pages by updating a different tree + self.large_updates(uri2, bigvalue, ds2, nrows, 100) + + # Now truncate half of the records + self.session.begin_transaction() + end = self.session.open_cursor(uri) + end.set_key(ds.key(nrows / 2)) + self.session.truncate(None, None, end) + end.close() + self.session.commit_transaction('commit_timestamp=' + timestamp_str(200)) + + # Check that the truncate is visible after commit + self.check(bigvalue2, uri, nrows / 2, 200) + + # Repeat earlier checks + self.check(bigvalue, uri, nrows / 3, 1) + self.check(bigvalue2, uri, nrows, 100) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare01.py b/src/third_party/wiredtiger/test/suite/test_prepare01.py index 0039e9106f0..20615ab836c 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare01.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare01.py @@ -111,7 +111,6 @@ class test_prepare01(wttest.WiredTigerTestCase): cursor = self.session.open_cursor(self.uri, None) self.check(cursor, 0, 0) - # Currently ignore_prepare is not realized yet, hence no effect. self.session.begin_transaction("ignore_prepare=false") for i in xrange(self.nentries): if i > 0 and i % (self.nentries / 37) == 0: @@ -165,11 +164,13 @@ class test_read_committed_default(wttest.WiredTigerTestCase): self.assertEqual(self.cursor_count(cursor), 1) s.prepare_transaction("prepare_timestamp=4a") - s.commit_transaction("commit_timestamp=5a") + # commit timestamp can be same as prepare timestamp + s.commit_transaction("commit_timestamp=4a") s.begin_transaction() self.assertEqual(self.cursor_count(cursor), 1) s.prepare_transaction("prepare_timestamp=7a") + # commit timestamp can be greater than prepare timestamp s.commit_transaction("commit_timestamp=8a") s.close() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare02.py b/src/third_party/wiredtiger/test/suite/test_prepare02.py index e2971ee4ca5..b44362ca951 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare02.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare02.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. # # test_prepare02.py -# Prepare : check post conditions to prepare operation +# Prepare : Session API usage generates expected error in prepared state. # from suite_subprocess import suite_subprocess diff --git a/src/third_party/wiredtiger/test/suite/test_prepare03.py b/src/third_party/wiredtiger/test/suite/test_prepare03.py index d9838ae7f82..143d1766bda 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare03.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare03.py @@ -30,7 +30,7 @@ import wiredtiger, wttest from wtscenario import make_scenarios # test_prepre03.py -# Prepare transaction check post conditions for cursor operations +# Prepare: Cursor API usage generates expected error in prepared state. # Pattern of test script is to invoke cursor operations in prepared transaction # state to ensure they fail and to repeat same operations in non-prepared state diff --git a/src/third_party/wiredtiger/test/suite/test_prepare04.py b/src/third_party/wiredtiger/test/suite/test_prepare04.py index af5dd12b1e5..cd4fe924293 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare04.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare04.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. # # test_prepare04.py -# Prepare: prepare conflict with update and read operations +# Prepare: Update and read operations generate prepared conflict error. # import random diff --git a/src/third_party/wiredtiger/test/suite/test_prepare05.py b/src/third_party/wiredtiger/test/suite/test_prepare05.py new file mode 100644 index 00000000000..3b283dd8102 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare05.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_prepare05.py +# Prepare: Timestamps validation for prepare API's +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_prepare05(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_prepare05' + uri = 'table:' + tablename + + def test_timestamp_api(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + self.session.create(self.uri, 'key_format=i,value_format=i') + c = self.session.open_cursor(self.uri) + + # It is illegal to set a prepare timestamp older than oldest timestamp. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(2)) + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(1)), + "/older than the oldest timestamp/") + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + + # Check setting the prepare timestamp same as oldest timestamp is valid. + self.session.begin_transaction() + self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(2)) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + + # In a single transaction it is illegal to set a commit timestamp + # before invoking prepare for this transaction. + # Note: Values are not important, setting commit timestamp before + # prepare itself is illegal. + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(3)) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(2)), + "/should not have been set before/") + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + + # It is illegal to set a prepare timestamp same as or earlier than an + # active read timestamp. + # Start a new reader to have an active read timestamp. + s_reader = self.conn.open_session() + s_reader.begin_transaction('read_timestamp=' + timestamp_str(4)) + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(4)), + "/not later than an active read timestamp/") + self.session.rollback_transaction() + + # Check setting the prepare timestamp as later than active read + # timestamp is valid. + self.session.begin_transaction() + c[1] = 1 + self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(5)) + # Resolve the reader transaction started earlier. + s_reader.rollback_transaction() + self.session.rollback_transaction() + + # It is illegal to set a commit timestamp older than prepare + # timestamp of a transaction. + self.session.begin_transaction() + c[1] = 1 + self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(5)) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.commit_transaction( + 'commit_timestamp=' + timestamp_str(4)), + "/older than the prepare timestamp/") + + # It is legal to set a commit timestamp as same as prepare + # timestamp. + self.session.begin_transaction() + c[1] = 1 + self.session.prepare_transaction( + 'prepare_timestamp=' + timestamp_str(5)) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + +if __name__ == '__main__': + wttest.run() |