diff options
81 files changed, 2265 insertions, 719 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/read_write_heavy.wtperf b/src/third_party/wiredtiger/bench/workgen/runner/read_write_heavy.wtperf new file mode 100644 index 00000000000..f05ed62c5b4 --- /dev/null +++ b/src/third_party/wiredtiger/bench/workgen/runner/read_write_heavy.wtperf @@ -0,0 +1,20 @@ +# Warning: This config includes unwritten, implicit configuration defaults. +# Changes to those values may cause differences in behavior. +conn_config="cache_size=2GB,eviction=(threads_max=8),log=(enabled=true),session_max=250,statistics=(fast),statistics_log=(wait=1,json),io_capacity=(total=30M)" +checkpoint_interval=60 +checkpoint_threads=1 +compression="snappy" +create=true +close_conn=false +icount=4000000 +log_like_table=true +populate_threads=4 +report_interval=1 +run_time=900 +sample_interval=1 +sample_rate=1 +table_count=100 +threads=((count=80,updates=1,throttle=11,throttle_burst=0),(count=80,reads=1,throttle=60,throttle_burst=0)) +value_sz=7000 +warmup=0 +table_config="memory_page_max=10m,leaf_value_max=64MB,checksum=on,split_pct=90,type=file,log=(enabled=false),leaf_page_max=32k,block_compressor=snappy" diff --git a/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py b/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py new file mode 100644 index 00000000000..2f774d0c902 --- /dev/null +++ b/src/third_party/wiredtiger/bench/workgen/runner/read_write_storms.py @@ -0,0 +1,115 @@ +#/usr/bin/env python +# generated from runner/read_write_heavy.wtperf originally, then hand edited. + +from runner import * +from wiredtiger import * +from workgen import * + +context = Context() +conn_config = "" +conn_config += ",cache_size=2GB,eviction=(threads_max=8),log=(enabled=true),session_max=250,statistics=(fast),statistics_log=(wait=1,json),io_capacity=(total=30M)" # explicitly added +conn = wiredtiger_open("WT_TEST", "create," + conn_config) +s = conn.open_session("") + +wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\ + "exclusive=true,allocation_size=4kb," +\ + "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100," +compress_table_config = "block_compressor=snappy," +table_config = "memory_page_max=10m,leaf_value_max=64MB,checksum=on,split_pct=90,type=file,log=(enabled=false),leaf_page_max=32k,block_compressor=snappy" +tables = [] +table_count = 100 +for i in range(0, table_count): + tname = "table:test" + str(i) + table = Table(tname) + s.create(tname, wtperf_table_config +\ + compress_table_config + table_config) + table.options.key_size = 20 + table.options.value_size = 7000 + tables.append(table) + +populate_threads = 4 +icount = 4000000 +# There are multiple tables to be filled during populate, +# the icount is split between them all. +pop_ops = Operation(Operation.OP_INSERT, tables[0]) +pop_ops = op_multi_table(pop_ops, tables) +nops_per_thread = icount / (populate_threads * table_count) +pop_thread = Thread(pop_ops * nops_per_thread) +pop_workload = Workload(context, populate_threads * pop_thread) +pop_workload.run(conn) +print('populate complete') + +# Log like file, requires that logging be enabled in the connection config. +log_name = "table:log" +s.create(log_name, wtperf_table_config + "key_format=S,value_format=S," + compress_table_config + table_config + ",log=(enabled=true)") +log_table = Table(log_name) + +ops = Operation(Operation.OP_UPDATE, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +thread0 = Thread(ops) +# These operations include log_like operations, which will increase the number +# of insert/update operations by a factor of 2.0. This may cause the +# actual operations performed to be above the throttle. +thread0.options.throttle=11 +thread0.options.throttle_burst=0 + +ops = Operation(Operation.OP_SEARCH, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +thread1 = Thread(ops) +thread1.options.throttle=60 +thread1.options.throttle_burst=0 + +ops = Operation(Operation.OP_SLEEP, "60") + \ + Operation(Operation.OP_CHECKPOINT, "") +checkpoint_thread = Thread(ops) + +ops = Operation(Operation.OP_SLEEP, "0.1") + \ + Operation(Operation.OP_LOG_FLUSH, "") +logging_thread = Thread(ops) + +############################################################################ +# This part was added to the generated file. +# Add threads that do a bunch of operations and sleep, all in a loop. +# At the beginning of the run the threads will tend to be synchronized, +# but that effect will dissipate over time. + +ops = Operation(Operation.OP_UPDATE, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +ops = ops * 10000 + Operation(Operation.OP_SLEEP, "10") +thread_big_10 = Thread(ops) + +ops = Operation(Operation.OP_UPDATE, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +ops = ops * 80000 + Operation(Operation.OP_SLEEP, "20") +thread_big_20 = Thread(ops) + +ops = Operation(Operation.OP_SEARCH, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +ops = ops * 10000 + Operation(Operation.OP_SLEEP, "8") +thread_bigread_8 = Thread(ops) + +ops = Operation(Operation.OP_SEARCH, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +ops = ops * 80000 + Operation(Operation.OP_SLEEP, "16") +thread_bigread_16 = Thread(ops) + +# End of added section. +# The new threads will also be added to the workload below. +############################################################################ + +workload = Workload(context, 80 * thread0 + 80 * thread1 + checkpoint_thread + logging_thread + 10 * thread_big_10 + 10 * thread_big_20 + 10 * thread_bigread_8 + 10 * thread_bigread_16) +workload.options.report_interval=1 +workload.options.run_time=900 +workload.options.sample_rate=1 +workload.options.warmup=0 +workload.options.sample_interval_ms = 1000 +workload.run(conn) + +latency_filename = "WT_TEST/latency.out" +latency.workload_latency(workload, latency_filename) diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx index 776e7cf6098..9bfa29e3136 100644 --- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx +++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx @@ -948,7 +948,8 @@ int Throttle::throttle(uint64_t op_count, uint64_t *op_limit) { _next_div = ts_add_ms(now, _ms_per_div); _started = true; } else { - _ops_delta += (op_count - _ops_prev); + if (_burst != 0.0) + _ops_delta += (op_count - _ops_prev); // Sleep until the next division, but potentially with some randomness. if (now < _next_div) { @@ -961,7 +962,12 @@ int Throttle::throttle(uint64_t op_count, uint64_t *op_limit) { } _next_div = ts_add_ms(_next_div, _ms_per_div); } - ops = _ops_per_div; + + if (_burst == 0.0) + ops = _ops_left_this_second; + else + ops = _ops_per_div; + if (_ops_delta < (int64_t)ops) { ops -= _ops_delta; _ops_delta = 0; diff --git a/src/third_party/wiredtiger/bench/workgen/wtperf.py b/src/third_party/wiredtiger/bench/workgen/wtperf.py index 9ce1b84a663..e4ce0393276 100755 --- a/src/third_party/wiredtiger/bench/workgen/wtperf.py +++ b/src/third_party/wiredtiger/bench/workgen/wtperf.py @@ -34,7 +34,7 @@ # See also the usage() function. # from __future__ import print_function -import os, shutil, sys, tempfile +import os, shutil, sys, subprocess, tempfile def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) @@ -155,8 +155,6 @@ class Translator: # "(abc=123,def=234,ghi=(hi=1,bye=2))" would return 3 items. def split_config_parens(self, s): if s[0:1] != '(': - import pdb - pdb.set_trace() self.fatal_error('missing left paren', 'config parse error') if s[-1:] != ')': self.fatal_error('missing right paren', 'config parse error') @@ -201,16 +199,21 @@ class Translator: result += ' ' return result + def copy_file(self, srcname, destdir, destname): + dest_fullname = os.path.join(destdir, destname) + suffix = 0 + while os.path.exists(dest_fullname): + suffix += 1 + dest_fullname = os.path.join(destdir, destname + str(suffix)) + shutil.copyfile(srcname, dest_fullname) + def copy_config(self): # Note: If we add the capability of setting options on the command # line, we won't be able to do a simple copy. - config_save = os.path.join(self.homedir, 'CONFIG.wtperf') - suffix = 0 - while os.path.exists(config_save): - suffix += 1 - config_save = os.path.join(self.homedir, \ - 'CONFIG.wtperf.' + str(suffix)) - shutil.copyfile(self.filename, config_save) + self.copy_file(self.filename, self.homedir, 'CONFIG.wtperf') + + def copy_python_source(self, srcname): + self.copy_file(srcname, self.homedir, 'RUN.py') # Wtperf's throttle is based on the number of regular operations, # not including log_like operations. Workgen counts all operations, @@ -271,6 +274,9 @@ class Translator: topts.read = 0 topts.reads = 0 topts.throttle = 0 + # Workgen's throttle_burst variable has a default of 1.0 . Since we + # are always explicitly setting it, set our own value to the same. + topts.throttle_burst = 1.0 topts.update = 0 topts.updates = 0 topts.random_range = 0 @@ -333,8 +339,11 @@ class Translator: if topts.throttle > 0: (throttle, comment) = self.calc_throttle(topts, log_like_table) tdecls += comment - tdecls += self.assign_str(thread_name + '.options.throttle', - throttle) + tdecls += self.assign_str( + thread_name + '.options.throttle', throttle) + tdecls += self.assign_str( + thread_name + '.options.throttle_burst', + topts.throttle_burst) tdecls += '\n' if topts.count > 1: tnames += str(topts.count) + ' * ' @@ -504,9 +513,11 @@ class Translator: def translate_inner(self): workloadopts = '' + input_as_string = '' with open(self.filename) as fin: for line in fin: self.linenum += 1 + input_as_string += line commentpos = line.find('#') if commentpos >= 0: line = line[0:commentpos] @@ -563,6 +574,11 @@ class Translator: s += 'from wiredtiger import *\n' s += 'from workgen import *\n' s += '\n' + s += '\'\'\' The original wtperf input file follows:\n' + s += input_as_string + if not input_as_string.endswith('\n'): + s += '\n' + s += '\'\'\'\n\n' async_config = '' if opts.compact and opts.async_threads == 0: opts.async_threads = 2; @@ -585,6 +601,7 @@ class Translator: s += ' return op_ret\n' s += '\n' s += 'context = Context()\n' + s += 'homedir = "' + self.homedir + '"\n' extra_config = '' s += 'conn_config = ""\n' @@ -599,8 +616,7 @@ class Translator: s += 'conn_config += extensions_config(["compressors/' + \ compression + '"])\n' compression = 'block_compressor=' + compression + ',' - s += 'conn = wiredtiger_open("' + self.homedir + \ - '", "create," + conn_config)\n' + s += 'conn = wiredtiger_open(homedir, "create," + conn_config)\n' s += 's = conn.open_session("' + sess_config + '")\n' s += '\n' s += self.translate_table_create() @@ -618,8 +634,8 @@ class Translator: s += 'conn.close()\n' if readonly: 'conn_config += ",readonly=true"\n' - s += 'conn = wiredtiger_open(' + \ - '"' + self.homedir + '", "create," + conn_config)\n' + s += 'conn = wiredtiger_open(homedir, ' + \ + '"create," + conn_config)\n' s += '\n' s += 'workload = Workload(context, ' + t_var + ')\n' s += workloadopts @@ -627,7 +643,7 @@ class Translator: if self.verbose > 0: s += 'print("workload:")\n' s += 'workload.run(conn)\n\n' - s += 'latency_filename = "' + self.homedir + '/latency.out"\n' + s += 'latency_filename = homedir + "/latency.out"\n' s += 'latency.workload_latency(workload, latency_filename)\n' if close_conn: @@ -684,16 +700,22 @@ for arg in sys.argv[1:]: # directory after the run, because the wiredtiger_open # in the generated code will clean out the directory first. raised = None + ret = 0 try: - execfile(tmpfile) - except Exception, exception: + # Run python on the generated script + ret = subprocess.call([sys.executable, tmpfile]) + except (KeyboardInterrupt, Exception), exception: raised = exception if not os.path.isdir(homedir): os.makedirs(homedir) translator.copy_config() + translator.copy_python_source(tmpfile) os.remove(tmpfile) if raised != None: raise raised + if ret != 0: + raise Exception('Running generated program returned ' + + str(ret)) else: usage() sys.exit(1) diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 44d7d8f9886..4db94e19cf3 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -519,6 +519,17 @@ connection_runtime_config = [ interval in seconds at which to check for files that are inactive and close them''', min=1, max=100000), ]), + Config('io_capacity', '', r''' + control how many bytes per second are written and read. Exceeding + the capacity results in throttling.''', + type='category', subconfig=[ + Config('total', '0', r''' + number of bytes per second available to all subsystems in total. + When set, decisions about what subsystems are throttled, and in + what proportion, are made internally. The minimum non-zero setting + is 1MB.''', + min='0', max='1TB'), + ]), Config('lsm_manager', '', r''' configure database wide options for LSM tree management. The LSM manager is started automatically the first time an LSM tree is opened. @@ -597,8 +608,9 @@ connection_runtime_config = [ intended for use with internal stress testing of WiredTiger.''', type='list', undoc=True, choices=[ - 'checkpoint_slow', 'lookaside_sweep_race', 'split_1', 'split_2', - 'split_3', 'split_4', 'split_5', 'split_6', 'split_7', 'split_8']), + 'aggressive_sweep', 'checkpoint_slow', 'lookaside_sweep_race', + 'split_1', 'split_2', 'split_3', 'split_4', 'split_5', 'split_6', + 'split_7', 'split_8']), Config('verbose', '', r''' enable messages for various events. Options are given as a list, such as <code>"verbose=[evictserver,read]"</code>''', @@ -1471,13 +1483,15 @@ methods = { Config('get', 'all_committed', r''' specify which timestamp to query: \c all_committed returns the largest timestamp such that all timestamps up to that value have committed, - \c oldest returns the most recent \c oldest_timestamp set with - WT_CONNECTION::set_timestamp, \c oldest_reader returns the + \c last_checkpoint returns the timestamp of the most recent stable + checkpoint, \c oldest returns the most recent \c oldest_timestamp set + with WT_CONNECTION::set_timestamp, \c oldest_reader returns the minimum of the read timestamps of all active readers \c pinned returns - the minimum of the\c oldest_timestamp and the read timestamps of all - active readers, and \c stable returns the most recent - \c stable_timestamp set with WT_CONNECTION::set_timestamp. See - @ref transaction_timestamps''', + the minimum of the \c oldest_timestamp and the read timestamps of all + active readers, \c recovery returns the timestamp of the most recent + stable checkpoint taken prior to a shutdown and \c stable returns the + most recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. + See @ref transaction_timestamps''', choices=['all_committed','last_checkpoint', 'oldest','oldest_reader','pinned','recovery','stable']), ]), diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 1bbeeb3c7a3..73fa6819e94 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -68,6 +68,7 @@ src/conn/api_version.c src/conn/conn_api.c src/conn/conn_cache.c src/conn/conn_cache_pool.c +src/conn/conn_capacity.c src/conn/conn_ckpt.c src/conn/conn_dhandle.c src/conn/conn_handle.c diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index b93c99a9f99..c93229014c8 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1175,6 +1175,7 @@ scalability sched scr sd +second's secretkey sed sessionp @@ -1230,6 +1231,7 @@ subinit sublicense subone suboptimal +subsystem's subtest subtree sunique diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 5184bbcb9a2..9bca52f402a 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -52,6 +52,10 @@ class CacheWalkStat(Stat): def __init__(self, name, desc, flags=''): flags += ',cache_walk' Stat.__init__(self, name, CacheWalkStat.prefix, desc, flags) +class CapacityStat(Stat): + prefix = 'capacity' + def __init__(self, name, desc, flags=''): + Stat.__init__(self, name, CapacityStat.prefix, desc, flags) class CompressStat(Stat): prefix = 'compression' def __init__(self, name, desc, flags=''): @@ -134,6 +138,7 @@ groups['memory'] = [ ConnStat.prefix, RecStat.prefix] groups['system'] = [ + CapacityStat.prefix, ConnStat.prefix, DhandleStat.prefix, PerfHistStat.prefix, @@ -294,6 +299,24 @@ connection_stats = [ CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## + # Capacity statistics + ########################################## + CapacityStat('capacity_bytes_ckpt', 'throttled bytes written for checkpoint'), + CapacityStat('capacity_bytes_evict', 'throttled bytes written for eviction'), + CapacityStat('capacity_bytes_log', 'throttled bytes written for log'), + CapacityStat('capacity_bytes_read', 'throttled bytes read'), + CapacityStat('capacity_bytes_written', 'throttled bytes written total'), + CapacityStat('capacity_threshold', 'threshold to call fsync'), + CapacityStat('capacity_time_ckpt', 'time waiting during checkpoint (usecs)'), + CapacityStat('capacity_time_evict', 'time waiting during eviction (usecs)'), + CapacityStat('capacity_time_log', 'time waiting during logging (usecs)'), + CapacityStat('capacity_time_read', 'time waiting during read (usecs)'), + CapacityStat('capacity_time_total', 'time waiting due to total capacity (usecs)'), + CapacityStat('fsync_all_fh', 'background fsync file handles synced'), + CapacityStat('fsync_all_fh_total', 'background fsync file handles considered'), + CapacityStat('fsync_all_time', 'background fsync time (msecs)', 'no_clear,no_scale'), + + ########################################## # Cursor operations ########################################## CursorStat('cursor_open_count', 'open cursor count', 'no_clear,no_scale'), @@ -333,6 +356,7 @@ connection_stats = [ # Dhandle statistics ########################################## DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'), + DhandleStat('dh_conn_handle_size', 'connection data handle size', 'no_clear,no_scale,size'), DhandleStat('dh_session_handles', 'session dhandles swept'), DhandleStat('dh_session_sweeps', 'session sweep attempts'), DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'), diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index 445a92ba5f8..3bf66a876fd 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -1284,6 +1284,12 @@ main(int argc, char *argv[]) /*! [Configure file_extend] */ error_check(conn->close(conn, NULL)); + /*! [Configure capacity] */ + error_check(wiredtiger_open( + home, NULL, "create,io_capacity=(total=40MB)", &conn)); + /*! [Configure capacity] */ + error_check(conn->close(conn, NULL)); + /*! [Eviction configuration] */ /* * Configure eviction to begin at 90% full, and run until the cache diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index b45b085a227..b522dcbe4b9 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "92719d6bc9a83ce45c337db6a67adcc1354cca32", + "commit": "0c6ba8d8be02dd34a46c3e9533971f1739b6ad8e", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.2" diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c index 2107fd85a7f..7211e5cfa24 100644 --- a/src/third_party/wiredtiger/src/block/block_mgr.c +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -505,6 +505,8 @@ static int __bm_write(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_checksum, bool checkpoint_io) { + __wt_capacity_throttle(session, buf->size, + checkpoint_io ? WT_THROTTLE_CKPT : WT_THROTTLE_EVICT); return (__wt_block_write(session, bm->block, buf, addr, addr_sizep, data_checksum, checkpoint_io)); } diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index 977fb165b84..9614e1c2810 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -98,6 +98,7 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, block, "read", offset, size, bm->is_live, __func__, __LINE__)); #endif /* Read the block. */ + __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET( __wt_block_read_off(session, block, buf, offset, size, checksum)); diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index 4de128494d0..9edc4e0108b 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -351,9 +351,9 @@ __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && - (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && + fh->written > block->os_cache_dirty_max && __wt_session_can_wait(session)) { - block->os_cache_dirty = 0; + fh->written = 0; if ((ret = __wt_fsync(session, fh, false)) != 0) { /* * Ignore ENOTSUP, but don't try again. diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index d861276a843..37ee36634ff 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -30,7 +30,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* If the page is clean, test the original addresses. */ if (__wt_page_evict_clean(page)) { - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( @@ -249,7 +249,7 @@ __wt_compact_page_skip( * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ - __wt_ref_info(ref, &addr, &addr_size, &type); + __wt_ref_info(session, ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); if (addr != NULL && type != WT_CELL_ADDR_INT) { bm = S2BT(session)->bm; diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index b8f99a03df9..d12548b008e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -229,7 +229,7 @@ new_page: /* Find the matching WT_COL slot. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; - __wt_cell_unpack(page, cell, &unpack); + __wt_cell_unpack(session, page, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 7ecebc0f9d4..32310b8a341 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -376,7 +376,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; - __wt_cell_unpack(page, cell, &unpack); + __wt_cell_unpack(session, page, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 9395490b165..1ce403dba7f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -141,7 +141,7 @@ __debug_item_key(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) return (ds->f(ds, "\t%s%s{%s}\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", __wt_buf_set_printable_format( - ds->session, data_arg, size, ds->key_format, ds->t1))); + session, data_arg, size, ds->key_format, ds->t1))); } /* @@ -170,7 +170,7 @@ __debug_item_value( return (ds->f(ds, "\t%s%s{%s}\n", tag == NULL ? "" : tag, tag == NULL ? "" : " ", __wt_buf_set_printable_format( - ds->session, data_arg, size, ds->value_format, ds->t1))); + session, data_arg, size, ds->value_format, ds->t1))); } /* @@ -527,7 +527,7 @@ __debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk) btree = S2BT(ds->session); - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, false) { + WT_CELL_FOREACH_BEGIN(ds->session, btree, dsk, unpack, false) { WT_RET(__debug_cell(ds, dsk, &unpack)); } WT_CELL_FOREACH_END; return (0); @@ -997,7 +997,7 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) unpack = NULL; rle = 1; } else { - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(ds->session, page, cell, unpack); rle = __wt_cell_rle(unpack); } WT_RET(__wt_snprintf( @@ -1081,7 +1081,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); WT_ERR(__debug_item_key(ds, "K", key->data, key->size)); - __wt_row_leaf_value_cell(page, rip, NULL, unpack); + __wt_row_leaf_value_cell(session, page, rip, NULL, unpack); WT_ERR(__debug_cell_data( ds, page, WT_PAGE_ROW_LEAF, "V", unpack)); @@ -1205,8 +1205,11 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) else WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid)); __wt_timestamp_to_string( - upd->timestamp, ts_string, sizeof(ts_string)); - WT_RET(ds->f(ds, ", ts %s", ts_string)); + upd->start_ts, ts_string, sizeof(ts_string)); + WT_RET(ds->f(ds, ", start_ts %s", ts_string)); + __wt_timestamp_to_string( + upd->stop_ts, ts_string, sizeof(ts_string)); + WT_RET(ds->f(ds, ", stop_ts %s", ts_string)); WT_RET(ds->f(ds, "\n")); } return (0); @@ -1253,7 +1256,7 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) break; } - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); return (ds->f(ds, "\t" "%p %s %s\n", (void *)ref, state, __wt_addr_string(session, addr, addr_size, ds->t1))); } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 7168c8475da..405f00a7259 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -323,8 +323,8 @@ __tombstone_update_alloc(WT_SESSION_IMPL *session, */ if (page_del != NULL) { upd->txnid = page_del->txnid; - upd->timestamp = page_del->timestamp; - upd->durable_timestamp = page_del->timestamp; + upd->start_ts = page_del->timestamp; + upd->durable_ts = page_del->timestamp; upd->prepare_state = page_del->prepare_state; } *updp = upd; diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 38e11837e2c..6d96c2537b3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -488,15 +488,12 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) * Don't do compression adjustment for fixed-size column store, the * leaf page sizes don't change. (We could adjust internal pages but not * internal pages, but that seems an unlikely use case.) - * XXX - * Don't do compression adjustment of snappy-compressed blocks. */ btree->intlpage_compadjust = false; btree->maxintlpage_precomp = btree->maxintlpage; btree->leafpage_compadjust = false; btree->maxleafpage_precomp = btree->maxleafpage; if (btree->compressor != NULL && btree->compressor->compress != NULL && - !WT_STRING_MATCH("snappy", cval.str, cval.len) && btree->type != BTREE_COL_FIX) { /* * Don't do compression adjustment when on-disk page sizes are @@ -611,6 +608,12 @@ __wt_btree_tree_open( F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0) ret = __wt_verify_dsk(session, tmp->data, &dsk); + /* + * Flag any failed read or verification: if we're in startup, it may + * be fatal. + */ + if (ret != 0) + F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); if (ret != 0) __wt_err(session, ret, @@ -783,7 +786,7 @@ __btree_preload(WT_SESSION_IMPL *session) /* Pre-load the second-level internal pages. */ WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) { - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); if (addr != NULL) WT_RET(bm->preload(bm, session, addr, addr_size)); } WT_INTL_FOREACH_END; diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c index 51eb68aa51f..434dd579c5f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_misc.c +++ b/src/third_party/wiredtiger/src/btree/bt_misc.c @@ -102,7 +102,7 @@ __wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) return (buf->data); } - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); return (__wt_addr_string(session, addr, addr_size, buf)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 9e608114672..e254a9acf7d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -217,7 +217,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) bm = btree->bm; unpack = &_unpack; - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(session, page, cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index db096ab12c9..bd30d3218c2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -286,7 +286,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; hint = 0; - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, page->dsk, unpack, true) { ref = *refp++; ref->home = page; ref->pindex_hint = hint++; @@ -310,7 +310,7 @@ __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np) btree = S2BT(session); /* Walk the page, counting entries for the repeats array. */ - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, page->dsk, unpack, true) { if (__wt_cell_rle(&unpack) > 1) ++*np; } WT_CELL_FOREACH_END; @@ -346,7 +346,7 @@ __inmem_col_var( */ indx = 0; cip = page->pg_var; - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, page->dsk, unpack, true) { WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, unpack.cell)); cip++; @@ -409,7 +409,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) refp = pindex->index; overflow_keys = false; hint = 0; - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, page->dsk, unpack, true) { ref = *refp; ref->home = page; ref->pindex_hint = hint++; @@ -522,7 +522,7 @@ __inmem_row_leaf_entries( * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item. */ nindx = 0; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { switch (unpack.type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: @@ -554,7 +554,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) /* Walk the page, building indices. */ rip = page->pg_row; - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, page->dsk, unpack, true) { switch (unpack.type) { case WT_CELL_KEY_OVFL: __wt_row_leaf_key_set_cell(page, rip, unpack.cell); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 413e94377d3..c0933d4c4f8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -188,9 +188,9 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) session, &las_value, &upd, &incr, upd_type)); total_incr += incr; upd->txnid = las_txnid; - upd->timestamp = las_timestamp; + upd->start_ts = las_timestamp; + upd->durable_ts = durable_timestamp; upd->prepare_state = prepare_state; - upd->durable_timestamp = durable_timestamp; switch (page->type) { case WT_PAGE_COL_FIX: @@ -478,7 +478,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * only lookaside entries, and a subsequent search or insert is forcing * re-creation of the name space. */ - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); if (addr == NULL) { WT_ASSERT(session, previous_state != WT_REF_DISK); diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 45dac75a56a..46dc96aedce 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -215,7 +215,7 @@ __rebalance_col_walk( * location cookie pairs. Keys are on-page/overflow items and location * cookies are WT_CELL_ADDR_XXX items. */ - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { switch (unpack.type) { case WT_CELL_ADDR_INT: /* An internal page: read it and recursively walk it. */ @@ -301,7 +301,7 @@ __rebalance_row_walk( * cookies are WT_CELL_ADDR_XXX items. */ first_cell = true; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { switch (unpack.type) { case WT_CELL_KEY: key = unpack; diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index b25a5932284..61351c26e36 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -101,7 +101,7 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) return (0); /* Take the value from the original page cell. */ - __wt_row_leaf_value_cell(page, rip, NULL, &unpack); + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); return (__wt_page_cell_data_ref( session, page, &unpack, &cursor->value)); @@ -110,7 +110,7 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) if (page->type == WT_PAGE_COL_VAR) { /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); - __wt_cell_unpack(page, cell, &unpack); + __wt_cell_unpack(session, page, cell, &unpack); return (__wt_page_cell_data_ref( session, page, &unpack, &cursor->value)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index e98835c2eb3..a03cfb6405d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -571,7 +571,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, * the page. */ stop_recno = dsk->recno; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { stop_recno += __wt_cell_rle(&unpack); } WT_CELL_FOREACH_END; @@ -687,7 +687,7 @@ __slvg_trk_leaf_walk( /* Determine page min/max timestamps, count page overflow items. */ ovfl_cnt = 0; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { if (unpack.ovfl) ++ovfl_cnt; __slvg_trk_leaf_ts(trk, &unpack); @@ -703,7 +703,7 @@ __slvg_trk_leaf_walk( trk->trk_ovfl_cnt = ovfl_cnt; ovfl_cnt = 0; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, true) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, true) { if (unpack.ovfl) { WT_RET(__wt_memdup(session, unpack.data, unpack.size, &trk->trk_ovfl_addr[ovfl_cnt].addr)); @@ -1390,7 +1390,7 @@ __slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_COL_FOREACH(page, cip, i) { cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(page, cell, &unpack); + __wt_cell_unpack(session, page, cell, &unpack); recno += __wt_cell_rle(&unpack); /* @@ -2127,10 +2127,10 @@ __slvg_row_ovfl(WT_SESSION_IMPL *session, (void)__wt_row_leaf_key_info( page, copy, NULL, &cell, NULL, NULL); if (cell != NULL) { - __wt_cell_unpack(page, cell, &unpack); + __wt_cell_unpack(session, page, cell, &unpack); WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); } - __wt_row_leaf_value_cell(page, rip, NULL, &unpack); + __wt_row_leaf_value_cell(session, page, rip, NULL, &unpack); WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); } return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 62d96d79ba8..f0407ce71b1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -185,7 +185,7 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) ikey->cell_offset = 0; cell = WT_PAGE_REF_OFFSET(page, cell_offset); - __wt_cell_unpack(page, cell, &kpack); + __wt_cell_unpack(session, page, cell, &kpack); if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM) WT_RET(__wt_ovfl_discard(session, page, cell)); @@ -260,7 +260,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, */ WT_ORDERED_READ(ref_addr, ref->addr); if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) { - __wt_cell_unpack(from_home, (WT_CELL *)ref_addr, &unpack); + __wt_cell_unpack( + session, from_home, (WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); addr->oldest_start_ts = unpack.oldest_start_ts; addr->newest_start_ts = unpack.newest_start_ts; diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index fd73a84da5d..c201d9af73a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -165,7 +165,7 @@ __stat_page_col_var( ++deleted_cnt; } else { orig_deleted = false; - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(session, page, cell, unpack); if (unpack->type == WT_CELL_DEL) orig_deleted = true; else { @@ -244,7 +244,8 @@ __stat_page_row_int( * a reference to the original cell. */ if (page->dsk != NULL) - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, false) { + WT_CELL_FOREACH_BEGIN( + session, btree, page->dsk, unpack, false) { if (__wt_cell_type(unpack.cell) == WT_CELL_KEY_OVFL) ++ovfl_cnt; } WT_CELL_FOREACH_END; @@ -293,7 +294,8 @@ __stat_page_row_leaf( upd->type != WT_UPDATE_TOMBSTONE)) ++entry_cnt; if (upd == NULL) { - __wt_row_leaf_value_cell(page, rip, NULL, &unpack); + __wt_row_leaf_value_cell( + session, page, rip, NULL, &unpack); if (unpack.type == WT_CELL_VALUE_OVFL) ++ovfl_cnt; } @@ -316,7 +318,8 @@ __stat_page_row_leaf( */ if (page->dsk != NULL) { key = false; - WT_CELL_FOREACH_BEGIN(btree, page->dsk, unpack, false) { + WT_CELL_FOREACH_BEGIN( + session, btree, page->dsk, unpack, false) { switch (__wt_cell_type(unpack.cell)) { case WT_CELL_KEY_OVFL: ++ovfl_cnt; diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index c238669efd4..e47d210cc93 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -236,9 +236,9 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) * includes it. */ memset(&addr_unpack, 0, sizeof(addr_unpack)); - addr_unpack.oldest_start_ts = - addr_unpack.newest_start_ts = WT_TS_NONE; - addr_unpack.newest_stop_ts = WT_TS_MAX; + addr_unpack.oldest_start_ts = WT_TS_NONE; + addr_unpack.newest_start_ts = + addr_unpack.newest_stop_ts = WT_TS_MAX; addr_unpack.raw = WT_CELL_ADDR_INT; /* Verify the tree. */ @@ -326,6 +326,11 @@ static int __verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack, WT_VSTUFF *vs) { + if (unpack->newest_stop_ts == WT_TS_NONE) + WT_RET_MSG(session, WT_ERROR, + "internal page reference at %s has a newest stop " + "timestamp of 0", + __wt_page_addr_string(session, ref, vs->tmp1)); if (unpack->oldest_start_ts > unpack->newest_start_ts) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " @@ -447,7 +452,7 @@ recno_chk: if (recno != vs->record_total + 1) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(session, page, cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; @@ -534,7 +539,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR, /* Unpack the address block and check timestamps */ __wt_cell_unpack( - child_ref->home, child_ref->addr, unpack); + session, child_ref->home, child_ref->addr, unpack); WT_RET(__verify_addr_ts( session, child_ref, unpack, vs)); @@ -569,7 +574,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR, /* Unpack the address block and check timestamps */ __wt_cell_unpack( - child_ref->home, child_ref->addr, unpack); + session, child_ref->home, child_ref->addr, unpack); WT_RET(__verify_addr_ts( session, child_ref, unpack, vs)); @@ -810,7 +815,7 @@ __verify_page_cell(WT_SESSION_IMPL *session, /* Walk the page, tracking timestamps and verifying overflow pages. */ cell_num = 0; - WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, false) { + WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, false) { ++cell_num; switch (unpack.type) { case WT_CELL_KEY_OVFL: @@ -839,20 +844,29 @@ __verify_page_cell(WT_SESSION_IMPL *session, case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: + if (unpack.newest_stop_ts == WT_TS_NONE) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s has a " + "newest stop timestamp of 0", + cell_num - 1, + __wt_page_addr_string( + session, ref, vs->tmp1)); if (unpack.oldest_start_ts > unpack.newest_start_ts) WT_RET_MSG(session, WT_ERROR, - "cell %" PRIu32 " on page at %s has an oldest " - "start timestamp newer than its newest start " - "timestamp", - cell_num - 1, - __wt_page_addr_string(session, ref, vs->tmp1)); + "cell %" PRIu32 " on page at %s has an " + "oldest start timestamp newer than its " + "newest start timestamp", + cell_num - 1, + __wt_page_addr_string( + session, ref, vs->tmp1)); if (unpack.newest_start_ts > unpack.newest_stop_ts) WT_RET_MSG(session, WT_ERROR, - "cell %" PRIu32 " on page at %s has a newest " - "start timestamp newer than its newest stop " - "timestamp", - cell_num - 1, - __wt_page_addr_string(session, ref, vs->tmp1)); + "cell %" PRIu32 " on page at %s has a " + "newest start timestamp newer than its " + "newest stop timestamp", + cell_num - 1, + __wt_page_addr_string( + session, ref, vs->tmp1)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start", unpack.oldest_start_ts, @@ -872,12 +886,21 @@ __verify_page_cell(WT_SESSION_IMPL *session, case WT_CELL_VALUE_COPY: case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_SHORT: + if (unpack.stop_ts == WT_TS_NONE) + WT_RET_MSG(session, WT_ERROR, + "cell %" PRIu32 " on page at %s has a stop " + "timestamp of 0", + cell_num - 1, + __wt_page_addr_string( + session, ref, vs->tmp1)); if (unpack.start_ts > unpack.stop_ts) WT_RET_MSG(session, WT_ERROR, - "cell %" PRIu32 " on page at %s has a start " - "timestamp newer than its stop timestamp ", - cell_num - 1, - __wt_page_addr_string(session, ref, vs->tmp1)); + "cell %" PRIu32 " on page at %s has a " + "start timestamp newer than its stop " + "timestamp ", + cell_num - 1, + __wt_page_addr_string( + session, ref, vs->tmp1)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_ts, diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index e6bd252d67f..5896852c1bf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -34,7 +34,6 @@ static int __verify_dsk_row( #define WT_RET_VRFY(session, ...) do { \ if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \ __wt_errx(session, __VA_ARGS__); \ - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); \ return (WT_ERROR); \ } while (0) @@ -279,16 +278,22 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: + if (unpack->newest_stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a newest stop " + "timestamp of 0", + cell_num - 1, tag); if (unpack->oldest_start_ts > unpack->newest_start_ts) WT_RET_VRFY(session, - "cell %" PRIu32 " on page at %s has an oldest start " - "timestamp newer than its newest start timestamp", - cell_num - 1, tag); + "cell %" PRIu32 " on page at %s has an oldest " + "start timestamp newer than its newest start " + "timestamp", + cell_num - 1, tag); if (unpack->newest_start_ts > unpack->newest_stop_ts) WT_RET_VRFY(session, - "cell %" PRIu32 " on page at %s has a newest start " - "timestamp newer than its newest stop timestamp", - cell_num - 1, tag); + "cell %" PRIu32 " on page at %s has a newest start " + "timestamp newer than its newest stop timestamp", + cell_num - 1, tag); if (addr == NULL) break; @@ -311,11 +316,16 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: case WT_CELL_VALUE_SHORT: + if (unpack->stop_ts == WT_TS_NONE) + WT_RET_VRFY(session, + "cell %" PRIu32 " on page at %s has a stop " + "timestamp of 0", + cell_num - 1, tag); if (unpack->start_ts > unpack->stop_ts) WT_RET_VRFY(session, - "cell %" PRIu32 " on page at %s has a start timestamp " - "newer than its stop timestamp ", - cell_num - 1, tag); + "cell %" PRIu32 " on page at %s has a start " + "timestamp newer than its stop timestamp ", + cell_num - 1, tag); if (addr == NULL) break; @@ -384,7 +394,8 @@ __verify_dsk_row(WT_SESSION_IMPL *session, ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(dsk, cell, unpack, end) != 0) { + if (__wt_cell_unpack_safe( + session, dsk, cell, unpack, end) != 0) { ret = __err_cell_corrupt(session, cell_num, tag); goto err; } @@ -660,7 +671,7 @@ __verify_dsk_col_int(WT_SESSION_IMPL *session, ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(dsk, cell, unpack, end) != 0) + if (__wt_cell_unpack_safe(session, dsk, cell, unpack, end) != 0) return (__err_cell_corrupt(session, cell_num, tag)); /* Check the raw and collapsed cell types. */ @@ -709,32 +720,36 @@ static int __verify_dsk_col_var(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, WT_ADDR *addr) { + struct { + const void *data; + size_t size; + wt_timestamp_t start_ts, stop_ts; + bool deleted; + } last; WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_RET; - size_t last_size; uint32_t cell_num, cell_type, i; uint8_t *end; - const uint8_t *last_data; - bool last_deleted; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; end = (uint8_t *)dsk + dsk->mem_size; - last_data = NULL; - last_size = 0; - last_deleted = false; + last.data = NULL; + last.size = 0; + last.start_ts = last.stop_ts = WT_TS_NONE; + last.deleted = false; cell_num = 0; WT_CELL_FOREACH_VRFY(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(dsk, cell, unpack, end) != 0) + if (__wt_cell_unpack_safe(session, dsk, cell, unpack, end) != 0) return (__err_cell_corrupt(session, cell_num, tag)); /* Check the raw and collapsed cell types. */ @@ -762,33 +777,38 @@ __verify_dsk_col_var(WT_SESSION_IMPL *session, * a chance for RLE encoding. We don't have to care about data * encoding or anything else, a byte comparison is enough. */ - if (last_deleted) { + if (unpack->start_ts != last.start_ts || + unpack->stop_ts != last.stop_ts) + ; + else if (last.deleted) { if (cell_type == WT_CELL_DEL) goto match_err; } else if (cell_type == WT_CELL_VALUE && - last_data != NULL && - last_size == unpack->size && - memcmp(last_data, unpack->data, last_size) == 0) + last.data != NULL && + last.size == unpack->size && + memcmp(last.data, unpack->data, last.size) == 0) match_err: WT_RET_VRFY(session, "data entries %" PRIu32 " and %" PRIu32 " on page at %s are identical and should " "have been run-length encoded", cell_num - 1, cell_num, tag); + last.start_ts = unpack->start_ts; + last.stop_ts = unpack->stop_ts; switch (cell_type) { case WT_CELL_DEL: - last_deleted = true; - last_data = NULL; + last.data = NULL; + last.deleted = true; break; case WT_CELL_VALUE_OVFL: - last_deleted = false; - last_data = NULL; + last.data = NULL; + last.deleted = false; break; case WT_CELL_VALUE: - last_deleted = false; - last_data = unpack->data; - last_size = unpack->size; + last.data = unpack->data; + last.size = unpack->size; + last.deleted = false; break; } } @@ -863,7 +883,6 @@ static int __err_cell_corrupt( WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag) { - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); WT_RET_VRFY(session, "item %" PRIu32 " on page at %s is a corrupted cell", entry_num, tag); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index acf6643bcc5..a7cb433b56a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -85,7 +85,7 @@ found: WT_ASSERT(session, pindex->index[slot] == ref); * Check if a reference is for a leaf page. */ static inline bool -__ref_is_leaf(WT_REF *ref) +__ref_is_leaf(WT_SESSION_IMPL *session, WT_REF *ref) { size_t addr_size; const uint8_t *addr; @@ -96,7 +96,7 @@ __ref_is_leaf(WT_REF *ref) * this page is a leaf page or not. If there's no address, the page * isn't on disk and we don't know the page type. */ - __wt_ref_info(ref, &addr, &addr_size, &type); + __wt_ref_info(session, ref, &addr, &addr_size, &type); return (addr == NULL ? false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO); } @@ -650,7 +650,7 @@ __tree_walk_skip_count_callback( if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) *skipp = true; - else if (*skipleafcntp > 0 && __ref_is_leaf(ref)) { + else if (*skipleafcntp > 0 && __ref_is_leaf(session, ref)) { --*skipleafcntp; *skipp = true; } else diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index e67c36e6661..38aea173e8c 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -262,7 +262,7 @@ switch_and_jump: /* Switching to a forward roll. */ /* * It must be an on-page cell, unpack it. */ - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(session, page, cell, unpack); /* 3: the test for an on-page reference to an overflow key. */ if (unpack->type == WT_CELL_KEY_OVFL) { @@ -286,7 +286,8 @@ switch_and_jump: /* Switching to a forward roll. */ copy = WT_ROW_KEY_COPY(rip); if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack( + session, page, cell, unpack); ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index cd185cc75cc..a68c706ad95 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -727,12 +727,12 @@ __wt_las_insert_block(WT_CURSOR *cursor, upd->type == WT_UPDATE_MODIFY)) { las_value.size = 0; cursor->set_value(cursor, upd->txnid, - upd->timestamp, upd->durable_timestamp, + upd->start_ts, upd->durable_ts, upd->prepare_state, WT_UPDATE_BIRTHMARK, &las_value); } else cursor->set_value(cursor, upd->txnid, - upd->timestamp, upd->durable_timestamp, + upd->start_ts, upd->durable_ts, upd->prepare_state, upd->type, &las_value); /* diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 32939a97c72..90b1dc023ec 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -93,6 +93,12 @@ static const WT_CONFIG_CHECK }; static const WT_CONFIG_CHECK + confchk_wiredtiger_open_io_capacity_subconfigs[] = { + { "total", "int", NULL, "min=0,max=1TB", NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + +static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure_log_subconfigs[] = { { "archive", "boolean", NULL, NULL, NULL, 0 }, { "os_cache_dirty_pct", "int", @@ -170,6 +176,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "file_manager", "category", NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, + { "io_capacity", "category", + NULL, NULL, + confchk_wiredtiger_open_io_capacity_subconfigs, 1 }, { "log", "category", NULL, NULL, confchk_WT_CONNECTION_reconfigure_log_subconfigs, 4 }, @@ -191,9 +200,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, NULL, confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 }, { "timing_stress_for_test", "list", - NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," - "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\",\"split_8\"]", + NULL, "choices=[\"aggressive_sweep\",\"checkpoint_slow\"," + "\"lookaside_sweep_race\",\"split_1\",\"split_2\",\"split_3\"," + "\"split_4\",\"split_5\",\"split_6\",\"split_7\",\"split_8\"]", NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\"," @@ -876,6 +885,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "in_memory", "boolean", NULL, NULL, NULL, 0 }, + { "io_capacity", "category", + NULL, NULL, + confchk_wiredtiger_open_io_capacity_subconfigs, 1 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9 }, @@ -904,9 +916,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, NULL, confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "timing_stress_for_test", "list", - NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," - "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\",\"split_8\"]", + NULL, "choices=[\"aggressive_sweep\",\"checkpoint_slow\"," + "\"lookaside_sweep_race\",\"split_1\",\"split_2\",\"split_3\"," + "\"split_4\",\"split_5\",\"split_6\",\"split_7\",\"split_8\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -982,6 +994,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "in_memory", "boolean", NULL, NULL, NULL, 0 }, + { "io_capacity", "category", + NULL, NULL, + confchk_wiredtiger_open_io_capacity_subconfigs, 1 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9 }, @@ -1010,9 +1025,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, NULL, confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "timing_stress_for_test", "list", - NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," - "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\",\"split_8\"]", + NULL, "choices=[\"aggressive_sweep\",\"checkpoint_slow\"," + "\"lookaside_sweep_race\",\"split_1\",\"split_2\",\"split_3\"," + "\"split_4\",\"split_5\",\"split_6\",\"split_7\",\"split_8\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1085,6 +1100,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, + { "io_capacity", "category", + NULL, NULL, + confchk_wiredtiger_open_io_capacity_subconfigs, 1 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9 }, @@ -1113,9 +1131,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "timing_stress_for_test", "list", - NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," - "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\",\"split_8\"]", + NULL, "choices=[\"aggressive_sweep\",\"checkpoint_slow\"," + "\"lookaside_sweep_race\",\"split_1\",\"split_2\",\"split_3\"," + "\"split_4\",\"split_5\",\"split_6\",\"split_7\",\"split_8\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1186,6 +1204,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, + { "io_capacity", "category", + NULL, NULL, + confchk_wiredtiger_open_io_capacity_subconfigs, 1 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 9 }, @@ -1214,9 +1235,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "timing_stress_for_test", "list", - NULL, "choices=[\"checkpoint_slow\",\"lookaside_sweep_race\"," - "\"split_1\",\"split_2\",\"split_3\",\"split_4\",\"split_5\"," - "\"split_6\",\"split_7\",\"split_8\"]", + NULL, "choices=[\"aggressive_sweep\",\"checkpoint_slow\"," + "\"lookaside_sweep_race\",\"split_1\",\"split_2\",\"split_3\"," + "\"split_4\",\"split_5\",\"split_6\",\"split_7\",\"split_8\"]", NULL, 0 }, { "transaction_sync", "category", NULL, NULL, @@ -1294,15 +1315,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_checkpoint_target=1,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," - "close_scan_interval=10),log=(archive=true,os_cache_dirty_pct=0," - "prealloc=true,zero_fill=false),lsm_manager=(merge=true," - "worker_thread_max=4),lsm_merge=true," + "close_scan_interval=10),io_capacity=(total=0),log=(archive=true," + "os_cache_dirty_pct=0,prealloc=true,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," "operation_tracking=(enabled=false,path=\".\")," "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," "statistics=none,statistics_log=(json=false,on_close=false," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,verbose=", - confchk_WT_CONNECTION_reconfigure, 23 + confchk_WT_CONNECTION_reconfigure, 24 }, { "WT_CONNECTION.rollback_to_stable", "", @@ -1548,19 +1569,20 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_target=80,eviction_trigger=95,exclusive=false," "extensions=,file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,log=(archive=true,compressor=,enabled=false," - "file_max=100MB,os_cache_dirty_pct=0,path=\".\",prealloc=true," - "recover=on,zero_fill=false),lsm_manager=(merge=true," - "worker_thread_max=4),lsm_merge=true,mmap=true,multiprocess=false" - ",operation_tracking=(enabled=false,path=\".\"),readonly=false," - "salvage=false,session_max=100,session_scratch_max=2MB," - "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" - ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" - ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" - ",method=fsync),use_environment=true,use_environment_priv=false," + "in_memory=false,io_capacity=(total=0),log=(archive=true," + "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0," + "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,operation_tracking=(enabled=false," + "path=\".\"),readonly=false,salvage=false,session_max=100," + "session_scratch_max=2MB,session_table_cache=true," + "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + "statistics=none,statistics_log=(json=false,on_close=false," + "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "timing_stress_for_test=,transaction_sync=(enabled=false," + "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,write_through=", - confchk_wiredtiger_open, 47 + confchk_wiredtiger_open, 48 }, { "wiredtiger_open_all", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" @@ -1575,19 +1597,20 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_target=80,eviction_trigger=95,exclusive=false," "extensions=,file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,log=(archive=true,compressor=,enabled=false," - "file_max=100MB,os_cache_dirty_pct=0,path=\".\",prealloc=true," - "recover=on,zero_fill=false),lsm_manager=(merge=true," - "worker_thread_max=4),lsm_merge=true,mmap=true,multiprocess=false" - ",operation_tracking=(enabled=false,path=\".\"),readonly=false," - "salvage=false,session_max=100,session_scratch_max=2MB," - "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" - ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" - ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" - ",method=fsync),use_environment=true,use_environment_priv=false," + "in_memory=false,io_capacity=(total=0),log=(archive=true," + "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0," + "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,operation_tracking=(enabled=false," + "path=\".\"),readonly=false,salvage=false,session_max=100," + "session_scratch_max=2MB,session_table_cache=true," + "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + "statistics=none,statistics_log=(json=false,on_close=false," + "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "timing_stress_for_test=,transaction_sync=(enabled=false," + "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 48 + confchk_wiredtiger_open_all, 49 }, { "wiredtiger_open_basecfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" @@ -1601,18 +1624,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "log=(archive=true,compressor=,enabled=false,file_max=100MB," - "os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on," - "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," - "lsm_merge=true,mmap=true,multiprocess=false," - "operation_tracking=(enabled=false,path=\".\"),readonly=false," - "salvage=false,session_max=100,session_scratch_max=2MB," - "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" - ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" - ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" - ",method=fsync),verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 42 + "io_capacity=(total=0),log=(archive=true,compressor=," + "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," + "prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,operation_tracking=(enabled=false," + "path=\".\"),readonly=false,salvage=false,session_max=100," + "session_scratch_max=2MB,session_table_cache=true," + "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + "statistics=none,statistics_log=(json=false,on_close=false," + "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "timing_stress_for_test=,transaction_sync=(enabled=false," + "method=fsync),verbose=,version=(major=0,minor=0),write_through=", + confchk_wiredtiger_open_basecfg, 43 }, { "wiredtiger_open_usercfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" @@ -1626,18 +1650,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "log=(archive=true,compressor=,enabled=false,file_max=100MB," - "os_cache_dirty_pct=0,path=\".\",prealloc=true,recover=on," - "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," - "lsm_merge=true,mmap=true,multiprocess=false," - "operation_tracking=(enabled=false,path=\".\"),readonly=false," - "salvage=false,session_max=100,session_scratch_max=2MB," - "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" - ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" - ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" - ",method=fsync),verbose=,write_through=", - confchk_wiredtiger_open_usercfg, 41 + "io_capacity=(total=0),log=(archive=true,compressor=," + "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," + "prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,operation_tracking=(enabled=false," + "path=\".\"),readonly=false,salvage=false,session_max=100," + "session_scratch_max=2MB,session_table_cache=true," + "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + "statistics=none,statistics_log=(json=false,on_close=false," + "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "timing_stress_for_test=,transaction_sync=(enabled=false," + "method=fsync),verbose=,write_through=", + confchk_wiredtiger_open_usercfg, 42 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 43d2ee47afd..0630bdb3711 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2019,6 +2019,7 @@ __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) * conditions aren't encountered. */ static const WT_NAME_FLAG stress_types[] = { + { "aggressive_sweep", WT_TIMING_STRESS_AGGRESSIVE_SWEEP }, { "checkpoint_slow", WT_TIMING_STRESS_CHECKPOINT_SLOW }, { "lookaside_sweep_race",WT_TIMING_STRESS_LOOKASIDE_SWEEP }, { "split_1", WT_TIMING_STRESS_SPLIT_1 }, @@ -2138,6 +2139,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "config_base=," "create=," "encryption=(secretkey=)," + "error_prefix=," "exclusive=," "in_memory=," "log=(recover=)," diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index ed01390955b..0e15841c59a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -52,10 +52,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) WT_DECL_RET; uint64_t chunk, quota, reserve, size, used_cache; char *pool_name; - bool created, updating; + bool cp_locked, created, updating; conn = S2C(session); - created = updating = false; + cp_locked = created = updating = false; pool_name = NULL; cp = NULL; @@ -117,7 +117,16 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) "Attempting to join a cache pool that does not exist: %s", pool_name); + /* + * At this point we have a cache pool to use. We need to take its + * lock. We need to drop the process lock first to avoid deadlock + * and acquire in the proper order. + */ + __wt_spin_unlock(session, &__wt_process.spinlock); cp = __wt_process.cache_pool; + __wt_spin_lock(session, &cp->cache_pool_lock); + cp_locked = true; + __wt_spin_lock(session, &__wt_process.spinlock); /* * The cache pool requires a reference count to avoid a race between @@ -209,6 +218,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) conn->cache->cp_reserved = reserve; conn->cache->cp_quota = quota; + __wt_spin_unlock(session, &cp->cache_pool_lock); + cp_locked = false; /* Wake up the cache pool server so any changes are noticed. */ if (updating) @@ -221,6 +232,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) F_SET(conn, WT_CONN_CACHE_POOL); err: __wt_spin_unlock(session, &__wt_process.spinlock); + if (cp_locked) + __wt_spin_unlock(session, &cp->cache_pool_lock); __wt_free(session, pool_name); if (ret != 0 && created) { __wt_free(session, cp->name); diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c new file mode 100644 index 00000000000..0dd6a8c3c6d --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Compute the time in nanoseconds that must be reserved to represent + * a number of bytes in a subsystem with a particular capacity per second. + */ +#define WT_RESERVATION_NS(bytes, capacity) \ + (((bytes) * WT_BILLION) / (capacity)) + +/* + * The fraction of a second's worth of capacity that will be stolen at a + * time. The number of bytes this represents may be different for different + * subsystems, since each subsystem has its own capacity per second. + */ +#define WT_STEAL_FRACTION(x) ((x) / 16) + +/* + * __capacity_config -- + * Set I/O capacity configuration. + */ +static int +__capacity_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CAPACITY *cap; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + uint64_t total; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "io_capacity.total", &cval)); + if (cval.val != 0 && cval.val < WT_THROTTLE_MIN) + WT_RET_MSG(session, EINVAL, + "total I/O capacity value %" PRId64 " below minimum %d", + cval.val, WT_THROTTLE_MIN); + + cap = &conn->capacity; + cap->total = total = (uint64_t)cval.val; + if (cval.val != 0) { + /* + * We've been given a total capacity, set the + * capacity of all the subsystems. + */ + cap->ckpt = WT_CAPACITY_SYS(total, WT_CAP_CKPT); + cap->evict = WT_CAPACITY_SYS(total, WT_CAP_EVICT); + cap->log = WT_CAPACITY_SYS(total, WT_CAP_LOG); + cap->read = WT_CAPACITY_SYS(total, WT_CAP_READ); + + /* + * Set the threshold to the percent of our capacity to + * periodically asynchronously flush what we've written. + */ + cap->threshold = ((cap->ckpt + cap->evict + cap->log) / + 100) * WT_CAPACITY_PCT; + if (cap->threshold < WT_CAPACITY_MIN_THRESHOLD) + cap->threshold = WT_CAPACITY_MIN_THRESHOLD; + WT_STAT_CONN_SET(session, capacity_threshold, cap->threshold); + } else + WT_STAT_CONN_SET(session, capacity_threshold, 0); + + return (0); +} + +/* + * __capacity_server_run_chk -- + * Check to decide if the capacity server should continue running. + */ +static bool +__capacity_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_CAPACITY)); +} + +/* + * __capacity_server -- + * The capacity server thread. + */ +static WT_THREAD_RET +__capacity_server(void *arg) +{ + WT_CAPACITY *cap; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t start, stop, time_ms; + + session = arg; + conn = S2C(session); + cap = &conn->capacity; + for (;;) { + /* + * Wait until signalled but check once per second in case + * the signal was missed. + */ + __wt_cond_wait(session, + conn->capacity_cond, WT_MILLION, __capacity_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__capacity_server_run_chk(session)) + break; + + cap->signalled = false; + if (cap->written < cap->threshold) + continue; + + start = __wt_clock(session); + WT_ERR(__wt_fsync_background(session)); + stop = __wt_clock(session); + time_ms = WT_CLOCKDIFF_MS(stop, start); + WT_STAT_CONN_SET(session, fsync_all_time, time_ms); + cap->written = 0; + } + + if (0) { +err: WT_PANIC_MSG(session, ret, "capacity server error"); + } + return (WT_THREAD_RET_VALUE); +} + +/* + * __capacity_server_start -- + * Start the capacity server thread. + */ +static int +__capacity_server_start(WT_CONNECTION_IMPL *conn) +{ + WT_SESSION_IMPL *session; + + F_SET(conn, WT_CONN_SERVER_CAPACITY); + + /* + * The capacity server gets its own session. + */ + WT_RET(__wt_open_internal_session(conn, + "capacity-server", false, 0, &conn->capacity_session)); + session = conn->capacity_session; + + WT_RET(__wt_cond_alloc(session, + "capacity server", &conn->capacity_cond)); + + /* + * Start the thread. + */ + WT_RET(__wt_thread_create( + session, &conn->capacity_tid, __capacity_server, session)); + conn->capacity_tid_set = true; + + return (0); +} + +/* + * __wt_capacity_server_create -- + * Configure and start the capacity server. + */ +int +__wt_capacity_server_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * Stop any server that is already running. This means that each time + * reconfigure is called we'll bounce the server even if there are no + * configuration changes. This makes our life easier as the underlying + * configuration routine doesn't have to worry about freeing objects + * in the connection structure (it's guaranteed to always start with a + * blank slate), and we don't have to worry about races where a running + * server is reading configuration information that we're updating, and + * it's not expected that reconfiguration will happen a lot. + */ + if (conn->capacity_session != NULL) + WT_RET(__wt_capacity_server_destroy(session)); + WT_RET(__capacity_config(session, cfg)); + + /* + * If it is a read only connection or if background fsync is not + * supported, then there is nothing to do. + */ + if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY) || + !__wt_fsync_background_chk(session)) + return (0); + + if (conn->capacity.total != 0) + WT_RET(__capacity_server_start(conn)); + + return (0); +} + +/* + * __wt_capacity_server_destroy -- + * Destroy the capacity server thread. + */ +int +__wt_capacity_server_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + F_CLR(conn, WT_CONN_SERVER_CAPACITY); + if (conn->capacity_tid_set) { + __wt_cond_signal(session, conn->capacity_cond); + WT_TRET(__wt_thread_join(session, &conn->capacity_tid)); + conn->capacity_tid_set = false; + } + __wt_cond_destroy(session, &conn->capacity_cond); + + /* Close the server thread's session. */ + if (conn->capacity_session != NULL) { + wt_session = &conn->capacity_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + + /* + * Ensure capacity settings are cleared - so that reconfigure doesn't + * get confused. + */ + conn->capacity_session = NULL; + conn->capacity_tid_set = false; + conn->capacity_cond = NULL; + + return (ret); +} + +/* + * __capacity_signal -- + * Signal the capacity thread if sufficient data has been written. + */ +static void +__capacity_signal(WT_SESSION_IMPL *session) +{ + WT_CAPACITY *cap; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + cap = &conn->capacity; + if (cap->written >= cap->threshold && !cap->signalled) { + __wt_cond_signal(session, conn->capacity_cond); + cap->signalled = true; + } +} + +/* + * __capacity_reserve -- + * Make a reservation for the given number of bytes against + * the capacity of the subsystem. + */ +static void +__capacity_reserve(uint64_t *reservation, uint64_t bytes, uint64_t capacity, + uint64_t now_ns, uint64_t *result) +{ + uint64_t res_len, res_value; + + if (capacity != 0) { + res_len = WT_RESERVATION_NS(bytes, capacity); + res_value = __wt_atomic_add64(reservation, res_len); + if (now_ns > res_value && now_ns - res_value > WT_BILLION) + /* + * If the reservation clock is out of date, bring it + * to within a second of a current time. + */ + (void)__wt_atomic_store64(reservation, + (now_ns - WT_BILLION) + res_len); + } else + res_value = now_ns; + + *result = res_value; +} + +/* + * __wt_capacity_throttle -- + * Reserve a time to perform a write operation for the subsystem, + * and wait until that time. + * + * The concept is that each write to a subsystem reserves a time slot + * to do its write, and atomically adjusts the reservation marker to + * point past the reserved slot. The size of the adjustment (i.e. the + * length of time represented by the slot in nanoseconds) is chosen to + * be proportional to the number of bytes to be written, and the + * proportion is a simple calculation so that we can fit reservations for + * exactly the configured capacity in a second. Reservation times are + * in nanoseconds since the epoch. + */ +void +__wt_capacity_throttle(WT_SESSION_IMPL *session, uint64_t bytes, + WT_THROTTLE_TYPE type) +{ + struct timespec now; + WT_CAPACITY *cap; + WT_CONNECTION_IMPL *conn; + uint64_t best_res, capacity, new_res, now_ns, sleep_us, res_total_value; + uint64_t res_value, steal_capacity, stolen_bytes, this_res; + uint64_t *reservation, *steal; + uint64_t total_capacity; + + conn = S2C(session); + cap = &conn->capacity; + /* If not using capacity there's nothing to do. */ + if (cap->total == 0) + return; + + capacity = steal_capacity = 0; + reservation = steal = NULL; + switch (type) { + case WT_THROTTLE_CKPT: + capacity = cap->ckpt; + reservation = &cap->reservation_ckpt; + WT_STAT_CONN_INCRV(session, capacity_bytes_ckpt, bytes); + break; + case WT_THROTTLE_EVICT: + capacity = cap->evict; + reservation = &cap->reservation_evict; + WT_STAT_CONN_INCRV(session, capacity_bytes_evict, bytes); + break; + case WT_THROTTLE_LOG: + capacity = cap->log; + reservation = &cap->reservation_log; + WT_STAT_CONN_INCRV(session, capacity_bytes_log, bytes); + break; + case WT_THROTTLE_READ: + capacity = cap->read; + reservation = &cap->reservation_read; + WT_STAT_CONN_INCRV(session, capacity_bytes_read, bytes); + break; + } + total_capacity = cap->total; + + /* + * Right now no subsystem can be individually turned off, but it is + * certainly a possibility to consider one subsystem may be turned off + * at some point in the future. If this subsystem is not throttled + * there's nothing to do. + */ + if (capacity == 0 || F_ISSET(conn, WT_CONN_RECOVERING)) + return; + + /* + * There may in fact be some reads done under the umbrella of log + * I/O, but they are mostly done under recovery. And if we are + * recovering, we don't reach this code. + */ + if (type != WT_THROTTLE_READ) { + (void)__wt_atomic_addv64(&cap->written, bytes); + WT_STAT_CONN_INCRV(session, capacity_bytes_written, bytes); + __capacity_signal(session); + } + + /* If we get sizes larger than this, later calculations may overflow. */ + WT_ASSERT(session, bytes < 16 * (uint64_t)WT_GIGABYTE); + WT_ASSERT(session, capacity != 0); + + /* Get the current time in nanoseconds since the epoch. */ + __wt_epoch(session, &now); + now_ns = (uint64_t)now.tv_sec * WT_BILLION + (uint64_t)now.tv_nsec; + +again: + /* Take a reservation for the subsystem, and for the total */ + __capacity_reserve(reservation, bytes, capacity, now_ns, &res_value); + __capacity_reserve(&cap->reservation_total, bytes, total_capacity, + now_ns, &res_total_value); + + /* + * If we ended up with a future reservation, and we aren't constricted + * by the total capacity, then we may be able to reallocate some + * unused reservation time from another subsystem. + */ + if (res_value > now_ns && res_total_value < now_ns && steal == NULL && + total_capacity != 0) { + best_res = now_ns - WT_BILLION / 2; + if (type != WT_THROTTLE_CKPT && + (this_res = cap->reservation_ckpt) < best_res) { + steal = &cap->reservation_ckpt; + steal_capacity = cap->ckpt; + best_res = this_res; + } + if (type != WT_THROTTLE_EVICT && + (this_res = cap->reservation_evict) < best_res) { + steal = &cap->reservation_evict; + steal_capacity = cap->evict; + best_res = this_res; + } + if (type != WT_THROTTLE_LOG && + (this_res = cap->reservation_log) < best_res) { + steal = &cap->reservation_log; + steal_capacity = cap->log; + best_res = this_res; + } + if (type != WT_THROTTLE_READ && + (this_res = cap->reservation_read) < best_res) { + steal = &cap->reservation_read; + steal_capacity = cap->read; + best_res = this_res; + } + + if (steal != NULL) { + /* + * We have a subsystem that has enough spare capacity + * to steal. We'll take a small slice (a fraction + * of a second worth) and add it to our own subsystem. + */ + if (best_res < now_ns - WT_BILLION && + now_ns > WT_BILLION) + new_res = now_ns - WT_BILLION; + else + new_res = best_res; + WT_ASSERT(session, steal_capacity != 0); + new_res += WT_STEAL_FRACTION(WT_BILLION) + + WT_RESERVATION_NS(bytes, steal_capacity); + if (!__wt_atomic_casv64(steal, best_res, new_res)) { + /* + * Give up our reservations and try again. + * We won't try to steal the next time. + */ + (void)__wt_atomic_sub64(reservation, + WT_RESERVATION_NS(bytes, capacity)); + (void)__wt_atomic_sub64(&cap->reservation_total, + WT_RESERVATION_NS(bytes, total_capacity)); + goto again; + } + + /* + * We've stolen a fraction of a second of capacity. + * Figure out how many bytes that is, before adding + * that many bytes to the acquiring subsystem's + * capacity. + */ + stolen_bytes = WT_STEAL_FRACTION(steal_capacity); + res_value = __wt_atomic_sub64(reservation, + WT_RESERVATION_NS(stolen_bytes, capacity)); + } + } + if (res_value < res_total_value) + res_value = res_total_value; + + if (res_value > now_ns) { + sleep_us = (res_value - now_ns) / WT_THOUSAND; + if (res_value == res_total_value) + WT_STAT_CONN_INCRV(session, + capacity_time_total, sleep_us); + else + switch (type) { + case WT_THROTTLE_CKPT: + WT_STAT_CONN_INCRV(session, + capacity_time_ckpt, sleep_us); + break; + case WT_THROTTLE_EVICT: + WT_STAT_CONN_INCRV(session, + capacity_time_evict, sleep_us); + break; + case WT_THROTTLE_LOG: + WT_STAT_CONN_INCRV(session, + capacity_time_log, sleep_us); + break; + case WT_THROTTLE_READ: + WT_STAT_CONN_INCRV(session, + capacity_time_read, sleep_us); + break; + } + if (sleep_us > WT_CAPACITY_SLEEP_CUTOFF_US) + /* Sleep handles large usec values. */ + __wt_sleep(0, sleep_us); + } +} diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 28ad155ff53..27d1e6a620d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -56,6 +56,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) /* Initialize transaction support. */ WT_RET(__wt_txn_global_init(session, cfg)); + WT_STAT_CONN_SET(session, dh_conn_handle_size, sizeof(WT_DATA_HANDLE)); return (0); } @@ -101,6 +102,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); WT_FULL_BARRIER(); + WT_TRET(__wt_capacity_server_destroy(session)); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); WT_TRET(__wt_sweep_destroy(session)); @@ -251,6 +253,9 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Start the optional async threads. */ WT_RET(__wt_async_create(session, cfg)); + /* Start the optional capacity thread. */ + WT_RET(__wt_capacity_server_create(session, cfg)); + /* Start the optional checkpoint thread. */ WT_RET(__wt_checkpoint_server_create(session, cfg)); diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c index e56e76c8fd6..c6d7203f08e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -475,6 +475,7 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_ERR(__wt_conn_statistics_config(session, cfg)); WT_ERR(__wt_async_reconfig(session, cfg)); WT_ERR(__wt_cache_config(session, true, cfg)); + WT_ERR(__wt_capacity_server_create(session, cfg)); WT_ERR(__wt_checkpoint_server_create(session, cfg)); WT_ERR(__wt_logmgr_reconfig(session, cfg)); WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index c8681c13427..673f7c81399 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -278,13 +278,18 @@ __sweep_server(void *arg) WT_DECL_RET; WT_SESSION_IMPL *session; time_t last, now; - uint64_t last_las_sweep_id, min_sleep, oldest_id; + uint64_t last_las_sweep_id, min_sleep, oldest_id, sweep_interval; u_int dead_handles; session = arg; conn = S2C(session); last_las_sweep_id = WT_TXN_NONE; min_sleep = WT_MIN(WT_LAS_SWEEP_SEC, conn->sweep_interval); + if (FLD_ISSET(conn->timing_stress_flags, + WT_TIMING_STRESS_AGGRESSIVE_SWEEP)) + sweep_interval = conn->sweep_interval / 10; + else + sweep_interval = conn->sweep_interval; /* * Sweep for dead and excess handles. @@ -292,8 +297,14 @@ __sweep_server(void *arg) __wt_seconds(session, &last); for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, conn->sweep_cond, - min_sleep * WT_MILLION, __sweep_server_run_chk); + if (FLD_ISSET(conn->timing_stress_flags, + WT_TIMING_STRESS_AGGRESSIVE_SWEEP)) + __wt_cond_wait(session, conn->sweep_cond, + min_sleep * 100 * WT_THOUSAND, + __sweep_server_run_chk); + else + __wt_cond_wait(session, conn->sweep_cond, + min_sleep * WT_MILLION, __sweep_server_run_chk); /* Check if we're quitting or being reconfigured. */ if (!__sweep_server_run_chk(session)) @@ -312,7 +323,9 @@ __sweep_server(void *arg) * bringing in and evicting pages from the lookaside table, * which will stop the cache from moving into the stuck state. */ - if (now - last >= WT_LAS_SWEEP_SEC && + if ((FLD_ISSET(conn->timing_stress_flags, + WT_TIMING_STRESS_AGGRESSIVE_SWEEP) || + now - last >= WT_LAS_SWEEP_SEC) && !__wt_las_empty(session) && !__wt_cache_stuck(session)) { oldest_id = __wt_txn_oldest_id(session); @@ -327,7 +340,7 @@ __sweep_server(void *arg) * less frequently than the lookaside table by default and the * frequency is controlled by a user setting. */ - if ((uint64_t)(now - last) < conn->sweep_interval) + if ((uint64_t)(now - last) < sweep_interval) continue; WT_STAT_CONN_INCR(session, dh_sweeps); /* @@ -350,6 +363,9 @@ __sweep_server(void *arg) if (dead_handles > 0) WT_ERR(__sweep_remove_handles(session)); + + /* Remember the last sweep time. */ + last = now; } if (0) { diff --git a/src/third_party/wiredtiger/src/docs/programming.dox b/src/third_party/wiredtiger/src/docs/programming.dox index 3ddb0c376c5..960babfc146 100644 --- a/src/third_party/wiredtiger/src/docs/programming.dox +++ b/src/third_party/wiredtiger/src/docs/programming.dox @@ -68,6 +68,7 @@ each of which is ordered by one or more columns. - @subpage_single tune_build_options - @subpage_single tune_bulk_load - @subpage_single tune_cache +- @subpage_single tune_capacity - @subpage_single tune_checksum - @subpage_single tune_close - @subpage_single tune_cursor_persist diff --git a/src/third_party/wiredtiger/src/docs/tune-capacity.dox b/src/third_party/wiredtiger/src/docs/tune-capacity.dox new file mode 100644 index 00000000000..3aad4997576 --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/tune-capacity.dox @@ -0,0 +1,38 @@ +/*! @page tune_capacity Capacity tuning + +In some cases, it can be helpful to constrain the overall I/O bandwidth +generated by the database. This can be beneficial when resources are shared, +for example, in cloud or virtual environments. + +The total bandwidth capacity is configured by setting the +\c io_capacity configuration string when calling the ::wiredtiger_open +function. The capacity can be adjusted with WT_CONNECTION::reconfigure. + +An example of setting a capacity limit to 40MB per second: + +@snippet ex_all.c Configure capacity + +When a total capacity is set the volume of system reads and writes totalled +will not exceed the given I/O capacity. +If a read or write is scheduled and would overflow the capacity, the issuing +thread will sleep to guarantee the capacity ceiling. The policy used is +fair to all threads, and gives some weight to both readers and writers to +try to ensure that each session can make progress when bandwidth +resources are limited. + +System reads and writes do not directly translate to disk I/O +operations. These operations go through the operating system cache. To ensure +the steady flow of data to the disk, setting a capacity also enables an +additional thread that monitors the writes performed for each file. For each +file that has sufficient data written to it, a call to an +asynchronous \c fsync will be made. This call normally queues the flush +in the operating system, though there is no guarantee about when it will +actually occur. On Windows, there is no equivalent support for asynchronously +scheduling writes to disk, so this extra "sync" thread is not active. + +When a total capacity is not set, or equivalently, when it is set to 0, +there are no capacity constraints on the database, and pauses will never +be inserted before I/O is done, nor are extra asynchronous \c fsync calls +performed. + + */ diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index 584149d4379..8efaf10dd2b 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -234,7 +234,6 @@ struct __wt_block { uint32_t allocsize; /* Allocation size */ size_t os_cache; /* System buffer cache flush max */ size_t os_cache_max; - size_t os_cache_dirty; /* System buffer cache write max */ size_t os_cache_dirty_max; u_int block_header; /* Header length */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 2450f90a3a6..14d5a04b096 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -827,8 +827,9 @@ struct __wt_page { */ struct __wt_page_deleted { volatile uint64_t txnid; /* Transaction ID */ - wt_timestamp_t timestamp; - wt_timestamp_t durable_timestamp; /* aligned uint64_t timestamp */ + + wt_timestamp_t timestamp; /* Timestamps */ + wt_timestamp_t durable_timestamp; /* * The state is used for transaction prepare to manage visibility @@ -1058,8 +1059,9 @@ struct __wt_ikey { */ struct __wt_update { volatile uint64_t txnid; /* transaction ID */ - wt_timestamp_t timestamp; /* aligned uint64_t timestamp */ - wt_timestamp_t durable_timestamp; /* aligned uint64_t timestamp */ + + wt_timestamp_t durable_ts; /* timestamps */ + wt_timestamp_t start_ts, stop_ts; WT_UPDATE *next; /* forward-linked list */ @@ -1082,7 +1084,7 @@ struct __wt_update { * The update state is used for transaction prepare to manage * visibility and transitioning update structure state safely. */ - volatile uint8_t prepare_state; /* Prepare state. */ + volatile uint8_t prepare_state; /* prepare state */ /* * Zero or more bytes of value (the payload) immediately follows the @@ -1096,7 +1098,7 @@ struct __wt_update { * WT_UPDATE_SIZE is the expected structure size excluding the payload data -- * we verify the build to ensure the compiler hasn't inserted padding. */ -#define WT_UPDATE_SIZE 38 +#define WT_UPDATE_SIZE 46 /* * The memory size of an update: include some padding because this is such a diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index b17bfcc2595..f0c072615b8 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1018,7 +1018,7 @@ __wt_row_leaf_key(WT_SESSION_IMPL *session, * Return the unpacked value for a row-store leaf page key. */ static inline void -__wt_row_leaf_value_cell( +__wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack) { WT_CELL *kcell, *vcell; @@ -1048,13 +1048,14 @@ __wt_row_leaf_value_cell( page, copy, NULL, &kcell, &key, &size) && kcell == NULL) vcell = (WT_CELL *)((uint8_t *)key + size); else { - __wt_cell_unpack(page, kcell, &unpack); + __wt_cell_unpack(session, page, kcell, &unpack); vcell = (WT_CELL *)((uint8_t *) unpack.cell + __wt_cell_total_len(&unpack)); } } - __wt_cell_unpack(page, __wt_cell_leaf_value_parse(page, vcell), vpack); + __wt_cell_unpack(session, + page, __wt_cell_leaf_value_parse(page, vcell), vpack); } /* @@ -1087,7 +1088,8 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) * Return the addr/size and type triplet for a reference. */ static inline void -__wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) +__wt_ref_info(WT_SESSION_IMPL *session, + WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) { WT_ADDR *addr; WT_CELL_UNPACK *unpack, _unpack; @@ -1128,7 +1130,7 @@ __wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep) break; } } else { - __wt_cell_unpack(page, (WT_CELL *)addr, unpack); + __wt_cell_unpack(session, page, (WT_CELL *)addr, unpack); *addrp = unpack->data; *sizep = unpack->size; if (typep != NULL) @@ -1149,7 +1151,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) if (ref->addr == NULL) return (0); - __wt_ref_info(ref, &addr, &addr_size, NULL); + __wt_ref_info(session, ref, &addr, &addr_size, NULL); WT_RET(__wt_btree_block_free(session, addr, addr_size)); /* Clear the address (so we don't free it twice). */ diff --git a/src/third_party/wiredtiger/src/include/capacity.h b/src/third_party/wiredtiger/src/include/capacity.h new file mode 100644 index 00000000000..1fb42f5b435 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/capacity.h @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +typedef enum { + WT_THROTTLE_CKPT, /* Checkpoint throttle */ + WT_THROTTLE_EVICT, /* Eviction throttle */ + WT_THROTTLE_LOG, /* Logging throttle */ + WT_THROTTLE_READ /* Read throttle */ +} WT_THROTTLE_TYPE; + +#define WT_THROTTLE_MIN WT_MEGABYTE /* Config minimum size */ + +/* + * The per-file threshold means we won't start the background fsync on a file + * until it crosses the per-file threshold of data written. The other minimum + * threshold defines a minimum threshold for the background thread. Otherwise + * we compute a percentage of the given capacity. + */ +#define WT_CAPACITY_FILE_THRESHOLD (WT_MEGABYTE / 2) +#define WT_CAPACITY_MIN_THRESHOLD (10 * WT_MEGABYTE) +#define WT_CAPACITY_PCT 10 + +/* + * If we're being asked to sleep a short amount of time, ignore it. + * A non-zero value means there may be a temporary violation of the + * capacity limitation, but one that would even out. That is, possibly + * fewer sleeps with the risk of more choppy behavior as this number + * is larger. + */ +#define WT_CAPACITY_SLEEP_CUTOFF_US 100 + +/* + * When given a total capacity, divide it up for each subsystem. These defines + * represent the percentage of the total capacity that we allow for each + * subsystem capacity. We allow and expect the sum of the subsystems to + * exceed 100, as often they are not at their maximum at the same time. In any + * event, we track the total capacity separately, so it is never exceeded. + */ +#define WT_CAPACITY_SYS(total, pct) ((total) * (pct) / 100) +#define WT_CAP_CKPT 5 +#define WT_CAP_EVICT 50 +#define WT_CAP_LOG 30 +#define WT_CAP_READ 55 + +struct __wt_capacity { + uint64_t ckpt; /* Bytes/sec checkpoint capacity */ + uint64_t evict; /* Bytes/sec eviction capacity */ + uint64_t log; /* Bytes/sec logging capacity */ + uint64_t read; /* Bytes/sec read capacity */ + uint64_t total; /* Bytes/sec total capacity */ + uint64_t threshold; /* Capacity size period */ + + volatile uint64_t written; /* Written this period */ + volatile bool signalled; /* Capacity signalled */ + + /* + * A reservation is a point in time when a read or write for a subsystem + * can be scheduled, so as not to overrun the given capacity. These + * values hold the next available reservation, in nanoseconds since + * the epoch. Getting a reservation with a future time implies sleeping + * until that time; getting a reservation with a past time implies that + * the operation can be done immediately. + */ + uint64_t reservation_ckpt; /* Atomic: next checkpoint write */ + uint64_t reservation_evict; /* Atomic: next eviction write */ + uint64_t reservation_log; /* Atomic: next logging write */ + uint64_t reservation_read; /* Atomic: next read */ + uint64_t reservation_total; /* Atomic: next operation of any kind */ +}; diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index 654a409e627..8ec25508a7c 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -176,10 +176,11 @@ struct __wt_cell_unpack { * Pack a start, stop timestamp pair for a value. */ static inline void -__cell_pack_timestamp_value( +__cell_pack_timestamp_value(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t start_ts, wt_timestamp_t stop_ts) { - WT_ASSERT(NULL, start_ts <= stop_ts); + WT_ASSERT(session, stop_ts != WT_TS_NONE); + WT_ASSERT(session, start_ts <= stop_ts); if (__wt_process.page_version_ts) { /* Start timestamp, stop timestamp difference. */ @@ -194,11 +195,13 @@ __cell_pack_timestamp_value( * address. */ static inline void -__cell_pack_timestamp_addr(uint8_t **pp, wt_timestamp_t oldest_start_ts, +__cell_pack_timestamp_addr(WT_SESSION_IMPL *session, + uint8_t **pp, wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts) { - WT_ASSERT(NULL, oldest_start_ts <= newest_start_ts); - WT_ASSERT(NULL, newest_start_ts <= newest_stop_ts); + WT_ASSERT(session, newest_stop_ts != WT_TS_NONE); + WT_ASSERT(session, oldest_start_ts <= newest_start_ts); + WT_ASSERT(session, newest_start_ts <= newest_stop_ts); if (__wt_process.page_version_ts) { (void)__wt_vpack_uint(pp, 0, oldest_start_ts); @@ -214,14 +217,15 @@ __cell_pack_timestamp_addr(uint8_t **pp, wt_timestamp_t oldest_start_ts, * Pack an address cell. */ static inline size_t -__wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, +__wt_cell_pack_addr(WT_SESSION_IMPL *session, + WT_CELL *cell, u_int cell_type, uint64_t recno, wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts, size_t size) { uint8_t *p; p = cell->__chunk + 1; - __cell_pack_timestamp_addr( + __cell_pack_timestamp_addr(session, &p, oldest_start_ts, newest_start_ts, newest_stop_ts); if (recno == WT_RECNO_OOB) @@ -239,13 +243,13 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, * Set a data item's WT_CELL contents. */ static inline size_t -__wt_cell_pack_data(WT_CELL *cell, +__wt_cell_pack_data(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, size_t size) { uint8_t byte, *p; p = cell->__chunk + 1; - __cell_pack_timestamp_value(&p, start_ts, stop_ts); + __cell_pack_timestamp_value(session, &p, start_ts, stop_ts); /* * Short data cells without run-length encoding have 6 bits of data @@ -343,13 +347,13 @@ __wt_cell_pack_data_match(WT_CELL *page_cell, * Write a copy value cell. */ static inline size_t -__wt_cell_pack_copy(WT_CELL *cell, +__wt_cell_pack_copy(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, uint64_t v) { uint8_t *p; p = cell->__chunk + 1; - __cell_pack_timestamp_value(&p, start_ts, stop_ts); + __cell_pack_timestamp_value(session, &p, start_ts, stop_ts); if (rle < 2) cell->__chunk[0] = WT_CELL_VALUE_COPY; /* Type */ @@ -367,13 +371,13 @@ __wt_cell_pack_copy(WT_CELL *cell, * Write a deleted value cell. */ static inline size_t -__wt_cell_pack_del(WT_CELL *cell, +__wt_cell_pack_del(WT_SESSION_IMPL *session, WT_CELL *cell, wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle) { uint8_t *p; p = cell->__chunk + 1; - __cell_pack_timestamp_value(&p, start_ts, stop_ts); + __cell_pack_timestamp_value(session, &p, start_ts, stop_ts); if (rle < 2) cell->__chunk[0] = WT_CELL_DEL; /* Type */ @@ -453,7 +457,7 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) * Pack an overflow cell. */ static inline size_t -__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, +__wt_cell_pack_ovfl(WT_SESSION_IMPL *session, WT_CELL *cell, uint8_t type, wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle, size_t size) { uint8_t *p; @@ -465,7 +469,7 @@ __wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, break; case WT_CELL_VALUE_OVFL: case WT_CELL_VALUE_OVFL_RM: - __cell_pack_timestamp_value(&p, start_ts, stop_ts); + __cell_pack_timestamp_value(session, &p, start_ts, stop_ts); break; } @@ -621,7 +625,7 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) * Unpack a WT_CELL into a structure, with optional boundary checks. */ static inline int -__wt_cell_unpack_safe(const WT_PAGE_HEADER *dsk, +__wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *end) { struct { @@ -729,9 +733,11 @@ restart: WT_PTRDIFF(end, p), &unpack->newest_stop_ts)); unpack->newest_stop_ts += unpack->newest_start_ts; - WT_ASSERT(NULL, + WT_ASSERT(session, + unpack->newest_stop_ts != WT_TS_NONE); + WT_ASSERT(session, unpack->oldest_start_ts <= unpack->newest_start_ts); - WT_ASSERT(NULL, + WT_ASSERT(session, unpack->newest_start_ts <= unpack->newest_stop_ts); break; case WT_CELL_DEL: @@ -746,7 +752,8 @@ restart: 0 : WT_PTRDIFF(end, p), &unpack->stop_ts)); unpack->stop_ts += unpack->start_ts; - WT_ASSERT(NULL, unpack->start_ts <= unpack->stop_ts); + WT_ASSERT(session, unpack->stop_ts != WT_TS_NONE); + WT_ASSERT(session, unpack->start_ts <= unpack->stop_ts); break; } @@ -851,7 +858,7 @@ done: WT_CELL_LEN_CHK(cell, unpack->__len); * Unpack a WT_CELL into a structure. */ static inline void -__wt_cell_unpack_dsk( +__wt_cell_unpack_dsk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell, WT_CELL_UNPACK *unpack) { /* @@ -885,7 +892,7 @@ __wt_cell_unpack_dsk( return; } - (void)__wt_cell_unpack_safe(dsk, cell, unpack, NULL); + (void)__wt_cell_unpack_safe(session, dsk, cell, unpack, NULL); } /* @@ -893,9 +900,10 @@ __wt_cell_unpack_dsk( * Unpack a WT_CELL into a structure. */ static inline void -__wt_cell_unpack(WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack) +__wt_cell_unpack(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_CELL *cell, WT_CELL_UNPACK *unpack) { - __wt_cell_unpack_dsk(page->dsk, cell, unpack); + __wt_cell_unpack_dsk(session, page->dsk, cell, unpack); } /* @@ -985,13 +993,14 @@ __wt_page_cell_data_ref(WT_SESSION_IMPL *session, * WT_CELL_FOREACH -- * Walk the cells on a page. */ -#define WT_CELL_FOREACH_BEGIN(btree, dsk, unpack, skip_ts) do { \ +#define WT_CELL_FOREACH_BEGIN(session, btree, dsk, unpack, skip_ts) do {\ uint32_t __i; \ uint8_t *__cell; \ for (__cell = WT_PAGE_HEADER_BYTE(btree, dsk), \ __i = (dsk)->u.entries; \ __i > 0; __cell += (unpack).__len, --__i) { \ - __wt_cell_unpack_dsk(dsk, (WT_CELL *)__cell, &(unpack));\ + __wt_cell_unpack_dsk( \ + session, dsk, (WT_CELL *)__cell, &(unpack)); \ /* \ * Optionally skip unstable page entries after downgrade\ * to a release without page timestamps. Check for cells\ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 2c639fc7b8a..280d7e32f7d 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -293,6 +293,12 @@ struct __wt_connection_impl { uint32_t async_size; /* Async op array size */ uint32_t async_workers; /* Number of async workers */ + WT_CAPACITY capacity; /* Capacity structure */ + WT_SESSION_IMPL *capacity_session; /* Capacity thread session */ + wt_thread_t capacity_tid; /* Capacity thread */ + bool capacity_tid_set; /* Capacity thread set */ + WT_CONDVAR *capacity_cond; /* Capacity wait mutex */ + WT_LSM_MANAGER lsm_manager; /* LSM worker thread information */ WT_KEYED_ENCRYPTOR *kencryptor; /* Encryptor for metadata and log */ @@ -456,16 +462,17 @@ struct __wt_connection_impl { * delays have been requested. */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_TIMING_STRESS_CHECKPOINT_SLOW 0x001u -#define WT_TIMING_STRESS_LOOKASIDE_SWEEP 0x002u -#define WT_TIMING_STRESS_SPLIT_1 0x004u -#define WT_TIMING_STRESS_SPLIT_2 0x008u -#define WT_TIMING_STRESS_SPLIT_3 0x010u -#define WT_TIMING_STRESS_SPLIT_4 0x020u -#define WT_TIMING_STRESS_SPLIT_5 0x040u -#define WT_TIMING_STRESS_SPLIT_6 0x080u -#define WT_TIMING_STRESS_SPLIT_7 0x100u -#define WT_TIMING_STRESS_SPLIT_8 0x200u +#define WT_TIMING_STRESS_AGGRESSIVE_SWEEP 0x001u +#define WT_TIMING_STRESS_CHECKPOINT_SLOW 0x002u +#define WT_TIMING_STRESS_LOOKASIDE_SWEEP 0x004u +#define WT_TIMING_STRESS_SPLIT_1 0x008u +#define WT_TIMING_STRESS_SPLIT_2 0x010u +#define WT_TIMING_STRESS_SPLIT_3 0x020u +#define WT_TIMING_STRESS_SPLIT_4 0x040u +#define WT_TIMING_STRESS_SPLIT_5 0x080u +#define WT_TIMING_STRESS_SPLIT_6 0x100u +#define WT_TIMING_STRESS_SPLIT_7 0x200u +#define WT_TIMING_STRESS_SPLIT_8 0x400u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint64_t timing_stress_flags; @@ -500,12 +507,13 @@ struct __wt_connection_impl { #define WT_CONN_RECOVERING 0x0020000u #define WT_CONN_SALVAGE 0x0040000u #define WT_CONN_SERVER_ASYNC 0x0080000u -#define WT_CONN_SERVER_CHECKPOINT 0x0100000u -#define WT_CONN_SERVER_LOG 0x0200000u -#define WT_CONN_SERVER_LSM 0x0400000u -#define WT_CONN_SERVER_STATISTICS 0x0800000u -#define WT_CONN_SERVER_SWEEP 0x1000000u -#define WT_CONN_WAS_BACKUP 0x2000000u +#define WT_CONN_SERVER_CAPACITY 0x0100000u +#define WT_CONN_SERVER_CHECKPOINT 0x0200000u +#define WT_CONN_SERVER_LOG 0x0400000u +#define WT_CONN_SERVER_LSM 0x0800000u +#define WT_CONN_SERVER_STATISTICS 0x1000000u +#define WT_CONN_SERVER_SWEEP 0x2000000u +#define WT_CONN_WAS_BACKUP 0x4000000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 4dcf31a1dc0..351a5cd7abe 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -425,7 +425,7 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd) */ kpack = &_kpack; memset(kpack, 0, sizeof(*kpack)); - __wt_cell_unpack(page, cell, kpack); + __wt_cell_unpack(session, page, cell, kpack); if (kpack->type == WT_CELL_KEY && cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) { WT_ASSERT(session, cbt->row_key->size >= kpack->prefix); @@ -470,7 +470,7 @@ value: return (0); /* Else, take the value from the original page cell. */ - __wt_row_leaf_value_cell(page, rip, kpack, vpack); + __wt_row_leaf_value_cell(session, page, rip, kpack, vpack); return (__wt_page_cell_data_ref(session, cbt->ref->page, vpack, vb)); } /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 196028b5297..681c6c242ae 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -265,6 +265,9 @@ extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) WT extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern WT_THREAD_RET __wt_cache_pool_server(void *arg); +extern int __wt_capacity_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_capacity_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_capacity_throttle(WT_SESSION_IMPL *session, uint64_t bytes, WT_THROTTLE_TYPE type); extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); @@ -566,6 +569,8 @@ extern int __wt_ext_map_windows_error(WT_EXTENSION_API *wt_api, WT_SESSION *wt_s extern bool __wt_handle_is_open(WT_SESSION_IMPL *session, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, WT_FS_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_fsync_background_chk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_fsync_background(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_close_connection_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_os_inmemory(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index ff50fff0081..37e0799ef16 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -109,9 +109,12 @@ struct __wt_fh { const char *name; /* File name */ uint64_t name_hash; /* hash of name */ + uint64_t last_sync; /* time of background fsync */ + volatile uint64_t written; /* written since fsync */ TAILQ_ENTRY(__wt_fh) q; /* internal queue */ TAILQ_ENTRY(__wt_fh) hashq; /* internal hash queue */ u_int ref; /* reference count */ + WT_FS_OPEN_FILE_TYPE file_type; /* file type */ WT_FILE_HANDLE *handle; }; diff --git a/src/third_party/wiredtiger/src/include/os_fhandle.i b/src/third_party/wiredtiger/src/include/os_fhandle.i index 1aab749a2ac..a0573ee3cba 100644 --- a/src/third_party/wiredtiger/src/include/os_fhandle.i +++ b/src/third_party/wiredtiger/src/include/os_fhandle.i @@ -114,6 +114,10 @@ __wt_read( ret = fh->handle->fh_read( fh->handle, (WT_SESSION *)session, offset, len, buf); + /* Flag any failed read: if we're in startup, it may be fatal. */ + if (ret != 0) + F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); + time_stop = __wt_clock(session); __wt_stat_msecs_hist_incr_fsread(session, WT_CLOCKDIFF_MS(time_stop, time_start)); @@ -196,6 +200,7 @@ __wt_write(WT_SESSION_IMPL *session, time_stop = __wt_clock(session); __wt_stat_msecs_hist_incr_fswrite(session, WT_CLOCKDIFF_MS(time_stop, time_start)); + (void)__wt_atomic_addv64(&fh->written, len); WT_STAT_CONN_DECR_ATOMIC(session, thread_write_active); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 75be6c5147a..40dc8cf695e 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -453,6 +453,20 @@ struct __wt_connection_stats { int64_t cache_bytes_dirty; int64_t cache_pages_dirty; int64_t cache_eviction_clean; + int64_t fsync_all_fh_total; + int64_t fsync_all_fh; + int64_t fsync_all_time; + int64_t capacity_threshold; + int64_t capacity_bytes_read; + int64_t capacity_bytes_ckpt; + int64_t capacity_bytes_evict; + int64_t capacity_bytes_log; + int64_t capacity_bytes_written; + int64_t capacity_time_total; + int64_t capacity_time_ckpt; + int64_t capacity_time_evict; + int64_t capacity_time_log; + int64_t capacity_time_read; int64_t cond_auto_wait_reset; int64_t cond_auto_wait; int64_t time_travel; @@ -494,6 +508,7 @@ struct __wt_connection_stats { int64_t cursor_update_bytes_changed; int64_t cursor_reopen; int64_t cursor_open_count; + int64_t dh_conn_handle_size; int64_t dh_conn_handle_count; int64_t dh_sweep_ref; int64_t dh_sweep_close; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 360262e68fe..7ba90887513 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -129,8 +129,8 @@ __txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd) */ upd->prepare_state = WT_PREPARE_LOCKED; WT_WRITE_BARRIER(); - upd->timestamp = txn->commit_timestamp; - upd->durable_timestamp = txn->durable_timestamp; + upd->start_ts = txn->commit_timestamp; + upd->durable_ts = txn->durable_timestamp; WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED); } @@ -382,7 +382,7 @@ __wt_txn_op_apply_prepare_state( } for (updp = ref->page_del->update_list; updp != NULL && *updp != NULL; ++updp) { - (*updp)->timestamp = ts; + (*updp)->start_ts = ts; /* * Holding the ref locked means we have exclusive access, so if * we are committing we don't need to use the prepare locked @@ -390,7 +390,7 @@ __wt_txn_op_apply_prepare_state( */ (*updp)->prepare_state = prepare_state; if (commit) - (*updp)->durable_timestamp = txn->durable_timestamp; + (*updp)->durable_ts = txn->durable_timestamp; } ref->page_del->timestamp = ts; if (commit) @@ -446,13 +446,13 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) * commit and durable timestamps need to be updated. */ timestamp = op->type == WT_TXN_OP_REF_DELETE ? - &op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp; + &op->u.ref->page_del->timestamp : &op->u.op_upd->start_ts; if (*timestamp == WT_TS_NONE) { *timestamp = txn->commit_timestamp; timestamp = op->type == WT_TXN_OP_REF_DELETE ? &op->u.ref->page_del->durable_timestamp : - &op->u.op_upd->durable_timestamp; + &op->u.op_upd->durable_ts; *timestamp = txn->durable_timestamp; } } @@ -684,7 +684,7 @@ __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd) upd->prepare_state == WT_PREPARE_INPROGRESS) return (false); - return (__wt_txn_visible_all(session, upd->txnid, upd->timestamp)); + return (__wt_txn_visible_all(session, upd->txnid, upd->start_ts)); } /* @@ -782,8 +782,8 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) if (prepare_state == WT_PREPARE_LOCKED) continue; - upd_visible = __wt_txn_visible( - session, upd->txnid, upd->timestamp); + upd_visible = + __wt_txn_visible(session, upd->txnid, upd->start_ts); /* * The visibility check is only valid if the update does not @@ -817,7 +817,7 @@ __wt_txn_upd_durable(WT_SESSION_IMPL *session, WT_UPDATE *upd) /* If update is visible then check if it is durable. */ if (__wt_txn_upd_visible_type(session, upd) != WT_VISIBLE_TRUE) return (false); - return (__wt_txn_visible(session, upd->txnid, upd->durable_timestamp)); + return (__wt_txn_visible(session, upd->txnid, upd->durable_ts)); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 2875102571f..1ac4de23044 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2296,6 +2296,15 @@ struct __wt_connection { * seconds at which to check for files that are inactive and close * them., an integer between 1 and 100000; default \c 10.} * @config{ ),,} + * @config{io_capacity = (, control how many bytes per second are + * written and read. Exceeding the capacity results in throttling., a + * set of related configuration options defined below.} + * @config{ total, number of bytes per second + * available to all subsystems in total. When set\, decisions about + * what subsystems are throttled\, and in what proportion\, are made + * internally. The minimum non-zero setting is 1MB., an integer between + * 0 and 1TB; default \c 0.} + * @config{ ),,} * @config{log = (, enable logging. Enabling logging uses three * sessions from the configured session_max., a set of related * configuration options defined below.} @@ -2510,16 +2519,19 @@ struct __wt_connection { * @configstart{WT_CONNECTION.query_timestamp, see dist/api_data.py} * @config{get, specify which timestamp to query: \c all_committed * returns the largest timestamp such that all timestamps up to that - * value have committed\, \c oldest returns the most recent \c - * oldest_timestamp set with WT_CONNECTION::set_timestamp\, \c + * value have committed\, \c last_checkpoint returns the timestamp of + * the most recent stable checkpoint\, \c oldest returns the most recent + * \c oldest_timestamp set with WT_CONNECTION::set_timestamp\, \c * oldest_reader returns the minimum of the read timestamps of all - * active readers \c pinned returns the minimum of the\c - * oldest_timestamp and the read timestamps of all active readers\, and - * \c stable returns the most recent \c stable_timestamp set with - * WT_CONNECTION::set_timestamp. See @ref transaction_timestamps., a - * string\, chosen from the following options: \c "all_committed"\, \c - * "last_checkpoint"\, \c "oldest"\, \c "oldest_reader"\, \c "pinned"\, - * \c "recovery"\, \c "stable"; default \c all_committed.} + * active readers \c pinned returns the minimum of the \c + * oldest_timestamp and the read timestamps of all active readers\, \c + * recovery returns the timestamp of the most recent stable checkpoint + * taken prior to a shutdown and \c stable returns the most recent \c + * stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref + * transaction_timestamps., a string\, chosen from the following + * options: \c "all_committed"\, \c "last_checkpoint"\, \c "oldest"\, \c + * "oldest_reader"\, \c "pinned"\, \c "recovery"\, \c "stable"; default + * \c all_committed.} * @configend * @errors * If there is no matching timestamp (e.g., if this method is called @@ -2949,6 +2961,15 @@ struct __wt_connection { * @config{ ),,} * @config{in_memory, keep data in-memory only. See @ref in_memory for more * information., a boolean flag; default \c false.} + * @config{io_capacity = (, control how many bytes per second are written and + * read. Exceeding the capacity results in throttling., a set of related + * configuration options defined below.} + * @config{ total, + * number of bytes per second available to all subsystems in total. When set\, + * decisions about what subsystems are throttled\, and in what proportion\, are + * made internally. The minimum non-zero setting is 1MB., an integer between 0 + * and 1TB; default \c 0.} + * @config{ ),,} * @config{log = (, enable logging. Enabling logging uses three sessions from * the configured session_max., a set of related configuration options defined * below.} @@ -5217,532 +5238,562 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_PAGES_DIRTY 1127 /*! cache: unmodified pages evicted */ #define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1128 +/*! capacity: background fsync file handles considered */ +#define WT_STAT_CONN_FSYNC_ALL_FH_TOTAL 1129 +/*! capacity: background fsync file handles synced */ +#define WT_STAT_CONN_FSYNC_ALL_FH 1130 +/*! capacity: background fsync time (msecs) */ +#define WT_STAT_CONN_FSYNC_ALL_TIME 1131 +/*! capacity: threshold to call fsync */ +#define WT_STAT_CONN_CAPACITY_THRESHOLD 1132 +/*! capacity: throttled bytes read */ +#define WT_STAT_CONN_CAPACITY_BYTES_READ 1133 +/*! capacity: throttled bytes written for checkpoint */ +#define WT_STAT_CONN_CAPACITY_BYTES_CKPT 1134 +/*! capacity: throttled bytes written for eviction */ +#define WT_STAT_CONN_CAPACITY_BYTES_EVICT 1135 +/*! capacity: throttled bytes written for log */ +#define WT_STAT_CONN_CAPACITY_BYTES_LOG 1136 +/*! capacity: throttled bytes written total */ +#define WT_STAT_CONN_CAPACITY_BYTES_WRITTEN 1137 +/*! capacity: time waiting due to total capacity (usecs) */ +#define WT_STAT_CONN_CAPACITY_TIME_TOTAL 1138 +/*! capacity: time waiting during checkpoint (usecs) */ +#define WT_STAT_CONN_CAPACITY_TIME_CKPT 1139 +/*! capacity: time waiting during eviction (usecs) */ +#define WT_STAT_CONN_CAPACITY_TIME_EVICT 1140 +/*! capacity: time waiting during logging (usecs) */ +#define WT_STAT_CONN_CAPACITY_TIME_LOG 1141 +/*! capacity: time waiting during read (usecs) */ +#define WT_STAT_CONN_CAPACITY_TIME_READ 1142 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1129 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1143 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1130 +#define WT_STAT_CONN_COND_AUTO_WAIT 1144 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1131 +#define WT_STAT_CONN_TIME_TRAVEL 1145 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1132 +#define WT_STAT_CONN_FILE_OPEN 1146 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1133 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1147 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1134 +#define WT_STAT_CONN_MEMORY_FREE 1148 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1135 +#define WT_STAT_CONN_MEMORY_GROW 1149 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1136 +#define WT_STAT_CONN_COND_WAIT 1150 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1137 +#define WT_STAT_CONN_RWLOCK_READ 1151 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1138 +#define WT_STAT_CONN_RWLOCK_WRITE 1152 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1139 +#define WT_STAT_CONN_FSYNC_IO 1153 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1140 +#define WT_STAT_CONN_READ_IO 1154 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1141 +#define WT_STAT_CONN_WRITE_IO 1155 /*! cursor: cached cursor count */ -#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1142 +#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1156 /*! cursor: cursor bulk loaded cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT_BULK 1143 +#define WT_STAT_CONN_CURSOR_INSERT_BULK 1157 /*! cursor: cursor close calls that result in cache */ -#define WT_STAT_CONN_CURSOR_CACHE 1144 +#define WT_STAT_CONN_CURSOR_CACHE 1158 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1145 +#define WT_STAT_CONN_CURSOR_CREATE 1159 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1146 +#define WT_STAT_CONN_CURSOR_INSERT 1160 /*! cursor: cursor insert key and value bytes */ -#define WT_STAT_CONN_CURSOR_INSERT_BYTES 1147 +#define WT_STAT_CONN_CURSOR_INSERT_BYTES 1161 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1148 +#define WT_STAT_CONN_CURSOR_MODIFY 1162 /*! cursor: cursor modify key and value bytes affected */ -#define WT_STAT_CONN_CURSOR_MODIFY_BYTES 1149 +#define WT_STAT_CONN_CURSOR_MODIFY_BYTES 1163 /*! cursor: cursor modify value bytes modified */ -#define WT_STAT_CONN_CURSOR_MODIFY_BYTES_TOUCH 1150 +#define WT_STAT_CONN_CURSOR_MODIFY_BYTES_TOUCH 1164 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1151 +#define WT_STAT_CONN_CURSOR_NEXT 1165 /*! cursor: cursor operation restarted */ -#define WT_STAT_CONN_CURSOR_RESTART 1152 +#define WT_STAT_CONN_CURSOR_RESTART 1166 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1153 +#define WT_STAT_CONN_CURSOR_PREV 1167 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1154 +#define WT_STAT_CONN_CURSOR_REMOVE 1168 /*! cursor: cursor remove key bytes removed */ -#define WT_STAT_CONN_CURSOR_REMOVE_BYTES 1155 +#define WT_STAT_CONN_CURSOR_REMOVE_BYTES 1169 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1156 +#define WT_STAT_CONN_CURSOR_RESERVE 1170 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1157 +#define WT_STAT_CONN_CURSOR_RESET 1171 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1158 +#define WT_STAT_CONN_CURSOR_SEARCH 1172 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1159 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1173 /*! cursor: cursor sweep buckets */ -#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1160 +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1174 /*! cursor: cursor sweep cursors closed */ -#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1161 +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1175 /*! cursor: cursor sweep cursors examined */ -#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1162 +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1176 /*! cursor: cursor sweeps */ -#define WT_STAT_CONN_CURSOR_SWEEP 1163 +#define WT_STAT_CONN_CURSOR_SWEEP 1177 /*! cursor: cursor truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1164 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1178 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1165 +#define WT_STAT_CONN_CURSOR_UPDATE 1179 /*! cursor: cursor update key and value bytes */ -#define WT_STAT_CONN_CURSOR_UPDATE_BYTES 1166 +#define WT_STAT_CONN_CURSOR_UPDATE_BYTES 1180 /*! cursor: cursor update value size change */ -#define WT_STAT_CONN_CURSOR_UPDATE_BYTES_CHANGED 1167 +#define WT_STAT_CONN_CURSOR_UPDATE_BYTES_CHANGED 1181 /*! cursor: cursors reused from cache */ -#define WT_STAT_CONN_CURSOR_REOPEN 1168 +#define WT_STAT_CONN_CURSOR_REOPEN 1182 /*! cursor: open cursor count */ -#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1169 +#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1183 +/*! data-handle: connection data handle size */ +#define WT_STAT_CONN_DH_CONN_HANDLE_SIZE 1184 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1170 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1185 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1171 +#define WT_STAT_CONN_DH_SWEEP_REF 1186 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1172 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1187 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1173 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1188 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1174 +#define WT_STAT_CONN_DH_SWEEP_TOD 1189 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1175 +#define WT_STAT_CONN_DH_SWEEPS 1190 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1176 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1191 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1177 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1192 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1178 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1193 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1179 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1194 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1180 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1195 /*! * lock: commit timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1181 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1196 /*! lock: commit timestamp queue lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1182 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1197 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1183 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1198 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1184 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1199 /*! lock: dhandle lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1185 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1200 /*! lock: dhandle lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1186 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1201 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1187 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1202 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1188 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1203 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1189 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1204 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1190 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1205 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1191 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1206 /*! * lock: read timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1192 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1207 /*! lock: read timestamp queue lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1193 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1208 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1194 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1209 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1195 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1210 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1196 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1211 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1197 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1212 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1198 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1213 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1199 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1214 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1200 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1215 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1201 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1216 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1202 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1217 /*! lock: txn global lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1203 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1218 /*! lock: txn global lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1204 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1219 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1205 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1220 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1206 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1221 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1207 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1222 /*! log: force archive time sleeping (usecs) */ -#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1208 +#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1223 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1209 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1224 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1210 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1225 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1211 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1226 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1212 +#define WT_STAT_CONN_LOG_FLUSH 1227 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1213 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1228 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1214 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1229 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1215 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1230 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1216 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1231 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1217 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1232 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1218 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1233 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1219 +#define WT_STAT_CONN_LOG_SCANS 1234 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1220 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1235 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1221 +#define WT_STAT_CONN_LOG_WRITE_LSN 1236 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1222 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1237 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1223 +#define WT_STAT_CONN_LOG_SYNC 1238 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1224 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1239 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1225 +#define WT_STAT_CONN_LOG_SYNC_DIR 1240 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1226 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1241 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1227 +#define WT_STAT_CONN_LOG_WRITES 1242 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1228 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1243 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1229 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1244 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1230 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1245 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1231 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1246 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1232 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1247 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1233 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1248 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1234 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1249 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1235 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1250 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1236 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1251 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1237 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1252 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1238 +#define WT_STAT_CONN_LOG_SLOT_RACES 1253 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1239 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1254 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1240 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1255 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1241 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1256 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1242 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1257 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1243 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1258 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1244 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1259 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1245 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1260 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1246 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1261 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1247 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1262 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1248 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1263 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1249 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1264 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1250 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1265 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1251 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1266 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1252 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1267 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1253 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1268 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1254 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1269 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1255 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1270 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1256 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1271 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1257 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1272 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1258 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1273 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1259 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1274 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1260 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1275 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1261 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1276 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1262 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1277 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1263 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1278 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1264 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1279 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1265 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1280 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1266 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1281 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1267 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1282 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1268 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1283 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1269 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1284 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1270 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1285 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1271 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1286 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1272 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1287 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1273 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1288 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1274 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1289 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1275 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1290 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1276 +#define WT_STAT_CONN_REC_PAGES 1291 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1277 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1292 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1278 +#define WT_STAT_CONN_REC_PAGE_DELETE 1293 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1279 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1294 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1280 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1295 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1281 +#define WT_STAT_CONN_SESSION_OPEN 1296 /*! session: session query timestamp calls */ -#define WT_STAT_CONN_SESSION_QUERY_TS 1282 +#define WT_STAT_CONN_SESSION_QUERY_TS 1297 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1283 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1298 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1284 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1299 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1285 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1300 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1286 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1301 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1287 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1302 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1288 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1303 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1289 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1304 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1290 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1305 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1291 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1306 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1292 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1307 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1293 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1308 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1294 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1309 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1295 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1310 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1296 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1311 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1297 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1312 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1298 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1313 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1299 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1314 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1300 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1315 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1301 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1316 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1302 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1317 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1303 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1318 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1304 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1319 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1305 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1320 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1306 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1321 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1307 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1322 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1308 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1323 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1309 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1324 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1310 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1325 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1311 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1326 /*! thread-yield: page access yielded due to prepare state change */ -#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1312 +#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1327 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1313 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1328 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1314 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1329 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1315 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1330 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1316 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1331 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1317 +#define WT_STAT_CONN_PAGE_SLEEP 1332 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1318 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1333 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1319 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1334 /*! transaction: Number of prepared updates */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1320 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1335 /*! transaction: Number of prepared updates added to cache overflow */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1321 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1336 /*! transaction: Number of prepared updates resolved */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1322 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1337 /*! transaction: commit timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1323 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1338 /*! transaction: commit timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1324 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1339 /*! transaction: commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1325 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1340 /*! transaction: commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1326 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1341 /*! transaction: commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1327 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1342 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1328 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1343 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1329 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1344 /*! transaction: prepared transactions */ -#define WT_STAT_CONN_TXN_PREPARE 1330 +#define WT_STAT_CONN_TXN_PREPARE 1345 /*! transaction: prepared transactions committed */ -#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1331 +#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1346 /*! transaction: prepared transactions currently active */ -#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1332 +#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1347 /*! transaction: prepared transactions rolled back */ -#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1333 +#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1348 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1334 +#define WT_STAT_CONN_TXN_QUERY_TS 1349 /*! transaction: read timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1335 +#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1350 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1336 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1351 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1337 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1352 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1338 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1353 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1339 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1354 /*! transaction: rollback to stable calls */ -#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1340 +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1355 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1341 +#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1356 /*! transaction: rollback to stable updates removed from cache overflow */ -#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1342 +#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1357 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1343 +#define WT_STAT_CONN_TXN_SET_TS 1358 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1344 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1359 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1345 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1360 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1346 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1361 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1347 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1362 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1348 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1363 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1349 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1364 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1350 +#define WT_STAT_CONN_TXN_BEGIN 1365 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1351 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1366 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1352 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1367 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1353 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1368 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1354 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1369 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1355 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1370 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1356 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1371 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1357 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1372 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1358 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1373 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1359 +#define WT_STAT_CONN_TXN_CHECKPOINT 1374 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1360 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1375 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1361 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1376 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1362 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1377 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1363 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1378 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1364 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1379 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1365 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1380 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1366 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1381 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1367 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1382 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1368 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1383 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1369 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1384 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1370 +#define WT_STAT_CONN_TXN_SYNC 1385 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1371 +#define WT_STAT_CONN_TXN_COMMIT 1386 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1372 +#define WT_STAT_CONN_TXN_ROLLBACK 1387 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1373 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1388 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 75801ceb48b..d93f6a3be7f 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -97,6 +97,8 @@ struct __wt_cache; typedef struct __wt_cache WT_CACHE; struct __wt_cache_pool; typedef struct __wt_cache_pool WT_CACHE_POOL; +struct __wt_capacity; + typedef struct __wt_capacity WT_CAPACITY; struct __wt_cell; typedef struct __wt_cell WT_CELL; struct __wt_cell_unpack; @@ -359,6 +361,7 @@ typedef uint64_t wt_timestamp_t; #include "btmem.h" #include "btree.h" #include "cache.h" +#include "capacity.h" #include "compact.h" #include "config.h" #include "cursor.h" diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index aff145be512..9e27a996251 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -182,6 +182,22 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) } /* + * __log_fs_read -- + * Wrapper when reading from a log file. + */ +static int +__log_fs_read(WT_SESSION_IMPL *session, + WT_FH *fh, wt_off_t offset, size_t len, void *buf) +{ + WT_DECL_RET; + + __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); + if ((ret = __wt_read(session, fh, offset, len, buf)) != 0) + WT_RET_MSG(session, ret, "%s: log read failure", fh->name); + return (ret); +} + +/* * __log_fs_write -- * Wrapper when writing to a log file. If we're writing to a new log * file for the first time wait for writes to the previous log file. @@ -207,6 +223,7 @@ __log_fs_write(WT_SESSION_IMPL *session, __log_wait_for_earlier_slot(session, slot); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } + __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) WT_PANIC_RET(session, ret, "%s: fatal log failure", slot->slot_fh->name); @@ -663,6 +680,7 @@ __log_zero(WT_SESSION_IMPL *session, */ if ((uint32_t)len - off < bufsz) wrlen = (uint32_t)len - off; + __wt_capacity_throttle(session, wrlen, WT_THROTTLE_LOG); WT_ERR(__wt_write(session, fh, (wt_off_t)off, wrlen, zerobuf->mem)); off += wrlen; @@ -989,7 +1007,7 @@ __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, * Read in the log file header and verify it. */ WT_ERR(__log_openfile(session, id, 0, &fh)); - WT_ERR(__wt_read(session, fh, 0, allocsize, buf->mem)); + WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; __wt_log_record_byteswap(logrec); desc = (WT_LOG_DESC *)logrec->record; @@ -1053,7 +1071,7 @@ __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, goto err; memset(buf->mem, 0, allocsize); - WT_ERR(__wt_read(session, fh, allocsize, allocsize, buf->mem)); + WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; /* * We have a valid header but the system record is not there. @@ -1932,7 +1950,7 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, for (off = offset; remainder > 0; remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) { rdlen = WT_MIN(bufsz, (size_t)remainder); - WT_ERR(__wt_read(session, fh, off, rdlen, buf)); + WT_ERR(__log_fs_read(session, fh, off, rdlen, buf)); allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize); if (memcmp(buf, zerobuf, rdlen) != 0) { /* @@ -2450,7 +2468,7 @@ advance: */ WT_ASSERT(session, buf->memsize >= allocsize); need_salvage = F_ISSET(conn, WT_CONN_SALVAGE); - WT_ERR(__wt_read(session, + WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem)); need_salvage = false; /* @@ -2504,7 +2522,7 @@ advance: * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, buf, rdup_len)); - WT_ERR(__wt_read(session, log_fh, + WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)rdup_len, buf->mem)); WT_STAT_CONN_INCR(session, log_scan_rereads); } diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index acff9771f62..40f37b961e8 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -535,6 +535,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) { rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); if (rel != 0) + /* Writes are not throttled. */ WT_RET(__wt_write(session, slot->slot_fh, slot->slot_start_offset, (size_t)rel, slot->slot_buf.mem)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 1da4169d234..5b91aa09db2 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -697,11 +697,11 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) } /* - * __wt_lsm_free_chunks -- + * __lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ -int -__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +static int +__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; @@ -712,15 +712,6 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) flush_metadata = false; - if (lsm_tree->nold_chunks == 0) - return (0); - - /* - * Make sure only a single thread is freeing the old chunk array - * at any time. - */ - if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) - return (0); /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while @@ -743,16 +734,6 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) } /* - * Don't remove files if a hot backup is in progress. - * - * The schema lock protects the set of live files, this check - * prevents us from removing a file that hot backup already - * knows about. - */ - if (S2C(session)->hot_backup) - break; - - /* * Drop any bloom filters and chunks we can. Don't try to drop * a chunk if the bloom filter drop fails. * An EBUSY return indicates that a cursor is still open in @@ -822,7 +803,6 @@ err: /* Flush the metadata unless the system is in panic */ } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); - lsm_tree->freeing_old_chunks = 0; /* Returning non-zero means there is no work to do. */ if (!flush_metadata) @@ -830,3 +810,42 @@ err: /* Flush the metadata unless the system is in panic */ return (ret); } + +/* + * __wt_lsm_free_chunks -- + * Try to drop chunks from the tree that are no longer required. + */ +int +__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + if (lsm_tree->nold_chunks == 0) + return (0); + + /* + * Make sure only a single thread is freeing the old chunk array + * at any time. + */ + if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) + return (0); + + /* + * Don't remove files if a hot backup is in progress. + * + * The schema lock protects the set of live files, this check prevents + * us from removing a file that hot backup already knows about. + */ + if (!conn->hot_backup) { + __wt_readlock(session, &conn->hot_backup_lock); + if (!conn->hot_backup) + ret = __lsm_free_chunks(session, lsm_tree); + __wt_readunlock(session, &conn->hot_backup_lock); + } + + lsm_tree->freeing_old_chunks = 0; + return (ret); +} diff --git a/src/third_party/wiredtiger/src/os_common/os_fhandle.c b/src/third_party/wiredtiger/src/os_common/os_fhandle.c index 3100817e650..df67508c4fe 100644 --- a/src/third_party/wiredtiger/src/os_common/os_fhandle.c +++ b/src/third_party/wiredtiger/src/os_common/os_fhandle.c @@ -240,6 +240,8 @@ __wt_open(WT_SESSION_IMPL *session, WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); + fh->file_type = file_type; + /* * If this is a read-only connection, open all files read-only except * the lock file. @@ -356,6 +358,134 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) } /* + * __wt_fsync_background_chk -- + * Return if background fsync is supported. + */ +bool +__wt_fsync_background_chk(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_FH *fh; + WT_FILE_HANDLE *handle; + bool supported; + + conn = S2C(session); + supported = true; + __wt_spin_lock(session, &conn->fh_lock); + /* + * Look for the first data file handle and see if + * the fsync nowait function is supported. + */ + TAILQ_FOREACH(fh, &conn->fhqh, q) { + handle = fh->handle; + if (fh->file_type != WT_FS_OPEN_FILE_TYPE_DATA) + continue; + /* + * If we don't have a function, return false, otherwise + * return true. In any case, we are done with the loop. + */ + if (handle->fh_sync_nowait == NULL) + supported = false; + break; + } + __wt_spin_unlock(session, &conn->fh_lock); + return (supported); +} + +/* + * __fsync_background -- + * Background fsync for a single dirty file handle. + */ +static int +__fsync_background(WT_SESSION_IMPL *session, WT_FH *fh) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FILE_HANDLE *handle; + uint64_t now; + + conn = S2C(session); + WT_STAT_CONN_INCR(session, fsync_all_fh_total); + + handle = fh->handle; + if (handle->fh_sync_nowait == NULL || + fh->written < WT_CAPACITY_FILE_THRESHOLD) + return (0); + + /* Only sync data files. */ + if (fh->file_type != WT_FS_OPEN_FILE_TYPE_DATA) + return (0); + + now = __wt_clock(session); + if (fh->last_sync == 0 || WT_CLOCKDIFF_SEC(now, fh->last_sync) > 0) { + __wt_spin_unlock(session, &conn->fh_lock); + + /* + * We set the false flag to indicate a non-blocking background + * fsync, but there is no guarantee that it doesn't block. If + * we wanted to detect if it is blocking, adding a clock call + * and checking the time would be done here. + */ + ret = __wt_fsync(session, fh, false); + if (ret == 0) { + WT_STAT_CONN_INCR(session, fsync_all_fh); + fh->last_sync = now; + fh->written = 0; + } + + __wt_spin_lock(session, &conn->fh_lock); + } + return (ret); +} + +/* + * __wt_fsync_background -- + * Background fsync for all dirty file handles. + */ +int +__wt_fsync_background(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *fh, *fhnext; + + conn = S2C(session); + __wt_spin_lock(session, &conn->fh_lock); + TAILQ_FOREACH_SAFE(fh, &conn->fhqh, q, fhnext) { + /* + * The worker routine will unlock the list to avoid holding it + * locked over an fsync. Increment the count on the current and + * next handles to guarantee their validity. + */ + if (fhnext != NULL) + ++fhnext->ref; + ++fh->ref; + + WT_TRET(__fsync_background(session, fh)); + + /* + * The file handle reference may have gone to 0, in which case + * we're responsible for the close. Configure the close routine + * to drop the lock, which means we must re-acquire it. + */ + if (--fh->ref == 0) { + WT_TRET(__handle_close(session, fh, true)); + __wt_spin_lock(session, &conn->fh_lock); + } + + /* + * Decrement the next element's reference count. It might have + * gone to 0 as well, in which case we'll close it in the next + * loop iteration. + */ + if (fhnext != NULL) + --fhnext->ref; + } + __wt_spin_unlock(session, &conn->fh_lock); + return (ret); +} + +/* * __wt_close_connection_close -- * Close any open file handles at connection close. */ diff --git a/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c b/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c index 94db8806305..304a745efb2 100644 --- a/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c +++ b/src/third_party/wiredtiger/src/os_common/os_fs_inmemory.c @@ -349,7 +349,6 @@ __im_file_read(WT_FILE_HANDLE *file_handle, __wt_spin_unlock(session, &im_fs->lock); if (ret == 0) return (0); - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); WT_RET_MSG(session, WT_ERROR, "%s: handle-read: failed to read %" WT_SIZET_FMT " bytes at " "offset %" WT_SIZET_FMT, diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index f9771fb3860..438af2eb58d 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -439,15 +439,12 @@ __posix_file_read(WT_FILE_HANDLE *file_handle, /* Break reads larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { chunk = WT_MIN(len, WT_GIGABYTE); - if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0) { - if (nr == 0) - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); + if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0) WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(), "%s: handle-read: pread: failed to read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, file_handle->name, chunk, (uintmax_t)offset); - } } return (0); } diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c index 684a28bfd51..b2f90233a4c 100644 --- a/src/third_party/wiredtiger/src/os_win/os_fs.c +++ b/src/third_party/wiredtiger/src/os_win/os_fs.c @@ -299,8 +299,6 @@ __win_file_read(WT_FILE_HANDLE *file_handle, win_fh->filehandle, addr, chunk, &nr, &overlapped)) { windows_error = __wt_getlasterror(); ret = __wt_map_windows_error(windows_error); - if (ret == WT_ERROR) - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); __wt_err(session, ret, "%s: handle-read: ReadFile: failed to read %lu " "bytes at offset %" PRIuMAX ": %s", diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index 6508db6df8f..d84d5524df3 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -39,7 +39,7 @@ __ovfl_discard_verbose( WT_RET(__wt_scr_alloc(session, 512, &tmp)); unpack = &_unpack; - __wt_cell_unpack(page, cell, unpack); + __wt_cell_unpack(session, page, cell, unpack); __wt_verbose(session, WT_VERB_OVERFLOW, "discard: %s%s%p %s", diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 647712093a8..786f2bdec81 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -12,8 +12,6 @@ struct __rec_chunk; typedef struct __rec_chunk WT_CHUNK; struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY; struct __rec_kv; typedef struct __rec_kv WT_KV; -#define WT_TS_FIXME 37 /* Fake timestamp */ - /* * Reconciliation is the process of taking an in-memory page, walking each entry * in the page, building a backing disk image in a temporary buffer representing @@ -273,6 +271,16 @@ typedef struct { WT_CURSOR_BTREE update_modify_cbt; } WT_RECONCILE; +typedef struct { + WT_UPDATE *upd; /* Update to write (or NULL) */ + + uint64_t txnid; /* Transaction ID, timestamps */ + wt_timestamp_t start_ts, stop_ts; + + bool upd_saved; /* Updates saved to list */ + +} WT_UPDATE_SELECT; + #define WT_CROSSING_MIN_BND(r, next_len) \ ((r)->cur_ptr->min_offset == 0 && \ (next_len) > (r)->min_space_avail) @@ -1157,8 +1165,8 @@ __rec_append_orig_value(WT_SESSION_IMPL *session, */ if (upd->type == WT_UPDATE_BIRTHMARK) { append->txnid = upd->txnid; - append->timestamp = upd->timestamp; - append->durable_timestamp = upd->durable_timestamp; + append->start_ts = upd->start_ts; + append->durable_ts = upd->durable_ts; append->next = upd->next; } @@ -1176,14 +1184,13 @@ err: __wt_scr_free(session, &tmp); } /* - * __rec_txn_read -- + * __rec_upd_select -- * Return the update in a list that should be written (or NULL if none can * be written). */ static int -__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, - bool *upd_savedp, WT_UPDATE **updp) +__rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, + void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE_SELECT *upd_select) { WT_PAGE *page; WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; @@ -1192,9 +1199,12 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t max_txn, txnid; bool all_visible, prepared, skipped_birthmark, uncommitted; - if (upd_savedp != NULL) - *upd_savedp = false; - *updp = NULL; + /* + * The "saved updates" return value is used independently of returning + * an update we can write, both must be initialized. + */ + upd_select->upd = NULL; + upd_select->upd_saved = false; page = r->page; first_ts_upd = first_txn_upd = NULL; @@ -1252,7 +1262,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, continue; /* Consider a non durable update as uncommitted. */ - if (upd->timestamp != WT_TS_NONE && + if (upd->start_ts != WT_TS_NONE && !__wt_txn_upd_durable(session, upd)) { uncommitted = r->update_uncommitted = true; continue; @@ -1261,7 +1271,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } /* Track the first update with non-zero timestamp. */ - if (first_ts_upd == NULL && upd->timestamp != WT_TS_NONE) + if (first_ts_upd == NULL && upd->start_ts != WT_TS_NONE) first_ts_upd = upd; /* @@ -1282,8 +1292,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * (but we save enough information that checkpoint can fix * things up if we choose an update that is too new). */ - if (*updp == NULL && r->las_skew_newest) - *updp = upd; + if (upd_select->upd == NULL && r->las_skew_newest) + upd_select->upd = upd; /* Consider non durable updates as uncommitted. */ if ((F_ISSET(r, WT_REC_VISIBLE_ALL) ? @@ -1302,7 +1312,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * discard an uncommitted update. */ if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && - *updp != NULL && (uncommitted || prepared)) { + upd_select->upd != NULL && + (uncommitted || prepared)) { r->leave_dirty = true; return (__wt_set_return(session, EBUSY)); } @@ -1318,17 +1329,26 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * (set to the first uncommitted transaction). Lookaside with * stable timestamp always takes the first stable update. */ - if (*updp == NULL) - *updp = upd; + if (upd_select->upd == NULL) + upd_select->upd = upd; } /* Keep track of the selected update. */ - upd = *updp; + upd = upd_select->upd; /* Reconciliation should never see an aborted or reserved update. */ WT_ASSERT(session, upd == NULL || (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE)); + /* + * The checkpoint transaction is special. Make sure we never write + * metadata updates from a checkpoint in a concurrent session. + */ + WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || + upd == NULL || upd->txnid == WT_TXN_NONE || + upd->txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_SESSION_IS_CHECKPOINT(session)); + /* If all of the updates were aborted, quit. */ if (first_txn_upd == NULL) { WT_ASSERT(session, upd == NULL); @@ -1340,13 +1360,29 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->update_used = true; /* - * The checkpoint transaction is special. Make sure we never write - * metadata updates from a checkpoint in a concurrent session. + * The start timestamp is determined by the commit timestamp when the + * key is first inserted (or last updated). The end timestamp is set + * when a key/value pair becomes invalid, either because of a remove + * or a modify/update operation on the same key. */ - WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || - upd == NULL || upd->txnid == WT_TXN_NONE || - upd->txnid != S2C(session)->txn_global.checkpoint_state.id || - WT_SESSION_IS_CHECKPOINT(session)); + if (upd != NULL) { + upd_select->txnid = upd->txnid; + + /* + * TIMESTAMP-FIXME + * This is waiting on the WT_UPDATE structure's start/stop + * timestamp work. For now, if we don't have a timestamp, + * just pretend it's durable, otherwise pretend the start + * and stop timestamps are the same. + * + */ + if (upd_select->upd->start_ts == WT_TS_NONE) { + upd_select->start_ts = WT_TS_NONE; + upd_select->stop_ts = WT_TS_MAX; + } else + upd_select->start_ts = + upd_select->stop_ts = upd_select->upd->start_ts; + } /* * Track the most recent transaction in the page. We store this in the @@ -1358,8 +1394,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->max_txn = max_txn; /* Update the maximum timestamp. */ - if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->timestamp) - r->max_timestamp = first_ts_upd->timestamp; + if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->start_ts) + r->max_timestamp = first_ts_upd->start_ts; /* * If the update we chose was a birthmark, or we are doing @@ -1369,7 +1405,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (upd != NULL && (upd->type == WT_UPDATE_BIRTHMARK || (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark))) - *updp = NULL; + upd_select->upd = NULL; /* * Check if all updates on the page are visible. If not, it must stay @@ -1379,7 +1415,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * order), so we track the maximum transaction ID and the newest update * with a timestamp (if any). */ - timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->timestamp; + timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->start_ts; all_visible = upd == first_txn_upd && !(uncommitted || prepared) && (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_visible_all(session, max_txn, timestamp) : @@ -1427,9 +1463,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * The order of the updates on the list matters, we can't move only the * unresolved updates, move the entire update list. */ - WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); - if (upd_savedp != NULL) - *upd_savedp = true; + WT_RET(__rec_update_save( + session, r, ins, ripcip, upd_select->upd, upd_memsize)); + upd_select->upd_saved = true; /* * Track the first off-page update when saving history in the lookaside @@ -1441,18 +1477,18 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) r->unstable_txn = first_upd->txnid; if (first_ts_upd != NULL && - r->unstable_timestamp < first_ts_upd->timestamp) - r->unstable_timestamp = first_ts_upd->timestamp; + r->unstable_timestamp < first_ts_upd->start_ts) + r->unstable_timestamp = first_ts_upd->start_ts; } else if (F_ISSET(r, WT_REC_LOOKASIDE)) { - for (upd = first_upd; upd != *updp; upd = upd->next) { + for (upd = first_upd; upd != upd_select->upd; upd = upd->next) { if (upd->txnid == WT_TXN_ABORTED) continue; if (upd->txnid != WT_TXN_NONE && WT_TXNID_LT(upd->txnid, r->unstable_txn)) r->unstable_txn = upd->txnid; - if (upd->timestamp < r->unstable_timestamp) - r->unstable_timestamp = upd->timestamp; + if (upd->start_ts < r->unstable_timestamp) + r->unstable_timestamp = upd->start_ts; } } @@ -1461,7 +1497,9 @@ check_original_value: * Paranoia: check that we didn't choose an update that has since been * rolled back. */ - WT_ASSERT(session, *updp == NULL || (*updp)->txnid != WT_TXN_ABORTED); + WT_ASSERT(session, + upd_select->upd == NULL || + upd_select->upd->txnid != WT_TXN_ABORTED); /* * Returning an update means the original on-page value might be lost, @@ -1474,7 +1512,8 @@ check_original_value: * - or any reconciliation of a backing overflow record that will be * physically removed once it's no longer needed. */ - if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) || + if (upd_select->upd != NULL && + (!WT_UPDATE_DATA_VALUE(upd_select->upd) || F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) WT_RET( @@ -1937,7 +1976,7 @@ __rec_dict_replace(WT_SESSION_IMPL *session, WT_RECONCILE *r, offset = (uint64_t)WT_PTRDIFF(r->first_free, (uint8_t *)r->cur_ptr->image.mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy( - &val->cell, start_ts, stop_ts, rle, offset); + session, &val->cell, start_ts, stop_ts, rle, offset); val->buf.data = NULL; val->buf.size = 0; } @@ -3600,7 +3639,7 @@ __wt_bulk_insert_var( val = &r->v; if (deleted) { val->cell_len = __wt_cell_pack_del( - &val->cell, WT_TS_NONE, WT_TS_MAX, cbulk->rle); + session, &val->cell, WT_TS_NONE, WT_TS_MAX, cbulk->rle); val->buf.data = NULL; val->buf.size = 0; val->len = val->cell_len; @@ -3729,7 +3768,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) if (addr == NULL && __wt_off_page(page, ref->addr)) addr = ref->addr; if (addr == NULL) { - __wt_cell_unpack(page, ref->addr, vpack); + __wt_cell_unpack(session, page, ref->addr, vpack); val->buf.data = ref->addr; val->buf.size = __wt_cell_total_len(vpack); val->cell_len = 0; @@ -3813,6 +3852,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_INSERT *ins; WT_PAGE *page; WT_UPDATE *upd; + WT_UPDATE_SELECT upd_select; uint64_t recno; uint32_t entry, nrecs; @@ -3828,7 +3868,9 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { - WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, NULL, &upd)); + WT_RET(__rec_upd_select( + session, r, ins, NULL, NULL, &upd_select)); + upd = upd_select.upd; if (upd != NULL) __bit_setv(r->first_free, WT_INSERT_RECNO(ins) - pageref->ref_recno, @@ -3872,8 +3914,9 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) break; upd = NULL; } else { - WT_RET(__rec_txn_read( - session, r, ins, NULL, NULL, NULL, &upd)); + WT_RET(__rec_upd_select( + session, r, ins, NULL, NULL, &upd_select)); + upd = upd_select.upd; recno = WT_INSERT_RECNO(ins); } for (;;) { @@ -4004,15 +4047,14 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, */ static int __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_SALVAGE_COOKIE *salvage, - WT_ITEM *value, bool deleted, uint8_t overflow_type, - wt_timestamp_t start_ts, wt_timestamp_t stop_ts, uint64_t rle) + WT_SALVAGE_COOKIE *salvage, WT_ITEM *value, + wt_timestamp_t start_ts, wt_timestamp_t stop_ts, + uint64_t rle, bool deleted, bool overflow_type) { WT_BTREE *btree; WT_KV *val; btree = S2BT(session); - val = &r->v; /* @@ -4047,14 +4089,14 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, } if (deleted) { - val->cell_len = - __wt_cell_pack_del(&val->cell, start_ts, stop_ts, rle); + val->cell_len = __wt_cell_pack_del( + session, &val->cell, start_ts, stop_ts, rle); val->buf.data = NULL; val->buf.size = 0; val->len = val->cell_len; } else if (overflow_type) { - val->cell_len = __wt_cell_pack_ovfl(&val->cell, - overflow_type, start_ts, stop_ts, rle, value->size); + val->cell_len = __wt_cell_pack_ovfl(session, &val->cell, + WT_CELL_VALUE_OVFL, start_ts, stop_ts, rle, value->size); val->buf.data = value->data; val->buf.size = value->size; val->len = val->cell_len + value->size; @@ -4088,6 +4130,11 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) { enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; + struct { + WT_ITEM *value; /* Value */ + wt_timestamp_t start_ts, stop_ts; /* Timestamps */ + bool deleted; /* If deleted */ + } last; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *vpack, _vpack; @@ -4096,30 +4143,38 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_DECL_ITEM(orig); WT_DECL_RET; WT_INSERT *ins; - WT_ITEM *last; WT_PAGE *page; WT_UPDATE *upd; + WT_UPDATE_SELECT upd_select; wt_timestamp_t start_ts, stop_ts; uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; uint32_t i, size; - bool deleted, last_deleted, orig_deleted, update_no_copy; + bool deleted, orig_deleted, update_no_copy; const void *data; btree = S2BT(session); - page = pageref->page; - last = r->last; vpack = &_vpack; cbt = &r->update_modify_cbt; + page = pageref->page; + upd = NULL; + size = 0; + data = NULL; + + /* Set the "last" values to cause failure if they're not set. */ + last.value = r->last; + last.start_ts = last.stop_ts = WT_TS_NONE; + last.deleted = false; + + /* + * Set the start/stop values to cause failure if they're not set. + * [-Werror=maybe-uninitialized] + */ + start_ts = stop_ts = WT_TS_NONE; WT_RET(__rec_split_init(session, r, page, pageref->ref_recno, btree->maxleafpage_precomp)); WT_RET(__wt_scr_alloc(session, 0, &orig)); - data = NULL; - size = 0; - upd = NULL; - - start_ts = stop_ts = WT_TS_FIXME; /* * The salvage code may be calling us to reconcile a page where there @@ -4133,11 +4188,12 @@ __rec_col_var(WT_SESSION_IMPL *session, * helper function's assistance.) */ rle = 0; - last_deleted = false; if (salvage != NULL && salvage->missing != 0) { if (salvage->skip == 0) { rle = salvage->missing; - last_deleted = true; + last.start_ts = WT_TS_NONE; + last.stop_ts = WT_TS_MAX; + last.deleted = true; /* * Correct the number of records we're going to "take", @@ -4145,9 +4201,9 @@ __rec_col_var(WT_SESSION_IMPL *session, */ salvage->take += salvage->missing; } else - WT_ERR(__rec_col_var_helper(session, - r, NULL, NULL, true, false, - WT_TS_NONE, WT_TS_MAX, salvage->missing)); + WT_ERR(__rec_col_var_helper(session, r, + NULL, NULL, WT_TS_NONE, WT_TS_MAX, + salvage->missing, true, false)); } /* @@ -4167,11 +4223,15 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_COL_FOREACH(page, cip, i) { ovfl_state = OVFL_IGNORE; if ((cell = WT_COL_PTR(page, cip)) == NULL) { + start_ts = WT_TS_NONE; + stop_ts = WT_TS_MAX; nrepeat = 1; ins = NULL; orig_deleted = true; } else { - __wt_cell_unpack(page, cell, vpack); + __wt_cell_unpack(session, page, cell, vpack); + start_ts = vpack->start_ts; + stop_ts = vpack->stop_ts; nrepeat = __wt_cell_rle(vpack); ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); @@ -4227,8 +4287,23 @@ record_loop: /* n < nrepeat; n += repeat_count, src_recno += repeat_count) { upd = NULL; if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { - WT_ERR(__rec_txn_read( - session, r, ins, cip, vpack, NULL, &upd)); + WT_ERR(__rec_upd_select( + session, r, ins, cip, vpack, &upd_select)); + upd = upd_select.upd; + if (upd == NULL) { + /* + * TIMESTAMP-FIXME + * I'm pretty sure this is wrong: a NULL + * update means an item was deleted, and + * I think that requires a tombstone on + * the page. + */ + start_ts = WT_TS_NONE; + stop_ts = WT_TS_MAX; + } else { + start_ts = upd_select.start_ts; + stop_ts = upd_select.stop_ts; + } ins = WT_SKIP_NEXT(ins); } @@ -4316,18 +4391,19 @@ record_loop: /* */ if (rle != 0) { WT_ERR(__rec_col_var_helper( - session, r, salvage, last, - last_deleted, 0, - start_ts, stop_ts, rle)); + session, r, salvage, + last.value, + last.start_ts, last.stop_ts, + rle, last.deleted, false)); rle = 0; } - last->data = vpack->data; - last->size = vpack->size; - WT_ERR(__rec_col_var_helper( - session, r, salvage, last, false, - WT_CELL_VALUE_OVFL, - start_ts, stop_ts, repeat_count)); + last.value->data = vpack->data; + last.value->size = vpack->size; + WT_ERR(__rec_col_var_helper(session, r, + salvage, + last.value, start_ts, stop_ts, + repeat_count, false, true)); /* Track if page has overflow items. */ r->ovfl_items = true; @@ -4367,16 +4443,20 @@ compare: /* * we've been doing that all along. */ if (rle != 0) { - if ((deleted && last_deleted) || - (!last_deleted && !deleted && - last->size == size && - memcmp(last->data, data, size) == 0)) { + if ((!__wt_process.page_version_ts || + (last.start_ts == start_ts && + last.stop_ts == stop_ts)) && + ((deleted && last.deleted) || + (!deleted && !last.deleted && + last.value->size == size && + memcmp( + last.value->data, data, size) == 0))) { rle += repeat_count; continue; } - WT_ERR(__rec_col_var_helper(session, r, - salvage, last, last_deleted, 0, - start_ts, stop_ts, rle)); + WT_ERR(__rec_col_var_helper(session, r, salvage, + last.value, last.start_ts, last.stop_ts, + rle, last.deleted, false)); } /* @@ -4399,13 +4479,15 @@ compare: /* * the pointers, they're not moving. */ if (data == vpack->data || update_no_copy) { - last->data = data; - last->size = size; + last.value->data = data; + last.value->size = size; } else WT_ERR(__wt_buf_set( - session, last, data, size)); + session, last.value, data, size)); } - last_deleted = deleted; + last.start_ts = start_ts; + last.stop_ts = stop_ts; + last.deleted = deleted; rle = repeat_count; } @@ -4449,10 +4531,24 @@ compare: /* upd = NULL; } else { - WT_ERR(__rec_txn_read( - session, r, ins, NULL, NULL, NULL, &upd)); + WT_ERR(__rec_upd_select( + session, r, ins, NULL, NULL, &upd_select)); + upd = upd_select.upd; n = WT_INSERT_RECNO(ins); } + if (upd == NULL) { + /* + * TIMESTAMP-FIXME + * I'm pretty sure this is wrong: a NULL update means + * an item was deleted, and I think that requires a + * tombstone on the page. + */ + start_ts = WT_TS_NONE; + stop_ts = WT_TS_MAX; + } else { + start_ts = upd_select.start_ts; + stop_ts = upd_select.stop_ts; + } while (src_recno <= n) { deleted = false; update_no_copy = true; @@ -4465,7 +4561,10 @@ compare: /* */ if (src_recno < n) { deleted = true; - if (last_deleted) { + if (last.deleted && + (!__wt_process.page_version_ts || + (last.start_ts == start_ts && + last.stop_ts == stop_ts))) { /* * The record adjustment is decremented * by one so we can naturally fall into @@ -4478,9 +4577,22 @@ compare: /* rle += skip; src_recno += skip; } - } else if (upd == NULL) + } else if (upd == NULL) { + /* + * TIMESTAMP-FIXME + * I'm pretty sure this is wrong: a NULL + * update means an item was deleted, and + * I think that requires a tombstone on + * the page. + */ + start_ts = WT_TS_NONE; + stop_ts = WT_TS_MAX; + deleted = true; - else + } else { + start_ts = upd_select.start_ts; + stop_ts = upd_select.stop_ts; + switch (upd->type) { case WT_UPDATE_MODIFY: /* @@ -4504,22 +4616,27 @@ compare: /* break; WT_ILLEGAL_VALUE_ERR(session, upd->type); } + } /* * Handle RLE accounting and comparisons -- see comment * above, this code fragment does the same thing. */ if (rle != 0) { - if ((deleted && last_deleted) || - (!last_deleted && !deleted && - last->size == size && - memcmp(last->data, data, size) == 0)) { + if ((!__wt_process.page_version_ts || + (last.start_ts == start_ts && + last.stop_ts == stop_ts)) && + ((deleted && last.deleted) || + (!deleted && !last.deleted && + last.value->size == size && + memcmp( + last.value->data, data, size) == 0))) { ++rle; goto next; } - WT_ERR(__rec_col_var_helper(session, r, - salvage, last, last_deleted, 0, - start_ts, stop_ts, rle)); + WT_ERR(__rec_col_var_helper(session, r, salvage, + last.value, last.start_ts, last.stop_ts, + rle, last.deleted, false)); } /* @@ -4533,15 +4650,17 @@ compare: /* */ if (!deleted) { if (update_no_copy) { - last->data = data; - last->size = size; + last.value->data = data; + last.value->size = size; } else WT_ERR(__wt_buf_set( - session, last, data, size)); + session, last.value, data, size)); } /* Ready for the next loop, reset the RLE counter. */ - last_deleted = deleted; + last.start_ts = start_ts; + last.stop_ts = stop_ts; + last.deleted = deleted; rle = 1; /* @@ -4564,8 +4683,8 @@ next: if (src_recno == UINT64_MAX) /* If we were tracking a record, write it. */ if (rle != 0) - WT_ERR(__rec_col_var_helper(session, r, salvage, - last, last_deleted, 0, start_ts, stop_ts, rle)); + WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, + last.start_ts, last.stop_ts, rle, last.deleted, false)); /* Write the remnant page. */ ret = __rec_split_finish(session, r); @@ -4651,7 +4770,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (ikey != NULL && ikey->cell_offset != 0) { cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - __wt_cell_unpack(page, cell, kpack); + __wt_cell_unpack(session, page, cell, kpack); key_onpage_ovfl = kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; } @@ -4748,7 +4867,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) newest_start_ts = addr->newest_start_ts; newest_stop_ts = addr->newest_stop_ts; } else { - __wt_cell_unpack(page, ref->addr, vpack); + __wt_cell_unpack(session, page, ref->addr, vpack); if (state == WT_CHILD_PROXY) { WT_ERR(__wt_buf_set(session, &val->buf, ref->addr, __wt_cell_total_len(vpack))); @@ -4886,9 +5005,10 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_KV *key, *val; WT_ROW *rip; WT_UPDATE *upd; + WT_UPDATE_SELECT upd_select; wt_timestamp_t start_ts, stop_ts; size_t size; - uint64_t slvg_skip; + uint64_t slvg_skip, txnid; uint32_t i; bool dictionary, key_onpage_ovfl, ovfl_key; void *copy; @@ -4947,17 +5067,23 @@ __rec_row_leaf(WT_SESSION_IMPL *session, kpack = NULL; else { kpack = &_kpack; - __wt_cell_unpack(page, cell, kpack); + __wt_cell_unpack(session, page, cell, kpack); } /* Unpack the on-page value cell, set the default timestamps. */ - __wt_row_leaf_value_cell(page, rip, NULL, vpack); + __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); start_ts = vpack->start_ts; stop_ts = vpack->stop_ts; + txnid = WT_TXN_NONE; /* Look for an update. */ - WT_ERR(__rec_txn_read( - session, r, NULL, rip, vpack, NULL, &upd)); + WT_ERR(__rec_upd_select( + session, r, NULL, rip, vpack, &upd_select)); + if ((upd = upd_select.upd) != NULL) { + txnid = upd_select.txnid; + start_ts = upd_select.start_ts; + stop_ts = upd_select.stop_ts; + } /* Build value cell. */ dictionary = false; @@ -5147,7 +5273,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, goto build; kpack = &_kpack; - __wt_cell_unpack(page, cell, kpack); + __wt_cell_unpack(session, page, cell, kpack); if (btree->huffman_key == NULL && kpack->type == WT_CELL_KEY && tmpkey->size >= kpack->prefix) { @@ -5215,13 +5341,11 @@ build: /* * Copy the key/value pair onto the page. Zero-length items must * be globally visible as we're writing nothing to the page. - * - * WT_TS_FIXME: NONE-MAX is too pessimistic a test, and we may - * want to adjust start_ts/stop_ts. */ __rec_image_copy(session, r, key); if (val->len == 0 && - start_ts == WT_TS_NONE && stop_ts == WT_TS_MAX) + (!__wt_process.page_version_ts || + __wt_txn_visible_all(session, txnid, stop_ts))) r->any_empty_value = true; else { r->all_empty_value = false; @@ -5259,7 +5383,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_CURSOR_BTREE *cbt; WT_KV *key, *val; WT_UPDATE *upd; + WT_UPDATE_SELECT upd_select; wt_timestamp_t start_ts, stop_ts; + uint64_t txnid; bool ovfl_key, upd_saved; btree = S2BT(session); @@ -5269,9 +5395,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) val = &r->v; for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { - WT_RET(__rec_txn_read( - session, r, ins, NULL, NULL, &upd_saved, &upd)); - start_ts = stop_ts = WT_TS_FIXME; + WT_RET(__rec_upd_select( + session, r, ins, NULL, NULL, &upd_select)); + upd = upd_select.upd; + txnid = upd_select.txnid; + start_ts = upd_select.start_ts; + stop_ts = upd_select.stop_ts; + upd_saved = upd_select.upd_saved; if (upd == NULL) { /* @@ -5345,13 +5475,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) /* * Copy the key/value pair onto the page. Zero-length items must * be globally visible as we're writing nothing to the page. - * - * WT_TS_FIXME: NONE-MAX is too pessimistic a test, and we may - * want to adjust start_ts/stop_ts. */ __rec_image_copy(session, r, key); if (val->len == 0 && - start_ts == WT_TS_NONE && stop_ts == WT_TS_MAX) + (!__wt_process.page_version_ts && + __wt_txn_visible_all(session, txnid, stop_ts))) r->any_empty_value = true; else { r->all_empty_value = false; @@ -5949,7 +6077,8 @@ __rec_cell_build_addr(WT_SESSION_IMPL *session, */ val->buf.data = addr->addr; val->buf.size = addr->size; - val->cell_len = __wt_cell_pack_addr(&val->cell, cell_type, recno, + val->cell_len = __wt_cell_pack_addr(session, + &val->cell, cell_type, recno, addr->oldest_start_ts, addr->newest_start_ts, addr->newest_stop_ts, val->buf.size); val->len = val->cell_len + val->buf.size; @@ -5996,7 +6125,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, WT_RECONCILE *r, } } val->cell_len = __wt_cell_pack_data( - &val->cell, start_ts, stop_ts, rle, val->buf.size); + session, &val->cell, start_ts, stop_ts, rle, val->buf.size); val->len = val->cell_len + val->buf.size; return (0); @@ -6069,7 +6198,7 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session, /* Build the cell and return. */ kv->cell_len = __wt_cell_pack_ovfl( - &kv->cell, type, start_ts, stop_ts, rle, kv->buf.size); + session, &kv->cell, type, start_ts, stop_ts, rle, kv->buf.size); kv->len = kv->cell_len + kv->buf.size; err: __wt_scr_free(session, &tmp); diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index d47b6291fe1..3d5ca2d6a16 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -882,6 +882,20 @@ static const char * const __stats_connection_desc[] = { "cache: tracked dirty bytes in the cache", "cache: tracked dirty pages in the cache", "cache: unmodified pages evicted", + "capacity: background fsync file handles considered", + "capacity: background fsync file handles synced", + "capacity: background fsync time (msecs)", + "capacity: threshold to call fsync", + "capacity: throttled bytes read", + "capacity: throttled bytes written for checkpoint", + "capacity: throttled bytes written for eviction", + "capacity: throttled bytes written for log", + "capacity: throttled bytes written total", + "capacity: time waiting due to total capacity (usecs)", + "capacity: time waiting during checkpoint (usecs)", + "capacity: time waiting during eviction (usecs)", + "capacity: time waiting during logging (usecs)", + "capacity: time waiting during read (usecs)", "connection: auto adjusting condition resets", "connection: auto adjusting condition wait calls", "connection: detected system time went backwards", @@ -923,6 +937,7 @@ static const char * const __stats_connection_desc[] = { "cursor: cursor update value size change", "cursor: cursors reused from cache", "cursor: open cursor count", + "data-handle: connection data handle size", "data-handle: connection data handles currently active", "data-handle: connection sweep candidate became referenced", "data-handle: connection sweep dhandles closed", @@ -1298,6 +1313,20 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing cache_bytes_dirty */ /* not clearing cache_pages_dirty */ stats->cache_eviction_clean = 0; + stats->fsync_all_fh_total = 0; + stats->fsync_all_fh = 0; + /* not clearing fsync_all_time */ + stats->capacity_threshold = 0; + stats->capacity_bytes_read = 0; + stats->capacity_bytes_ckpt = 0; + stats->capacity_bytes_evict = 0; + stats->capacity_bytes_log = 0; + stats->capacity_bytes_written = 0; + stats->capacity_time_total = 0; + stats->capacity_time_ckpt = 0; + stats->capacity_time_evict = 0; + stats->capacity_time_log = 0; + stats->capacity_time_read = 0; stats->cond_auto_wait_reset = 0; stats->cond_auto_wait = 0; stats->time_travel = 0; @@ -1339,6 +1368,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cursor_update_bytes_changed = 0; stats->cursor_reopen = 0; /* not clearing cursor_open_count */ + /* not clearing dh_conn_handle_size */ /* not clearing dh_conn_handle_count */ stats->dh_sweep_ref = 0; stats->dh_sweep_close = 0; @@ -1760,6 +1790,21 @@ __wt_stat_connection_aggregate( to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->fsync_all_fh_total += WT_STAT_READ(from, fsync_all_fh_total); + to->fsync_all_fh += WT_STAT_READ(from, fsync_all_fh); + to->fsync_all_time += WT_STAT_READ(from, fsync_all_time); + to->capacity_threshold += WT_STAT_READ(from, capacity_threshold); + to->capacity_bytes_read += WT_STAT_READ(from, capacity_bytes_read); + to->capacity_bytes_ckpt += WT_STAT_READ(from, capacity_bytes_ckpt); + to->capacity_bytes_evict += WT_STAT_READ(from, capacity_bytes_evict); + to->capacity_bytes_log += WT_STAT_READ(from, capacity_bytes_log); + to->capacity_bytes_written += + WT_STAT_READ(from, capacity_bytes_written); + to->capacity_time_total += WT_STAT_READ(from, capacity_time_total); + to->capacity_time_ckpt += WT_STAT_READ(from, capacity_time_ckpt); + to->capacity_time_evict += WT_STAT_READ(from, capacity_time_evict); + to->capacity_time_log += WT_STAT_READ(from, capacity_time_log); + to->capacity_time_read += WT_STAT_READ(from, capacity_time_read); to->cond_auto_wait_reset += WT_STAT_READ(from, cond_auto_wait_reset); to->cond_auto_wait += WT_STAT_READ(from, cond_auto_wait); to->time_travel += WT_STAT_READ(from, time_travel); @@ -1804,6 +1849,7 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cursor_update_bytes_changed); to->cursor_reopen += WT_STAT_READ(from, cursor_reopen); to->cursor_open_count += WT_STAT_READ(from, cursor_open_count); + to->dh_conn_handle_size += WT_STAT_READ(from, dh_conn_handle_size); to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count); to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref); to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 60fb6aee8c5..13fd1ee1233 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -643,7 +643,7 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session) * Check timestamps are used in order. */ op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); - upd_zero_ts = upd->timestamp == WT_TS_NONE; + upd_zero_ts = upd->start_ts == WT_TS_NONE; if (op_zero_ts != upd_zero_ts) WT_RET_MSG(session, EINVAL, "per-key timestamps used inconsistently"); @@ -655,14 +655,14 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session) if (op_zero_ts) continue; - op_timestamp = op->u.op_upd->timestamp; + op_timestamp = op->u.op_upd->start_ts; /* * Only if the update structure doesn't have a timestamp * then use the one in the transaction structure. */ if (op_timestamp == WT_TS_NONE) op_timestamp = txn->commit_timestamp; - if (op_timestamp < upd->timestamp) + if (op_timestamp < upd->start_ts) WT_RET_MSG(session, EINVAL, "out of order timestamps"); } @@ -1016,7 +1016,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) } /* Set prepare timestamp. */ - upd->timestamp = ts; + upd->start_ts = ts; WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS); op->u.op_upd = NULL; diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index dfcd5eaa7c8..2ff42b7d220 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -106,10 +106,10 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, * updates were also rolled back. */ if (upd->txnid == WT_TXN_ABORTED || - upd->timestamp == WT_TS_NONE) { + upd->start_ts == WT_TS_NONE) { if (upd == first_upd) first_upd = upd->next; - } else if (rollback_timestamp < upd->durable_timestamp) { + } else if (rollback_timestamp < upd->durable_ts) { /* * If any updates are aborted, all newer updates * better be aborted as well. @@ -127,8 +127,8 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, upd->txnid = WT_TXN_ABORTED; WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted); - upd->timestamp = 0; - upd->durable_timestamp = 0; + upd->durable_ts = 0; + upd->start_ts = 0; } } } diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c index a1ad6097e70..de4f31fcf23 100644 --- a/src/third_party/wiredtiger/src/utilities/util_main.c +++ b/src/third_party/wiredtiger/src/utilities/util_main.c @@ -256,6 +256,7 @@ main(int argc, char *argv[]) /* Build the configuration string. */ len = 10; /* some slop */ p1 = p2 = p3 = ""; + len += strlen("error_prefix=wt"); if (config != NULL) len += strlen(config); if (cmd_config != NULL) @@ -271,7 +272,7 @@ main(int argc, char *argv[]) (void)util_err(NULL, errno, NULL); goto err; } - if ((ret = __wt_snprintf(p, len, "%s,%s,%s%s%s%s", + if ((ret = __wt_snprintf(p, len, "error_prefix=wt,%s,%s,%s%s%s%s", config == NULL ? "" : config, cmd_config == NULL ? "" : cmd_config, rec_config, p1, p2, p3)) != 0) { diff --git a/src/third_party/wiredtiger/test/csuite/random_directio/main.c b/src/third_party/wiredtiger/test/csuite/random_directio/main.c index eafbe9754be..894d704a7cf 100644 --- a/src/third_party/wiredtiger/test/csuite/random_directio/main.c +++ b/src/third_party/wiredtiger/test/csuite/random_directio/main.c @@ -904,12 +904,11 @@ check_db(uint32_t nth, uint32_t datasize, bool directio, uint32_t flags) printf("starting full scan at %" PRIu64 "\n", id); gen_kv(keybuf, kvsize, id, 0, large_arr[0], true); cursor->set_key(cursor, keybuf); - testutil_check(cursor->search(cursor)); th = 0; /* Keep bitmap of "active" threads. */ threadmap = (0x1U << nth) - 1; - for (ret = 0; ret != WT_NOTFOUND && threadmap != 0; + for (ret = cursor->search(cursor); ret != WT_NOTFOUND && threadmap != 0; ret = cursor->next(cursor)) { testutil_check(ret); testutil_check(cursor->get_key(cursor, &gotkey)); diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 00e16ed43ac..9189a993732 100644 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -965,6 +965,18 @@ tasks: ulimit -c unlimited largescale/run-million-collection-test.sh . + - name: compatibility-test-for-mongodb-releases + commands: + - func: "fetch source" + - command: shell.exec + params: + working_dir: "wiredtiger" + script: | + set -o errexit + set -o verbose + test/evergreen/compatibility_test_for_mongodb_releases.sh + + buildvariants: - name: ubuntu1404 display_name: Ubuntu 14.04 @@ -972,9 +984,10 @@ buildvariants: - ubuntu1404-test expansions: # It's ugly, but we need the absolute path here, not the relative - test_env_vars: PATH=/opt/mongodbtoolchain/v2/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs + test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs smp_command: -j $(grep -c ^processor /proc/cpuinfo) - configure_env_vars: CC=/opt/mongodbtoolchain/bin/gcc CXX=/opt/mongodbtoolchain/bin/g++ PATH=/opt/mongodbtoolchain/v2/bin:$PATH + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH + make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make tasks: - name: compile - name: lang-python-test @@ -1037,10 +1050,18 @@ buildvariants: run_on: - rhel62-large expansions: - configure_env_vars: CC=/opt/mongodbtoolchain/bin/gcc CXX=/opt/mongodbtoolchain/bin/g++ + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ tasks: - name: million-collection-test +- name: compatibility-tests + display_name: Compatibility tests + batchtime: 10080 # 7 days + run_on: + - ubuntu1404-test + tasks: + - name: compatibility-test-for-mongodb-releases + - name: windows-64 display_name: Windows 64-bit run_on: @@ -1058,9 +1079,9 @@ buildvariants: - macos-1012 expansions: smp_command: -j $(sysctl -n hw.logicalcpu) - configure_env_vars: PATH=/opt/mongodbtoolchain/v2/bin:$PATH - make_command: PATH=/opt/mongodbtoolchain/v2/bin:$PATH ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future make - test_env_vars: PATH=/opt/mongodbtoolchain/v2/bin:$PATH DYLD_LIBRARY_PATH=$(pwd)/.libs + configure_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH + make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future make + test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH DYLD_LIBRARY_PATH=$(pwd)/.libs tasks: - name: compile - name: make-check-test diff --git a/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh new file mode 100755 index 00000000000..1207c479c59 --- /dev/null +++ b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +############################################################################################## +# Check releases to ensure forward and backward compatibility. +############################################################################################## + +########################################################################### +# Return the most recent version of the tagged release. +########################################################################### +get_release() +{ + echo "$(git tag | grep "^mongodb-$1.[0-9]" | sort -V | sed -e '$p' -e d)" +} + +############################################################# +# This function will +# - checkout git tree of the desired release and build it, +# - generate test objects. +# +# arg1: MongoDB tagged release number or develop branch identifier. +############################################################# +build_rel() +{ + echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + echo "Building release: \"$1\"" + echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + + git clone --quiet https://github.com/wiredtiger/wiredtiger.git "wt.$1" > /dev/null || return 1 + cd "wt.$1" || return 1 + + config="" + config+="--enable-snappy " + + case "$1" in + # Please note 'develop' here is planned as the future MongoDB release 4.2 - the only release that supports + # both enabling and disabling of timestamps in data format. Once 4.2 is released, we need to update this script. + "develop") + branch="develop";; + "develop-timestamps") + branch="develop" + config+="--enable-page-version-ts";; + *) + branch=$(get_release "$1");; + esac + + git checkout --quiet -b $branch || return 1 + + (sh build_posix/reconf && ./configure $config && make -j $(grep -c ^processor /proc/cpuinfo)) > /dev/null || return 1 + + cd test/format || return 1 + + # Run a configuration and generate some on-disk files. + args="" + args+="cache=80 " # Medium cache so there's eviction + args+="checkpoints=1 " # Force periodic writes + args+="compression=snappy " # We only built with snappy, force the choice + args+="data_source=table " + args+="in_memory=0 " # Interested in the on-disk format + args+="leak_memory=1 " # Faster runs + args+="logging_compression=snappy " # We only built with snappy, force the choice + args+="quiet=1 " + args+="rebalance=0 " # Faster runs + args+="rows=1000000 " + args+="salvage=0 " # Faster runs + args+="timer=4 " + args+="verify=0 " # Faster runs + for am in fix row var; do + ./t -h "RUNDIR.$am" -1 "file_type=$am" $args || return 1 + done + + return 0 +} + +############################################################# +# This function will +# - verify a pair of releases can verify each other's objects. +# +# arg1: release #1 +# arg2: release #2 +############################################################# +verify() +{ + echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + echo "Verifying release \"$1\" and \"$2\"" + echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" + a="wt.$1" + b="wt.$2" + + EXT="extensions=[" + EXT+="ext/compressors/snappy/.libs/libwiredtiger_snappy.so," + EXT+="ext/collators/reverse/.libs/libwiredtiger_reverse_collator.so, " + EXT+="ext/encryptors/rotn/.libs/libwiredtiger_rotn.so, " + EXT+="]" + + cd $a || return 1 + for am in fix row var; do + echo "$a/wt verifying $b/test/format/RUNDIR.$am..." + WIREDTIGER_CONFIG="$EXT" \ + ./wt -h ../$b/test/format/RUNDIR.$am verify table:wt || return 1 + done + + cd ../$b || return 1 + for am in fix row var; do + echo "$b/wt verifying $a/test/format/RUNDIR.$am..." + WIREDTIGER_CONFIG="$EXT" \ + ./wt -h ../$a/test/format/RUNDIR.$am verify table:wt || return 1 + done + + return 0 +} + +run() +{ + # Build test files from each release. + (build_rel 3.4) || return 1 + (build_rel 3.6) || return 1 + (build_rel 4.0) || return 1 + (build_rel develop) || return 1 + (build_rel develop-timestamps) || return 1 + + # Verify forward/backward compatibility. + (verify 3.4 3.6) || return 1 + (verify 3.6 4.0) || return 1 + (verify 4.0 develop) || return 1 + (verify 4.0 develop-timestamps) || return 1 + (verify develop develop-timestamps) || return 1 + + return 0 +} + +# Create a directory in which to do the work. +top="test-compatibility-run" +rm -rf $top && mkdir $top && cd $top || { + echo "$0: unable to create $top working directory" + exit 1 +} + +run +exit $? diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index f4354588a6b..d638781197e 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -334,6 +334,10 @@ static CONFIG c[] = { "maximum time to run in minutes", C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL }, + { "timing_stress_aggressive_sweep", + "stress aggressive sweep", /* 2% */ + C_BOOL, 2, 0, 0, &g.c_timing_stress_aggressive_sweep, NULL }, + { "timing_stress_checkpoint", "stress checkpoints", /* 2% */ C_BOOL, 2, 0, 0, &g.c_timing_stress_checkpoint, NULL }, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index aab40ef3174..8ef34ec226f 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -220,6 +220,7 @@ typedef struct { uint32_t c_statistics_server; uint32_t c_threads; uint32_t c_timer; + uint32_t c_timing_stress_aggressive_sweep; uint32_t c_timing_stress_checkpoint; uint32_t c_timing_stress_lookaside_sweep; uint32_t c_timing_stress_split_1; diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index e52640d9c46..fc12c381a23 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -235,6 +235,8 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) /* Optionally stress operations. */ CONFIG_APPEND(p, ",timing_stress_for_test=["); + if (g.c_timing_stress_aggressive_sweep) + CONFIG_APPEND(p, ",aggressive_sweep"); if (g.c_timing_stress_checkpoint) CONFIG_APPEND(p, ",checkpoint_slow"); if (g.c_timing_stress_lookaside_sweep) diff --git a/src/third_party/wiredtiger/test/suite/test_baseconfig.py b/src/third_party/wiredtiger/test/suite/test_baseconfig.py index 4a97a3599a4..717f49dece2 100644..100755 --- a/src/third_party/wiredtiger/test/suite/test_baseconfig.py +++ b/src/third_party/wiredtiger/test/suite/test_baseconfig.py @@ -36,6 +36,8 @@ class test_baseconfig(wttest.WiredTigerTestCase): # Open up another database and modify the baseconfig os.mkdir("A") conn = self.wiredtiger_open("A", 'create') + # Mark the new directory as corrupted + self.databaseCorrupted("A") self.assertTrue(os.path.exists("A/WiredTiger.basecfg")) with open("A/WiredTiger.basecfg", "a") as basecfg_file: basecfg_file.write("foo!") diff --git a/src/third_party/wiredtiger/test/suite/test_dictionary.py b/src/third_party/wiredtiger/test/suite/test_dictionary.py index 2b49b46e7f2..0990cdfb525 100644 --- a/src/third_party/wiredtiger/test/suite/test_dictionary.py +++ b/src/third_party/wiredtiger/test/suite/test_dictionary.py @@ -38,8 +38,8 @@ import wiredtiger, wttest class test_dictionary(wttest.WiredTigerTestCase): conn_config = 'statistics=(all)' scenarios = make_scenarios([ - ('row', dict(key_format='S', value_format='S')), - ('var', dict(key_format='r', value_format='S')), + ('row', dict(key_format='S')), + ('var', dict(key_format='r')), ]) # Smoke test dictionary compression. @@ -48,8 +48,10 @@ class test_dictionary(wttest.WiredTigerTestCase): uri = 'file:test_dictionary' # This is a btree layer test. # Create the object, open the cursor, insert some records with identical values. Use - # alternating values, otherwise column-store will RLE compress them into a single item. - self.session.create(uri, 'dictionary=100,value_format=S,key_format=' + self.key_format) + # a reasonably large page size so most of the items fit on a page. Use alternating + # values, otherwise column-store will RLE compress them into a single item. + config='leaf_page_max=64K,dictionary=100,value_format=S,key_format=' + self.session.create(uri, config + self.key_format) cursor = self.session.open_cursor(uri, None) i = 0 while i < nentries: diff --git a/src/third_party/wiredtiger/test/suite/test_reconfig01.py b/src/third_party/wiredtiger/test/suite/test_reconfig01.py index 8957e1d4e43..84f96150768 100644 --- a/src/third_party/wiredtiger/test/suite/test_reconfig01.py +++ b/src/third_party/wiredtiger/test/suite/test_reconfig01.py @@ -106,6 +106,13 @@ class test_reconfig01(wttest.WiredTigerTestCase): self.conn.reconfigure("statistics=(fast)") self.conn.reconfigure("statistics=(none)") + def test_reconfig_capacity(self): + self.conn.reconfigure("io_capacity=(total=80M)") + self.conn.reconfigure("io_capacity=(total=100M)") + msg = '/below minimum/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.conn.reconfigure("io_capacity=(total=16K)"), msg) + def test_reconfig_checkpoints(self): self.conn.reconfigure("checkpoint=(wait=0)") self.conn.reconfigure("checkpoint=(wait=5)") diff --git a/src/third_party/wiredtiger/test/suite/test_split.py b/src/third_party/wiredtiger/test/suite/test_split.py index 11f0f115e1d..cde840fb055 100644 --- a/src/third_party/wiredtiger/test/suite/test_split.py +++ b/src/third_party/wiredtiger/test/suite/test_split.py @@ -45,9 +45,13 @@ class test_split(wttest.WiredTigerTestCase): 'allocation_size=4KB,leaf_page_max=4KB,split_pct=75') cursor = self.session.open_cursor(self.uri, None) + # THIS TEST IS DEPENDENT ON THE PAGE SIZES CREATED BY RECONCILIATION. + # IF IT FAILS, IT MAY BE RECONCILIATION ISN'T CREATING THE SAME SIZE + # PAGES AS BEFORE. + # Create a 4KB page (more than 3KB): 40 records w / 10 byte keys # and 81 byte values. - for i in range(40): + for i in range(35): cursor['%09d' % i] = 8 * ('%010d' % i) # Stabilize @@ -59,7 +63,7 @@ class test_split(wttest.WiredTigerTestCase): # Now append a few records so we're definitely (a little) over 4KB cursor = self.session.open_cursor(self.uri, None) - for i in range(50,55): + for i in range(50,60): cursor['%09d' % i] = 8 * ('%010d' % i) # Stabilize diff --git a/src/third_party/wiredtiger/test/suite/test_txn19.py b/src/third_party/wiredtiger/test/suite/test_txn19.py index a6061ffbb9a..98053a5c7a8 100755 --- a/src/third_party/wiredtiger/test/suite/test_txn19.py +++ b/src/third_party/wiredtiger/test/suite/test_txn19.py @@ -201,6 +201,8 @@ class test_txn19(wttest.WiredTigerTestCase, suite_subprocess): def corrupt_log(self, homedir): if not self.corrupted(): return + # Mark this test has having corrupted files + self.databaseCorrupted() self.f(self.log_number_to_file_name(homedir, self.corruptpos)) # Corrupt a second log file if needed diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py index c0b755d2230..ca4a8295373 100644..100755 --- a/src/third_party/wiredtiger/test/suite/wttest.py +++ b/src/third_party/wiredtiger/test/suite/wttest.py @@ -585,6 +585,15 @@ class WiredTigerTestCase(unittest.TestCase): msg = '**** ' + myname + ' HAS A KNOWN LIMITATION: ' + name + ' ****' self.printOnce(msg) + def databaseCorrupted(self, directory = None): + """ + Mark this test as having a corrupted database by creating a + DATABASE_CORRUPTED file in the home directory. + """ + if directory == None: + directory = self.home + open(os.path.join(directory, "DATABASE_CORRUPTED"), "a").close() + @staticmethod def printVerbose(level, message): if level <= WiredTigerTestCase._verbose: |