diff options
author | Matt Kangas <matt.kangas@mongodb.com> | 2014-12-02 09:02:22 -0500 |
---|---|---|
committer | Matt Kangas <matt.kangas@mongodb.com> | 2014-12-02 14:44:03 -0500 |
commit | e2a08e2cd6fb0ebd226c22cae6ba73425613e477 (patch) | |
tree | 9d935b6d12d9444138676ae444d3e35f516b2e92 | |
parent | 15a2b7ca39ce472e43d8d37c6127eb4c3958456c (diff) | |
download | mongo-e2a08e2cd6fb0ebd226c22cae6ba73425613e477.tar.gz |
Import wiredtiger-wiredtiger-2.4.0-494-ge498f55.tar.gz from wiredtiger branch mongodb-2.8
34 files changed, 1148 insertions, 498 deletions
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index 1ec3496d565..248e967872e 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -199,11 +199,19 @@ env.Program("wt", [ LIBS=[wtlib] + wtlibs) if GetOption("swig"): - env.SharedLibrary('_wiredtiger', + swiglib = env.SharedLibrary('_wiredtiger', [ 'lang\python\wiredtiger.i'], SHLIBSUFFIX=".pyd", LIBS=[wtlib]) + copySwig = env.Command( + 'lang/python/wiredtiger/__init__.py', + 'lang/python/wiredtiger.py', + Copy('$TARGET', '$SOURCE')) + env.Depends(copySwig, swiglib) + + env.Install('lang/python/wiredtiger/', swiglib) + # Shim library of functions to emulate POSIX on Windows shim = env.Library("window_shim", ["test/windows/windows_shim.c"]) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/log-append.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/log-append.wtperf new file mode 100644 index 00000000000..9d0a78e3c61 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/log-append.wtperf @@ -0,0 +1,8 @@ +# wtperf options file: Test a log file with a multi-threaded +# append workload. +conn_config="cache_size=1G,log=(enabled=true,file_max=20MB),checkpoint=(log_size=1G)" +table_config="type=file" +icount=50000000 +report_interval=5 +run_time=0 +populate_threads=8 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/log-nockpt.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/log-nockpt.wtperf new file mode 100644 index 00000000000..a078cead740 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/log-nockpt.wtperf @@ -0,0 +1,12 @@ +# wtperf options file: Test performance with a log file enabled. +# Set the log file reasonably small to catch log-swtich bottle +# necks. +conn_config="cache_size=1G,log=(enabled=true,file_max=20MB)" +table_config="type=file" +icount=50000 +report_interval=5 +run_time=40 +populate_threads=1 +random_range=50000000 +threads=((count=8,inserts=1)) + diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf new file mode 100644 index 00000000000..c336c9d8a5f --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/log.wtperf @@ -0,0 +1,11 @@ +# wtperf options file: Test performance with a log file enabled. +# Set the log file reasonably small to catch log-swtich bottle +# necks. +conn_config="cache_size=1G,log=(enabled=true,file_max=20MB),checkpoint=(log_size=1G)" +table_config="type=file" +icount=50000 +report_interval=5 +run_time=120 +populate_threads=1 +random_range=50000000 +threads=((count=8,inserts=1)) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf index 413c16075d3..091c4e69ad1 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf @@ -1,6 +1,6 @@ # wtperf options file: medium lsm configuration, with multiple tables. conn_config="cache_size=1G,lsm_manager=(worker_thread_max=8)" -table_config="lsm=(chunk_size=100MB,chunk_max=1TB),type=lsm,prefix_compression=false,os_cache_dirty_max=16MB" +table_config="lsm=(chunk_max=1TB),type=lsm,prefix_compression=false,os_cache_dirty_max=16MB" icount=50000000 populate_threads=1 compact=true diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf index 99b7b49aebd..d8433352311 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf @@ -1,6 +1,6 @@ # wtperf options file: medium lsm configuration, with multiple tables. conn_config="cache_size=1G,lsm_manager=(worker_thread_max=8)" -table_config="lsm=(chunk_size=100MB,chunk_max=1TB),type=lsm,os_cache_dirty_max=16MB" +table_config="lsm=(chunk_max=1TB),type=lsm,os_cache_dirty_max=16MB" icount=50000000 populate_threads=1 compact=true diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh index 3296a4072b5..d5de7c4abdb 100755 --- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh +++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh @@ -10,19 +10,20 @@ # build_posix/bench/wtperf). # # This script should be invoked with the pathname of the wtperf test -# config to run. +# config to run and the number of runs. # -if test "$#" -ne "1"; then - echo "Must specify wtperf test to run" +if test "$#" -ne "2"; then + echo "Must specify wtperf test to run and number of runs" exit 1 fi wttest=$1 +runmax=$2 + home=./WT_TEST outfile=./wtperf.out rm -f $outfile -runmax=5 -run=1 +# Each of these has an entry for each op in ops below. avg=(0 0 0) max=(0 0 0) min=(0 0 0) @@ -72,6 +73,7 @@ isstable() getmin=0 getmax=1 +run=1 while test "$run" -le "$runmax"; do rm -rf $home mkdir $home @@ -144,21 +146,36 @@ while test "$run" -le "$runmax"; do run=`expr $run + 1` done -if test "$run" -le "$runmax"; then +skipminmax=0 +if test "$runmax" -le "2"; then + numruns=$(getval $getmin $run $runmax) + skipminmax=1 +elif test "$run" -le "$runmax"; then numruns=`expr $run - 2` else numruns=`expr $runmax - 2` fi +if test "$numruns" -eq "0"; then + $numruns=1 +fi # # The sum contains all runs. Subtract out the min/max values. # Average the remaining and write it out to the file. # for i in ${!min[*]}; do if test "$i" -eq "$loadindex"; then - s=`echo "scale=3; ${sum[$i]} - ${min[$i]} - ${max[$i]}" | bc` + if test "$skipminmax" -eq "0"; then + s=`echo "scale=3; ${sum[$i]} - ${min[$i]} - ${max[$i]}" | bc` + else + s=${sum[$i]} + fi avg[$i]=`echo "scale=3; $s / $numruns" | bc` else - s=`expr ${sum[$i]} - ${min[$i]} - ${max[$i]}` + if test "$skipminmax" -eq "0"; then + s=`expr ${sum[$i]} - ${min[$i]} - ${max[$i]}` + else + s=${sum[$i]} + fi avg[$i]=`expr $s / $numruns` fi done diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index 3b4ddb6b3ad..3fec9bddac9 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -105,7 +105,7 @@ DEF_OPT_AS_UINT32(database_count, 1, " threads") DEF_OPT_AS_UINT32(icount, 5000, "number of records to initially populate. If multiple tables are " - "configured, each table has this many items inserted.") + "configured the count is spread evenly across all tables.") DEF_OPT_AS_BOOL(insert_rmw, 0, "execute a read prior to each insert in workload phase") DEF_OPT_AS_UINT32(key_sz, 20, "key size") diff --git a/src/third_party/wiredtiger/dist/stat.py b/src/third_party/wiredtiger/dist/stat.py index 6a3a1b74db3..5ffcd07e66c 100644 --- a/src/third_party/wiredtiger/dist/stat.py +++ b/src/third_party/wiredtiger/dist/stat.py @@ -5,7 +5,7 @@ import re, string, sys, textwrap from dist import compare_srcfile # Read the source files. -from stat_data import dsrc_stats, connection_stats +from stat_data import groups, dsrc_stats, connection_stats def print_struct(title, name, base, stats): '''Print the structures for the stat.h file.''' @@ -161,23 +161,34 @@ compare_srcfile(tmp_file, '../src/support/stat.c') # Update the statlog file with the entries we can scale per second. scale_info = 'no_scale_per_second_list = [\n' clear_info = 'no_clear_list = [\n' +prefix_list = [] for l in sorted(connection_stats): + prefix_list.append(l.prefix) if 'no_scale' in l.flags: scale_info += ' \'' + l.desc + '\',\n' if 'no_clear' in l.flags: clear_info += ' \'' + l.desc + '\',\n' for l in sorted(dsrc_stats): + prefix_list.append(l.prefix) if 'no_scale' in l.flags: scale_info += ' \'' + l.desc + '\',\n' if 'no_clear' in l.flags: clear_info += ' \'' + l.desc + '\',\n' scale_info += ']\n' clear_info += ']\n' +prefix_info = 'prefix_list = [\n' +# Remove the duplicates and print out the list +for l in list(set(prefix_list)): + prefix_info += ' \'' + l + '\',\n' +prefix_info += ']\n' +group_info = 'groups = ' + str(groups) tmp_file = '__tmp' f = open(tmp_file, 'w') f.write('# DO NOT EDIT: automatically built by dist/stat.py. */\n\n') f.write(scale_info) f.write(clear_info) +f.write(prefix_info) +f.write(group_info) f.close() compare_srcfile(tmp_file, '../tools/stat_data.py') diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index df6c919f808..1e4f1b41cb9 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -26,44 +26,69 @@ class Stat: return cmp(self.desc.lower(), other.desc.lower()) class AsyncStat(Stat): + prefix = 'async' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'async', desc, flags) + Stat.__init__(self, name, AsyncStat.prefix, desc, flags) class BlockStat(Stat): + prefix = 'block-manager' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'block-manager', desc, flags) + Stat.__init__(self, name, BlockStat.prefix, desc, flags) class BtreeStat(Stat): + prefix = 'btree' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'btree', desc, flags) + Stat.__init__(self, name, BtreeStat.prefix, desc, flags) class CacheStat(Stat): + prefix = 'cache' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'cache', desc, flags) + Stat.__init__(self, name, CacheStat.prefix, desc, flags) class CompressStat(Stat): + prefix = 'compression' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'compression', desc, flags) -class CursorStat(Stat): - def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'cursor', desc, flags) + Stat.__init__(self, name, CompressStat.prefix, desc, flags) class ConnStat(Stat): + prefix = 'connection' + def __init__(self, name, desc, flags=''): + Stat.__init__(self, name, ConnStat.prefix, desc, flags) +class CursorStat(Stat): + prefix = 'cursor' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'connection', desc, flags) + Stat.__init__(self, name, CursorStat.prefix, desc, flags) class DhandleStat(Stat): + prefix = 'data-handle' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'data-handle', desc, flags) + Stat.__init__(self, name, DhandleStat.prefix, desc, flags) class LogStat(Stat): + prefix = 'log' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'log', desc, flags) + Stat.__init__(self, name, LogStat.prefix, desc, flags) class LSMStat(Stat): + prefix = 'LSM' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'LSM', desc, flags) + Stat.__init__(self, name, LSMStat.prefix, desc, flags) class RecStat(Stat): + prefix = 'reconciliation' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'reconciliation', desc, flags) + Stat.__init__(self, name, RecStat.prefix, desc, flags) class SessionStat(Stat): + prefix = 'session' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'session', desc, flags) + Stat.__init__(self, name, SessionStat.prefix, desc, flags) class TxnStat(Stat): + prefix = 'transaction' def __init__(self, name, desc, flags=''): - Stat.__init__(self, name, 'transaction', desc, flags) + Stat.__init__(self, name, TxnStat.prefix, desc, flags) + +########################################## +# Groupings of useful statistics: +# A pre-defined dictionary containing the group name as the key and the +# list of prefix tags that comprise that group. +########################################## +groups = {} +groups['cursor'] = [CursorStat.prefix, SessionStat.prefix] +groups['evict'] = [CacheStat.prefix, ConnStat.prefix, BlockStat.prefix] +groups['lsm'] = [LSMStat.prefix, TxnStat.prefix] +groups['memory'] = [CacheStat.prefix, ConnStat.prefix, RecStat.prefix] +groups['system'] = [ConnStat.prefix, DhandleStat.prefix, SessionStat.prefix] ########################################## # CONNECTION statistics @@ -123,6 +148,7 @@ connection_stats = [ 'maximum bytes configured', 'no_clear,no_scale'), CacheStat('cache_bytes_read', 'bytes read into cache'), CacheStat('cache_bytes_write', 'bytes written from cache'), + CacheStat('cache_eviction_app', 'pages evicted by application threads'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), @@ -148,6 +174,7 @@ connection_stats = [ 'eviction server unable to reach eviction goal'), CacheStat('cache_eviction_split', 'pages split during eviction'), CacheStat('cache_eviction_walk', 'pages walked for eviction'), + CacheStat('cache_inmem_split', 'in-memory page splits'), CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_scale'), CacheStat('cache_pages_inuse', @@ -251,17 +278,17 @@ connection_stats = [ SessionStat('session_open', 'open session count', 'no_clear,no_scale'), ########################################## - # Total Btree cursor operations - ########################################## - BtreeStat('cursor_create', 'cursor create calls'), - BtreeStat('cursor_insert', 'cursor insert calls'), - BtreeStat('cursor_next', 'cursor next calls'), - BtreeStat('cursor_prev', 'cursor prev calls'), - BtreeStat('cursor_remove', 'cursor remove calls'), - BtreeStat('cursor_reset', 'cursor reset calls'), - BtreeStat('cursor_search', 'cursor search calls'), - BtreeStat('cursor_search_near', 'cursor search near calls'), - BtreeStat('cursor_update', 'cursor update calls'), + # Total cursor operations + ########################################## + CursorStat('cursor_create', 'cursor create calls'), + CursorStat('cursor_insert', 'cursor insert calls'), + CursorStat('cursor_next', 'cursor next calls'), + CursorStat('cursor_prev', 'cursor prev calls'), + CursorStat('cursor_remove', 'cursor remove calls'), + CursorStat('cursor_reset', 'cursor reset calls'), + CursorStat('cursor_search', 'cursor search calls'), + CursorStat('cursor_search_near', 'cursor search near calls'), + CursorStat('cursor_update', 'cursor update calls'), ] connection_stats = sorted(connection_stats, key=attrgetter('name')) @@ -371,6 +398,7 @@ dsrc_stats = [ 'data source pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), + CacheStat('cache_inmem_split', 'in-memory page splits'), CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'), CacheStat('cache_read', 'pages read into cache'), diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index d500f93817a..28c39c71a08 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -1167,7 +1167,7 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session, * merged). Second, the "available" list is sorted by size as well as * by offset, and the fast-path append code doesn't support that, it's * limited to offset. The test of "track size" is short-hand for "are - * we reading the "available" list. + * we reading the available-blocks list". */ func = el->track_size == 0 ? __block_append : __block_merge; for (;;) { diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index 148b4fa9743..db4e42b79e7 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -17,7 +17,7 @@ static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); /* The bit list ignores the first block: convert to/from a frag/offset. */ -#define WT_wt_off_tO_FRAG(block, off) \ +#define WT_wt_off_TO_FRAG(block, off) \ ((off) / (block)->allocsize - 1) #define WT_FRAG_TO_OFF(block, frag) \ (((wt_off_t)(frag + 1)) * (block)->allocsize) @@ -81,7 +81,7 @@ __wt_block_verify_start( * verify many non-contiguous blocks creating too many entries on the * list to fit into memory. */ - block->frags = (uint64_t)WT_wt_off_tO_FRAG(block, size); + block->frags = (uint64_t)WT_wt_off_TO_FRAG(block, size); WT_RET(__bit_alloc(session, block->frags, &block->fragfile)); /* @@ -232,6 +232,22 @@ __wt_verify_ckpt_load( } /* + * We don't need to list of blocks on a checkpoint's avail list, but we + * read it to ensure it wasn't corrupted. We could confirm correctness + * of intermediate avail lists (that is, if they're logically the result + * of the allocations and discards to this point). We don't because the + * only avail list ever used is the one for the last checkpoint, which + * is separately verified by checking it against all of the blocks found + * in the file. + */ + el = &ci->avail; + if (el->offset != WT_BLOCK_INVALID_OFFSET) { + WT_RET(__wt_block_extlist_read( + session, block, el, ci->file_size)); + __wt_block_extlist_free(session, el); + } + + /* * The root page of the checkpoint appears on the alloc list, but not, * at least until the checkpoint is deleted, on a discard list. To * handle this case, remove the root page from the accumulated list of @@ -252,7 +268,7 @@ __wt_verify_ckpt_load( WT_RET(__bit_alloc(session, block->frags, &block->fragckpt)); el = &block->verify_alloc; WT_EXT_FOREACH(ext, el->off) { - frag = (uint64_t)WT_wt_off_tO_FRAG(block, ext->off); + frag = (uint64_t)WT_wt_off_TO_FRAG(block, ext->off); frags = (uint64_t)(ext->size / block->allocsize); __bit_nset(block->fragckpt, frag, frag + (frags - 1)); } @@ -342,7 +358,7 @@ __verify_filefrag_add(WT_SESSION_IMPL *session, WT_BLOCK *block, "non-existent file blocks", (uintmax_t)offset, (uintmax_t)(offset + size)); - frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset); + frag = (uint64_t)WT_wt_off_TO_FRAG(block, offset); frags = (uint64_t)(size / block->allocsize); /* It may be illegal to reference a particular chunk more than once. */ @@ -445,7 +461,7 @@ __verify_ckptfrag_add( "file blocks outside the checkpoint", (uintmax_t)offset, (uintmax_t)(offset + size)); - frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset); + frag = (uint64_t)WT_wt_off_TO_FRAG(block, offset); frags = (uint64_t)(size / block->allocsize); /* It is illegal to reference a particular chunk more than once. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 0cc79776634..6a07969d684 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -120,7 +120,7 @@ new_page: * Return the next variable-length entry on the append list. */ static inline int -__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_ITEM *val; WT_SESSION_IMPL *session; @@ -141,8 +141,10 @@ new_page: if (cbt->ins == NULL) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || - WT_UPDATE_DELETED_ISSET(upd)) + WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; @@ -155,7 +157,7 @@ new_page: if (cbt->ins == NULL) * Move to the next, variable-length column-store item. */ static inline int -__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_CELL *cell; WT_CELL_UNPACK unpack; @@ -195,8 +197,10 @@ new_page: /* Find the matching WT_COL slot. */ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) + if (WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; @@ -233,7 +237,7 @@ new_page: /* Find the matching WT_COL slot. */ * Move to the next row-store item. */ static inline int -__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_INSERT *ins; WT_ITEM *key, *val; @@ -277,8 +281,10 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage) new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL || - WT_UPDATE_DELETED_ISSET(upd)) + WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); @@ -307,8 +313,10 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); - if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) + if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } return (__cursor_row_slot_return(cbt, rip, upd)); } @@ -385,7 +393,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; - int newpage; + int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -410,15 +418,18 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) * found. Then, move to the next page, until we reach the end of the * file. */ - page = cbt->ref == NULL ? NULL : cbt->ref->page; - for (newpage = 0;; newpage = 1) { + for (skipped = newpage = 0;; skipped = 0, newpage = 1) { + page = cbt->ref == NULL ? NULL : cbt->ref->page; + WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: - ret = __cursor_var_append_next(cbt, newpage); + ret = __cursor_var_append_next( + cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -433,10 +444,10 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: - ret = __cursor_var_next(cbt, newpage); + ret = __cursor_var_next(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: - ret = __cursor_row_next(cbt, newpage); + ret = __cursor_row_next(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -455,11 +466,17 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) } } + /* + * If we scanned all the way through a page and only saw + * deleted records, try to evict the page as we release it. + * Otherwise repeatedly deleting from the beginning of a tree + * can have quadratic performance. + */ + if (newpage && skipped) + page->read_gen = WT_READGEN_OLDEST; + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); - - page = cbt->ref->page; - WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page)); } err: if (ret != 0) diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 8de784d1f1d..a6be8271ea5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -257,7 +257,7 @@ new_page: * Return the previous variable-length entry on the append list. */ static inline int -__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_ITEM *val; WT_SESSION_IMPL *session; @@ -278,8 +278,10 @@ new_page: if (cbt->ins == NULL) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || - WT_UPDATE_DELETED_ISSET(upd)) + WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; @@ -292,7 +294,7 @@ new_page: if (cbt->ins == NULL) * Move to the previous, variable-length column-store item. */ static inline int -__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_CELL *cell; WT_CELL_UNPACK unpack; @@ -333,8 +335,10 @@ new_page: if (cbt->recno < page->pg_var_recno) upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) + if (WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; @@ -352,8 +356,10 @@ new_page: if (cbt->recno < page->pg_var_recno) if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); - if (unpack.type == WT_CELL_DEL) + if (unpack.type == WT_CELL_DEL) { + *skipped = 1; continue; + } WT_RET(__wt_page_cell_data_ref( session, page, &unpack, &cbt->tmp)); @@ -371,7 +377,7 @@ new_page: if (cbt->recno < page->pg_var_recno) * Move to the previous row-store item. */ static inline int -__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage) +__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage, int *skipped) { WT_INSERT *ins; WT_ITEM *key, *val; @@ -426,8 +432,10 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage) new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL || - WT_UPDATE_DELETED_ISSET(upd)) + WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); @@ -458,8 +466,10 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); - if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) + if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { + *skipped = 1; continue; + } return (__cursor_row_slot_return(cbt, rip, upd)); } @@ -477,7 +487,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; - int newpage; + int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -502,15 +512,27 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) * found. Then, move to the previous page, until we reach the start * of the file. */ - page = cbt->ref == NULL ? NULL : cbt->ref->page; - for (newpage = 0;; newpage = 1) { + for (skipped = newpage = 0;; skipped = 0, newpage = 1) { + page = cbt->ref == NULL ? NULL : cbt->ref->page; + WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); + + /* + * The last page in a column-store has appended entries. + * We handle it separately from the usual cursor code: + * it's only that one page and it's in a simple format. + */ + if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && + (cbt->ins_head = WT_COL_APPEND(page)) != NULL) + F_SET(cbt, WT_CBT_ITERATE_APPEND); + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: - ret = __cursor_var_append_prev(cbt, newpage); + ret = __cursor_var_append_prev( + cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -527,10 +549,10 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: - ret = __cursor_var_prev(cbt, newpage); + ret = __cursor_var_prev(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: - ret = __cursor_row_prev(cbt, newpage); + ret = __cursor_row_prev(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -538,20 +560,11 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) break; } + if (newpage && skipped) + page->read_gen = WT_READGEN_OLDEST; + WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); - - page = cbt->ref->page; - WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page)); - - /* - * The last page in a column-store has appended entries. - * We handle it separately from the usual cursor code: - * it's only that one page and it's in a simple format. - */ - if (page->type != WT_PAGE_ROW_LEAF && - (cbt->ins_head = WT_COL_APPEND(page)) != NULL) - F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret != 0) diff --git a/src/third_party/wiredtiger/src/btree/bt_evict.c b/src/third_party/wiredtiger/src/btree/bt_evict.c index 91a7885e63f..2af9f0024db 100644 --- a/src/third_party/wiredtiger/src/btree/bt_evict.c +++ b/src/third_party/wiredtiger/src/btree/bt_evict.c @@ -615,7 +615,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) WT_ASSERT(session, !F_ISSET(txn, TXN_HAS_ID) || !__wt_txn_visible(session, txn->id)); - ret = __wt_rec_evict(session, ref, 0); + ret = __wt_evict(session, ref, 0); txn->isolation = saved_iso; return (ret); @@ -1266,6 +1266,9 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) WT_PAGE *page; WT_REF *ref; + if (is_app) + WT_STAT_FAST_CONN_INCR(session, cache_eviction_app); + WT_RET(__evict_get_ref(session, is_app, &btree, &ref)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index c9cda548d43..b5af14ab376 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -511,7 +511,7 @@ err: if (leaf != NULL) /* * __wt_btree_new_leaf_page -- - * Create an empty leaf page and link it into a reference in its parent. + * Create an empty leaf page. */ int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) @@ -523,15 +523,15 @@ __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) switch (btree->type) { case BTREE_COL_FIX: WT_RET( - __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 1, pagep)); + __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 0, pagep)); break; case BTREE_COL_VAR: WT_RET( - __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 1, pagep)); + __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 0, pagep)); break; case BTREE_ROW: WT_RET( - __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 1, pagep)); + __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 0, pagep)); break; WT_ILLEGAL_VALUE(session); } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 10366e91a0e..6e70c9ea2b6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -324,7 +324,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) */ if (ss->root_ref.page != NULL) { btree->ckpt = ckptbase; - ret = __wt_rec_evict(session, &ss->root_ref, 1); + ret = __wt_evict(session, &ss->root_ref, 1); ss->root_ref.page = NULL; btree->ckpt = NULL; } @@ -1302,7 +1302,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -1310,7 +1310,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_rec_evict(session, ref, 1); + ret = __wt_evict(session, ref, 1); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); @@ -2009,7 +2009,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); /* Reset the page. */ page->pg_row_entries += skip_stop; @@ -2020,7 +2020,7 @@ __slvg_row_build_leaf( */ ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_rec_evict(session, ref, 1); + ret = __wt_evict(session, ref, 1); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 0c4064dfdd1..cccd2e5bede 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -66,7 +66,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) __wt_txn_refresh(session, 1); leaf_bytes += page->memory_footprint; ++leaf_pages; - WT_ERR(__wt_rec_write(session, walk, NULL, 0)); + WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; @@ -132,7 +132,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) leaf_bytes += page->memory_footprint; ++leaf_pages; } - WT_ERR(__wt_rec_write(session, walk, NULL, 0)); + WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; @@ -244,7 +244,7 @@ __evict_file(WT_SESSION_IMPL *session, int syncop) * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) - WT_ERR(__wt_rec_write(session, ref, NULL, WT_EVICTING)); + WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our @@ -269,7 +269,7 @@ __evict_file(WT_SESSION_IMPL *session, int syncop) if (__wt_ref_is_root(ref) || page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_EMPTY)) - WT_ERR(__wt_rec_evict(session, ref, 1)); + WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_FORCE: @@ -295,15 +295,10 @@ __evict_file(WT_SESSION_IMPL *session, int syncop) !__wt_txn_visible_all(session, page->modify->rec_max_txn)) WT_ERR(EBUSY); + if (syncop == WT_SYNC_DISCARD_FORCE) F_SET(session, WT_SESSION_DISCARD_FORCE); - __wt_ref_out(session, ref); - /* - * In case we don't discard the whole tree, make sure - * that future readers know that the page is no longer - * in cache. - */ - ref->state = WT_REF_DISK; + __wt_rec_page_clean_update(session, ref); F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); diff --git a/src/third_party/wiredtiger/src/btree/rec_evict.c b/src/third_party/wiredtiger/src/btree/rec_evict.c index 4696e78059e..f8dd4708ffd 100644 --- a/src/third_party/wiredtiger/src/btree/rec_evict.c +++ b/src/third_party/wiredtiger/src/btree/rec_evict.c @@ -10,25 +10,24 @@ static int __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int); static void __rec_discard_tree(WT_SESSION_IMPL *, WT_REF *, int, int); static void __rec_excl_clear(WT_SESSION_IMPL *); -static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_REF *); static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int); -static int __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *); +static int __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *, int *); /* - * __wt_rec_evict -- - * Reconciliation plus eviction. + * __wt_evict -- + * Eviction. */ int -__wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) { WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_TXN_STATE *txn_state; - int istree; + int inmem_split, istree; page = ref->page; - istree = 0; + inmem_split = istree = 0; WT_RET(__wt_verbose(session, WT_VERB_EVICT, "page %p (%s)", page, __wt_page_type_string(page->type))); @@ -51,7 +50,14 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. */ - WT_ERR(__rec_review(session, ref, exclusive, 1, &istree)); + WT_ERR(__rec_review(session, ref, exclusive, 1, &inmem_split, &istree)); + + /* + * If there was an in-memory split, the tree has been left in the state + * we want: there is nothing more to do. + */ + if (inmem_split) + goto done; /* * Update the page's modification reference, reconciliation might have @@ -77,7 +83,7 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else - __rec_page_clean_update(session, ref); + __wt_rec_page_clean_update(session, ref); WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); @@ -103,7 +109,7 @@ err: /* WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail); WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail); } - session->excl_next = 0; +done: session->excl_next = 0; if (txn_state != NULL) txn_state->snap_min = WT_TXN_NONE; @@ -112,11 +118,11 @@ err: /* } /* - * __rec_page_clean_update -- + * __wt_rec_page_clean_update -- * Update a clean page's reference on eviction. */ -static void -__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) +void +__wt_rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) { /* * Discard the page and update the reference structure; if the page has @@ -166,8 +172,26 @@ __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) WT_PUBLISH(ref->state, WT_REF_DELETED); break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ - /* Split the page in memory. */ - WT_RET(__wt_split_evict(session, ref, exclusive)); + /* + * There are two cases in this code. + * + * First, an in-memory page that got too large, we forcibly + * evicted it, and there wasn't anything to write. (Imagine two + * threads updating a small set keys on a leaf page. The page is + * too large so we try to evict it, but after reconciliation + * there's only a small amount of data (so it's a single page we + * can't split), and because there are two threads, there's some + * data we can't write (so we can't evict it). In that case, we + * take advantage of the fact we have exclusive access to the + * page and rewrite it in memory.) + * + * Second, a real split where we reconciled a page and it turned + * into a lot of pages. + */ + if (mod->mod_multi_entries == 1) + WT_RET(__wt_split_rewrite(session, ref)); + else + WT_RET(__wt_split_multi(session, ref, exclusive)); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -233,8 +257,8 @@ __rec_discard_tree( * for conditions that would block its eviction. */ static int -__rec_review( - WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top, int *istree) +__rec_review(WT_SESSION_IMPL *session, WT_REF *ref, + int exclusive, int top, int *inmem_splitp, int *istreep) { WT_BTREE *btree; WT_PAGE *page; @@ -260,6 +284,24 @@ __rec_review( * valid memory. */ __wt_evict_list_clear_page(session, ref); + + /* + * Check for an append-only workload needing an in-memory split. + * + * We can't do this earlier because in-memory splits require + * exclusive access. If an in-memory split completes, the page + * stays in memory and the tree is left in the desired state: + * avoid the usual cleanup. + * + * Attempt the split before checking whether a checkpoint is + * running - that's not a problem here because we aren't + * evicting any dirty pages. + */ + if (top) { + WT_RET(__wt_split_insert(session, ref, inmem_splitp)); + if (*inmem_splitp) + return (0); + } } /* @@ -279,9 +321,9 @@ __rec_review( * know to do a full walk when discarding the * page. */ - *istree = 1; - WT_RET(__rec_review( - session, child, exclusive, 0, istree)); + *istreep = 1; + WT_RET(__rec_review(session, child, exclusive, + 0, inmem_splitp, istreep)); break; case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ @@ -387,7 +429,7 @@ __rec_review( else if (top && !WT_PAGE_IS_INTERNAL(page) && page->memory_footprint > 10 * btree->maxleafpage) LF_SET(WT_SKIP_UPDATE_RESTORE); - WT_RET(__wt_rec_write(session, ref, NULL, flags)); + WT_RET(__wt_reconcile(session, ref, NULL, flags)); WT_ASSERT(session, !__wt_page_is_modified(page) || LF_ISSET(WT_SKIP_UPDATE_RESTORE)); diff --git a/src/third_party/wiredtiger/src/btree/rec_split.c b/src/third_party/wiredtiger/src/btree/rec_split.c index babec2cc295..dea44503c55 100644 --- a/src/third_party/wiredtiger/src/btree/rec_split.c +++ b/src/third_party/wiredtiger/src/btree/rec_split.c @@ -631,12 +631,12 @@ err: __wt_free_ref_index(session, parent, alloc_index, 1); } /* - * __split_inmem_build -- + * __split_multi_inmem -- * Instantiate a page in a multi-block set, when an update couldn't be * written. */ static int -__split_inmem_build( +__split_multi_inmem( WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi) { WT_CURSOR_BTREE cbt; @@ -722,9 +722,10 @@ __split_inmem_build( */ page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id; -err: __wt_scr_free(&key); - /* Free any resources that may have been cached in the cursor. */ +err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt)); + + __wt_scr_free(&key); return (ret); } @@ -774,10 +775,9 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, addr->type = multi->addr.type; WT_RET(__wt_strndup(session, multi->addr.addr, addr->size, &addr->addr)); - /* Need a cast to avoid an implicit conversion warning. */ WT_MEMSIZE_ADD(incr, addr->size); } else - WT_RET(__split_inmem_build(session, page, ref, multi)); + WT_RET(__split_multi_inmem(session, page, ref, multi)); switch (page->type) { case WT_PAGE_ROW_INT: @@ -804,42 +804,27 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, } /* - * __split_evict_multi -- + * __split_parent -- * Resolve a multi-page split, inserting new information into the parent. */ static int -__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, + uint32_t new_entries, size_t parent_decr, size_t parent_incr, + int exclusive, int ref_discard) { WT_DECL_RET; - WT_IKEY *ikey; - WT_PAGE *parent, *child; + WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; - WT_PAGE_MODIFY *mod; - WT_REF **alloc_refp, *parent_ref, ref_copy, **ref_tmp; - size_t parent_decr, parent_incr, size; - uint32_t i, j, parent_entries, result_entries, split_entries; + WT_REF **alloc_refp, *parent_ref; + size_t size; + uint32_t i, j, parent_entries, result_entries; int complete, hazard, locked; parent = NULL; /* -Wconditional-uninitialized */ alloc_index = NULL; parent_ref = NULL; - ref_tmp = NULL; - parent_decr = parent_incr = 0; complete = hazard = locked = 0; - child = ref->page; - mod = child->modify; - - /* - * Convert the split page's multiblock reconciliation information into - * an array of page reference structures. - */ - split_entries = mod->mod_multi_entries; - WT_RET(__wt_calloc_def(session, split_entries, &ref_tmp)); - for (i = 0; i < split_entries; ++i) - WT_ERR(__wt_multi_to_ref(session, - child, &mod->mod_multi[i], &ref_tmp[i], &parent_incr)); - /* * Get a page-level lock on the parent to single-thread splits into the * page because we need to single-thread sizing/growing the page index. @@ -879,7 +864,7 @@ __split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) pindex = WT_INTL_INDEX_COPY(parent); parent_entries = pindex->entries; - result_entries = (parent_entries - 1) + split_entries; + result_entries = (parent_entries - 1) + new_entries; /* * Allocate and initialize a new page index array for the parent, then @@ -893,20 +878,19 @@ __split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) if (pindex->index[i] == ref) - for (j = 0; j < split_entries; ++j) { - ref_tmp[j]->home = parent; - *alloc_refp++ = ref_tmp[j]; + for (j = 0; j < new_entries; ++j) { + ref_new[j]->home = parent; + *alloc_refp++ = ref_new[j]; /* * Clear the split reference as it moves to the * allocated page index, so it never appears on * both after an error. */ - ref_tmp[j] = NULL; + ref_new[j] = NULL; } else *alloc_refp++ = pindex->index[i]; - __wt_free(session, ref_tmp); /* * Update the parent page's index: this update makes the split visible @@ -930,19 +914,26 @@ __split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) /* * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by simply freeing allocated - * memory because the original page is unchanged, we can continue to - * use it and we have not yet modified the parent. (See below for an - * exception, we cannot discard pages referencing unresolved changes.) - * Failures after we swap the new page index into the parent are also - * relatively benign because the split is OK and complete and the page - * is reset so it will be discarded by eviction. For that reason, we - * mostly ignore further errors unless there's a panic. + * index into the parent can be resolved by freeing allocated memory + * because the original page is unchanged, we can continue to use it + * and we have not yet modified the parent. Failures after we swap + * the new page index into the parent are also relatively benign, the + * split is OK and complete. For those reasons, we ignore errors past + * this point unless there's a panic. */ complete = 1; /* - * The previous parent page's key for this child page may have been an + * We can't free the previous page index, there may be threads using it. + * Add it to the session discard list, to be freed when it's safe. + */ + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, exclusive, pindex, size)); + WT_MEMSIZE_ADD(parent_decr, size); + + /* + * Row-store trees where the old version of the page is being discarded: + * the previous parent page's key for this child page may have been an * on-page overflow key. In that case, if the key hasn't been deleted, * delete it now, including its backing blocks. We are exchanging the * WT_REF that referenced it for the split page WT_REFs and their keys, @@ -950,32 +941,10 @@ __split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * split (if we failed, we'd leak the underlying blocks, but the parent * page would be unaffected). */ - if (parent->type == WT_PAGE_ROW_INT) + if (ref_discard && parent->type == WT_PAGE_ROW_INT) WT_TRET(__split_ovfl_key_cleanup(session, parent, ref)); /* - * We can't free the previous page index, or the page's original WT_REF - * structure and instantiated key, there may be threads using them. Add - * them to the session discard list, to be freed once we know it's safe. - */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, exclusive, pindex, size)); - WT_MEMSIZE_ADD(parent_decr, size); - if (parent->type == WT_PAGE_ROW_INT && - (ikey = __wt_ref_key_instantiated(ref)) != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free(session, exclusive, ikey, size)); - WT_MEMSIZE_ADD(parent_decr, size); - } - /* - * Take a copy of the ref in case we can free it immediately: we still - * need to discard the page. - */ - ref_copy = *ref; - WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF))); - WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); - - /* * Adjust the parent's memory footprint. This may look odd, but we * have already taken the allocation overhead into account, and an * increment followed by a decrement will cancel out the normal @@ -986,9 +955,9 @@ __split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %s split into parent %p %" PRIu32 " -> %" PRIu32 + "%s split into parent %" PRIu32 " -> %" PRIu32 " (%" PRIu32 ")", - child, __wt_page_type_string(child->type), parent, parent_entries, + __wt_page_type_string(ref->page->type), parent_entries, result_entries, result_entries - parent_entries)); /* @@ -1016,55 +985,348 @@ err: if (locked) if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); + __wt_free_ref_index(session, NULL, alloc_index, 0); + /* - * Discard the child; test for split completion instead of errors, there - * might be a relatively innocuous error, and if we split the parent, we - * want to discard the child. + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. */ - if (complete) { + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during parent page split"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __wt_split_insert -- + * Check for pages with append-only workloads and split their last insert + * list into a separate page. + */ +int +__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_IKEY *ikey; + WT_DECL_ITEM(key); + WT_INSERT *ins, **insp, *moved_ins, *prev_ins; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page, *right; + WT_REF *child, *split_ref[2] = { NULL, NULL }; + WT_UPDATE *upd; + size_t page_decr, parent_incr, right_incr, size; + int i; + + *splitp = 0; + + btree = S2BT(session); + page = ref->page; + right = NULL; + page_decr = parent_incr = right_incr = 0; + + /* + * Check for pages with append-only workloads. A common application + * pattern is to have multiple threads frantically appending to the + * tree. We want to reconcile and evict this page, but we'd like to + * do it without making the appending threads wait. If we're not + * discarding the tree, check and see if it's worth doing a split to + * let the threads continue before doing eviction. + * + * Ignore anything other than row-store leaf pages. + * Ignore small pages. + * + * XXX KEITH + * Need a better test for append-only workloads. + */ + if (page->type != WT_PAGE_ROW_LEAF) + return (0); + if (page->memory_footprint < 10 * btree->maxleafpage) + return (0); + + /* + * There is no point splitting if the list is small, no deep items is + * our heuristic for that. (A 1/4 probability of adding a new skiplist + * level means there will be a new 6th level for roughly each 4KB of + * entries in the list. If we have at least two 6th level entries, the + * list is at least large enough to work with.) + * + * The following code requires at least two items on the insert list, + * this test serves the additional purpose of confirming that. + */ +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) + ins_head = page->pg_row_entries == 0 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + if (ins_head == NULL || + ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || + ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == + ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) + return (0); + + /* Find the last item in the insert list. */ + moved_ins = WT_SKIP_LAST(ins_head); + + /* + * Only split a page once, otherwise workloads that update in the middle + * of the page could continually split without benefit. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) + return (0); + F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); + + /* + * The first page in the split is the current page, but we still need to + * create a replacement WT_REF and make a copy of the key (the original + * WT_REF is set to split-status and eventually freed). + * + * The new reference is visible to readers once the split completes. + */ + WT_ERR(__wt_calloc_def(session, 1, &split_ref[0])); + child = split_ref[0]; + *child = *ref; + child->state = WT_REF_MEM; + + /* + * Copy the first key from the original page into first ref in the new + * parent. Pages created in memory always have a "smallest" insert + * list, so look there first. If we don't find one, get the first key + * from the disk image. + * + * We can't just use the key from the original ref: it may have been + * suffix-compressed, and after the split the truncated key may not be + * valid. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } else + WT_ERR(__wt_row_leaf_key( + session, page, &page->pg_row_d[0], key, 1)); + + WT_ERR(__wt_row_ikey( + session, 0, key->data, key->size, &child->key.ikey)); + __wt_scr_free(&key); + + /* + * The second page in the split is a new WT_REF/page pair. + */ + WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 0, &right)); + WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins)); + WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins[0])); + WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD)); + WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD *)); + + WT_ERR(__wt_calloc_def(session, 1, &split_ref[1])); + child = split_ref[1]; + child->page = right; + child->state = WT_REF_MEM; + WT_ERR(__wt_row_ikey(session, 0, + WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), + &child->key.ikey)); + WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); + WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY)); + WT_MEMSIZE_ADD(parent_incr, WT_INSERT_KEY_SIZE(moved_ins)); + + /* The new page is dirty by definition. */ + WT_ERR(__wt_page_modify_init(session, right)); + __wt_page_only_modify_set(session, right); + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we are moving may be older than that: inherit the original + * page's transaction ID. + */ + right->modify->first_dirty_txn = page->modify->first_dirty_txn; + + /* + * Calculate how much memory we're moving: figure out how deep the skip + * list stack is for the element we are moving, and the memory used by + * the item's list of updates. + */ + for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i) + ; + size = ((size_t)i - 1) * sizeof(WT_INSERT *); + size += sizeof(WT_INSERT) + WT_INSERT_KEY_SIZE(moved_ins); + for (upd = moved_ins->upd; upd != NULL; upd = upd->next) + size += sizeof(WT_UPDATE) + upd->size; + WT_MEMSIZE_ADD(right_incr, size); + WT_MEMSIZE_ADD(page_decr, size); + __wt_cache_page_inmem_decr(session, page, page_decr); + __wt_cache_page_inmem_incr(session, right, right_incr); + + /* + * Allocation operations completed, move the last insert list item from + * the original page to the new page. + * + * First, update the item to the new child page. (Just append the entry + * for simplicity, the previous skip list pointers originally allocated + * can be ignored.) + */ + right->pg_row_ins[0]->head[0] = + right->pg_row_ins[0]->tail[0] = moved_ins; + + /* + * Remove the entry from the orig page (i.e truncate the skip list). + * Following is an example skip list that might help. + * + * __ + * |c3| + * | + * __ __ __ + * |a2|--------|c2|--|d2| + * | | | + * __ __ __ __ + * |a1|--------|c1|--|d1|--------|f1| + * | | | | + * __ __ __ __ __ __ + * |a0|--|b0|--|c0|--|d0|--|e0|--|f0| + * + * From the above picture. + * The head array will be: a0, a1, a2, c3, NULL + * The tail array will be: f0, f1, d2, c3, NULL + * We are looking for: e1, d2, NULL + * If there were no f1, we'd be looking for: e0, NULL + * If there were an f2, we'd be looking for: e0, d1, d2, NULL + * + * The algorithm does: + * 1) Start at the top of the head list. + * 2) Step down until we find a level that contains more than one + * element. + * 3) Step across until we reach the tail of the level. + * 4) If the tail is the item being moved, remove it. + * 5) Drop down a level, and go to step 3 until at level 0. + */ + prev_ins = NULL; /* -Wconditional-uninitialized */ + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; + i >= 0; + i--, insp--) { + /* Level empty, or a single element. */ + if (ins_head->head[i] == NULL || + ins_head->head[i] == ins_head->tail[i]) { + /* Remove if it is the element being moved. */ + if (ins_head->head[i] == moved_ins) + ins_head->head[i] = ins_head->tail[i] = NULL; + continue; + } + + for (ins = *insp; ins != ins_head->tail[i]; ins = ins->next[i]) + prev_ins = ins; + /* - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Update the stack head so that we step down as far to the + * right as possible. We know that prev_ins is valid since + * levels must contain at least two items to be here. */ - if (__wt_page_is_modified(child)) { - mod->write_gen = 0; - __wt_cache_dirty_decr(session, child); + insp = &prev_ins->next[i]; + if (ins == moved_ins) { + /* Remove the item being moved. */ + WT_ASSERT(session, ins_head->head[i] != moved_ins); + WT_ASSERT(session, prev_ins->next[i] == moved_ins); + *insp = NULL; + ins_head->tail[i] = prev_ins; } - __wt_ref_out(session, &ref_copy); } +#ifdef HAVE_DIAGNOSTIC /* - * A note on error handling: in the case of evicting a page that has - * unresolved changes, we just instantiated some in-memory pages that - * reflect those unresolved changes. The problem is those pages - * reference the same WT_UPDATE chains as the page we're splitting, - * that is, we simply copied references into the new pages. If the - * split fails, the original page is fine, but discarding the created - * page would free those update chains, and that's wrong. There isn't - * an easy solution, there's a lot of small memory allocations in some - * common code paths, and unwinding those changes will be difficult. - * For now, leak the memory by not discarding the instantiated pages. + * Verify the moved insert item appears nowhere on the skip list. */ - __wt_free_ref_index(session, NULL, alloc_index, 0); - if (ref_tmp != NULL) { - for (i = 0; i < split_entries; ++i) - __wt_free_ref(session, child, ref_tmp[i], 0); - __wt_free(session, ref_tmp); + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; + i >= 0; + i--, insp--) + for (ins = *insp; ins != NULL; ins = ins->next[i]) + WT_ASSERT(session, ins != moved_ins); +#endif + + /* + * Split into the parent. + */ + if ((ret = __split_parent( + session, ref, split_ref, 2, 0, parent_incr, 0, 0)) != 0) { + /* + * Move the insert list element back to the original page list. + * For simplicity, the previous skip list pointers originally + * allocated can be ignored, just append the entry to the end of + * the level 0 list. As before, we depend on the list having + * multiple elements and ignore the edge cases small lists have. + */ + right->pg_row_ins[0]->head[0] = + right->pg_row_ins[0]->tail[0] = NULL; + ins_head->tail[0]->next[0] = moved_ins; + ins_head->tail[0] = moved_ins; + + /* + * We marked the new page dirty; we're going to discard it, but + * first mark it clean and fix up the cache statistics. + */ + right->modify->write_gen = 0; + __wt_cache_dirty_decr(session, right); + + WT_ERR(ret); } /* + * Save the transaction ID when the split happened. Application + * threads will not try to forcibly evict the page again until + * all concurrent transactions commit. + */ + page->modify->inmem_split_txn = __wt_txn_new_id(session); + + /* Let our caller know that we split. */ + *splitp = 1; + + WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); + + /* + * We may not be able to immediately free the page's original WT_REF + * structure and instantiated key, there may be threads using them. + * Add them to the session discard list, to be freed once we know it's + * safe. + * + * After the split, we're going to discard the WT_REF, account for the + * change in memory footprint. Row store pages have keys that may be + * instantiated, check for that. + */ + if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) && + (ikey = __wt_ref_key_instantiated(ref)) != NULL) + WT_TRET(__split_safe_free( + session, 0, ikey, sizeof(WT_IKEY) + ikey->size)); + WT_TRET(__split_safe_free(session, 0, ref, sizeof(WT_REF))); + + /* * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened. + * nothing really bad can have happened, and our caller has to proceed + * with the split. */ - return (ret == WT_PANIC || !complete ? ret : 0); + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during insert page split"); + return (ret == WT_PANIC ? WT_PANIC : 0); + +err: if (split_ref[0] != NULL) { + __wt_free(session, split_ref[0]->key.ikey); + __wt_free(session, split_ref[0]); + } + if (split_ref[1] != NULL) { + __wt_free(session, split_ref[1]->key.ikey); + __wt_free(session, split_ref[1]); + } + if (right != NULL) + __wt_page_out(session, &right); + __wt_scr_free(&key); + return (ret); } /* - * __split_evict_single -- - * Resolve a single page split, replacing a page with a new version. + * __wt_split_rewrite -- + * Resolve a failed reconciliation by replacing a page with a new version. */ -static int -__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref) +int +__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) { WT_PAGE *page; WT_PAGE_MODIFY *mod; @@ -1073,17 +1335,28 @@ __split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; mod = page->modify; - /* Build the new page. */ + /* + * This isn't a split: a reconciliation failed because we couldn't write + * something, and in the case of forced eviction, we need to stop this + * page from being such a problem. We have exclusive access, rewrite the + * page in memory. The code lives here because the split code knows how + * to re-create a page in memory after it's been reconciled, and that's + * exactly what we want to do. + * + * Build the new page. + */ memset(&new, 0, sizeof(new)); - WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0])); + WT_RET(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); /* - * Discard the original page. Pages with unresolved changes are not - * marked clean during reconciliation, do it now. + * Discard the original page. + * + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. */ mod->write_gen = 0; __wt_cache_dirty_decr(session, page); - __wt_page_out(session, &page); + __wt_ref_out(session, ref); /* Swap the new page into place. */ ref->page = new.page; @@ -1093,29 +1366,100 @@ __split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * __wt_split_evict -- + * __wt_split_multi -- * Resolve a page split. */ int -__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) { - uint32_t split_entries; + WT_IKEY *ikey; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF **ref_new; + size_t ikey_size, parent_decr, parent_incr; + uint32_t i, new_entries; + + page = ref->page; + mod = page->modify; + new_entries = mod->mod_multi_entries; + + ikey = NULL; + ikey_size = parent_decr = parent_incr = 0; + + /* + * Convert the split page's multiblock reconciliation information into + * an array of page reference structures. + */ + WT_RET(__wt_calloc_def(session, new_entries, &ref_new)); + for (i = 0; i < new_entries; ++i) + WT_ERR(__wt_multi_to_ref(session, + page, &mod->mod_multi[i], &ref_new[i], &parent_incr)); + + /* + * After the split, we're going to discard the WT_REF, account for the + * change in memory footprint. Row store pages have keys that may be + * instantiated, check for that. + */ + if ((page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) && + (ikey = __wt_ref_key_instantiated(ref)) != NULL) { + ikey_size = sizeof(WT_IKEY) + ikey->size; + WT_MEMSIZE_ADD(parent_decr, ikey_size); + } + WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + + /* Split into the parent. */ + WT_ERR(__split_parent(session, + ref, ref_new, new_entries, parent_decr, parent_incr, exclusive, 1)); + + __wt_free(session, ref_new); /* - * There are two cases entering this code. First, an in-memory page that - * got too large, we forcibly evicted it, and there wasn't anything to - * write. (Imagine two threads updating a small set keys on a leaf page. - * The page is too large so we try to evict it, but after reconciliation - * there's only a small amount of data (so it's a single page we can't - * split), and because there are two threads, there's some data we can't - * write (so we can't evict it). In that case, we take advantage of the - * fact we have exclusive access to the page and rewrite it in memory.) + * The split succeeded, discard the page. * - * Second, a real split where we reconciled a page and it turned into a - * lot of pages. + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. + */ + if (__wt_page_is_modified(page)) { + mod->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } + __wt_ref_out(session, ref); + + /* + * We may not be able to immediately free the page's original WT_REF + * structure and instantiated key, there may be threads using them. + * Add them to the session discard list, to be freed once we know it's + * safe. + */ + if (ikey != NULL) + WT_TRET(__split_safe_free(session, exclusive, ikey, ikey_size)); + WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF))); + + /* + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. */ - split_entries = ref->page->modify->mod_multi_entries; - return (split_entries == 1 ? - __split_evict_single(session, ref) : - __split_evict_multi(session, ref, exclusive)); + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during multi-page split"); + return (ret == WT_PANIC ? WT_PANIC : 0); + +err: /* + * A note on error handling: in the case of evicting a page that has + * unresolved changes, we just instantiated some in-memory pages that + * reflect those unresolved changes. The problem is those pages + * reference the same WT_UPDATE chains as the page we're splitting, + * that is, we simply copied references into the new pages. If the + * split fails, the original page is fine, but discarding the created + * page would free those update chains, and that's wrong. There isn't + * an easy solution, there's a lot of small memory allocations in some + * common code paths, and unwinding those changes will be difficult. + * For now, leak the memory by not discarding the instantiated pages. + */ + for (i = 0; i < new_entries; ++i) + __wt_free_ref(session, page, ref_new[i], 0); + __wt_free(session, ref_new); + return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/rec_write.c b/src/third_party/wiredtiger/src/btree/rec_write.c index 46f6ed92aae..c72447ae841 100644 --- a/src/third_party/wiredtiger/src/btree/rec_write.c +++ b/src/third_party/wiredtiger/src/btree/rec_write.c @@ -331,11 +331,11 @@ static int __rec_dictionary_lookup( static void __rec_dictionary_reset(WT_RECONCILE *); /* - * __wt_rec_write -- + * __wt_reconcile -- * Reconcile an in-memory page into its on-disk format, and write it. */ int -__wt_rec_write(WT_SESSION_IMPL *session, +__wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) { WT_CONNECTION_IMPL *conn; @@ -523,7 +523,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) * Fake up a reference structure, and write the next root page. */ __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT); - return (__wt_rec_write(session, &fake_ref, NULL, flags)); + return (__wt_reconcile(session, &fake_ref, NULL, flags)); err: __wt_page_out(session, &next); return (ret); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index caa2a938954..e8428b76691 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -465,8 +465,7 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, if (bulk) __wt_spin_lock( session, &S2C(session)->checkpoint_lock); - ret = __wt_session_get_btree_ckpt( - session, uri, cfg, flags); + ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags); if (bulk) __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 0c4fe876e5e..082bf9fa9d0 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -187,6 +187,9 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; + /* In-memory split transaction ID. */ + uint64_t inmem_split_txn; + /* Dirty bytes added to the cache. */ uint64_t bytes_dirty; @@ -549,7 +552,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */ +#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing */ +#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ }; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index b7957e6647f..464b491c480 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -937,13 +937,15 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * Skip this if eviction is disabled for this operation or this tree, * or if there is no chance of eviction succeeding for dirty pages due * to a checkpoint or because we've already tried writing this page and - * it contains an update that isn't stable. + * it contains an update that isn't stable. Also skip forced eviction + * if we just did an in-memory split. */ if (LF_ISSET(WT_READ_NO_EVICT) || page->read_gen != WT_READGEN_OLDEST || F_ISSET(btree, WT_BTREE_NO_EVICTION) || (__wt_page_is_modified(page) && (btree->checkpointing || - !__wt_txn_visible_all(session, page->modify->first_dirty_txn)))) + !__wt_txn_visible_all(session, page->modify->first_dirty_txn) || + !__wt_txn_visible_all(session, page->modify->inmem_split_txn)))) return (__wt_hazard_clear(session, page)); /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d6b952a3841..8ecb81d638a 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -163,11 +163,14 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM * extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); -extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern void __wt_rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); -extern int __wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp); +extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); @@ -178,7 +181,7 @@ extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uin extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page); -extern int __wt_rec_write(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags); +extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index ee2baa9037b..879394e2cc5 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -156,6 +156,7 @@ struct __wt_connection_stats { WT_STATS cache_bytes_max; WT_STATS cache_bytes_read; WT_STATS cache_bytes_write; + WT_STATS cache_eviction_app; WT_STATS cache_eviction_checkpoint; WT_STATS cache_eviction_clean; WT_STATS cache_eviction_deepen; @@ -172,6 +173,7 @@ struct __wt_connection_stats { WT_STATS cache_eviction_slow; WT_STATS cache_eviction_split; WT_STATS cache_eviction_walk; + WT_STATS cache_inmem_split; WT_STATS cache_pages_dirty; WT_STATS cache_pages_inuse; WT_STATS cache_read; @@ -290,6 +292,7 @@ struct __wt_dsrc_stats { WT_STATS cache_eviction_fail; WT_STATS cache_eviction_hazard; WT_STATS cache_eviction_internal; + WT_STATS cache_inmem_split; WT_STATS cache_overflow_value; WT_STATS cache_read; WT_STATS cache_read_overflow; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a911ec1acdb..6edbe55197e 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -3104,180 +3104,184 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_BYTES_READ 1023 /*! cache: bytes written from cache */ #define WT_STAT_CONN_CACHE_BYTES_WRITE 1024 +/*! cache: pages evicted by application threads */ +#define WT_STAT_CONN_CACHE_EVICTION_APP 1025 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1025 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1026 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1026 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1027 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1027 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1028 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1028 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1029 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1029 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1030 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1030 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1031 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1031 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1032 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1032 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1033 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1033 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1034 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1034 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1035 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1035 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1036 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1036 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1037 /*! cache: eviction server populating queue, but not evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1037 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1038 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1038 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1039 /*! cache: pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1039 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1040 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1040 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1041 +/*! cache: in-memory page splits */ +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1042 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1041 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1043 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1042 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1044 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1043 +#define WT_STAT_CONN_CACHE_READ 1045 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1044 +#define WT_STAT_CONN_CACHE_WRITE 1046 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1045 -/*! btree: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1046 -/*! btree: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1047 -/*! btree: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1048 -/*! btree: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1049 -/*! btree: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1050 -/*! btree: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1051 -/*! btree: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1052 -/*! btree: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1053 -/*! btree: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1054 +#define WT_STAT_CONN_COND_WAIT 1047 +/*! cursor: cursor create calls */ +#define WT_STAT_CONN_CURSOR_CREATE 1048 +/*! cursor: cursor insert calls */ +#define WT_STAT_CONN_CURSOR_INSERT 1049 +/*! cursor: cursor next calls */ +#define WT_STAT_CONN_CURSOR_NEXT 1050 +/*! cursor: cursor prev calls */ +#define WT_STAT_CONN_CURSOR_PREV 1051 +/*! cursor: cursor remove calls */ +#define WT_STAT_CONN_CURSOR_REMOVE 1052 +/*! cursor: cursor reset calls */ +#define WT_STAT_CONN_CURSOR_RESET 1053 +/*! cursor: cursor search calls */ +#define WT_STAT_CONN_CURSOR_SEARCH 1054 +/*! cursor: cursor search near calls */ +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1055 +/*! cursor: cursor update calls */ +#define WT_STAT_CONN_CURSOR_UPDATE 1056 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1055 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1057 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1056 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1058 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1057 +#define WT_STAT_CONN_FILE_OPEN 1059 /*! log: log buffer size increases */ -#define WT_STAT_CONN_LOG_BUFFER_GROW 1058 +#define WT_STAT_CONN_LOG_BUFFER_GROW 1060 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1059 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1061 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1060 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1062 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1061 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1063 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1062 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1064 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1063 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1065 /*! log: log read operations */ -#define WT_STAT_CONN_LOG_READS 1064 +#define WT_STAT_CONN_LOG_READS 1066 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1065 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1067 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1066 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1068 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1067 +#define WT_STAT_CONN_LOG_SCANS 1069 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1068 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1070 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1069 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1071 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1070 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1072 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1071 +#define WT_STAT_CONN_LOG_SLOT_RACES 1073 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1072 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1074 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1073 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1075 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1074 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1076 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1075 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1077 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1076 +#define WT_STAT_CONN_LOG_SYNC 1078 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1077 +#define WT_STAT_CONN_LOG_WRITES 1079 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1078 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1080 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1079 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1081 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1080 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1082 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1081 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1083 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1082 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1084 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1083 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1085 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1084 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1086 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1085 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1087 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1086 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1088 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1087 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1089 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1088 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1090 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1089 +#define WT_STAT_CONN_MEMORY_FREE 1091 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1090 +#define WT_STAT_CONN_MEMORY_GROW 1092 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1091 +#define WT_STAT_CONN_READ_IO 1093 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1092 +#define WT_STAT_CONN_REC_PAGES 1094 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1093 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1095 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1096 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1097 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1096 +#define WT_STAT_CONN_RWLOCK_READ 1098 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1097 +#define WT_STAT_CONN_RWLOCK_WRITE 1099 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1100 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1099 +#define WT_STAT_CONN_SESSION_OPEN 1101 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1100 +#define WT_STAT_CONN_TXN_BEGIN 1102 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1101 +#define WT_STAT_CONN_TXN_CHECKPOINT 1103 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1102 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1104 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1103 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1105 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1104 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1106 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1105 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1107 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1106 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1108 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1107 +#define WT_STAT_CONN_TXN_COMMIT 1109 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1108 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1110 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1109 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1111 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1110 +#define WT_STAT_CONN_TXN_ROLLBACK 1112 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1111 +#define WT_STAT_CONN_WRITE_IO 1113 /*! * @} @@ -3365,98 +3369,100 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038 /*! cache: internal pages evicted */ #define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039 +/*! cache: in-memory page splits */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2040 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2040 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2041 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2041 +#define WT_STAT_DSRC_CACHE_READ 2042 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2042 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2043 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2043 +#define WT_STAT_DSRC_CACHE_WRITE 2044 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2044 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2045 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2045 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2046 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2046 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2047 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2047 +#define WT_STAT_DSRC_COMPRESS_READ 2048 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2048 +#define WT_STAT_DSRC_COMPRESS_WRITE 2049 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2049 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2050 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2050 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2051 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2051 +#define WT_STAT_DSRC_CURSOR_CREATE 2052 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2052 +#define WT_STAT_DSRC_CURSOR_INSERT 2053 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2053 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2054 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2054 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2055 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2055 +#define WT_STAT_DSRC_CURSOR_NEXT 2056 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2056 +#define WT_STAT_DSRC_CURSOR_PREV 2057 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2057 +#define WT_STAT_DSRC_CURSOR_REMOVE 2058 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2058 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2059 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2059 +#define WT_STAT_DSRC_CURSOR_RESET 2060 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2060 +#define WT_STAT_DSRC_CURSOR_SEARCH 2061 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2061 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2062 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2062 +#define WT_STAT_DSRC_CURSOR_UPDATE 2063 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2063 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2064 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2064 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2065 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2065 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2066 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2066 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2067 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2067 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2068 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2068 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2069 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2069 +#define WT_STAT_DSRC_REC_DICTIONARY 2070 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2070 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2071 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2071 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2072 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2072 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2073 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2073 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2074 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2075 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2076 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2077 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2077 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2078 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2078 +#define WT_STAT_DSRC_REC_PAGES 2079 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2079 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2080 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2081 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2081 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2082 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2082 +#define WT_STAT_DSRC_SESSION_COMPACT 2083 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2083 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2084 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085 /*! @} */ /* * Statistics section: END diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 71e4724f91c..5cdc4e0783e 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -406,23 +406,28 @@ static int __lsm_manager_run_server(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LSM_TREE *lsm_tree; struct timespec now; uint64_t fillms, pushms; + int dhandle_locked; conn = S2C(session); + dhandle_locked = 0; + while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { - if (TAILQ_EMPTY(&conn->lsmqh)) { - __wt_sleep(0, 10000); - continue; - } __wt_sleep(0, 10000); + if (TAILQ_EMPTY(&conn->lsmqh)) + continue; + __wt_spin_lock(session, &conn->dhandle_lock); + F_SET(session, WT_SESSION_HANDLE_LIST_LOCKED); + dhandle_locked = 1; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) continue; - WT_RET(__lsm_manager_aggressive_update( + WT_ERR(__lsm_manager_aggressive_update( session, lsm_tree)); - WT_RET(__wt_epoch(session, &now)); + WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : WT_TIMEDIFF( now, lsm_tree->work_push_ts) / WT_MILLION; @@ -453,15 +458,15 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) (lsm_tree->merge_aggressiveness > 3 && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) || pushms > fillms) { - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); - WT_RET(__wt_verbose(session, WT_VERB_LSM, + WT_ERR(__wt_verbose(session, WT_VERB_LSM, "MGR %s: queue %d mod %d nchunks %d" " flags 0x%x aggressive %d pushms %" PRIu64 " fillms %" PRIu64, @@ -470,13 +475,20 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) lsm_tree->flags, lsm_tree->merge_aggressiveness, pushms, fillms)); - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } + __wt_spin_unlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); + dhandle_locked = 0; } - return (0); +err: if (dhandle_locked) { + __wt_spin_unlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); + } + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 077b5564276..888f12bdd94 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -786,7 +786,7 @@ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - WT_LSM_CHUNK *chunk; + WT_LSM_CHUNK *chunk, *last_chunk; uint32_t nchunks, new_id; int first_switch; @@ -795,21 +795,18 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) nchunks = lsm_tree->nchunks; first_switch = nchunks == 0 ? 1 : 0; + /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ - chunk = NULL; + last_chunk = NULL; if (!first_switch && - (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && - !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && + (last_chunk = lsm_tree->chunk[nchunks - 1]) != NULL && + !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; - /* Set the switch transaction in the previous chunk, if necessary. */ - if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE) - chunk->switch_txn = __wt_txn_new_id(session); - /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 0); @@ -835,6 +832,10 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) lsm_tree->modified = 1; + /* Set the switch transaction in the previous chunk, if necessary. */ + if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE) + last_chunk->switch_txn = __wt_txn_new_id(session); + err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); /* * Errors that happen during a tree switch leave the tree in a state diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 60c28a3cc06..c27b7edb234 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -69,9 +69,12 @@ int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp) { + WT_DECL_RET; + WT_LSM_CHUNK *chunk; u_int i, end; *chunkp = NULL; + chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); @@ -86,29 +89,43 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1; for (i = 0; i < end; i++) { if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK) || - (*chunkp == NULL && + (chunk == NULL && !F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_STABLE) && !lsm_tree->chunk[i]->evicted)) { - (void)WT_ATOMIC_ADD4(lsm_tree->chunk[i]->refcnt, 1); - WT_RET(__wt_verbose(session, WT_VERB_LSM, + chunk = lsm_tree->chunk[i]; + (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); + WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Flush%s: return chunk %u of %u: %s", - force ? " w/ force" : "", i, end - 1, - lsm_tree->chunk[i]->uri)); - *chunkp = lsm_tree->chunk[i]; + force ? " w/ force" : "", i, end - 1, chunk->uri)); + /* - * Discards are opportunistic, flip a coin to decide - * whether to try, but take the first real flush we - * find. + * If retrying a discard push an additional work unit + * so there are enough to trigger checkpoints. */ - if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK) || - __wt_random(session->rnd) & 1) - break; + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + /* + * Don't be overly zealous about pushing old + * chunks from cache. Attempting too many drops + * can interfere with checkpoints. + */ + if (__wt_random(session->rnd) & 1) { + (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + chunk = NULL; + continue; + } + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); + } + break; } } +err: if (ret != 0 && chunk != NULL) + (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); - return (0); + *chunkp = chunk; + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 2b53746bb46..ac0b854c9df 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -50,6 +50,7 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) "cache: data source pages selected for eviction unable to be evicted"; stats->cache_eviction_hazard.desc = "cache: hazard pointer blocked page eviction"; + stats->cache_inmem_split.desc = "cache: in-memory page splits"; stats->cache_eviction_internal.desc = "cache: internal pages evicted"; stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; stats->cache_read_overflow.desc = @@ -168,6 +169,7 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg) stats->cache_eviction_checkpoint.v = 0; stats->cache_eviction_fail.v = 0; stats->cache_eviction_hazard.v = 0; + stats->cache_inmem_split.v = 0; stats->cache_eviction_internal.v = 0; stats->cache_eviction_dirty.v = 0; stats->cache_read_overflow.v = 0; @@ -253,6 +255,7 @@ __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent) p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v; p->cache_eviction_fail.v += c->cache_eviction_fail.v; p->cache_eviction_hazard.v += c->cache_eviction_hazard.v; + p->cache_inmem_split.v += c->cache_inmem_split.v; p->cache_eviction_internal.v += c->cache_eviction_internal.v; p->cache_eviction_dirty.v += c->cache_eviction_dirty.v; p->cache_read_overflow.v += c->cache_read_overflow.v; @@ -341,15 +344,6 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->block_byte_write.desc = "block-manager: bytes written"; stats->block_map_read.desc = "block-manager: mapped blocks read"; stats->block_byte_map_read.desc = "block-manager: mapped bytes read"; - stats->cursor_create.desc = "btree: cursor create calls"; - stats->cursor_insert.desc = "btree: cursor insert calls"; - stats->cursor_next.desc = "btree: cursor next calls"; - stats->cursor_prev.desc = "btree: cursor prev calls"; - stats->cursor_remove.desc = "btree: cursor remove calls"; - stats->cursor_reset.desc = "btree: cursor reset calls"; - stats->cursor_search.desc = "btree: cursor search calls"; - stats->cursor_search_near.desc = "btree: cursor search near calls"; - stats->cursor_update.desc = "btree: cursor update calls"; stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache"; stats->cache_bytes_read.desc = "cache: bytes read into cache"; stats->cache_bytes_write.desc = "cache: bytes written from cache"; @@ -369,6 +363,7 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "cache: failed eviction of pages that exceeded the in-memory maximum"; stats->cache_eviction_hazard.desc = "cache: hazard pointer blocked page eviction"; + stats->cache_inmem_split.desc = "cache: in-memory page splits"; stats->cache_eviction_internal.desc = "cache: internal pages evicted"; stats->cache_bytes_max.desc = "cache: maximum bytes configured"; stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; @@ -378,6 +373,8 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "cache: pages currently held in the cache"; stats->cache_eviction_force.desc = "cache: pages evicted because they exceeded the in-memory maximum"; + stats->cache_eviction_app.desc = + "cache: pages evicted by application threads"; stats->cache_read.desc = "cache: pages read into cache"; stats->cache_eviction_fail.desc = "cache: pages selected for eviction unable to be evicted"; @@ -402,6 +399,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "connection: pthread mutex shared lock write-lock calls"; stats->read_io.desc = "connection: total read I/Os"; stats->write_io.desc = "connection: total write I/Os"; + stats->cursor_create.desc = "cursor: cursor create calls"; + stats->cursor_insert.desc = "cursor: cursor insert calls"; + stats->cursor_next.desc = "cursor: cursor next calls"; + stats->cursor_prev.desc = "cursor: cursor prev calls"; + stats->cursor_remove.desc = "cursor: cursor remove calls"; + stats->cursor_reset.desc = "cursor: cursor reset calls"; + stats->cursor_search.desc = "cursor: cursor search calls"; + stats->cursor_search_near.desc = "cursor: cursor search near calls"; + stats->cursor_update.desc = "cursor: cursor update calls"; stats->dh_session_handles.desc = "data-handle: session dhandles swept"; stats->dh_session_sweeps.desc = "data-handle: session sweep attempts"; stats->log_slot_closes.desc = "log: consolidated slot closures"; @@ -501,15 +507,6 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->block_byte_write.v = 0; stats->block_map_read.v = 0; stats->block_byte_map_read.v = 0; - stats->cursor_create.v = 0; - stats->cursor_insert.v = 0; - stats->cursor_next.v = 0; - stats->cursor_prev.v = 0; - stats->cursor_remove.v = 0; - stats->cursor_reset.v = 0; - stats->cursor_search.v = 0; - stats->cursor_search_near.v = 0; - stats->cursor_update.v = 0; stats->cache_bytes_read.v = 0; stats->cache_bytes_write.v = 0; stats->cache_eviction_checkpoint.v = 0; @@ -520,10 +517,12 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->cache_eviction_slow.v = 0; stats->cache_eviction_force_fail.v = 0; stats->cache_eviction_hazard.v = 0; + stats->cache_inmem_split.v = 0; stats->cache_eviction_internal.v = 0; stats->cache_eviction_dirty.v = 0; stats->cache_eviction_deepen.v = 0; stats->cache_eviction_force.v = 0; + stats->cache_eviction_app.v = 0; stats->cache_read.v = 0; stats->cache_eviction_fail.v = 0; stats->cache_eviction_split.v = 0; @@ -540,6 +539,15 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->rwlock_write.v = 0; stats->read_io.v = 0; stats->write_io.v = 0; + stats->cursor_create.v = 0; + stats->cursor_insert.v = 0; + stats->cursor_next.v = 0; + stats->cursor_prev.v = 0; + stats->cursor_remove.v = 0; + stats->cursor_reset.v = 0; + stats->cursor_search.v = 0; + stats->cursor_search_near.v = 0; + stats->cursor_update.v = 0; stats->dh_session_handles.v = 0; stats->dh_session_sweeps.v = 0; stats->log_slot_closes.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 228b2919a39..9254692ea93 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -706,17 +706,18 @@ __checkpoint_worker( if (F_ISSET(ckpt, WT_CKPT_DELETE)) ++deleted; /* - * Complicated test: if we only deleted a single checkpoint, and - * it was the last checkpoint in the object, and it has the same - * name as the checkpoint we're taking (correcting for internal - * checkpoint names with their generational suffix numbers), we - * can skip the checkpoint, there's nothing to do. + * Complicated test: if the last checkpoint in the object has + * the same name as the checkpoint we're taking (correcting for + * internal checkpoint names with their generational suffix + * numbers), we can skip the checkpoint, there's nothing to do. + * The exception is if we're deleting two or more checkpoints: + * then we may save space. */ - if (deleted == 1 && - F_ISSET(ckpt - 1, WT_CKPT_DELETE) && + if (ckpt > ckptbase && (strcmp(name, (ckpt - 1)->name) == 0 || (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && - WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT)))) + WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && + deleted < 2) goto done; } diff --git a/src/third_party/wiredtiger/tools/stat_data.py b/src/third_party/wiredtiger/tools/stat_data.py index 56218f497b7..e596fc014e7 100644 --- a/src/third_party/wiredtiger/tools/stat_data.py +++ b/src/third_party/wiredtiger/tools/stat_data.py @@ -72,3 +72,19 @@ no_clear_list = [ 'transaction: transaction range of IDs currently pinned', 'session: open cursor count', ] +prefix_list = [ + 'data-handle', + 'reconciliation', + 'LSM', + 'log', + 'cache', + 'transaction', + 'cursor', + 'connection', + 'session', + 'block-manager', + 'async', + 'btree', + 'compression', +] +groups = {'cursor': ['cursor', 'session'], 'lsm': ['LSM', 'transaction'], 'system': ['connection', 'data-handle', 'session'], 'evict': ['cache', 'connection', 'block-manager'], 'memory': ['cache', 'connection', 'reconciliation']}
\ No newline at end of file diff --git a/src/third_party/wiredtiger/tools/wtstats.py b/src/third_party/wiredtiger/tools/wtstats.py index 882a6fb5be2..cc2ebd80877 100644 --- a/src/third_party/wiredtiger/tools/wtstats.py +++ b/src/third_party/wiredtiger/tools/wtstats.py @@ -28,6 +28,7 @@ import fileinput, os, re, shutil, sys, textwrap from collections import defaultdict +from glob import glob from time import mktime from subprocess import call @@ -39,7 +40,8 @@ tool_dir = os.path.split(sys.argv[0])[0] sys.path = [ os.path.join(tool_dir, "3rdparty") ] + sys.path try: - from stat_data import no_scale_per_second_list, no_clear_list + from stat_data \ + import groups, no_scale_per_second_list, no_clear_list, prefix_list except ImportError: print >>sys.stderr, "Could not import stat_data.py, it should be\ in the same directory as %s" % sys.argv[0] @@ -118,6 +120,8 @@ import argparse parser = argparse.ArgumentParser(description='Create graphs from WiredTiger statistics.') parser.add_argument('--abstime', action='store_true', help='use absolute time on the x axis') +parser.add_argument('--all', '-A', action='store_true', + help='generate all series as separate HTML output files by category') parser.add_argument('--clear', action='store_true', help='WiredTiger stats gathered with clear set') parser.add_argument('--focus', action='store_true', @@ -127,15 +131,15 @@ parser.add_argument('--include', '-I', metavar='regexp', help='include series with titles matching the specifed regexp') parser.add_argument('--list', action='store_true', help='list the series that would be displayed') -parser.add_argument('--output', '-o', metavar='file', default='wtstats.html', - help='HTML output file') +parser.add_argument('--output', '-o', metavar='file', default='wtstats', + help='HTML output file prefix') parser.add_argument('--right', '-R', metavar='regexp', type=re.compile, action='append', help='use the right axis for series with titles matching the specifed regexp') parser.add_argument('--wtperf', '-w', action='store_true', help='Plot wtperf statistics on the same graph') parser.add_argument('files', metavar='file', nargs='+', - help='input files generated by WiredTiger statistics logging') + help='input files or directories generated by WiredTiger statistics logging') args = parser.parse_args() # Don't require users to specify regexps twice for right axis @@ -148,8 +152,17 @@ if args.include and args.right: args.include += args.right # Read the input file(s) into a dictionary of lists. +def getfiles(l): + for f in l: + if os.path.isfile(f): + yield f + elif os.path.isdir(f): + for s in glob(os.path.join(f, 'WiredTigerStat*')): + print 'Processing ' + s + yield s + d = defaultdict(list) -for f in args.files: +for f in getfiles(args.files): for line in open(f, 'rU'): month, day, time, v, title = line.strip('\n').split(" ", 4) d[title].append((month + " " + day + " " + time, v)) @@ -179,13 +192,75 @@ def common_suffix(a, b): b = b[1:] return b +def output_series(results, prefix=None, grouplist=[]): + # open the output file based on prefix + if prefix == None: + outputname = args.output + '.html' + elif len(grouplist) == 0: + outputname = args.output +'.' + prefix + '.html' + else: + outputname = args.output +'.group.' + prefix + '.html' + + if prefix != None and len(grouplist) == 0: + this_series = [] + for title, yaxis, ydata in results: + if not prefix in title: + continue + #print 'Appending to dataset: ' + title + this_series.append((title, yaxis, ydata)) + elif prefix != None and len(grouplist) > 0: + this_series = [] + for title, yaxis, ydata in results: + for subgroup in grouplist: + if not subgroup in title: + continue + # print 'Appending to dataset: ' + title + this_series.append((title, yaxis, ydata)) + else: + this_series = results + + if len(this_series) == 0: + print 'Output: ' + outputname + ' has no data. Do not create.' + return + + #--------------------------------------- + if args.right: + charttype = multiChart + elif args.focus: + charttype = lineWithFocusChart + else: + charttype = lineChart + + chart_extra = {} + # Add in the x axis if the user wants time. + if args.abstime: + chart_extra['x_axis_format'] = '%H:%M:%S' + + # Create the chart, add the series + chart = charttype(name='statlog', height=450+10*len(this_series), resize=True, x_is_date=args.abstime, y_axis_format='g', assets_directory='http://source.wiredtiger.com/graphs/', **chart_extra) + + for title, yaxis, ydata in this_series: + chart.add_serie(x=xdata, y=(ydata.get(x, 0) for x in xdata), name=title, + type="line", yaxis="2" if yaxis else "1") + + if args.wtperf: + addPlotsToStatsChart(chart, os.path.dirname(args.files[0]), args.abstime) + + chart.buildhtml() + output_file = open(outputname, 'w') + output_file.write(chart.htmlcontent) + + #close Html file + output_file.close() + + # Split out the data, convert timestamps results = [] for title, values in sorted(d.iteritems()): title, ydata = munge(title, values) # Ignore entries if a list of regular expressions was given if args.include and not [r for r in args.include if r.search(title)]: - continue + continue yaxis = args.right and [r for r in args.right if r.search(title)] prefix = title if prefix is None else common_prefix(prefix, title) suffix = title if suffix is None else common_suffix(title, suffix) @@ -215,33 +290,11 @@ if args.list: # Figure out the full set of x axis values xdata = sorted(set(k for k in ydata.iterkeys() for ydata in results)) -# open the output file -output_file = open(args.output, 'w') -#--------------------------------------- -if args.right: - charttype = multiChart -elif args.focus: - charttype = lineWithFocusChart -else: - charttype = lineChart - -chart_extra = {} -# Add in the x axis if the user wants time. -if args.abstime: - chart_extra['x_axis_format'] = '%H:%M:%S' - -# Create the chart, add the series -chart = charttype(name='statlog', height=450+10*len(results), resize=True, x_is_date=args.abstime, y_axis_format='g', assets_directory='http://source.wiredtiger.com/graphs/', **chart_extra) - -for title, yaxis, ydata in results: - chart.add_serie(x=xdata, y=(ydata.get(x, 0) for x in xdata), name=title, - type="line", yaxis="2" if yaxis else "1") - -if args.wtperf: - addPlotsToStatsChart(chart, os.path.dirname(args.files[0]), args.abstime) - -chart.buildhtml() -output_file.write(chart.htmlcontent) +output_series(results) -#close Html file -output_file.close() +# If the user wants the stats split up by prefix type do so. +if args.all: + for prefix in prefix_list: + output_series(results, prefix) + for group in groups.keys(): + output_series(results, group, groups[group]) |