diff options
author | Ramon Fernandez <rfmnyc@gmail.com> | 2015-11-19 09:37:38 -0500 |
---|---|---|
committer | Ramon Fernandez <rfmnyc@gmail.com> | 2015-11-19 09:41:39 -0500 |
commit | a0771ea5ec1b44537d3c409e3d712db24fd8e6bb (patch) | |
tree | 62517780ad0982ec80b8a6d968a72cf0474df617 /src/third_party/wiredtiger | |
parent | 042d8fa2d252142489c5fa3009927bad20d77efb (diff) | |
download | mongo-a0771ea5ec1b44537d3c409e3d712db24fd8e6bb.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.2.0-rc3-177-g9d375e3.tar.gz from wiredtiger branch mongodb-3.2
ref: d9ec1ff..9d375e3
16c0a1a WT-1315 Fix some leaks with join cursors.
59857f9 WT-2222 Add statistics for named snapshots.
4368d39 WT-1315 Cursor join implementation
a72ddb7 WT-2218 Add truncate stats
fb9cebe WT-2224 Track which deleted refs are discarded by a split.
e2f1130 WT-2220 Split WT_TIMEDIFF macro into unit specific macros.
be412b5 WT-2182 when internal pages grow large enough, split them into their parents
ce8c091 WT-2219 Enhancements to in-memory testing
347d922 WT-2220 time_t cleanup.
08c0fcd WT-2217 change WT_CURSOR.insert to clear "set" key/value on return
d1b5e7f WT-2135 Fix log_only setting for backup cursor. Fix initialization.
78bd4ac WT-2210 raw compression fails if row-store recovery precedes column-store recovery
c1b2634 WT-2182 fixes for splitting up the tree.
0a1ee34 WT-2199 Fix transaction sync inconsistency.
ee31bb2 WT-2182 Simplify the split deepen logic.
c360d53 WT-2212 Add a "use_environment" config to "wiredtiger_open"
3f132a4 WT-2182 detect internal page split races.
Diffstat (limited to 'src/third_party/wiredtiger')
84 files changed, 3779 insertions, 1326 deletions
diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README index c30b210029d..5056431c95b 100644 --- a/src/third_party/wiredtiger/README +++ b/src/third_party/wiredtiger/README @@ -1,6 +1,6 @@ -WiredTiger 2.6.2: (June 4, 2015) +WiredTiger 2.7.0: (November 19, 2015) -This is version 2.6.2 of WiredTiger. +This is version 2.7.0 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at: The documentation for this specific release can be found at: - http://source.wiredtiger.com/2.6.2/index.html + http://source.wiredtiger.com/2.7.0/index.html The WiredTiger source code can be found at: diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO index a178c2e40fb..1204e262af2 100644 --- a/src/third_party/wiredtiger/RELEASE_INFO +++ b/src/third_party/wiredtiger/RELEASE_INFO @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=2 -WIREDTIGER_VERSION_MINOR=6 -WIREDTIGER_VERSION_PATCH=2 +WIREDTIGER_VERSION_MINOR=7 +WIREDTIGER_VERSION_PATCH=0 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 44aff59963c..9ac96862fa1 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -653,7 +653,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { goto err; } ++trk->latency_ops; - usecs = ns_to_us(WT_TIMEDIFF(stop, start)); + usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); } /* Increment operation count */ @@ -936,7 +936,7 @@ populate_thread(void *arg) goto err; } ++trk->latency_ops; - usecs = ns_to_us(WT_TIMEDIFF(stop, start)); + usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); } ++thread->insert.ops; /* Same as trk->ops */ @@ -1068,7 +1068,7 @@ populate_async(void *arg) goto err; } ++trk->latency_ops; - usecs = ns_to_us(WT_TIMEDIFF(stop, start)); + usecs = WT_TIMEDIFF_US(stop, start); track_operation(trk, usecs); } if ((ret = session->close(session, NULL)) != 0) { @@ -1386,7 +1386,7 @@ execute_populate(CONFIG *cfg) } lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount); - msecs = ns_to_ms(WT_TIMEDIFF(stop, start)); + msecs = WT_TIMEDIFF_MS(stop, start); /* * This is needed as the divisions will fail if the insert takes no time @@ -1444,7 +1444,7 @@ execute_populate(CONFIG *cfg) } lprintf(cfg, 0, 1, "Compact completed in %" PRIu64 " seconds", - (uint64_t)(ns_to_sec(WT_TIMEDIFF(stop, start)))); + (uint64_t)(WT_TIMEDIFF_SEC(stop, start))); assert(tables == 0); } return (0); @@ -2423,7 +2423,7 @@ worker_throttle(int64_t throttle, int64_t *ops, struct timespec *interval) * If we did enough operations in less than a second, sleep for * the rest of the second. */ - usecs_to_complete = ns_to_us(WT_TIMEDIFF(now, *interval)); + usecs_to_complete = WT_TIMEDIFF_US(now, *interval); if (usecs_to_complete < USEC_PER_SEC) (void)usleep((useconds_t)(USEC_PER_SEC - usecs_to_complete)); @@ -2457,7 +2457,7 @@ drop_all_tables(CONFIG *cfg) } } (void)__wt_epoch(NULL, &stop); - msecs = ns_to_ms(WT_TIMEDIFF(stop, start)); + msecs = WT_TIMEDIFF_MS(stop, start); lprintf(cfg, 0, 1, "Executed %" PRIu32 " drop operations average time %" PRIu64 "ms", cfg->table_count, msecs / cfg->table_count); diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 index cec54f5e842..c8b89b7842b 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 @@ -1,14 +1,14 @@ dnl build by dist/s_version VERSION_MAJOR=2 -VERSION_MINOR=6 -VERSION_PATCH=2 -VERSION_STRING='"WiredTiger 2.6.2: (June 4, 2015)"' +VERSION_MINOR=7 +VERSION_PATCH=0 +VERSION_STRING='"WiredTiger 2.7.0: (November 19, 2015)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) AC_SUBST(VERSION_PATCH) AC_SUBST(VERSION_STRING) -VERSION_NOPATCH=2.6 +VERSION_NOPATCH=2.7 AC_SUBST(VERSION_NOPATCH) diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 index 4a4f3427df7..2ebe4516695 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -2.6.2 +2.7.0 diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win index 9d0ee10d305..af6ddf98da9 100644 --- a/src/third_party/wiredtiger/build_win/filelist.win +++ b/src/third_party/wiredtiger/build_win/filelist.win @@ -72,6 +72,7 @@ src/cursor/cur_ds.c src/cursor/cur_dump.c src/cursor/cur_file.c src/cursor/cur_index.c +src/cursor/cur_join.c src/cursor/cur_json.c src/cursor/cur_log.c src/cursor/cur_metadata.c diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 9afff74ca71..f58a48b4a0b 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -705,10 +705,15 @@ wiredtiger_open = wiredtiger_open_common + [ Config('in_memory', 'false', r''' keep data in-memory only, minimize disk I/O''', type='boolean', undoc=True), + Config('use_environment', 'true', r''' + use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment + variables if the process is not running with special privileges. + See @ref home for more information''', + type='boolean'), Config('use_environment_priv', 'false', r''' use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment - variables regardless of whether or not the process is running - with special privileges. See @ref home for more information''', + variables even if the process is running with special privileges. + See @ref home for more information''', type='boolean'), ] @@ -767,6 +772,33 @@ methods = { type='boolean'), ]), +'WT_SESSION.join' : Method([ + Config('compare', '"eq"', r''' + modifies the set of items to be returned so that the index key + satisfies the given comparison relative to the key set in this + cursor''', + choices=['eq', 'ge', 'gt', 'le', 'lt']), + Config('count', '', r''' + set an approximate count of the elements that would be included in + the join. This is used in sizing the bloom filter, and also influences + evaluation order for cursors in the join. When the count is equal + for multiple bloom filters in a composition of joins, the bloom + filter may be shared''', + type='int'), + Config('bloom_bit_count', '16', r''' + the number of bits used per item for the bloom filter''', + min='2', max='1000'), + Config('bloom_hash_count', '8', r''' + the number of hash values per item for the bloom filter''', + min='2', max='100'), + Config('strategy', '', r''' + when set to bloom, a bloom filter is created and populated for + this index. This has an up front cost but may reduce the number + of accesses to the main table when iterating the joined cursor. + The bloom setting requires that count be set''', + choices=['bloom', 'default']), +]), + 'WT_SESSION.log_flush' : Method([ Config('sync', 'on', r''' forcibly flush the log and wait for it to achieve the synchronization diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index f33f0e9a962..52af87c2a68 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -72,6 +72,7 @@ src/cursor/cur_ds.c src/cursor/cur_dump.c src/cursor/cur_file.c src/cursor/cur_index.c +src/cursor/cur_join.c src/cursor/cur_json.c src/cursor/cur_log.c src/cursor/cur_metadata.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 0e2bad0910c..1965dfb7dbe 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -26,6 +26,7 @@ flags = { 'LOG_DSYNC', 'LOG_FLUSH', 'LOG_FSYNC', + 'LOG_SYNC_ENABLED', ], 'page_read' : [ 'READ_CACHE', diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index d204a11835b..8b0d9a0bdcd 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -4,6 +4,7 @@ API_CALL API_CALL_NOCONF API_SESSION_INIT FLD_MASK +JOINABLE_CURSOR_CALL_CHECK LF_MASK LLONG_MAX LLONG_MIN @@ -36,6 +37,8 @@ WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT WT_SESSION_LOCKED_TURTLE +WT_STATS_FIELD_TO_SLOT +WT_STATS_SLOT_ID WT_STAT_DECR WT_STAT_DECRV WT_STAT_FAST_CONN_DECRV @@ -45,9 +48,8 @@ WT_STAT_FAST_DECRV WT_STAT_FAST_INCR WT_STAT_FAST_INCRV WT_STAT_FAST_SET -WT_STATS_FIELD_TO_SLOT -WT_STATS_SLOT_ID WT_STAT_WRITE +WT_TIMEDIFF_US WT_TRET_ERROR_OK WT_WITH_LOCK __F diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list index 3b5690a4bc2..ed6cf43bb2f 100644 --- a/src/third_party/wiredtiger/dist/s_funcs.list +++ b/src/third_party/wiredtiger/dist/s_funcs.list @@ -27,6 +27,8 @@ __wt_log_scan __wt_nlpo2 __wt_nlpo2_round __wt_print_huffman_code +__wt_stat_join_aggregate +__wt_stat_join_clear_all __wt_try_readlock wiredtiger_config_parser_open wiredtiger_config_validate diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index c14f4c961e6..7de139f6a40 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -221,6 +221,7 @@ OUTBUFF OVFL ObWgfvgw Obama +Outfmt PARAM POSIX PREDEFINE @@ -351,6 +352,7 @@ allocfile allocsize amd ao +ap api arg argc @@ -421,6 +423,7 @@ checksums chk chongo cip +cjoin ckpt ckptfrag ckptlist @@ -464,6 +467,7 @@ curdump curextract curfile curindex +curjoin curlog curmetadata cursoring @@ -507,8 +511,10 @@ dev dh dhandle dhandles +difftime dir dirlist +disjunction dlclose dlh dll @@ -540,6 +546,7 @@ enqueue enqueued env eof +eq equalp errhandler errno @@ -592,6 +599,7 @@ ftruncate func gcc gdb +ge getenv getline getone @@ -607,6 +615,7 @@ goesc gostring gostruct goutf +gt hashval havesize hdr @@ -632,6 +641,7 @@ indirects indx infeasible inflateInit +infmt init initn initsize @@ -650,6 +660,7 @@ io ip islocked ispo +iter iteratively jnr jrx @@ -668,6 +679,7 @@ latencies lbrace lbracket ld +le len lenp level's @@ -714,6 +726,7 @@ mem memalign membar memcpy +memget memmove memset memsize diff --git a/src/third_party/wiredtiger/dist/stat.py b/src/third_party/wiredtiger/dist/stat.py index c9684665a53..d62fda3fcb9 100644 --- a/src/third_party/wiredtiger/dist/stat.py +++ b/src/third_party/wiredtiger/dist/stat.py @@ -5,7 +5,7 @@ import re, string, sys, textwrap from dist import compare_srcfile # Read the source files. -from stat_data import groups, dsrc_stats, connection_stats +from stat_data import groups, dsrc_stats, connection_stats, join_stats def print_struct(title, name, base, stats): '''Print the structures for the stat.h file.''' @@ -35,9 +35,17 @@ for line in open('../src/include/stat.h', 'r'): print_struct( 'connections', 'connection', 1000, connection_stats) print_struct('data sources', 'dsrc', 2000, dsrc_stats) + print_struct('join cursors', 'join', 3000, join_stats) f.close() compare_srcfile(tmp_file, '../src/include/stat.h') +def print_defines_one(capname, base, stats): + for v, l in enumerate(stats, base): + f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70))) + f.write('#define\tWT_STAT_' + capname + '_' + l.name.upper() + "\t" * + max(1, 6 - int((len('WT_STAT_' + capname + '_' + l.name)) / 8)) + + str(v) + '\n') + def print_defines(): '''Print the #defines for the wiredtiger.in file.''' f.write(''' @@ -51,11 +59,7 @@ def print_defines(): * @{ */ ''') - for v, l in enumerate(connection_stats, 1000): - f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70))) - f.write('#define\tWT_STAT_CONN_' + l.name.upper() + "\t" * - max(1, 6 - int((len('WT_STAT_CONN_' + l.name)) / 8)) + - str(v) + '\n') + print_defines_one('CONN', 1000, connection_stats) f.write(''' /*! * @} @@ -64,11 +68,16 @@ def print_defines(): * @{ */ ''') - for v, l in enumerate(dsrc_stats, 2000): - f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70))) - f.write('#define\tWT_STAT_DSRC_' + l.name.upper() + "\t" * - max(1, 6 - int((len('WT_STAT_DSRC_' + l.name)) / 8)) + - str(v) + '\n') + print_defines_one('DSRC', 2000, dsrc_stats) + f.write(''' +/*! + * @} + * @name Statistics for join cursors + * @anchor statistics_join + * @{ + */ +''') + print_defines_one('JOIN', 3000, join_stats) f.write('/*! @} */\n') # Update the #defines in the wiredtiger.in file. @@ -98,10 +107,12 @@ def print_func(name, handle, list): f.write('};\n') f.write(''' -const char * -__wt_stat_''' + name + '''_desc(int slot) +int +__wt_stat_''' + name + '''_desc(WT_CURSOR_STAT *cst, int slot, const char **p) { -\treturn (__stats_''' + name + '''_desc[slot]); +\tWT_UNUSED(cst); +\t*p = __stats_''' + name + '''_desc[slot]; +\treturn (0); } ''') @@ -113,7 +124,8 @@ __wt_stat_''' + name + '_init_single(WT_' + name.upper() + '''_STATS *stats) } ''') - f.write(''' + if handle != None: + f.write(''' void __wt_stat_''' + name + '_init(' + handle + ''' *handle) { @@ -205,6 +217,7 @@ f.write('#include "wt_internal.h"\n') print_func('dsrc', 'WT_DATA_HANDLE', dsrc_stats) print_func('connection', 'WT_CONNECTION_IMPL', connection_stats) +print_func('join', None, join_stats) f.close() compare_srcfile(tmp_file, '../src/support/stat.c') @@ -224,6 +237,7 @@ for l in sorted(dsrc_stats): scale_info += ' \'' + l.desc + '\',\n' if 'no_clear' in l.flags: clear_info += ' \'' + l.desc + '\',\n' +# No join statistics can be captured in wtstats scale_info += ']\n' clear_info += ']\n' prefix_info = 'prefix_list = [\n' diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 76fdf185137..3a23071a3f2 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -67,6 +67,10 @@ class DhandleStat(Stat): prefix = 'data-handle' def __init__(self, name, desc, flags=''): Stat.__init__(self, name, DhandleStat.prefix, desc, flags) +class JoinStat(Stat): + prefix = '' # prefix is inserted dynamically + def __init__(self, name, desc, flags=''): + Stat.__init__(self, name, JoinStat.prefix, desc, flags) class LogStat(Stat): prefix = 'log' def __init__(self, name, desc, flags=''): @@ -199,7 +203,9 @@ connection_stats = [ 'eviction server populating queue, but not evicting pages'), CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'), - CacheStat('cache_eviction_split', 'pages split during eviction'), + CacheStat('cache_eviction_split_internal', + 'internal pages split during eviction'), + CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), CacheStat('cache_eviction_walk', 'pages walked for eviction'), CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), @@ -278,6 +284,8 @@ connection_stats = [ # Reconciliation statistics ########################################## RecStat('rec_pages', 'page reconciliation calls'), + RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_delete_fast', 'fast-path pages deleted'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale'), @@ -309,6 +317,11 @@ connection_stats = [ 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_snapshot_range', + 'transaction range of IDs currently pinned by named snapshots', + 'no_clear,no_scale'), + TxnStat('txn_snapshots_created', 'number of named snapshots created'), + TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_sync', 'transaction sync calls'), @@ -349,6 +362,7 @@ connection_stats = [ CursorStat('cursor_restart', 'cursor restarted searches'), CursorStat('cursor_search', 'cursor search calls'), CursorStat('cursor_search_near', 'cursor search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), CursorStat('cursor_update', 'cursor update calls'), ########################################## @@ -390,6 +404,7 @@ dsrc_stats = [ CursorStat('cursor_restart', 'restarted searches'), CursorStat('cursor_search', 'search calls'), CursorStat('cursor_search_near', 'search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), CursorStat('cursor_update', 'update calls'), CursorStat('cursor_update_bytes', 'cursor-update value bytes updated'), @@ -476,7 +491,9 @@ dsrc_stats = [ 'data source pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), - CacheStat('cache_eviction_split', 'pages split during eviction'), + CacheStat('cache_eviction_split_internal', + 'internal pages split during eviction'), + CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), CacheStat('cache_inmem_split', 'in-memory page splits'), CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'), @@ -518,6 +535,7 @@ dsrc_stats = [ RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_delete_fast', 'fast-path pages deleted'), RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), @@ -533,3 +551,14 @@ dsrc_stats = [ ] dsrc_stats = sorted(dsrc_stats, key=attrgetter('name')) + +########################################## +# Cursor Join statistics +########################################## +join_stats = [ + JoinStat('accesses', 'accesses'), + JoinStat('actual_count', 'actual count of items'), + JoinStat('bloom_false_positive', 'bloom filter false positives'), +] + +join_stats = sorted(join_stats, key=attrgetter('name')) diff --git a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c index 34b8d7c7c64..8d50cc7ec5d 100644 --- a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c +++ b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c @@ -49,7 +49,8 @@ typedef struct { WT_EXTRACTOR extractor; /* Must come first */ WT_EXTENSION_API *wt_api; /* Extension API */ - int field_num; /* Field to extract */ + int field; /* Field to extract */ + int format_isnum; /* Field contents are numeric */ } CSV_EXTRACTOR; /* @@ -61,15 +62,15 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor) { char *copy, *p, *pend, *valstr; - const CSV_EXTRACTOR *cvs_extractor; - int i, ret; + const CSV_EXTRACTOR *csv_extractor; + int i, ret, val; size_t len; WT_EXTENSION_API *wtapi; (void)key; /* Unused parameters */ - cvs_extractor = (const CSV_EXTRACTOR *)extractor; - wtapi = cvs_extractor->wt_api; + csv_extractor = (const CSV_EXTRACTOR *)extractor; + wtapi = csv_extractor->wt_api; /* Unpack the value. */ if ((ret = wtapi->struct_unpack(wtapi, @@ -78,11 +79,11 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, p = valstr; pend = strchr(p, ','); - for (i = 0; i < cvs_extractor->field_num && pend != NULL; i++) { + for (i = 0; i < csv_extractor->field && pend != NULL; i++) { p = pend + 1; pend = strchr(p, ','); } - if (i == cvs_extractor->field_num) { + if (i == csv_extractor->field) { if (pend == NULL) pend = p + strlen(p); /* @@ -95,7 +96,12 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, return (errno); strncpy(copy, p, len); copy[len] = '\0'; - result_cursor->set_key(result_cursor, copy); + if (csv_extractor->format_isnum) { + if ((val = atoi(copy)) < 0) + return (EINVAL); + result_cursor->set_key(result_cursor, val); + } else + result_cursor->set_key(result_cursor, copy); ret = result_cursor->insert(result_cursor); free(copy); if (ret != 0) @@ -107,7 +113,7 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, /* * csv_customize -- * The customize function creates a customized extractor, - * needed to save the field number. + * needed to save the field number and format. */ static int csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session, @@ -115,20 +121,37 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session, { const CSV_EXTRACTOR *orig; CSV_EXTRACTOR *csv_extractor; + WT_CONFIG_ITEM field, format; + WT_CONFIG_PARSER *parser; + WT_EXTENSION_API *wtapi; + int ret; long field_num; (void)session; /* Unused parameters */ (void)uri; /* Unused parameters */ orig = (const CSV_EXTRACTOR *)extractor; - field_num = strtol(appcfg->str, NULL, 10); + wtapi = orig->wt_api; + if ((ret = wtapi->config_parser_open(wtapi, session, appcfg->str, + appcfg->len, &parser)) != 0) + return (ret); + if ((ret = parser->get(parser, "field", &field)) != 0 || + (ret = parser->get(parser, "format", &format)) != 0) { + if (ret == WT_NOTFOUND) + return (EINVAL); + return (ret); + } + field_num = strtol(field.str, NULL, 10); if (field_num < 0 || field_num > INT_MAX) return (EINVAL); + if (format.len != 1 || (format.str[0] != 'S' && format.str[0] != 'i')) + return (EINVAL); if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL) return (errno); *csv_extractor = *orig; - csv_extractor->field_num = (int)field_num; + csv_extractor->field = field_num; + csv_extractor->format_isnum = (format.str[0] == 'i'); *customp = (WT_EXTRACTOR *)csv_extractor; return (0); } diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i index 75c14dbfe8f..17317ab875b 100644 --- a/src/third_party/wiredtiger/lang/java/java_doc.i +++ b/src/third_party/wiredtiger/lang/java/java_doc.i @@ -33,6 +33,7 @@ COPYDOC(__wt_session, WT_SESSION, open_cursor) COPYDOC(__wt_session, WT_SESSION, create) COPYDOC(__wt_session, WT_SESSION, compact) COPYDOC(__wt_session, WT_SESSION, drop) +COPYDOC(__wt_session, WT_SESSION, join) COPYDOC(__wt_session, WT_SESSION, log_flush) COPYDOC(__wt_session, WT_SESSION, log_printf) COPYDOC(__wt_session, WT_SESSION, rename) diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c index 9225b9fe3b5..e3a21f25dc1 100644 --- a/src/third_party/wiredtiger/src/bloom/bloom.c +++ b/src/third_party/wiredtiger/src/bloom/bloom.c @@ -314,6 +314,47 @@ __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key) } /* + * __wt_bloom_inmem_get -- + * Tests whether the given key is in the Bloom filter. + * This can be used in place of __wt_bloom_get + * for Bloom filters that are memory only. + */ +int +__wt_bloom_inmem_get(WT_BLOOM *bloom, WT_ITEM *key) +{ + uint64_t h1, h2; + uint32_t i; + + h1 = __wt_hash_fnv64(key->data, key->size); + h2 = __wt_hash_city64(key->data, key->size); + for (i = 0; i < bloom->k; i++, h1 += h2) { + if (!__bit_test(bloom->bitstring, h1 % bloom->m)) + return (WT_NOTFOUND); + } + return (0); +} + +/* + * __wt_bloom_intersection -- + * Modify the Bloom filter to contain the intersection of this + * filter with another. + */ +int +__wt_bloom_intersection(WT_BLOOM *bloom, WT_BLOOM *other) +{ + uint64_t i, nbytes; + + if (bloom->k != other->k || bloom->factor != other->factor || + bloom->m != other->m || bloom->n != other->n) + return (EINVAL); + + nbytes = __bitstr_size(bloom->m); + for (i = 0; i < nbytes; i++) + bloom->bitstring[i] &= other->bitstring[i]; + return (0); +} + +/* * __wt_bloom_close -- * Close the Bloom filter, release any resources. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 3290fd6374c..69512f45933 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1093,6 +1093,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) cbt = (start != NULL) ? start : stop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* * We always delete in a forward direction because it's faster, assert diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 8edc40794e2..0f47c060daf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -566,7 +566,7 @@ __debug_tree( /* A NULL page starts at the top of the tree -- it's a convenience. */ if (page == NULL) - page = S2BT(session)->root.page; + page = btree->root.page; WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags)); diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 757b7b51cdd..98c6390e0f4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -138,6 +138,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; + WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast); + WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 3e611a107ab..dbdf94fc1b6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -643,11 +643,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; uint64_t cache_size; uint32_t intl_split_size, leaf_split_size; const char **cfg; btree = S2BT(session); + conn = S2C(session); cfg = btree->dhandle->cfg; /* @@ -688,8 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session) WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage); - if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) { - if ((cache_size = S2C(session)->cache_size) > 0) + if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) { + if ((cache_size = conn->cache_size) > 0) btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4); } @@ -723,6 +725,17 @@ __btree_page_sizes(WT_SESSION_IMPL *session) /* * Get the maximum internal/leaf page key/value sizes. * + * In-memory configuration overrides any key/value sizes, there's no + * such thing as an overflow item in an in-memory configuration. + */ + if (F_ISSET(conn, WT_CONN_IN_MEMORY)) { + btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE; + btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE; + btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE; + return (0); + } + + /* * In historic versions of WiredTiger, the maximum internal/leaf page * key/value sizes were set by the internal_item_max and leaf_item_max * configuration strings. Look for those strings if we don't find the diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index e60f7b3fb02..389ac761c5b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -586,8 +586,8 @@ skip_evict: * CPU to no purpose. */ if (stalled) - wait_cnt += 1000; - else if (++wait_cnt < 1000) { + wait_cnt += WT_THOUSAND; + else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } @@ -603,7 +603,7 @@ skip_evict: if (cache_work) continue; } - sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 9e45bf10a5c..caba12b78f1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session, return (__split_stash_add(session, split_gen, p, s)); } +#ifdef HAVE_DIAGNOSTIC /* - * __split_should_deepen -- - * Return if we should deepen the tree. + * __split_verify_intl_key_order -- + * Verify the key order on an internal page after a split, diagnostic only. */ -static bool -__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) +static void +__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_ITEM *next, _next, *last, _last, *tmp; + WT_REF *ref; + uint64_t recno; + int cmp; + bool first; btree = S2BT(session); - page = ref->page; - - /* - * Our caller is holding the parent page locked to single-thread splits, - * which means we can safely look at the page's index without setting a - * split generation. - */ - pindex = WT_INTL_INDEX_GET_SAFE(page); - - /* - * Sanity check for a reasonable number of keys on-page keys. Splitting - * with too few keys leads to excessively deep trees. - */ - if (pindex->entries < 100) - return (false); - - /* - * Deepen the tree if the page's memory footprint is larger than the - * maximum size for a page in memory (presumably putting eviction - * pressure on the cache). - */ - if (page->memory_footprint > btree->maxmempage) - return (true); - /* - * Check if the page has enough keys to make it worth splitting. If - * the number of keys is allowed to grow too large, the cost of - * splitting into parent pages can become large enough to result - * in slow operations. - */ - if (!__wt_ref_is_root(ref) && - pindex->entries > btree->split_deepen_min_child) - return (true); + switch (page->type) { + case WT_PAGE_COL_INT: + recno = 0; /* Less than any valid record number. */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->key.recno > recno); + recno = ref->key.recno; + } WT_INTL_FOREACH_END; + break; + case WT_PAGE_ROW_INT: + next = &_next; + WT_CLEAR(_next); + last = &_last; + WT_CLEAR(_last); - return (false); + first = true; + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __wt_ref_key(page, ref, &next->data, &next->size); + if (last->size == 0) { + if (first) + first = false; + else { + WT_ASSERT(session, __wt_compare( + session, btree->collator, last, + next, &cmp) == 0); + WT_ASSERT(session, cmp < 0); + } + } + tmp = last; + last = next; + next = tmp; + } WT_INTL_FOREACH_END; + break; + } } +#endif /* * __split_ovfl_key_cleanup -- @@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) } /* - * __split_ref_deepen_move -- - * Move a WT_REF from a parent to a child in service of a split to deepen - * the tree, including updating the accounting information. + * __split_ref_move -- + * Move a WT_REF from one page to another, including updating accounting + * information. */ static int -__split_ref_deepen_move(WT_SESSION_IMPL *session, - WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) +__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, + WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; + WT_REF *ref; size_t size; void *key; + ref = *from_refp; + /* + * The from-home argument is the page into which the "from" WT_REF may + * point, for example, if there's an on-page key the "from" WT_REF + * references, it will be on the page "from-home". + * * Instantiate row-store keys, and column- and row-store addresses in - * the WT_REF structures referenced by a page that's being split (and - * deepening the tree). The WT_REF structures aren't moving, but the - * index references are moving from the page we're splitting to a set - * of child pages, and so we can no longer reference the block image - * that remains with the page being split. + * the WT_REF structures referenced by a page that's being split. The + * WT_REF structures aren't moving, but the index references are moving + * from the page we're splitting to a set of new pages, and so we can + * no longer reference the block image that remains with the page being + * split. * * No locking is required to update the WT_REF structure because we're - * the only thread splitting the parent page, and there's no way for - * readers to race with our updates of single pointers. The changes - * have to be written before the page goes away, of course, our caller - * owns that problem. - * - * Row-store keys, first. + * the only thread splitting the page, and there's no way for readers + * to race with our updates of single pointers. The changes have to be + * written before the page goes away, of course, our caller owns that + * problem. */ - if (parent->type == WT_PAGE_ROW_INT) { + if (from_home->type == WT_PAGE_ROW_INT) { + /* + * Row-store keys: if it's not yet instantiated, instantiate it. + * If already instantiated, check for overflow cleanup (overflow + * keys are always instantiated). + */ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { - __wt_ref_key(parent, ref, &key, &size); + __wt_ref_key(from_home, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { - WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); - *parent_decrp += sizeof(WT_IKEY) + ikey->size; + WT_RET( + __split_ovfl_key_cleanup(session, from_home, ref)); + *decrp += sizeof(WT_IKEY) + ikey->size; } - *child_incrp += sizeof(WT_IKEY) + ikey->size; + *incrp += sizeof(WT_IKEY) + ikey->size; } /* @@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, * get the address from the on-page cell. */ addr = ref->addr; - if (addr != NULL && !__wt_off_page(parent, addr)) { + if (addr != NULL && !__wt_off_page(from_home, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( @@ -330,364 +345,1048 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, ref->addr = addr; } - /* And finally, the WT_REF itself. */ - WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); + /* And finally, copy the WT_REF pointer itself. */ + *to_refp = ref; + WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF)); return (0); } -#ifdef HAVE_DIAGNOSTIC /* - * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * __split_child_block_evict_and_split -- + * Ensure the newly created child isn't evicted or split for now. */ static void -__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) +__split_child_block_evict_and_split(WT_PAGE *child) { - WT_BTREE *btree; - WT_ITEM *next, _next, *last, _last, *tmp; - WT_REF *ref; - uint64_t recno; - int cmp; - bool first; + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page which previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * newly created page's modify structure has a field with a transaction + * ID that's checked before any internal page is evicted. Unfortunately, + * we don't know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but the act + * of updating the original page's index is what allows the eviction to + * happen. + * + * Once the split is live, newly created internal pages might themselves + * split. The split itself is not the problem: if a page splits before + * we fix up its WT_REF (in other words, a WT_REF we move is then moved + * again, before we reset the underlying page's parent reference), it's + * OK because the test we use to find a WT_REF and WT_PAGE that require + * fixing up is only that the WT_REF points to the wrong parent, not it + * points to a specific wrong parent. The problem is our fix up of the + * WT_REFs in the created page could race with the subsequent fix of the + * same WT_REFs (in a different created page), we'd have to acquire some + * lock to prevent that race, and that's going to be difficult at best. + * + * For now, block eviction and splits in newly created pages until they + * have been fixed up. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); +} - btree = S2BT(session); +/* + * __split_ref_move_final -- + * Finalize the moved WT_REF structures after the split succeeds. + */ +static int +__split_ref_move_final( + WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) +{ + WT_DECL_RET; + WT_PAGE *child; + WT_REF *ref, *child_ref; + uint64_t txn_new_id; + uint32_t i; - switch (page->type) { - case WT_PAGE_COL_INT: - recno = 0; /* Less than any valid record number. */ - WT_INTL_FOREACH_BEGIN(session, page, ref) { - WT_ASSERT(session, ref->key.recno > recno); - recno = ref->key.recno; - } WT_INTL_FOREACH_END; - break; - case WT_PAGE_ROW_INT: - next = &_next; - WT_CLEAR(_next); - last = &_last; - WT_CLEAR(_last); + /* + * When creating new internal pages as part of a split, we set a field + * in those pages modify structure to prevent them from being evicted + * until all threads are known to have exited the index of the page that + * previously "owned" the WT_REF. Set that field to a safe value. + */ + txn_new_id = __wt_txn_new_id(session); - first = true; - WT_INTL_FOREACH_BEGIN(session, page, ref) { - __wt_ref_key(page, ref, &next->data, &next->size); - if (last->size == 0) { - if (first) - first = false; - else { - WT_ASSERT(session, __wt_compare( - session, btree->collator, last, - next, &cmp) == 0); - WT_ASSERT(session, cmp < 0); - } + /* + * The WT_REF structures moved to newly allocated child pages reference + * the wrong parent page and we have to fix that up. The problem is + * revealed when a thread of control searches for the child page's + * reference structure slot, and fails to find it because the parent + * page being searched no longer references the child. When that failure + * happens the thread waits for the reference's home page to be updated, + * which we do here: walk the children and fix them up. + */ + for (i = 0; i < entries; ++i, ++refp) { + ref = *refp; + + /* + * We don't hold hazard pointers on created pages, they cannot + * be evicted because the page-modify transaction value set as + * they were created prevents eviction. (See above, we reset + * that value as part of fixing up the page.) But, an eviction + * thread might be attempting to evict the page (the WT_REF may + * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF + * may be WT_REF_READING), or it may be in some other state. + * Acquire a hazard pointer for any in-memory pages so we know + * the state of the page. Ignore pages not in-memory (deleted, + * on-disk, being read), there's no in-memory structure to fix. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + continue; + WT_ERR(ret); + + child = ref->page; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + /* + * The page's home reference may not be wrong, as we + * opened up access from the top of the tree already, + * disk pages may have been read in since then, and + * those pages would have correct parent references. + */ + if (child_ref->home != child) { + child_ref->home = child; + child_ref->pindex_hint = 0; + + child->modify->mod_split_txn = txn_new_id; } - tmp = last; - last = next; - next = tmp; } WT_INTL_FOREACH_END; - break; + WT_LEAVE_PAGE_INDEX(session); + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + WT_ERR(__wt_hazard_clear(session, child)); } + + /* + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error resolving a split"); } -#endif /* - * __split_deepen -- - * Split an internal page in-memory, deepening the tree. + * __split_root -- + * Split the root page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +__split_root(WT_SESSION_IMPL *session, WT_PAGE *root) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *child; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; - WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; - size_t child_incr, parent_decr, parent_incr, size; + WT_REF **child_refp, *ref, **root_refp; + size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; - uint32_t children, chunk, i, j, moved_entries, new_entries, remain; - uint32_t skip_leading, slots; + uint32_t children, chunk, i, j, remain; + uint32_t slots; bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); btree = S2BT(session); alloc_index = NULL; - parent_incr = parent_decr = 0; + root_decr = root_incr = 0; complete = false; + /* The root page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, root)); + /* - * Our caller is holding the parent page locked to single-thread splits, + * Our caller is holding the root page locked to single-thread splits, * which means we can safely look at the page's index without setting a * split generation. */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); + pindex = WT_INTL_INDEX_GET_SAFE(root); /* - * A prepending/appending workload will repeatedly deepen parts of the - * tree that aren't changing, and appending workloads are not uncommon. - * First, keep the first/last pages of the tree at their current level, - * to catch simple workloads. Second, track the number of entries which - * resulted from the last time we deepened this page, and if we refilled - * this page without splitting into those slots, ignore them for this - * split. It's not exact because an eviction might split into any part - * of the page: if 80% of the splits are at the end of the page, assume - * an append-style workload. Of course, the plan eventually fails: when - * repeatedly deepening this page for an append-only workload, we will - * progressively ignore more and more of the slots. When ignoring 90% of - * the slots, deepen the entire page again. - * - * Figure out how many slots we're leaving at this level and how many - * child pages we're creating. + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. */ -#undef skip_trailing -#define skip_trailing 1 - skip_leading = 1; - new_entries = pindex->entries - parent->pg_intl_deepen_split_last; - if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10) - skip_leading = parent->pg_intl_deepen_split_last; - if (skip_leading > (pindex->entries * 9) * 10) - skip_leading = 1; - - /* - * In a few (rare) cases we split pages with only a few entries, and in - * those cases we keep it simple, 10 children, skip only first and last - * entries. Otherwise, split into a lot of child pages. - */ - moved_entries = pindex->entries - (skip_leading + skip_trailing); - children = moved_entries / btree->split_deepen_per_child; + children = pindex->entries / btree->split_deepen_per_child; if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); children = 10; - skip_leading = 1; - moved_entries = - pindex->entries - (skip_leading + skip_trailing); } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", - parent, pindex->entries, children)); + "%p: %" PRIu32 " root page elements, splitting into %" PRIu32 + " children", + root, pindex->entries, children)); /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize - * the slots of the allocated WT_PAGE_INDEX to point to the pages we're - * keeping at the current level, and the rest of the slots to point to - * new WT_REF objects. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the root page, replacing the root's page-index. */ - size = sizeof(WT_PAGE_INDEX) + - (children + skip_leading + skip_trailing) * sizeof(WT_REF *); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; + root_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = children + skip_leading + skip_trailing; - for (alloc_refp = alloc_index->index, - i = 0; i < skip_leading; ++alloc_refp, ++i) - alloc_index->index[i] = pindex->index[i]; - for (i = 0; i < children; ++alloc_refp, ++i) + alloc_index->entries = children; + alloc_refp = alloc_index->index; + for (i = 0; i < children; alloc_refp++, ++i) WT_ERR(__wt_calloc_one(session, alloc_refp)); - parent_incr += children * sizeof(WT_REF); - alloc_index->index[alloc_index->entries - 1] = - pindex->index[pindex->entries - 1]; + root_incr += children * sizeof(WT_REF); /* Allocate child pages, and connect them into the new page index. */ - chunk = moved_entries / children; - remain = moved_entries - chunk * (children - 1); - for (parent_refp = pindex->index + skip_leading, - alloc_refp = alloc_index->index + skip_leading, - i = 0; i < children; ++i) { + for (root_refp = pindex->index, + alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; WT_ERR(__wt_page_alloc( - session, parent->type, 0, slots, false, &child)); + session, root->type, 0, slots, false, &child)); /* - * Initialize the parent page's child reference; we need a copy - * of the page's key. + * Initialize the page's child reference; we need a copy of the + * page's key. */ ref = *alloc_refp++; - ref->home = parent; + ref->home = root; ref->page = child; ref->addr = NULL; - if (parent->type == WT_PAGE_ROW_INT) { - __wt_ref_key(parent, *parent_refp, &p, &size); + if (root->type == WT_PAGE_ROW_INT) { + __wt_ref_key(root, *root_refp, &p, &size); WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); - parent_incr += sizeof(WT_IKEY) + size; + root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*parent_refp)->key.recno; + ref->key.recno = (*root_refp)->key.recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (parent->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*parent_refp)->key.recno; + if (root->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* - * Once the split goes live, the newly created internal pages - * might be evicted and their WT_REF structures freed. If those - * pages are evicted before threads exit the previous page index - * array, a thread might see a freed WT_REF. Set the eviction - * transaction requirement for the newly created internal pages. - */ - child->modify->mod_split_txn = __wt_txn_new_id(session); + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); /* * The newly allocated child's page index references the same - * structures as the parent. (We cannot move WT_REF structures, + * structures as the root. (We cannot move WT_REF structures, * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for * the page that has an page index entry for the WT_REF is about * to change. */ - child_incr = 0; child_pindex = WT_INTL_INDEX_GET_SAFE(child); - for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { - WT_ERR(__split_ref_deepen_move(session, - parent, *parent_refp, &parent_decr, &child_incr)); - *child_refp++ = *parent_refp++; - } + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++root_refp, ++j) + WT_ERR(__split_ref_move(session, root, + root_refp, &root_decr, child_refp, &child_incr)); + __wt_cache_page_inmem_incr(session, child, child_incr); } WT_ASSERT(session, - alloc_refp - alloc_index->index == - (ptrdiff_t)(alloc_index->entries - skip_trailing)); - WT_ASSERT(session, parent_refp - pindex->index == - (ptrdiff_t)(pindex->entries - skip_trailing)); + alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* - * Confirm the parent page's index hasn't moved, then update it, which + * Confirm the root page's index hasn't moved, then update it, which * makes the split visible to threads descending the tree. From this * point on, we're committed to the split. * * A note on error handling: until this point, there's no problem with * unwinding on error. We allocated a new page index, a new set of * WT_REFs and a new set of child pages -- if an error occurred, the - * parent remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention + * root remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the root page, attention * needs to be paid. However, subsequent failures are relatively benign, * the split is OK and complete. For that reason, we ignore errors past * this point unless there's a panic. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); + WT_INTL_INDEX_SET(root, alloc_index); + complete = true; + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, root)); +#endif + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index, alloc_index->entries)); + + /* We've installed the allocated page-index, ensure error handling. */ + alloc_index = NULL; + + /* + * We can't free the previous root's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling: we have already + * updated the page with a new index. Even if stashing the old value + * fails, we don't roll back that change, because threads may already + * be using the new index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + root_decr += size; + + /* Adjust the root's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, root, root_incr); + __wt_cache_page_inmem_decr(session, root, root_decr); + __wt_page_modify_set(session, root); + +err: /* + * If complete is true, we saw an error after opening up the tree to + * descent through the root page's new index. There is nothing we + * can do, there are threads potentially active in both versions of + * the tree. + * + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) + __wt_free_ref_index(session, root, alloc_index, true); + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during root page split to " + "deepen the tree"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_parent -- + * Resolve a multi-page split, inserting new information into the parent. + */ +static int +__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, + uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard) +{ + WT_DECL_ITEM(scr); + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *parent; + WT_PAGE_INDEX *alloc_index, *pindex; + WT_REF **alloc_refp, *next_ref; + size_t parent_decr, size; + uint64_t split_gen; + uint32_t i, j; + uint32_t deleted_entries, parent_entries, result_entries; + uint32_t *deleted_refs; + bool complete, empty_parent; + + parent = ref->home; + + alloc_index = pindex = NULL; + parent_decr = 0; + parent_entries = 0; + complete = empty_parent = false; + + /* The parent page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, parent)); + + /* + * We've locked the parent, which means it cannot split (which is the + * only reason to worry about split generation values). + */ + pindex = WT_INTL_INDEX_GET_SAFE(parent); + parent_entries = pindex->entries; + + /* + * Remove any refs to deleted pages while we are splitting, we have + * the internal page locked down, and are copying the refs into a new + * array anyway. Switch them to the special split state, so that any + * reading thread will restart. + */ + WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if ((discard && next_ref == ref) || + (next_ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, next_ref, true) && + __wt_atomic_casv32( + &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) { + WT_ERR(__wt_buf_grow(session, scr, + (deleted_entries + 1) * sizeof(uint32_t))); + deleted_refs = scr->mem; + deleted_refs[deleted_entries++] = i; + } + } + + /* + * The final entry count consists of the original count, plus any new + * pages, less any WT_REFs we're removing (deleted entries plus the + * entry we're replacing). + */ + result_entries = (parent_entries + new_entries) - deleted_entries; + if (!discard) + --result_entries; + + /* + * If there are no remaining entries on the parent, give up, we can't + * leave an empty internal page. Mark it to be evicted soon and clean + * up any references that have changed state. + */ + if (result_entries == 0) { + empty_parent = true; + __wt_page_evict_soon(parent); + goto err; + } + + /* + * Allocate and initialize a new page index array for the parent, then + * copy references from the original index array, plus references from + * the newly created split array, into place. + */ + size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = result_entries; + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref == ref) + for (j = 0; j < new_entries; ++j) { + ref_new[j]->home = parent; + *alloc_refp++ = ref_new[j]; + } + else if (next_ref->state != WT_REF_SPLIT) + /* Skip refs we have marked for deletion. */ + *alloc_refp++ = next_ref; + } + + /* Check that we filled in all the entries. */ + WT_ASSERT(session, + alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + + /* + * Confirm the parent page's index hasn't moved then update it, which + * makes the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - complete = true; + alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + /* - * Save the number of entries created by deepening the tree and reset - * the count of splits into this page after that point. + * If discarding the page's original WT_REF field, reset it to split. + * Threads cursoring through the tree were blocked because that WT_REF + * state was set to locked. Changing the locked state to split unblocks + * those threads and causes them to re-calculate their position based + * on the just-updated parent page's index. */ - parent->pg_intl_deepen_split_append = 0; - parent->pg_intl_deepen_split_last = alloc_index->entries; + if (discard) + WT_PUBLISH(ref->state, WT_REF_SPLIT); /* - * The moved reference structures now reference the wrong parent page, - * and we have to fix that up. The problem is revealed when a thread - * of control searches for a page's reference structure slot, and fails - * to find it because the page it's searching no longer references it. - * When that failure happens, the thread waits for the reference's home - * page to be updated, which we do here: walk the children and fix them - * up. + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + + /* + * A note on error handling: failures before we swapped the new page + * index into the parent can be resolved by freeing allocated memory + * because the original page is unchanged, we can continue to use it + * and we have not yet modified the parent. Failures after we swap + * the new page index into the parent are also relatively benign, the + * split is OK and complete. For those reasons, we ignore errors past + * this point unless there's a panic. + */ + complete = true; + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 + " (%s%" PRIu32 ")", + ref->page, ref->page == NULL ? + "unknown page type" : __wt_page_type_string(ref->page->type), + ref->page == NULL ? "reverse " : "", parent, + parent_entries, result_entries, + ref->page == NULL ? "-" : "+", + ref->page == NULL ? + parent_entries - result_entries : result_entries - parent_entries)); + + /* + * The new page index is in place, free the WT_REF we were splitting and + * any deleted WT_REFs we found, modulo the usual safe free semantics. * - * We're not acquiring hazard pointers on these pages, they cannot be - * evicted because of the eviction transaction value set above. - */ - for (parent_refp = alloc_index->index, - i = alloc_index->entries; i > 0; ++parent_refp, --i) { - parent_ref = *parent_refp; - WT_ASSERT(session, parent_ref->home == parent); - if (parent_ref->state != WT_REF_MEM) - continue; + * Acquire a new split generation. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { + next_ref = pindex->index[deleted_refs[i]]; + WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); /* - * We left the first/last children of the parent at the current - * level to avoid bad split patterns, they might be leaf pages; - * check the page type before we continue. - */ - child = parent_ref->page; - if (!WT_PAGE_IS_INTERNAL(child)) - continue; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); -#endif - /* - * We have the parent locked, but there's nothing to prevent - * this child from splitting beneath us; ensure that reading - * the child's page index structure is safe. + * We set the WT_REF to split, discard it, freeing any resources + * it holds. + * + * Row-store trees where the old version of the page is being + * discarded: the previous parent page's key for this child page + * may have been an on-page overflow key. In that case, if the + * key hasn't been deleted, delete it now, including its backing + * blocks. We are exchanging the WT_REF that referenced it for + * the split page WT_REFs and their keys, and there's no longer + * any reference to it. Done after completing the split (if we + * failed, we'd leak the underlying blocks, but the parent page + * would be unaffected). */ - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup( + session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free( + session, split_gen, exclusive, ikey, size)); + parent_decr += size; + } /* - * The page's parent reference may not be wrong, as we - * opened up access from the top of the tree already, - * pages may have been read in since then. Check and - * only update pages that reference the original page, - * they must be wrong. + * The page_del structure can be freed immediately: it + * is only read when the ref state is WT_REF_DELETED. + * The size of the structure wasn't added to the parent, + * don't decrement. */ - if (child_ref->home == parent) { - child_ref->home = child; - child_ref->pindex_hint = 0; + if (next_ref->page_del != NULL) { + __wt_free(session, + next_ref->page_del->update_list); + __wt_free(session, next_ref->page_del); } - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); + } + + WT_TRET(__split_safe_free( + session, split_gen, exclusive, next_ref, sizeof(WT_REF))); + parent_decr += sizeof(WT_REF); } + /* We freed the reference that was split in the loop above. */ + ref = NULL; + /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. + * We can't free the previous page index, there may be threads using it. + * Add it to the session discard list, to be freed when it's safe. */ - WT_FULL_BARRIER(); - alloc_index = NULL; + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); + parent_decr += size; + /* Adjust the parent's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, parent, parent_incr); + __wt_cache_page_inmem_decr(session, parent, parent_decr); + __wt_page_modify_set(session, parent); + +err: __wt_scr_free(session, &scr); /* - * We can't free the previous parent's index, there may be threads using - * it. Add to the session's discard list, to be freed once we know no - * threads can still be using it. + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) { + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + next_ref->state = WT_REF_DELETED; + } + + __wt_free_ref_index(session, NULL, alloc_index, false); + + /* + * The split couldn't proceed because the parent would be empty, + * return EBUSY so our caller knows to unlock the WT_REF that's + * being deleted, but don't be noisy, there's nothing wrong. + */ + if (empty_parent) + return (EBUSY); + } + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during parent page split"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_internal -- + * Split an internal page into its parent. + */ +static int +__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *child; + WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; + WT_REF **alloc_refp; + WT_REF **child_refp, *page_ref, **page_refp, *ref; + size_t child_incr, page_decr, page_incr, parent_incr, size; + uint64_t split_gen; + uint32_t children, chunk, i, j, remain; + uint32_t slots; + bool complete; + void *p; + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); + + /* The page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, page)); + + btree = S2BT(session); + alloc_index = replace_index = NULL; + page_ref = page->pg_intl_parent_ref; + page_decr = page_incr = parent_incr = 0; + complete = false; + + /* + * Our caller is holding the page locked to single-thread splits, which + * means we can safely look at the page's index without setting a split + * generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. + */ + children = pindex->entries / btree->split_deepen_per_child; + if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); + children = 10; + } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " internal page elements, splitting %" PRIu32 + " children into parent %p", + page, pindex->entries, children, parent)); + + /* + * Ideally, we'd discard the original page, but that's hard since other + * threads of control are using it (for example, if eviction is walking + * the tree and looking at the page.) Instead, perform a right-split, + * moving all except the first chunk of the page's WT_REF objects to new + * pages. * - * This change requires care with error handling: we have already - * updated the page with a new index. Even if stashing the old value - * fails, we don't roll back that change, because threads may already - * be using the new index. + * Create and initialize a replacement WT_PAGE_INDEX for the original + * page. */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size)); - parent_decr += size; + size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &replace_index)); + page_incr += size; + replace_index->index = (WT_REF **)(replace_index + 1); + replace_index->entries = chunk; + for (page_refp = pindex->index, i = 0; i < chunk; ++i) + replace_index->index[i] = *page_refp++; /* - * Adjust the parent's memory footprint. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the page's parent, replacing the page's page-index. + * + * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. + * The remainder of the slots are allocated WT_REFs. */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = children; + alloc_refp = alloc_index->index; + *alloc_refp++ = page_ref; + for (i = 1; i < children; ++alloc_refp, ++i) + WT_ERR(__wt_calloc_one(session, alloc_refp)); + parent_incr += children * sizeof(WT_REF); + + /* Allocate child pages, and connect them into the new page index. */ + WT_ASSERT(session, page_refp == pindex->index + chunk); + for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { + slots = i == children - 1 ? remain : chunk; + WT_ERR(__wt_page_alloc( + session, page->type, 0, slots, false, &child)); + + /* + * Initialize the page's child reference; we need a copy of the + * page's key. + */ + ref = *alloc_refp++; + ref->home = parent; + ref->page = child; + ref->addr = NULL; + if (page->type == WT_PAGE_ROW_INT) { + __wt_ref_key(page, *page_refp, &p, &size); + WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); + parent_incr += sizeof(WT_IKEY) + size; + } else + ref->key.recno = (*page_refp)->key.recno; + ref->state = WT_REF_MEM; + + /* Initialize the child page. */ + if (page->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*page_refp)->key.recno; + child->pg_intl_parent_ref = ref; + + /* Mark it dirty. */ + WT_ERR(__wt_page_modify_init(session, child)); + __wt_page_modify_set(session, child); + + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); + + /* + * The newly allocated child's page index references the same + * structures as the parent. (We cannot move WT_REF structures, + * threads may be underneath us right now changing the structure + * state.) However, if the WT_REF structures reference on-page + * information, we have to fix that, because the disk image for + * the page that has an page index entry for the WT_REF is about + * to be discarded. + */ + child_pindex = WT_INTL_INDEX_GET_SAFE(child); + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++page_refp, ++j) + WT_ERR(__split_ref_move(session, page, + page_refp, &page_decr, child_refp, &child_incr)); + + __wt_cache_page_inmem_incr(session, child, child_incr); + } + WT_ASSERT(session, alloc_refp - + alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + page_refp - pindex->index == (ptrdiff_t)pindex->entries); + + /* Split into the parent. */ + WT_ERR(__split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)); + + /* + * A note on error handling: until this point, there's no problem with + * unwinding on error. We allocated a new page index, a new set of + * WT_REFs and a new set of child pages -- if an error occurred, the + * page remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the parent page, attention + * needs to be paid. However, subsequent failures are relatively benign, + * the split is OK and complete. For that reason, we ignore errors past + * this point unless there's a panic. + */ + complete = true; + + /* Confirm the page's index hasn't moved, then update it. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, page)); +#endif + + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index + 1, alloc_index->entries - 1)); + + /* + * We don't care about the page-index we allocated, all we needed was + * the array of WT_REF structures, which has now been split into the + * parent page. + */ + __wt_free(session, alloc_index); + + /* + * We can't free the previous page's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling, we've already updated + * the parent page. Even if stashing the old value fails, we don't roll + * back that change, because threads may already be using the new parent + * page. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + page_decr += size; + + /* Adjust the page's memory footprint, and mark it dirty. */ + __wt_cache_page_inmem_incr(session, page, page_incr); + __wt_cache_page_inmem_decr(session, page, page_decr); + __wt_page_modify_set(session, page); err: /* * If complete is true, we saw an error after opening up the tree to - * descent through the parent page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. + * descent through the page's new index. There is nothing we can do, + * there are threads potentially active in both versions of the tree. * * A note on error handling: if we completed the split, return success, * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) - __wt_free_ref_index(session, parent, alloc_index, true); + if (!complete) { + __wt_free_ref_index(session, page, alloc_index, true); + __wt_free_ref_index(session, page, replace_index, false); + } if (ret != 0 && ret != WT_PANIC) __wt_err(session, ret, - "ignoring not-fatal error during parent page split to " - "deepen the tree"); + "ignoring not-fatal error during internal page split"); return (ret == WT_PANIC || !complete ? ret : 0); } /* + * __split_internal_lock -- + * Lock an internal page. + */ +static int +__split_internal_lock( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *parent_ref; + + *hazardp = false; + *parentp = NULL; + + /* + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we want to split + * the parent, give up to avoid that deadlock. + */ + if (S2BT(session)->checkpointing != WT_CKPT_OFF) + return (EBUSY); + + /* + * Get a page-level lock on the parent to single-thread splits into the + * page because we need to single-thread sizing/growing the page index. + * It's OK to queue up multiple splits as the child pages split, but the + * actual split into the parent has to be serialized. Note we allocate + * memory inside of the lock and may want to invest effort in making the + * locked period shorter. + * + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. + */ + for (;;) { + parent = ref->home; + + /* Skip pages that aren't ready to split. */ + if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) + return (EBUSY); + + WT_RET(__wt_fair_lock(session, &parent->page_lock)); + if (parent == ref->home) + break; + WT_RET(__wt_fair_unlock(session, &parent->page_lock)); + } + + /* + * We have exclusive access to split the parent, and at this point, the + * child prevents the parent from being evicted. However, once we + * update the parent's index, it may no longer refer to the child, and + * could conceivably be evicted. Get a hazard pointer on the parent + * now, so that we can safely access it after updating the index. + * + * Take care getting the page doesn't trigger eviction work: we could + * block trying to split a different child of our parent and deadlock + * or we could be the eviction server relied upon by other threads to + * populate the eviction queue. + */ + if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { + WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); + *hazardp = true; + } + + *parentp = parent; + return (0); + +err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_unlock -- + * Unlock the parent page. + */ +static int +__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) +{ + WT_DECL_RET; + + if (hazard) + ret = __wt_hazard_clear(session, parent); + + WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_should_split -- + * Return if we should split an internal page. + */ +static bool +__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + + btree = S2BT(session); + page = ref->page; + + /* + * Our caller is holding the parent page locked to single-thread splits, + * which means we can safely look at the page's index without setting a + * split generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* Sanity check for a reasonable number of on-page keys. */ + if (pindex->entries < 100) + return (false); + + /* + * Deepen the tree if the page's memory footprint is larger than the + * maximum size for a page in memory (presumably putting eviction + * pressure on the cache). + */ + if (page->memory_footprint > btree->maxmempage) + return (true); + + /* + * Check if the page has enough keys to make it worth splitting. If + * the number of keys is allowed to grow too large, the cost of + * splitting into parent pages can become large enough to result + * in slow operations. + */ + if (pindex->entries > btree->split_deepen_min_child) + return (true); + + return (false); +} + +/* + * __split_parent_climb -- + * Check if we should split up the tree. + */ +static int +__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *ref; + bool parent_hazard; + + /* + * Page splits trickle up the tree, that is, as leaf pages grow large + * enough and are evicted, they'll split into their parent. And, as + * that parent page grows large enough and is evicted, it splits into + * its parent and so on. When the page split wave reaches the root, + * the tree will permanently deepen as multiple root pages are written. + * + * However, this only helps if internal pages are evicted (and we resist + * evicting internal pages for obvious reasons), or if the tree were to + * be closed and re-opened from a disk image, which may be a rare event. + * + * To avoid internal pages becoming too large absent eviction, check + * parent pages each time pages are split into them. If the page is big + * enough, either split the page into its parent or, in the case of the + * root, deepen the tree. + * + * Split up the tree. + */ + for (;;) { + parent = NULL; + parent_hazard = false; + ref = page->pg_intl_parent_ref; + + /* If we don't need to split the page, we're done. */ + if (!__split_internal_should_split(session, ref)) + break; + + /* + * If we've reached the root page, there are no subsequent pages + * to review, deepen the tree and quit. + */ + if (__wt_ref_is_root(ref)) { + ret = __split_root(session, page); + break; + } + + /* + * Lock the parent and split into it, then swap the parent/page + * locks, lock-coupling up the tree. + */ + WT_ERR(__split_internal_lock( + session, ref, &parent, &parent_hazard)); + ret = __split_internal(session, parent, page); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + page = parent; + page_hazard = parent_hazard; + parent = NULL; + parent_hazard = false; + WT_ERR(ret); + } + +err: if (parent != NULL) + WT_TRET( + __split_internal_unlock(session, parent, parent_hazard)); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + /* A page may have been busy, in which case return without error. */ + WT_RET_BUSY_OK(ret); + return (0); +} + +/* * __split_multi_inmem -- * Instantiate a page in a multi-block set. */ @@ -901,369 +1600,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, } /* - * __split_parent_lock -- - * Lock the parent page. - */ -static int -__split_parent_lock( - WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) -{ - WT_DECL_RET; - WT_PAGE *parent; - WT_REF *parent_ref; - - *hazardp = false; - *parentp = NULL; - - /* - * A checkpoint reconciling this parent page can deadlock with - * our split. We have an exclusive page lock on the child before - * we acquire the page's reconciliation lock, and reconciliation - * acquires the page's reconciliation lock before it encounters - * the child's exclusive lock (which causes reconciliation to - * loop until the exclusive lock is resolved). If we want to split - * the parent, give up to avoid that deadlock. - */ - if (S2BT(session)->checkpointing != WT_CKPT_OFF) - return (EBUSY); - - /* - * Get a page-level lock on the parent to single-thread splits into the - * page because we need to single-thread sizing/growing the page index. - * It's OK to queue up multiple splits as the child pages split, but the - * actual split into the parent has to be serialized. Note we allocate - * memory inside of the lock and may want to invest effort in making the - * locked period shorter. - * - * We use the reconciliation lock here because not only do we have to - * single-thread the split, we have to lock out reconciliation of the - * parent because reconciliation of the parent can't deal with finding - * a split child during internal page traversal. Basically, there's no - * reason to use a different lock if we have to block reconciliation - * anyway. - */ - for (;;) { - parent = ref->home; - WT_RET(__wt_fair_lock(session, &parent->page_lock)); - if (parent == ref->home) - break; - /* Try again if the page deepened while we were waiting */ - WT_RET(__wt_fair_unlock(session, &parent->page_lock)); - } - - /* - * We have exclusive access to split the parent, and at this point, the - * child prevents the parent from being evicted. However, once we - * update the parent's index, it will no longer refer to the child, and - * could conceivably be evicted. Get a hazard pointer on the parent - * now, so that we can safely access it after updating the index. - * - * Take care getting the page doesn't trigger eviction work: we could - * block trying to split a different child of our parent and deadlock - * or we could be the eviction server relied upon by other threads to - * populate the eviction queue. - */ - if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { - WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); - *hazardp = true; - } - - *parentp = parent; - return (0); - -err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent_unlock -- - * Unlock the parent page. - */ -static int -__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) -{ - WT_DECL_RET; - - if (hazard) - ret = __wt_hazard_clear(session, parent); - - WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent -- - * Resolve a multi-page split, inserting new information into the parent. - */ -static int -__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) -{ - WT_DECL_RET; - WT_IKEY *ikey; - WT_PAGE *parent; - WT_PAGE_INDEX *alloc_index, *pindex; - WT_REF **alloc_refp, *next_ref, *parent_ref; - size_t parent_decr, size; - uint64_t split_gen; - uint32_t i, j; - uint32_t deleted_entries, parent_entries, result_entries; - bool complete; - - parent = ref->home; - parent_ref = parent->pg_intl_parent_ref; - - alloc_index = pindex = NULL; - parent_decr = 0; - parent_entries = 0; - complete = false; - - /* - * We've locked the parent, which means it cannot split (which is the - * only reason to worry about split generation values). - */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); - parent_entries = pindex->entries; - - /* - * Remove any refs to deleted pages while we are splitting, we have - * the internal page locked down, and are copying the refs into a new - * array anyway. Switch them to the special split state, so that any - * reading thread will restart. Include the ref we are splitting in - * the count to be deleted. - */ - for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - if (next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref, true) && - __wt_atomic_casv32( - &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) - deleted_entries++; - } - - /* - * The final entry count consists of the original count, plus any new - * pages, less any WT_REFs we're removing. - */ - result_entries = (parent_entries + new_entries) - deleted_entries; - - /* - * If the entire (sub)tree is empty, give up: we can't leave an empty - * internal page. Mark it to be evicted soon and clean up any - * references that have changed state. - */ - if (result_entries == 0) { - __wt_page_evict_soon(parent); - goto err; - } - - /* - * Allocate and initialize a new page index array for the parent, then - * copy references from the original index array, plus references from - * the newly created split array, into place. - */ - size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); - WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; - alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref == ref) { - for (j = 0; j < new_entries; ++j) { - ref_new[j]->home = parent; - *alloc_refp++ = ref_new[j]; - - /* - * Clear the split reference as it moves to the - * allocated page index, so it never appears on - * both after an error. - */ - ref_new[j] = NULL; - } - - /* - * We detect append-style workloads to avoid repeatedly - * deepening parts of the tree where no work is being - * done by tracking if we're splitting after the slots - * created by the last split to deepen this parent. - * - * Note the calculation: i is a 0-based array offset and - * split-last is a count of entries, also either or both - * i and split-last might be unsigned 0, don't decrement - * either one. - */ - if (i > parent->pg_intl_deepen_split_last) - parent-> - pg_intl_deepen_split_append += new_entries; - } else if (next_ref->state != WT_REF_SPLIT) - /* Skip refs we have marked for deletion. */ - *alloc_refp++ = next_ref; - } - - /* Check that we filled in all the entries. */ - WT_ASSERT(session, - alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); - - /* - * Confirm the parent page's index hasn't moved then update it, which - * makes the split visible to threads descending the tree. - */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); - WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - alloc_index = NULL; - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - - /* - * Reset the page's original WT_REF field to split. Threads cursoring - * through the tree were blocked because that WT_REF state was set to - * locked. This update changes the locked state to split, unblocking - * those threads and causing them to re-calculate their position based - * on the updated parent page's index. - */ - WT_PUBLISH(ref->state, WT_REF_SPLIT); - - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - - WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", ref->page == NULL ? - "reverse" : __wt_page_type_string(ref->page->type), - parent_entries, result_entries, result_entries - parent_entries)); - - /* - * The new page index is in place, free the WT_REF we were splitting - * and any deleted WT_REFs we found, modulo the usual safe free - * semantics. - */ - for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state != WT_REF_SPLIT) - continue; - --deleted_entries; - - /* - * We set the WT_REF to split, discard it, freeing any resources - * it holds. - * - * Row-store trees where the old version of the page is being - * discarded: the previous parent page's key for this child page - * may have been an on-page overflow key. In that case, if the - * key hasn't been deleted, delete it now, including its backing - * blocks. We are exchanging the WT_REF that referenced it for - * the split page WT_REFs and their keys, and there's no longer - * any reference to it. Done after completing the split (if we - * failed, we'd leak the underlying blocks, but the parent page - * would be unaffected). - */ - if (parent->type == WT_PAGE_ROW_INT) { - WT_TRET(__split_ovfl_key_cleanup( - session, parent, next_ref)); - ikey = __wt_ref_key_instantiated(next_ref); - if (ikey != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free( - session, split_gen, 0, ikey, size)); - parent_decr += size; - } - /* - * The page_del structure can be freed immediately: it - * is only read when the ref state is WT_REF_DELETED. - * The size of the structure wasn't added to the parent, - * don't decrement. - */ - if (next_ref->page_del != NULL) { - __wt_free(session, - next_ref->page_del->update_list); - __wt_free(session, next_ref->page_del); - } - } - - WT_TRET(__split_safe_free( - session, split_gen, 0, next_ref, sizeof(WT_REF))); - parent_decr += sizeof(WT_REF); - } - - /* We freed the reference that was split in the loop above. */ - ref = NULL; - - /* - * We can't free the previous page index, there may be threads using it. - * Add it to the session discard list, to be freed when it's safe. - */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); - parent_decr += size; - - /* - * Adjust the parent's memory footprint. - */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); - - /* - * Simple page splits trickle up the tree, that is, as leaf pages grow - * large enough and are evicted, they'll split into their parent. And, - * as that parent grows large enough and is evicted, it will split into - * its parent and so on. When the page split wave reaches the root, - * the tree will permanently deepen as multiple root pages are written. - * However, this only helps if first, the pages are evicted (and - * we resist evicting internal pages for obvious reasons), and second, - * if the tree is closed and re-opened from a disk image, which may be - * a rare event. - * To avoid the case of internal pages becoming too large when they - * aren't being evicted, check internal pages each time a leaf page is - * split into them. If it's big enough, deepen the tree at that point. - * Do the check here because we've just grown the parent page and - * are holding it locked. - */ - if (ret == 0 && !exclusive && - __split_should_deepen(session, parent_ref)) - ret = __split_deepen(session, parent); - -err: /* - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { - for (i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state == WT_REF_SPLIT) - next_ref->state = WT_REF_DELETED; - } - - /* If we gave up on a reverse split, unlock the child. */ - if (ref_new == NULL) { - WT_ASSERT(session, ref->state == WT_REF_LOCKED); - ref->state = WT_REF_DELETED; - } - - __wt_free_ref_index(session, NULL, alloc_index, false); - } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); -} - -/* * __split_insert -- * Split a page's last insert list entries into a separate page. */ @@ -1279,6 +1615,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) size_t page_decr, parent_incr, right_incr; int i; + WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); + page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; @@ -1491,7 +1830,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false)) != 0) { + session, ref, split_ref, 2, parent_incr, false, true)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1513,9 +1852,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(ret); } - WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); - WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); - return (0); err: if (split_ref[0] != NULL) { @@ -1543,83 +1879,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_insert(session, ref); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-insert", ref->page)); -/* - * __wt_split_reverse -- - * We have a locked ref that is empty and we want to rewrite the index in - * its parent. - */ -int -__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *parent; - bool hazard; - - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_parent(session, ref, NULL, 0, 0, 0); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} - -/* - * __wt_split_rewrite -- - * Rewrite an in-memory page with a new version. - */ -int -__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_REF new; - - page = ref->page; - mod = page->modify; - - /* - * This isn't a split: a reconciliation failed because we couldn't write - * something, and in the case of forced eviction, we need to stop this - * page from being such a problem. We have exclusive access, rewrite the - * page in memory. The code lives here because the split code knows how - * to re-create a page in memory after it's been reconciled, and that's - * exactly what we want to do. - * - * Build the new page. - */ - memset(&new, 0, sizeof(new)); - WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); - - /* - * The rewrite succeeded, we can no longer fail. - * - * Finalize the move, discarding moved update lists from the original - * page. - */ - __split_multi_inmem_final(page, &mod->mod_multi[0]); + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_insert(session, ref)) != 0) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } /* - * Discard the original page. - * - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. */ - __wt_page_modify_clear(session, page); - __wt_ref_out(session, ref); - - /* Swap the new page into place. */ - ref->page = new.page; - WT_PUBLISH(ref->state, WT_REF_MEM); - - return (0); - -err: __split_multi_inmem_fail(session, &new); - return (ret); + return (__split_parent_climb(session, parent, hazard)); } /* @@ -1636,6 +1910,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) size_t parent_incr; uint32_t i, new_entries; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf); + page = ref->page; mod = page->modify; new_entries = mod->mod_multi_entries; @@ -1656,10 +1933,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * exclusively. */ WT_ERR(__split_parent( - session, ref, ref_new, new_entries, parent_incr, closing)); - - WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); - WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); + session, ref, ref_new, new_entries, parent_incr, closing, true)); /* * The split succeeded, we can no longer fail. @@ -1697,8 +1971,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_multi(session, ref, closing); - WT_TRET(__split_parent_unlock(session, parent, hazard)); + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-multi", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } + + /* + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. + */ + return (__split_parent_climb(session, parent, hazard)); +} + +/* + * __wt_split_reverse -- + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. + */ +int +__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + bool hazard; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: reverse-split", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, false, true); + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); +} + +/* + * __wt_split_rewrite -- + * Rewrite an in-memory page with a new version. + */ +int +__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF new; + + page = ref->page; + mod = page->modify; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page)); + + /* + * This isn't a split: a reconciliation failed because we couldn't write + * something, and in the case of forced eviction, we need to stop this + * page from being such a problem. We have exclusive access, rewrite the + * page in memory. The code lives here because the split code knows how + * to re-create a page in memory after it's been reconciled, and that's + * exactly what we want to do. + * + * Build the new page. + */ + memset(&new, 0, sizeof(new)); + WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + + /* + * The rewrite succeeded, we can no longer fail. + * + * Finalize the move, discarding moved update lists from the original + * page. + */ + __split_multi_inmem_final(page, &mod->mod_multi[0]); + + /* + * Discard the original page. + * + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. + */ + __wt_page_modify_clear(session, page); + __wt_ref_out(session, ref); + + /* Swap the new page into place. */ + ref->page = new.page; + WT_PUBLISH(ref->state, WT_REF_MEM); + + return (0); + +err: __split_multi_inmem_fail(session, &new); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 7395cce11e1..07bb2eb3a01 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, - WT_TIMEDIFF(end, start) / WT_MILLION)); + WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index d02f23ed164..e9fa570f97b 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_INSERT *ins; WT_INSERT_HEAD *ins_head; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; uint32_t base, indx, limit; int depth; @@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } +restart_root: /* Search the internal pages of the tree. */ current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -51,8 +53,19 @@ restart: page = current->page; descent = pindex->index[base - 1]; /* Fast path appends. */ - if (recno >= descent->key.recno) + if (recno >= descent->key.recno) { + /* + * If on the last slot (the key is larger than any key + * on the page), check for an internal page split race. + */ + if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + WT_RET(__wt_page_release(session, current, 0)); + goto restart_root; + } goto descend; + } /* Binary search of internal pages. */ for (base = 0, @@ -90,15 +103,13 @@ descend: /* * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: - goto restart; - default: - return (ret); + continue; } + if (ret == WT_RESTART) + goto restart_page; + return (ret); } /* Track how deep the tree gets. */ diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 7b21f1e40bb..d2d8a4640ca 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_DECL_RET; WT_ITEM *item; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; WT_ROW *rip; size_t match, skiphigh, skiplow; @@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session, btree = S2BT(session); collator = btree->collator; item = cbt->tmp; + current = NULL; __cursor_pos_clear(cbt); /* - * The row-store search routine uses a different comparison API. - * The assumption is we're comparing more than a few keys with - * matching prefixes, and it's a win to avoid the memory fetches - * by skipping over those prefixes. That's done by tracking the - * length of the prefix match for the lowest and highest keys we - * compare as we descend the tree. + * In some cases we expect we're comparing more than a few keys with + * matching prefixes, so it's faster to avoid the memory fetches by + * skipping over those prefixes. That's done by tracking the length of + * the prefix match for the lowest and highest keys we compare as we + * descend the tree. */ skiphigh = skiplow = 0; @@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session, } /* Search the internal pages of the tree. */ - cmp = -1; +restart_root: current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -211,7 +212,7 @@ restart: page = current->page; WT_ERR(__wt_compare( session, collator, srch_key, item, &cmp)); if (cmp >= 0) - goto descend; + goto append; /* A failed append check turns off append checks. */ append_check = false; @@ -252,7 +253,26 @@ restart: page = current->page; } else if (cmp == 0) goto descend; } - else if (collator == NULL) + else if (collator == NULL) { + /* + * Reset the skipped prefix counts; we'd normally expect + * the parent's skipped prefix values to be larger than + * the child's values and so we'd only increase them as + * we walk down the tree (in other words, if we can skip + * N bytes on the parent, we can skip at least N bytes + * on the child). However, if a child internal page was + * split up into the parent, the child page's key space + * will have been truncated, and the values from the + * parent's search may be wrong for the child. We only + * need to reset the high count because the split-page + * algorithm truncates the end of the internal page's + * key space, the low count is still correct. We also + * don't need to clear either count when transitioning + * to a leaf page, a leaf page's key space can't change + * in flight. + */ + skiphigh = 0; + for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -271,7 +291,7 @@ restart: page = current->page; else goto descend; } - else + } else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -288,9 +308,10 @@ restart: page = current->page; } /* - * Set the slot to descend the tree: descent is already set if - * there was an exact match on the page, otherwise, base is - * the smallest index greater than key, possibly (last + 1). + * Set the slot to descend the tree: descent was already set if + * there was an exact match on the page, otherwise, base is the + * smallest index greater than key, possibly one past the last + * slot. */ descent = pindex->index[base - 1]; @@ -298,25 +319,41 @@ restart: page = current->page; * If we end up somewhere other than the last slot, it's not a * right-side descent. */ - if (pindex->entries != base - 1) + if (pindex->entries != base) descend_right = false; + /* + * If on the last slot (the key is larger than any key on the + * page), check for an internal page split race. + */ + if (pindex->entries == base) { +append: if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + if ((ret = __wt_page_release( + session, current, 0)) != 0) + return (ret); + + skiplow = skiphigh = 0; + goto restart_root; + } + } + descend: /* * Swap the current page for the child page. If the page splits * while we're retrieving it, restart the search in the current * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: + continue; + } + if (ret == WT_RESTART) { skiphigh = skiplow = 0; - goto restart; - default: - return (ret); + goto restart_page; } + return (ret); } /* Track how deep the tree gets. */ @@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) __cursor_pos_clear(cbt); -restart: +restart_root: /* Walk the internal pages of the tree. */ current = &btree->root; for (;;) { @@ -544,7 +581,7 @@ restart: */ if (ret == WT_RESTART && (ret = __wt_page_release(session, current, 0)) == 0) - goto restart; + goto restart_root; return (ret); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 311ddd56b7a..d79ce6853e6 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -295,6 +295,19 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_WT_SESSION_join[] = { + { "bloom_bit_count", "int", NULL, "min=2,max=1000", NULL, 0 }, + { "bloom_hash_count", "int", NULL, "min=2,max=100", NULL, 0 }, + { "compare", "string", + NULL, "choices=[\"eq\",\"ge\",\"gt\",\"le\",\"lt\"]", + NULL, 0 }, + { "count", "int", NULL, NULL, NULL, 0 }, + { "strategy", "string", + NULL, "choices=[\"bloom\",\"default\"]", + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_WT_SESSION_log_flush[] = { { "sync", "string", NULL, "choices=[\"background\",\"off\",\"on\"]", @@ -543,6 +556,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, + { "use_environment", "boolean", NULL, NULL, NULL, 0 }, { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," @@ -622,6 +636,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, + { "use_environment", "boolean", NULL, NULL, NULL, 0 }, { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," @@ -891,6 +906,11 @@ static const WT_CONFIG_ENTRY config_entries[] = { "force=0,remove_files=", confchk_WT_SESSION_drop, 2 }, + { "WT_SESSION.join", + "bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=," + "strategy=", + confchk_WT_SESSION_join, 5 + }, { "WT_SESSION.log_flush", "sync=on", confchk_WT_SESSION_log_flush, 1 @@ -995,9 +1015,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",name=,quota=0,reserve=0,size=500MB),statistics=none," "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" - ",verbose=,write_through=", - confchk_wiredtiger_open, 36 + "transaction_sync=(enabled=0,method=fsync),use_environment=," + "use_environment_priv=0,verbose=,write_through=", + confchk_wiredtiger_open, 37 }, { "wiredtiger_open_all", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1016,9 +1036,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",name=,quota=0,reserve=0,size=500MB),statistics=none," "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" - ",verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 37 + "transaction_sync=(enabled=0,method=fsync),use_environment=," + "use_environment_priv=0,verbose=,version=(major=0,minor=0)," + "write_through=", + confchk_wiredtiger_open_all, 38 }, { "wiredtiger_open_basecfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index c65b74e4e4e..bd14e1bf4fd 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1286,6 +1286,11 @@ __conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf) const char *env_config; size_t len; + /* Only use the environment variable if configured. */ + WT_RET(__wt_config_gets(session, cfg, "use_environment", &cval)); + if (cval.val == 0) + return (0); + ret = __wt_getenv(session, "WIREDTIGER_CONFIG", &env_config); if (ret == WT_NOTFOUND) return (0); @@ -1333,15 +1338,16 @@ err: __wt_free(session, env_config); static int __conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[]) { - WT_DECL_RET; WT_CONFIG_ITEM cval; /* If the application specifies a home directory, use it. */ if (home != NULL) goto copy; - ret = __wt_getenv(session, "WIREDTIGER_HOME", &S2C(session)->home); - if (ret == 0) + /* Only use the environment variable if configured. */ + WT_RET(__wt_config_gets(session, cfg, "use_environment", &cval)); + if (cval.val != 0 && + __wt_getenv(session, "WIREDTIGER_HOME", &S2C(session)->home) == 0) return (0); /* If there's no WIREDTIGER_HOME environment variable, use ".". */ diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index aa14e9aadde..8d16f94c092 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -734,7 +734,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) WT_ERR(__wt_cond_wait(session, - cp->cache_pool_cond, 1000000)); + cp->cache_pool_cond, WT_MILLION)); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index 8f039e61654..b47e2550b23 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -31,7 +31,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) * Checkpoints based on log size also require logging be enabled. */ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); - conn->ckpt_usecs = (uint64_t)cval.val * 1000000; + conn->ckpt_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval)); conn->ckpt_logsize = (wt_off_t)cval.val; diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 527b756ee1a..1d44d816467 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -23,17 +23,19 @@ __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg) WT_RET( __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval)); if (cval.val) - FLD_SET(conn->txn_logsync, WT_LOG_FLUSH); + FLD_SET(conn->txn_logsync, WT_LOG_SYNC_ENABLED); else - FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH); + FLD_CLR(conn->txn_logsync, WT_LOG_SYNC_ENABLED); WT_RET( __wt_config_gets(session, cfg, "transaction_sync.method", &cval)); - FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC); + FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH | WT_LOG_FSYNC); if (WT_STRING_MATCH("dsync", cval.str, cval.len)) - FLD_SET(conn->txn_logsync, WT_LOG_DSYNC); + FLD_SET(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH); else if (WT_STRING_MATCH("fsync", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_FSYNC); + else if (WT_STRING_MATCH("none", cval.str, cval.len)) + FLD_SET(conn->txn_logsync, WT_LOG_FLUSH); return (0); } @@ -536,8 +538,8 @@ restart: while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; - WT_ASSERT(session, slot->slot_state != 0 || - slot->slot_release_lsn.file >= log->write_lsn.file); + WT_ASSERT(session, slot->slot_state != 0 || + slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index ec3a630581a..31438e10606 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -83,7 +83,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval)); /* Only start the server if wait time is non-zero */ *runp = cval.val != 0; - conn->stat_usecs = (uint64_t)cval.val * 1000000; + conn->stat_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); @@ -154,7 +154,7 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) WT_DECL_RET; int64_t *stats; int i; - const char *uri; + const char *desc, *uri; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; @@ -175,16 +175,19 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) * If we don't find an underlying object, silently ignore it, the object * may exist only intermittently. */ - switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) { + switch (ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) { case 0: cst = (WT_CURSOR_STAT *)cursor; - for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) + for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) { + if (conn_stats) + WT_ERR(__wt_stat_connection_desc(cst, i, + &desc)); + else + WT_ERR(__wt_stat_dsrc_desc(cst, i, &desc)); WT_ERR(__wt_fprintf(conn->stat_fp, "%s %" PRId64 " %s %s\n", - conn->stat_stamp, stats[i], - name, conn_stats ? - __wt_stat_connection_desc(i) : - __wt_stat_dsrc_desc(i))); + conn->stat_stamp, stats[i], name, desc)); + } WT_ERR(cursor->close(cursor)); break; case EBUSY: diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index a8620ebaa99..b9b46f3211c 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -136,7 +136,8 @@ __sweep_expire(WT_SESSION_IMPL *session, time_t now) !F_ISSET(dhandle, WT_DHANDLE_OPEN) || dhandle->session_inuse != 0 || dhandle->timeofdeath == 0 || - now <= dhandle->timeofdeath + conn->sweep_idle_time) + difftime(now, dhandle->timeofdeath) <= + conn->sweep_idle_time) continue; WT_WITH_DHANDLE(session, dhandle, @@ -276,8 +277,8 @@ __sweep_server(void *arg) while (F_ISSET(conn, WT_CONN_SERVER_RUN) && F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { /* Wait until the next event. */ - WT_ERR(__wt_cond_wait(session, conn->sweep_cond, - (uint64_t)conn->sweep_interval * WT_MILLION)); + WT_ERR(__wt_cond_wait(session, + conn->sweep_cond, conn->sweep_interval * WT_MILLION)); WT_ERR(__wt_seconds(session, &now)); WT_STAT_FAST_CONN_INCR(session, dh_sweeps); @@ -329,27 +330,25 @@ __wt_sweep_config(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); - /* Pull out the sweep configurations. */ - WT_RET(__wt_config_gets(session, - cfg, "file_manager.close_idle_time", &cval)); - conn->sweep_idle_time = (time_t)cval.val; - - /* Non-zero sweep idle time is incompatible with in-memory */ - if (conn->sweep_idle_time != 0) { - WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); - if (cval.val != 0) - WT_RET_MSG(session, EINVAL, - "In memory configuration incompatible with " - "non zero file_manager=(close_idle_time)"); + /* + * A non-zero idle time is incompatible with in-memory, and the default + * is non-zero; set the in-memory configuration idle time to zero. + */ + conn->sweep_idle_time = 0; + WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); + if (cval.val == 0) { + WT_RET(__wt_config_gets(session, + cfg, "file_manager.close_idle_time", &cval)); + conn->sweep_idle_time = (uint64_t)cval.val; } WT_RET(__wt_config_gets(session, cfg, "file_manager.close_scan_interval", &cval)); - conn->sweep_interval = (time_t)cval.val; + conn->sweep_interval = (uint64_t)cval.val; WT_RET(__wt_config_gets(session, cfg, "file_manager.close_handle_minimum", &cval)); - conn->sweep_handles_min = (u_int)cval.val; + conn->sweep_handles_min = (uint64_t)cval.val; return (0); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 6f7d492327b..62ac2203b97 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -17,8 +17,7 @@ static int __backup_list_append( static int __backup_start( WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]); static int __backup_stop(WT_SESSION_IMPL *); -static int __backup_uri( - WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], bool *, bool *); +static int __backup_uri(WT_SESSION_IMPL *, const char *[], bool *, bool *); /* * __curbackup_next -- @@ -197,6 +196,7 @@ __backup_start( cb->next = 0; cb->list = NULL; + cb->list_next = 0; /* * Single thread hot backups: we're holding the schema lock, so we @@ -235,7 +235,7 @@ __backup_start( * a checkpoint that completes during the backup. */ target_list = false; - WT_ERR(__backup_uri(session, cb, cfg, &target_list, &log_only)); + WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); @@ -391,7 +391,7 @@ err: if (cursor != NULL) */ static int __backup_uri(WT_SESSION_IMPL *session, - WT_CURSOR_BACKUP *cb, const char *cfg[], bool *foundp, bool *log_only) + const char *cfg[], bool *foundp, bool *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; @@ -408,7 +408,7 @@ __backup_uri(WT_SESSION_IMPL *session, */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); - for (cb->list_next = 0, target_list = false; + for (target_list = false; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; target_list = true) { /* If it is our first time through, allocate. */ @@ -432,9 +432,11 @@ __backup_uri(WT_SESSION_IMPL *session, if (WT_PREFIX_MATCH(uri, "log:")) { *log_only = !target_list; WT_ERR(__wt_backup_list_uri_append(session, uri, NULL)); - } else + } else { + *log_only = false; WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); + } } WT_ERR_NOTFOUND_OK(ret); diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c index 6c11c4b407e..e5799fbad05 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_dump.c +++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c @@ -329,7 +329,7 @@ __curdump_close(WT_CURSOR *cursor) cdump = (WT_CURSOR_DUMP *)cursor; child = cdump->child; - CURSOR_API_CALL(cursor, session, get_key, NULL); + CURSOR_API_CALL(cursor, session, close, NULL); if (child != NULL) WT_TRET(child->close(child)); /* We shared the child's URI. */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 1db819b8b40..7c18b59fded 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -246,17 +246,17 @@ __curfile_insert(WT_CURSOR *cursor) /* * Insert is the one cursor operation that doesn't end with the cursor - * pointing to an on-page item. The standard macro handles errors - * correctly, but we need to leave the application cursor unchanged in - * the case of success, except for column-store appends, where we are - * returning a key. + * pointing to an on-page item (except for column-store appends, where + * we are returning a key). That is, the application's cursor continues + * to reference the application's memory after a successful cursor call, + * which isn't true anywhere else. We don't want to have to explain that + * scoping corner case, so we reset the application's cursor so it can + * free the referenced memory and continue on without risking subsequent + * core dumps. */ if (ret == 0) { - if (!F_ISSET(cursor, WT_CURSTD_APPEND)) { - F_SET(cursor, WT_CURSTD_KEY_EXT); + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) F_CLR(cursor, WT_CURSTD_KEY_INT); - } - F_SET(cursor, WT_CURSTD_VALUE_EXT); F_CLR(cursor, WT_CURSTD_VALUE_INT); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index fd2a6cd7480..a909eaece99 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -8,6 +8,20 @@ #include "wt_internal.h" + /* + * __wt_curindex_joined -- + * Produce an error that this cursor is being used in a join call. + */ +int +__wt_curindex_joined(WT_CURSOR *cursor) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + __wt_errx(session, "index cursor is being used in a join"); + return (ENOTSUP); +} + /* * __curindex_get_value -- * WT_CURSOR->get_value implementation for index cursors. @@ -15,32 +29,16 @@ static int __curindex_get_value(WT_CURSOR *cursor, ...) { - WT_CURSOR_INDEX *cindex; WT_DECL_RET; - WT_ITEM *item; WT_SESSION_IMPL *session; va_list ap; - cindex = (WT_CURSOR_INDEX *)cursor; - CURSOR_API_CALL(cursor, session, get_value, NULL); - WT_CURSOR_NEEDVALUE(cursor); - va_start(ap, cursor); - if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { - ret = __wt_schema_project_merge(session, - cindex->cg_cursors, cindex->value_plan, - cursor->value_format, &cursor->value); - if (ret == 0) { - item = va_arg(ap, WT_ITEM *); - item->data = cursor->value.data; - item->size = cursor->value.size; - } - } else - ret = __wt_schema_project_out(session, - cindex->cg_cursors, cindex->value_plan, ap); - va_end(ap); + JOINABLE_CURSOR_API_CALL(cursor, session, get_value, NULL); + WT_ERR(__wt_curindex_get_valuev(cursor, ap)); -err: API_END_RET(session, ret); +err: va_end(ap); + API_END_RET(session, ret); } /* @@ -53,7 +51,7 @@ __curindex_set_value(WT_CURSOR *cursor, ...) WT_DECL_RET; WT_SESSION_IMPL *session; - CURSOR_API_CALL(cursor, session, set_value, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL); ret = ENOTSUP; err: cursor->saved_err = ret; F_CLR(cursor, WT_CURSTD_VALUE_SET); @@ -72,7 +70,7 @@ __curindex_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)a; - CURSOR_API_CALL(a, session, compare, NULL); + JOINABLE_CURSOR_API_CALL(a, session, compare, NULL); /* Check both cursors are "index:" type. */ if (!WT_PREFIX_MATCH(a->uri, "index:") || @@ -150,7 +148,7 @@ __curindex_next(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)cursor; - CURSOR_API_CALL(cursor, session, next, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); if ((ret = cindex->child->next(cindex->child)) == 0) @@ -171,7 +169,7 @@ __curindex_prev(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)cursor; - CURSOR_API_CALL(cursor, session, prev, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, prev, NULL); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); if ((ret = cindex->child->prev(cindex->child)) == 0) @@ -194,7 +192,7 @@ __curindex_reset(WT_CURSOR *cursor) u_int i; cindex = (WT_CURSOR_INDEX *)cursor; - CURSOR_API_CALL(cursor, session, reset, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_TRET(cindex->child->reset(cindex->child)); @@ -225,7 +223,7 @@ __curindex_search(WT_CURSOR *cursor) cindex = (WT_CURSOR_INDEX *)cursor; child = cindex->child; - CURSOR_API_CALL(cursor, session, search, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL); /* * We are searching using the application-specified key, which @@ -284,7 +282,7 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)cursor; - CURSOR_API_CALL(cursor, session, search_near, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, search_near, NULL); __wt_cursor_set_raw_key(cindex->child, &cursor->key); if ((ret = cindex->child->search_near(cindex->child, exact)) == 0) ret = __curindex_move(cindex); @@ -311,7 +309,7 @@ __curindex_close(WT_CURSOR *cursor) cindex = (WT_CURSOR_INDEX *)cursor; idx = cindex->index; - CURSOR_API_CALL(cursor, session, close, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL); if ((cp = cindex->cg_cursors) != NULL) for (i = 0, cp = cindex->cg_cursors; diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c new file mode 100644 index 00000000000..c5155c75a0c --- /dev/null +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -0,0 +1,1054 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __curjoin_entry_iter_init -- + * Initialize an iteration for the index managed by a join entry. + * + */ +static int +__curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp) +{ + WT_CURSOR *newcur; + WT_CURSOR *to_dup; + WT_DECL_RET; + const char *raw_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), "raw", NULL }; + const char *def_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), NULL }; + const char *uri, **config; + char *uribuf; + WT_CURSOR_JOIN_ITER *iter; + size_t size; + + iter = NULL; + uribuf = NULL; + to_dup = entry->ends[0].cursor; + + uri = to_dup->uri; + if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) + config = &raw_cfg[0]; + else + config = &def_cfg[0]; + + if (cjoin->projection != NULL) { + size = strlen(uri) + strlen(cjoin->projection) + 1; + WT_ERR(__wt_calloc(session, size, 1, &uribuf)); + snprintf(uribuf, size, "%s%s", uri, cjoin->projection); + uri = uribuf; + } + WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, + &newcur)); + WT_ERR(__wt_cursor_dup_position(to_dup, newcur)); + WT_ERR(__wt_calloc_one(session, &iter)); + iter->cjoin = cjoin; + iter->session = session; + iter->entry = entry; + iter->cursor = newcur; + iter->advance = false; + *iterp = iter; + + if (0) { +err: __wt_free(session, iter); + } + __wt_free(session, uribuf); + return (ret); +} + +/* + * __curjoin_pack_recno -- + * Pack the given recno into a buffer; prepare an item referencing it. + * + */ +static int +__curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf, + size_t bufsize, WT_ITEM *item) +{ + WT_DECL_RET; + WT_SESSION *wtsession; + size_t sz; + + wtsession = (WT_SESSION *)session; + WT_ERR(wiredtiger_struct_size(wtsession, &sz, "r", r)); + WT_ASSERT(session, sz < bufsize); + WT_ERR(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); + item->size = sz; + item->data = buf; + +err: return (ret); +} + +/* + * __curjoin_entry_iter_next -- + * Get the next item in an iteration. + * + */ +static int +__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, + uint64_t *rp) +{ + WT_CURSOR *firstcg_cur; + WT_CURSOR_JOIN *cjoin; + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t r; + + if (iter->advance) + WT_ERR(iter->cursor->next(iter->cursor)); + else + iter->advance = true; + + session = iter->session; + cjoin = iter->cjoin; + + /* + * Set our key to the primary key, we'll also need this + * to check membership. + */ + if (iter->entry->index != NULL) + firstcg_cur = ((WT_CURSOR_INDEX *)iter->cursor)->cg_cursors[0]; + else + firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0]; + if (WT_CURSOR_RECNO(&cjoin->iface)) { + r = *(uint64_t *)firstcg_cur->key.data; + WT_ERR(__curjoin_pack_recno(session, r, cjoin->recno_buf, + sizeof(cjoin->recno_buf), primkey)); + *rp = r; + } else { + WT_ITEM_SET(*primkey, firstcg_cur->key); + *rp = 0; + } + iter->curkey = primkey; + iter->entry->stats.actual_count++; + iter->entry->stats.accesses++; + +err: return (ret); +} + +/* + * __curjoin_entry_iter_reset -- + * Reset an iteration to the starting point. + * + */ +static int +__curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter) +{ + WT_DECL_RET; + + if (iter->advance) { + WT_ERR(iter->cursor->reset(iter->cursor)); + WT_ERR(__wt_cursor_dup_position( + iter->cjoin->entries[0].ends[0].cursor, iter->cursor)); + iter->advance = false; + iter->entry->stats.actual_count = 0; + } + +err: return (ret); +} + +/* + * __curjoin_entry_iter_ready -- + * The iterator is positioned. + * + */ +static bool +__curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter) +{ + return (iter->advance); +} + +/* + * __curjoin_entry_iter_close -- + * Close the iteration, release resources. + * + */ +static int +__curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter) +{ + WT_DECL_RET; + + if (iter->cursor != NULL) + WT_TRET(iter->cursor->close(iter->cursor)); + __wt_free(iter->session, iter); + + return (ret); +} + +/* + * __curjoin_get_key -- + * WT_CURSOR->get_key for join cursors. + */ +static int +__curjoin_get_key(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_JOIN *cjoin; + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + cjoin = (WT_CURSOR_JOIN *)cursor; + + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_key, NULL); + + if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) || + !__curjoin_entry_iter_ready(cjoin->iter)) { + __wt_errx(session, "join cursor must be advanced with next()"); + WT_ERR(EINVAL); + } + WT_ERR(__wt_cursor_get_keyv(cursor, cursor->flags, ap)); + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * __curjoin_get_value -- + * WT_CURSOR->get_value for join cursors. + */ +static int +__curjoin_get_value(WT_CURSOR *cursor, ...) +{ + WT_CURSOR_JOIN *cjoin; + WT_CURSOR_JOIN_ITER *iter; + WT_DECL_RET; + WT_SESSION_IMPL *session; + va_list ap; + + cjoin = (WT_CURSOR_JOIN *)cursor; + iter = cjoin->iter; + + va_start(ap, cursor); + CURSOR_API_CALL(cursor, session, get_value, NULL); + + if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) || + !__curjoin_entry_iter_ready(iter)) { + __wt_errx(session, "join cursor must be advanced with next()"); + WT_ERR(EINVAL); + } + if (iter->entry->index != NULL) + WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap)); + else + WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap)); + +err: va_end(ap); + API_END_RET(session, ret); +} + +/* + * __curjoin_init_bloom -- + * Populate Bloom filters + */ +static int +__curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom) +{ + WT_COLLATOR *collator; + WT_CURSOR *c; + WT_CURSOR_INDEX *cindex; + WT_CURSOR_JOIN_ENDPOINT *end, *endmax; + WT_DECL_RET; + WT_DECL_ITEM(uribuf); + WT_ITEM curkey, curvalue, *k; + WT_TABLE *maintable; + const char *raw_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), "raw", NULL }; + const char *mainkey_str, *p; + void *allocbuf; + size_t mainkey_len, size; + u_int i; + int cmp, skip; + + c = NULL; + allocbuf = NULL; + skip = 0; + + if (entry->index != NULL) { + /* + * Open a cursor having a projection of the keys of the + * index we're comparing against. Open it raw, we're + * going to compare it to the raw keys of the + * reference cursors. + */ + maintable = ((WT_CURSOR_TABLE *)entry->main)->table; + mainkey_str = maintable->colconf.str + 1; + for (p = mainkey_str, i = 0; + p != NULL && i < maintable->nkey_columns; i++) + p = strchr(p + 1, ','); + WT_ASSERT(session, p != 0); + mainkey_len = WT_PTRDIFF(p, mainkey_str); + size = strlen(entry->index->name) + mainkey_len + 3; + WT_ERR(__wt_scr_alloc(session, size, &uribuf)); + WT_ERR(__wt_buf_fmt(session, uribuf, "%s(%.*s)", + entry->index->name, (int)mainkey_len, mainkey_str)); + } else { + /* + * For joins on the main table, we just need the primary + * key for comparison, we don't need any values. + */ + size = strlen(cjoin->table->name) + 3; + WT_ERR(__wt_scr_alloc(session, size, &uribuf)); + WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", + cjoin->table->name)); + } + WT_ERR(__wt_open_cursor( + session, uribuf->data, &cjoin->iface, raw_cfg, &c)); + + /* Initially position the cursor if necessary. */ + endmax = &entry->ends[entry->ends_next]; + if ((end = &entry->ends[0]) < endmax && + F_ISSET(end, WT_CURJOIN_END_GE)) { + WT_ERR(__wt_cursor_dup_position(end->cursor, c)); + if (end->flags == WT_CURJOIN_END_GE) + skip = 1; + } + collator = (entry->index == NULL) ? NULL : entry->index->collator; + while (ret == 0) { + c->get_key(c, &curkey); + if (entry->index != NULL) { + cindex = (WT_CURSOR_INDEX *)c; + if (cindex->index->extractor == NULL) { + /* + * Repack so it's comparable to the + * reference endpoints. + */ + k = &cindex->child->key; + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, + entry->main->value_format, k, &curkey, + &allocbuf)); + } else + curkey = cindex->child->key; + } + for (end = &entry->ends[skip]; end < endmax; end++) { + WT_ERR(__wt_compare(session, collator, &curkey, + &end->key, &cmp)); + if (!F_ISSET(end, WT_CURJOIN_END_LT)) { + if (cmp < 0 || (cmp == 0 && + !F_ISSET(end, WT_CURJOIN_END_EQ))) + goto advance; + if (cmp > 0) { + if (F_ISSET(end, WT_CURJOIN_END_GT)) + skip = 1; + else + goto done; + } + } else { + if (cmp > 0 || (cmp == 0 && + !F_ISSET(end, WT_CURJOIN_END_EQ))) + goto done; + } + } + if (entry->index != NULL) + c->get_value(c, &curvalue); + else + c->get_key(c, &curvalue); + WT_ERR(__wt_bloom_insert(bloom, &curvalue)); + entry->stats.actual_count++; +advance: + if ((ret = c->next(c)) == WT_NOTFOUND) + break; + } +done: + WT_ERR_NOTFOUND_OK(ret); + +err: if (c != NULL) + WT_TRET(c->close(c)); + __wt_scr_free(session, &uribuf); + __wt_free(session, allocbuf); + return (ret); +} + +/* + * __curjoin_endpoint_init_key -- + * Set the key in the reference endpoint. + */ +static int +__curjoin_endpoint_init_key(WT_SESSION_IMPL *session, + WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ENDPOINT *endpoint) +{ + WT_CURSOR *cursor; + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_ITEM *k; + uint64_t r; + void *allocbuf; + + allocbuf = NULL; + if ((cursor = endpoint->cursor) != NULL) { + if (entry->index != NULL) { + cindex = (WT_CURSOR_INDEX *)endpoint->cursor; + if (cindex->index->extractor == NULL) { + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, + entry->main->value_format, + &cindex->child->key, &endpoint->key, + &allocbuf)); + if (allocbuf != NULL) + F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); + } else + endpoint->key = cindex->child->key; + } else { + k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key; + if (WT_CURSOR_RECNO(cursor)) { + r = *(uint64_t *)k->data; + WT_ERR(__curjoin_pack_recno(session, r, + endpoint->recno_buf, + sizeof(endpoint->recno_buf), + &endpoint->key)); + } + else + endpoint->key = *k; + } + } + if (0) { +err: __wt_free(session, allocbuf); + } + return (ret); +} + +/* + * __curjoin_init_iter -- + * Initialize before any iteration. + */ +static int +__curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) +{ + WT_BLOOM *bloom; + WT_DECL_RET; + WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; + WT_CURSOR_JOIN_ENDPOINT *end; + uint64_t k, m; + + if (cjoin->entries_next == 0) { + __wt_errx(session, "join cursor has not yet been joined " + "with any other cursors"); + return (EINVAL); + } + + je = &cjoin->entries[0]; + WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); + + jeend = &cjoin->entries[cjoin->entries_next]; + for (je = cjoin->entries; je < jeend; je++) { + __wt_stat_join_init_single(&je->stats); + for (end = &je->ends[0]; end < &je->ends[je->ends_next]; + end++) + WT_RET(__curjoin_endpoint_init_key(session, je, end)); + + /* + * The first entry is iterated as the 'outermost' cursor. + * For the common GE case, we don't have to test against + * the left reference key, we know it will be true since + * the btree is ordered. + */ + if (je == cjoin->entries && je->ends[0].flags == + (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)) + F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); + + if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { + if (je->bloom == NULL) { + /* + * Look for compatible filters to be shared, + * pick compatible numbers for bit counts + * and number of hashes. + */ + m = je->bloom_bit_count; + k = je->bloom_hash_count; + for (je2 = je + 1; je2 < jeend; je2++) + if (F_ISSET(je2, + WT_CURJOIN_ENTRY_BLOOM) && + je2->count == je->count) { + m = WT_MAX( + je2->bloom_bit_count, m); + k = WT_MAX( + je2->bloom_hash_count, k); + } + je->bloom_bit_count = m; + je->bloom_hash_count = k; + WT_RET(__wt_bloom_create(session, NULL, + NULL, je->count, m, k, &je->bloom)); + F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); + WT_RET(__curjoin_init_bloom(session, cjoin, + je, je->bloom)); + /* + * Share the Bloom filter, making all + * config info consistent. + */ + for (je2 = je + 1; je2 < jeend; je2++) + if (F_ISSET(je2, + WT_CURJOIN_ENTRY_BLOOM) && + je2->count == je->count) { + WT_ASSERT(session, + je2->bloom == NULL); + je2->bloom = je->bloom; + je2->bloom_bit_count = m; + je2->bloom_hash_count = k; + } + } else { + /* + * Create a temporary filter that we'll + * merge into the shared one. The Bloom + * parameters of the two filters must match. + */ + WT_RET(__wt_bloom_create(session, NULL, + NULL, je->count, je->bloom_bit_count, + je->bloom_hash_count, &bloom)); + WT_RET(__curjoin_init_bloom(session, cjoin, + je, bloom)); + WT_RET(__wt_bloom_intersection(je->bloom, + bloom)); + WT_RET(__wt_bloom_close(bloom)); + } + } + } + + F_SET(cjoin, WT_CURJOIN_INITIALIZED); + return (ret); +} + +/* + * __curjoin_entry_in_range -- + * Check if a key is in the range specified by the entry, returning + * WT_NOTFOUND if not. + */ +static int +__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, + WT_ITEM *curkey, bool skip_left) +{ + WT_COLLATOR *collator; + WT_CURSOR_JOIN_ENDPOINT *end, *endmax; + WT_DECL_RET; + int cmp; + + collator = (entry->index != NULL) ? entry->index->collator : NULL; + endmax = &entry->ends[entry->ends_next]; + for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) { + WT_ERR(__wt_compare(session, collator, curkey, &end->key, + &cmp)); + if (!F_ISSET(end, WT_CURJOIN_END_LT)) { + if (cmp < 0 || + (cmp == 0 && + !F_ISSET(end, WT_CURJOIN_END_EQ)) || + (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT))) + WT_ERR(WT_NOTFOUND); + } else { + if (cmp > 0 || + (cmp == 0 && + !F_ISSET(end, WT_CURJOIN_END_EQ)) || + (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT))) + WT_ERR(WT_NOTFOUND); + } + } +err: return (ret); +} + +typedef struct { + WT_CURSOR iface; + WT_CURSOR_JOIN_ENTRY *entry; + int ismember; +} WT_CURJOIN_EXTRACTOR; + +/* + * __curjoin_extract_insert -- + * Handle a key produced by a custom extractor. + */ +static int +__curjoin_extract_insert(WT_CURSOR *cursor) { + WT_CURJOIN_EXTRACTOR *cextract; + WT_DECL_RET; + WT_ITEM ikey; + WT_SESSION_IMPL *session; + + cextract = (WT_CURJOIN_EXTRACTOR *)cursor; + /* + * This insert method may be called multiple times during a single + * extraction. If we already have a definitive answer to the + * membership question, exit early. + */ + if (cextract->ismember) + return (0); + + session = (WT_SESSION_IMPL *)cursor->session; + + WT_ITEM_SET(ikey, cursor->key); + /* + * We appended a padding byte to the key to avoid rewriting the last + * column. Strip that away here. + */ + WT_ASSERT(session, ikey.size > 0); + --ikey.size; + + ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false); + if (ret == WT_NOTFOUND) + ret = 0; + else + cextract->ismember = 1; + + return (ret); +} + +/* + * __curjoin_entry_member -- + * Do a membership check for a particular index that was joined, + * if not a member, returns WT_NOTFOUND. + */ +static int +__curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_CURSOR_JOIN_ENTRY *entry, bool skip_left) +{ + WT_CURJOIN_EXTRACTOR extract_cursor; + WT_CURSOR *c; + WT_CURSOR_STATIC_INIT(iface, + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ + __wt_cursor_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_notsup, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_notsup, /* search-near */ + __curjoin_extract_insert, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* reconfigure */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_notsup); /* close */ + WT_DECL_RET; + WT_INDEX *idx; + WT_ITEM *key, v; + bool bloom_found; + + key = cjoin->iter->curkey; + entry->stats.accesses++; + bloom_found = false; + + if (entry->bloom != NULL) { + /* + * If we don't own the Bloom filter, we must be sharing one + * in a previous entry. So the shared filter has already + * been checked and passed. + */ + if (!F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) + return (0); + + /* + * If the item is not in the Bloom filter, we return + * immediately, otherwise, we still need to check the + * long way. + */ + WT_ERR(__wt_bloom_inmem_get(entry->bloom, key)); + bloom_found = true; + } + if (entry->index != NULL) { + c = entry->main; + c->set_key(c, key); + if ((ret = c->search(c)) == 0) + ret = c->get_value(c, &v); + else if (ret == WT_NOTFOUND) + WT_ERR_MSG(session, WT_ERROR, + "main table for join is missing entry."); + c->reset(c); + WT_ERR(ret); + } else + v = *key; + + if ((idx = entry->index) != NULL && idx->extractor != NULL) { + extract_cursor.iface = iface; + extract_cursor.iface.session = &session->iface; + extract_cursor.iface.key_format = idx->exkey_format; + extract_cursor.ismember = 0; + extract_cursor.entry = entry; + WT_ERR(idx->extractor->extract(idx->extractor, + &session->iface, key, &v, &extract_cursor.iface)); + if (!extract_cursor.ismember) + WT_ERR(WT_NOTFOUND); + } else + WT_ERR(__curjoin_entry_in_range(session, entry, &v, skip_left)); + + if (0) { +err: if (ret == WT_NOTFOUND && bloom_found) + entry->stats.bloom_false_positive++; + } + return (ret); +} + +/* + * __curjoin_next -- + * WT_CURSOR::next for join cursors. + */ +static int +__curjoin_next(WT_CURSOR *cursor) +{ + WT_CURSOR_JOIN *cjoin; + WT_DECL_RET; + WT_SESSION_IMPL *session; + bool skip_left; + u_int i; + + cjoin = (WT_CURSOR_JOIN *)cursor; + + CURSOR_API_CALL(cursor, session, next, NULL); + + if (F_ISSET(cjoin, WT_CURJOIN_ERROR)) { + __wt_errx(session, "join cursor encountered previous error"); + WT_ERR(WT_ERROR); + } + if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED)) + WT_ERR(__curjoin_init_iter(session, cjoin)); + +nextkey: + if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key, + &cursor->recno)) == 0) { + F_SET(cursor, WT_CURSTD_KEY_EXT); + + /* + * We may have already established membership for the + * 'left' case for the first entry, since we're + * using that in our iteration. + */ + skip_left = F_ISSET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); + for (i = 0; i < cjoin->entries_next; i++) { + ret = __curjoin_entry_member(session, cjoin, + &cjoin->entries[i], skip_left); + if (ret == WT_NOTFOUND) + goto nextkey; + skip_left = false; + WT_ERR(ret); + } + } + + if (0) { +err: F_SET(cjoin, WT_CURJOIN_ERROR); + } + API_END_RET(session, ret); +} + +/* + * __curjoin_reset -- + * WT_CURSOR::reset for join cursors. + */ +static int +__curjoin_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_JOIN *cjoin; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cjoin = (WT_CURSOR_JOIN *)cursor; + + CURSOR_API_CALL(cursor, session, reset, NULL); + + if (F_ISSET(cjoin, WT_CURJOIN_INITIALIZED)) + WT_ERR(__curjoin_entry_iter_reset(cjoin->iter)); + +err: API_END_RET(session, ret); +} + +/* + * __curjoin_close -- + * WT_CURSOR::close for join cursors. + */ +static int +__curjoin_close(WT_CURSOR *cursor) +{ + WT_CURSOR_JOIN *cjoin; + WT_CURSOR_JOIN_ENDPOINT *end; + WT_CURSOR_JOIN_ENTRY *entry; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + cjoin = (WT_CURSOR_JOIN *)cursor; + + CURSOR_API_CALL(cursor, session, close, NULL); + + __wt_schema_release_table(session, cjoin->table); + /* These are owned by the table */ + cursor->internal_uri = NULL; + cursor->key_format = NULL; + if (cjoin->projection != NULL) { + __wt_free(session, cjoin->projection); + __wt_free(session, cursor->value_format); + } + + for (entry = cjoin->entries, i = 0; i < cjoin->entries_next; + entry++, i++) { + if (entry->main != NULL) + WT_TRET(entry->main->close(entry->main)); + if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) + WT_TRET(__wt_bloom_close(entry->bloom)); + for (end = &entry->ends[0]; + end < &entry->ends[entry->ends_next]; end++) { + F_CLR(end->cursor, WT_CURSTD_JOINED); + if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY)) + __wt_free(session, end->key.data); + } + __wt_free(session, entry->ends); + } + + if (cjoin->iter != NULL) + WT_TRET(__curjoin_entry_iter_close(cjoin->iter)); + __wt_free(session, cjoin->entries); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); +} + +/* + * __wt_curjoin_open -- + * Initialize a join cursor. + * + * Join cursors are read-only. + */ +int +__wt_curjoin_open(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CURSOR_STATIC_INIT(iface, + __curjoin_get_key, /* get-key */ + __curjoin_get_value, /* get-value */ + __wt_cursor_notsup, /* set-key */ + __wt_cursor_notsup, /* set-value */ + __wt_cursor_notsup, /* compare */ + __wt_cursor_notsup, /* equals */ + __curjoin_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curjoin_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_notsup, /* reconfigure */ + __curjoin_close); /* close */ + WT_CURSOR *cursor; + WT_CURSOR_JOIN *cjoin; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_TABLE *table; + size_t size; + const char *tablename, *columns; + + WT_STATIC_ASSERT(offsetof(WT_CURSOR_JOIN, iface) == 0); + + if (!WT_PREFIX_SKIP(uri, "join:")) + return (EINVAL); + tablename = uri; + if (!WT_PREFIX_SKIP(tablename, "table:")) + return (EINVAL); + + columns = strchr(tablename, '('); + if (columns == NULL) + size = strlen(tablename); + else + size = WT_PTRDIFF(columns, tablename); + WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table)); + + WT_RET(__wt_calloc_one(session, &cjoin)); + cursor = &cjoin->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->internal_uri = table->name; + cursor->key_format = table->key_format; + cursor->value_format = table->value_format; + cjoin->table = table; + + /* Handle projections. */ + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + if (columns != NULL) { + WT_ERR(__wt_struct_reformat(session, table, + columns, strlen(columns), NULL, 1, tmp)); + WT_ERR(__wt_strndup( + session, tmp->data, tmp->size, &cursor->value_format)); + WT_ERR(__wt_strdup(session, columns, &cjoin->projection)); + } + + if (owner != NULL) + WT_ERR(EINVAL); + + WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); + + if (0) { +err: WT_TRET(__curjoin_close(cursor)); + *cursorp = NULL; + } + + __wt_scr_free(session, &tmp); + return (ret); +} + +/* + * __wt_curjoin_join -- + * Add a new join to a join cursor. + */ +int +__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_INDEX *idx, WT_CURSOR *ref_cursor, uint32_t flags, uint32_t range, + uint64_t count, uint64_t bloom_bit_count, uint64_t bloom_hash_count) +{ + WT_CURSOR_JOIN_ENTRY *entry; + WT_DECL_RET; + WT_CURSOR_JOIN_ENDPOINT *end, *newend; + bool hasins, needbloom, range_eq; + u_int i, ins, nonbloom; + const char *raw_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), "raw", NULL }; + char *main_uri; + size_t namesize, newsize; + + entry = NULL; + hasins = needbloom = false; + ins = 0; /* -Wuninitialized */ + main_uri = NULL; + nonbloom = 0; /* -Wuninitialized */ + namesize = strlen(cjoin->table->name); + + for (i = 0; i < cjoin->entries_next; i++) { + if (cjoin->entries[i].index == idx) { + entry = &cjoin->entries[i]; + break; + } + if (!needbloom && i > 0 && + !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { + needbloom = true; + nonbloom = i; + } + } + if (entry == NULL) { + WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, + cjoin->entries_next + 1, &cjoin->entries)); + if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { + /* + * Reorder the list so that after the first entry, + * the Bloom filtered entries come next, followed by + * the non-Bloom entries. Once the Bloom filters + * are built, determining membership via Bloom is + * faster than without Bloom, so we can answer + * membership questions more quickly, and with less + * I/O, with the Bloom entries first. + */ + entry = &cjoin->entries[nonbloom]; + memmove(entry + 1, entry, + (cjoin->entries_next - nonbloom) * + sizeof(WT_CURSOR_JOIN_ENTRY)); + memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); + } + else + entry = &cjoin->entries[cjoin->entries_next]; + entry->index = idx; + entry->flags = flags; + entry->count = count; + entry->bloom_bit_count = bloom_bit_count; + entry->bloom_hash_count = bloom_hash_count; + ++cjoin->entries_next; + } else { + /* Merge the join into an existing entry for this index */ + if (count != 0 && entry->count != 0 && entry->count != count) { + __wt_errx(session, "count=%" PRIu64 " does not match " + "previous count=%" PRIu64 " for this index", + count, entry->count); + WT_ERR(EINVAL); + } + if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) != + F_ISSET(entry, WT_CURJOIN_ENTRY_BLOOM)) { + __wt_errx(session, "join has incompatible strategy " + "values for the same index"); + WT_ERR(EINVAL); + } + /* + * Check against other comparisons (we call them endpoints) + * already set up for this index. + * We allow either: + * - one or more "eq" (with disjunction) + * - exactly one "eq" (with conjunction) + * - exactly one of "gt" or "ge" (conjunction or disjunction) + * - exactly one of "lt" or "le" (conjunction or disjunction) + * - one of "gt"/"ge" along with one of "lt"/"le" + * (currently restricted to conjunction). + * + * Some other combinations, although expressible either do + * not make sense (X == 3 AND X == 5) or are reducible (X < + * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) + * or (X == 4 OR X > 15) make sense but we don't handle yet. + */ + for (i = 0; i < entry->ends_next; i++) { + end = &entry->ends[i]; + range_eq = (range == WT_CURJOIN_END_EQ); + if ((F_ISSET(end, WT_CURJOIN_END_GT) && + ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || + (F_ISSET(end, WT_CURJOIN_END_LT) && + ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || + (end->flags == WT_CURJOIN_END_EQ && + (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) + != 0)) { + __wt_errx(session, + "join has overlapping ranges"); + WT_ERR(EINVAL); + } + if (range == WT_CURJOIN_END_EQ && + end->flags == WT_CURJOIN_END_EQ && + !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) { + __wt_errx(session, + "compare=eq can only be combined " + "using operation=or"); + WT_ERR(EINVAL); + } + + /* + * Sort "gt"/"ge" to the front, followed by any number + * of "eq", and finally "lt"/"le". + */ + if (!hasins && + ((range & WT_CURJOIN_END_GT) != 0 || + (range == WT_CURJOIN_END_EQ && + !F_ISSET(end, WT_CURJOIN_END_GT)))) { + ins = i; + hasins = true; + } + } + /* All checks completed, merge any new configuration now */ + entry->count = count; + entry->bloom_bit_count = + WT_MAX(entry->bloom_bit_count, bloom_bit_count); + entry->bloom_hash_count = + WT_MAX(entry->bloom_hash_count, bloom_hash_count); + } + WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, + entry->ends_next + 1, &entry->ends)); + if (!hasins) + ins = entry->ends_next; + newend = &entry->ends[ins]; + memmove(newend + 1, newend, + (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); + memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); + entry->ends_next++; + newend->cursor = ref_cursor; + F_SET(newend, range); + + /* Open the main file with a projection of the indexed columns. */ + if (entry->main == NULL && entry->index != NULL) { + namesize = strlen(cjoin->table->name); + newsize = namesize + entry->index->colconf.len + 1; + WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); + snprintf(main_uri, newsize, "%s%.*s", + cjoin->table->name, (int)entry->index->colconf.len, + entry->index->colconf.str); + WT_ERR(__wt_open_cursor(session, main_uri, + (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); + } + +err: if (main_uri != NULL) + __wt_free(session, main_uri); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 81d028c165a..65d2dc81406 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -103,7 +103,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...) va_list ap; size_t size; uint64_t *v; - const char **p; + const char *desc, **p; cst = (WT_CURSOR_STAT *)cursor; va_start(ap, cursor); @@ -111,15 +111,13 @@ __curstat_get_value(WT_CURSOR *cursor, ...) WT_CURSOR_NEEDVALUE(cursor); + WT_ERR(cst->stats_desc(cst, WT_STAT_KEY_OFFSET(cst), &desc)); if (F_ISSET(cursor, WT_CURSTD_RAW)) { WT_ERR(__wt_struct_size(session, &size, cursor->value_format, - cst->stats_desc(WT_STAT_KEY_OFFSET(cst)), - cst->pv.data, cst->v)); + desc, cst->pv.data, cst->v)); WT_ERR(__wt_buf_initsize(session, &cursor->value, size)); WT_ERR(__wt_struct_pack(session, cursor->value.mem, size, - cursor->value_format, - cst->stats_desc(WT_STAT_KEY_OFFSET(cst)), - cst->pv.data, cst->v)); + cursor->value_format, desc, cst->pv.data, cst->v)); item = va_arg(ap, WT_ITEM *); item->data = cursor->value.data; @@ -130,7 +128,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...) * pointer support isn't documented, but it's a cheap test. */ if ((p = va_arg(ap, const char **)) != NULL) - *p = cst->stats_desc(WT_STAT_KEY_OFFSET(cst)); + *p = desc; if ((p = va_arg(ap, const char **)) != NULL) *p = cst->pv.data; if ((v = va_arg(ap, uint64_t *)) != NULL) @@ -201,7 +199,9 @@ __curstat_next(WT_CURSOR *cursor) /* Initialize on demand. */ if (cst->notinitialized) { WT_ERR(__wt_curstat_init( - session, cursor->internal_uri, cst->cfg, cst)); + session, cursor->internal_uri, NULL, cst->cfg, cst)); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, true, true)); cst->notinitialized = false; } @@ -211,15 +211,19 @@ __curstat_next(WT_CURSOR *cursor) cst->key = WT_STAT_KEY_MIN(cst); } else if (cst->key < WT_STAT_KEY_MAX(cst)) ++cst->key; - else { - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + else if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, true, false)); + else WT_ERR(WT_NOTFOUND); - } + cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)]; WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); -err: API_END_RET(session, ret); + if (0) { +err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + } + API_END_RET(session, ret); } /* @@ -239,7 +243,9 @@ __curstat_prev(WT_CURSOR *cursor) /* Initialize on demand. */ if (cst->notinitialized) { WT_ERR(__wt_curstat_init( - session, cursor->internal_uri, cst->cfg, cst)); + session, cursor->internal_uri, NULL, cst->cfg, cst)); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, false, true)); cst->notinitialized = false; } @@ -249,16 +255,19 @@ __curstat_prev(WT_CURSOR *cursor) cst->key = WT_STAT_KEY_MAX(cst); } else if (cst->key > WT_STAT_KEY_MIN(cst)) --cst->key; - else { - F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + else if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, false, false)); + else WT_ERR(WT_NOTFOUND); - } cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)]; WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); -err: API_END_RET(session, ret); + if (0) { +err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + } + API_END_RET(session, ret); } /* @@ -301,7 +310,7 @@ __curstat_search(WT_CURSOR *cursor) /* Initialize on demand. */ if (cst->notinitialized) { WT_ERR(__wt_curstat_init( - session, cursor->internal_uri, cst->cfg, cst)); + session, cursor->internal_uri, NULL, cst->cfg, cst)); cst->notinitialized = false; } @@ -332,6 +341,7 @@ __curstat_close(WT_CURSOR *cursor) __curstat_free_config(session, cst); __wt_buf_free(session, &cst->pv); + __wt_free(session, cst->desc_buf); WT_ERR(__wt_cursor_close(cursor)); @@ -426,12 +436,102 @@ __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst) } /* + * __curstat_join_next_set -- + * Advance to another index used in a join to give another set of + * statistics. + */ +static int +__curstat_join_next_set(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst, + bool forw, bool init) +{ + WT_CURSOR_JOIN *cjoin; + WT_JOIN_STATS_GROUP *join_group; + ssize_t pos; + + WT_ASSERT(session, WT_STREQ(cst->iface.uri, "statistics:join")); + join_group = &cst->u.join_stats_group; + cjoin = join_group->join_cursor; + if (init) + pos = forw ? 0 : cjoin->entries_next - 1; + else + pos = join_group->join_cursor_entry + (forw ? 1 : -1); + if (pos < 0 || (size_t)pos >= cjoin->entries_next) + return (WT_NOTFOUND); + + join_group->join_cursor_entry = pos; + if (cjoin->entries[pos].index == NULL) { + WT_ASSERT(session, WT_PREFIX_MATCH(cjoin->iface.uri, "join:")); + join_group->desc_prefix = cjoin->iface.uri + 5; + } else + join_group->desc_prefix = cjoin->entries[pos].index->name; + join_group->join_stats = cjoin->entries[pos].stats; + if (!init) + cst->key = forw ? WT_STAT_KEY_MIN(cst) : WT_STAT_KEY_MAX(cst); + return (0); +} + +/* + * __curstat_join_desc -- + * Assemble the description field based on current index and statistic. + */ +static int +__curstat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **resultp) +{ + size_t len; + const char *static_desc; + WT_JOIN_STATS_GROUP *sgrp; + WT_SESSION_IMPL *session; + + sgrp = &cst->u.join_stats_group; + session = (WT_SESSION_IMPL *)sgrp->join_cursor->iface.session; + WT_RET(__wt_stat_join_desc(cst, slot, &static_desc)); + len = strlen("join: ") + strlen(sgrp->desc_prefix) + + strlen(static_desc) + 1; + WT_RET(__wt_realloc(session, NULL, len, &cst->desc_buf)); + snprintf(cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix, + static_desc); + *resultp = cst->desc_buf; + return (0); +} + +/* + * __curstat_join_init -- + * Initialize the statistics for a joined cursor. + */ +static int +__curstat_join_init(WT_SESSION_IMPL *session, + WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) +{ + WT_CURSOR_JOIN *cjoin; + WT_DECL_RET; + + WT_UNUSED(cfg); + + if (curjoin == NULL && cst->u.join_stats_group.join_cursor != NULL) + curjoin = &cst->u.join_stats_group.join_cursor->iface; + if (curjoin == NULL || !WT_PREFIX_MATCH(curjoin->uri, "join:")) + WT_ERR_MSG(session, EINVAL, + "join cursor must be used with statistics:join"); + cjoin = (WT_CURSOR_JOIN *)curjoin; + memset(&cst->u.join_stats_group, 0, sizeof(WT_JOIN_STATS_GROUP)); + cst->u.join_stats_group.join_cursor = cjoin; + + cst->stats = (int64_t *)&cst->u.join_stats_group.join_stats; + cst->stats_base = WT_JOIN_STATS_BASE; + cst->stats_count = sizeof(WT_JOIN_STATS) / sizeof(int64_t); + cst->stats_desc = __curstat_join_desc; + cst->next_set = __curstat_join_next_set; + +err: return (ret); +} + +/* * __wt_curstat_init -- * Initialize a statistics cursor. */ int __wt_curstat_init(WT_SESSION_IMPL *session, - const char *uri, const char *cfg[], WT_CURSOR_STAT *cst) + const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) { const char *dsrc_uri; @@ -442,6 +542,10 @@ __wt_curstat_init(WT_SESSION_IMPL *session, dsrc_uri = uri + strlen("statistics:"); + if (WT_STREQ(dsrc_uri, "join")) + return ( + __curstat_join_init(session, curjoin, cfg, cst)); + if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) return ( __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst)); @@ -467,7 +571,7 @@ __wt_curstat_init(WT_SESSION_IMPL *session, */ int __wt_curstat_open(WT_SESSION_IMPL *session, - const char *uri, const char *cfg[], WT_CURSOR **cursorp) + const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, @@ -581,7 +685,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, * objects like tables, we need to a valid set of statistics when before * the open returns. */ - WT_ERR(__wt_curstat_init(session, uri, cst->cfg, cst)); + WT_ERR(__wt_curstat_init(session, uri, other, cst->cfg, cst)); cst->notinitialized = false; /* The cursor isn't yet positioned. */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 38359236b27..dca72a16ee5 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -186,34 +186,16 @@ __wt_curtable_get_key(WT_CURSOR *cursor, ...) int __wt_curtable_get_value(WT_CURSOR *cursor, ...) { - WT_CURSOR *primary; - WT_CURSOR_TABLE *ctable; WT_DECL_RET; - WT_ITEM *item; WT_SESSION_IMPL *session; va_list ap; - ctable = (WT_CURSOR_TABLE *)cursor; - primary = *ctable->cg_cursors; - CURSOR_API_CALL(cursor, session, get_value, NULL); - WT_CURSOR_NEEDVALUE(primary); - va_start(ap, cursor); - if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { - ret = __wt_schema_project_merge(session, - ctable->cg_cursors, ctable->plan, - cursor->value_format, &cursor->value); - if (ret == 0) { - item = va_arg(ap, WT_ITEM *); - item->data = cursor->value.data; - item->size = cursor->value.size; - } - } else - ret = __wt_schema_project_out(session, - ctable->cg_cursors, ctable->plan, ap); - va_end(ap); + JOINABLE_CURSOR_API_CALL(cursor, session, get_value, NULL); + WT_ERR(__wt_curtable_get_valuev(cursor, ap)); -err: API_END_RET(session, ret); +err: va_end(ap); + API_END_RET(session, ret); } /* @@ -264,7 +246,7 @@ __wt_curtable_set_value(WT_CURSOR *cursor, ...) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, set_value, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL); va_start(ap, cursor); if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) { @@ -332,7 +314,7 @@ __curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) WT_DECL_RET; WT_SESSION_IMPL *session; - CURSOR_API_CALL(a, session, compare, NULL); + JOINABLE_CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then @@ -362,7 +344,7 @@ __curtable_next(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, next, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL); APPLY_CG(ctable, next); err: API_END_RET(session, ret); @@ -383,7 +365,7 @@ __curtable_next_random(WT_CURSOR *cursor) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, next, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL); cp = ctable->cg_cursors; /* Split out the first next, it retrieves the random record. */ @@ -414,7 +396,7 @@ __curtable_prev(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, prev, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, prev, NULL); APPLY_CG(ctable, prev); err: API_END_RET(session, ret); @@ -432,7 +414,7 @@ __curtable_reset(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, reset, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL); APPLY_CG(ctable, reset); err: API_END_RET(session, ret); @@ -450,7 +432,7 @@ __curtable_search(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, search, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL); APPLY_CG(ctable, search); err: API_END_RET(session, ret); @@ -470,7 +452,7 @@ __curtable_search_near(WT_CURSOR *cursor, int *exact) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, search_near, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, search_near, NULL); cp = ctable->cg_cursors; primary = *cp; WT_ERR(primary->search_near(primary, exact)); @@ -501,7 +483,7 @@ __curtable_insert(WT_CURSOR *cursor) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); + JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); WT_ERR(__curtable_open_indices(ctable)); /* @@ -520,29 +502,38 @@ __curtable_insert(WT_CURSOR *cursor) if (ctable->table->nindices > 0) F_CLR(primary, WT_CURSTD_OVERWRITE); ret = primary->insert(primary); - F_SET(primary, flag_orig); - if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { - /* - * !!! - * The insert failure clears these flags, but does not touch the - * items. We could make a copy each time for overwrite cursors, - * but for now we just reset the flags. - */ - F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); - ret = __curtable_update(cursor); - goto err; - } - WT_ERR(ret); + /* + * !!! + * WT_CURSOR.insert clears the set internally/externally flags + * but doesn't touch the items. We could make a copy each time + * for overwrite cursors, but for now we just reset the flags. + */ + F_SET(primary, flag_orig | WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); - for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { - (*cp)->recno = primary->recno; - WT_ERR((*cp)->insert(*cp)); + if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + WT_ERR(__curtable_update(cursor)); + else { + WT_ERR(ret); + + for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { + (*cp)->recno = primary->recno; + WT_ERR((*cp)->insert(*cp)); + } + + WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, insert), false)); } - WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, insert), false)); + /* + * WT_CURSOR.insert doesn't leave the cursor positioned, and the + * application may want to free the memory used to configure the + * insert; don't read that memory again (matching the underlying + * file object cursor insert semantics). + */ + F_CLR(primary, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); err: CURSOR_UPDATE_API_END(session, ret); + return (ret); } @@ -559,7 +550,7 @@ __curtable_update(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); + JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); WT_ERR(__curtable_open_indices(ctable)); /* @@ -610,7 +601,7 @@ __curtable_remove(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_REMOVE_API_CALL(cursor, session, NULL); + JOINABLE_CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_ERR(__curtable_open_indices(ctable)); /* Find the old record so it can be removed from indices */ @@ -650,6 +641,7 @@ __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop) /* Open any indices. */ WT_RET(__curtable_open_indices(ctable)); WT_RET(__wt_scr_alloc(session, 128, &key)); + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* * Step through the cursor range, removing the index entries. @@ -721,7 +713,7 @@ __curtable_close(WT_CURSOR *cursor) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_API_CALL(cursor, session, close, NULL); + JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL); if (ctable->cg_cursors != NULL) for (i = 0, cp = ctable->cg_cursors; @@ -844,7 +836,7 @@ __curtable_open_indices(WT_CURSOR_TABLE *ctable) */ int __wt_curtable_open(WT_SESSION_IMPL *session, - const char *uri, const char *cfg[], WT_CURSOR **cursorp) + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_curtable_get_key, /* get-key */ @@ -935,7 +927,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session, } WT_ERR(__wt_cursor_init( - cursor, cursor->internal_uri, NULL, cfg, cursorp)); + cursor, cursor->internal_uri, owner, cfg, cursorp)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) WT_ERR(__wt_json_column_init(cursor, table->key_format, diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 18335d6fb5e..fa6c4f4313f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -183,10 +183,10 @@ __evict_server(void *arg) session, &conn->dhandle_lock)) == EBUSY && !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); spins++) { - if (spins < 1000) + if (spins < WT_THOUSAND) __wt_yield(); else - __wt_sleep(0, 1000); + __wt_sleep(0, WT_THOUSAND); } /* * If we gave up acquiring the lock, that indicates a @@ -210,7 +210,7 @@ __evict_server(void *arg) else { /* After being stuck for 5 minutes, give up. */ WT_ERR(__wt_epoch(session, &now)); - if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) { + if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) { __wt_errx(session, "Cache stuck for too long, giving up"); (void)__wt_cache_dump(session, NULL); @@ -601,7 +601,7 @@ __evict_pass(WT_SESSION_IMPL *session) * that can free space in cache, such as LSM discarding * handles. */ - __wt_sleep(0, 1000 * (uint64_t)loop); + __wt_sleep(0, WT_THOUSAND * (uint64_t)loop); if (loop == 100) { /* * Mark the cache as stuck if we need space @@ -992,10 +992,10 @@ retry: while (slot < max_entries && ret == 0) { session, &conn->dhandle_lock)) == EBUSY && !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); spins++) { - if (spins < 1000) + if (spins < WT_THOUSAND) __wt_yield(); else - __wt_sleep(0, 1000); + __wt_sleep(0, WT_THOUSAND); } if (ret != 0) break; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index e49098e90db..94c969fa5bb 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -179,9 +179,17 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * something is busy, be sure that the page still ends up * marked deleted. */ - if (ndeleted > pindex->entries / 10 && pindex->entries > 1 && - (ret = __wt_split_reverse(session, ref)) != EBUSY) - return (ret); + if (ndeleted > pindex->entries / 10 && pindex->entries > 1) { + if ((ret = __wt_split_reverse(session, ref)) == 0) + return (0); + WT_RET_BUSY_OK(ret); + + /* + * The child must be locked after a failed reverse + * split. + */ + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + } } WT_PUBLISH(ref->state, WT_REF_DELETED); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 74c58845c43..4821b450f9e 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -116,11 +116,23 @@ API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) +#define JOINABLE_CURSOR_CALL_CHECK(cur) \ + if (F_ISSET(cur, WT_CURSTD_JOINED)) \ + WT_ERR(__wt_curindex_joined(cur)) + +#define JOINABLE_CURSOR_API_CALL(cur, s, n, bt) \ + CURSOR_API_CALL(cur, s, n, bt); \ + JOINABLE_CURSOR_CALL_CHECK(cur) + #define CURSOR_REMOVE_API_CALL(cur, s, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); +#define JOINABLE_CURSOR_REMOVE_API_CALL(cur, s, bt) \ + CURSOR_REMOVE_API_CALL(cur, s, bt); \ + JOINABLE_CURSOR_CALL_CHECK(cur) + #define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ @@ -128,6 +140,10 @@ if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \ WT_ERR(WT_CACHE_FULL); +#define JOINABLE_CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ + CURSOR_UPDATE_API_CALL(cur, s, n, bt); \ + JOINABLE_CURSOR_CALL_CHECK(cur) + #define CURSOR_UPDATE_API_END(s, ret) \ TXN_API_END(s, ret) diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 02819237c13..ae29dc68003 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -437,24 +437,10 @@ struct __wt_page { uint32_t deleted_entries; WT_REF **index; } * volatile __index; /* Collated children */ - - /* - * When splitting to deepen the tree, track the number - * of entries in the newly created parent, and how many - * subsequent splits follow the initial set of entries. - * If future splits into the page are generally after - * the initial set of items, perform future deepening - * splits in this page to optimize for an append-style - * workload. - */ - uint32_t deepen_split_append; - uint32_t deepen_split_last; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref -#define pg_intl_deepen_split_append u.intl.deepen_split_append -#define pg_intl_deepen_split_last u.intl.deepen_split_last /* * Macros to copy/set the index because the name is obscured to ensure @@ -581,7 +567,8 @@ struct __wt_page { #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */ +#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 23e212eb772..a92d52e784a 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1101,16 +1101,17 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, return (false); /* - * If the tree was deepened, there's a requirement that newly created - * internal pages not be evicted until all threads are known to have - * exited the original page index array, because evicting an internal - * page discards its WT_REF array, and a thread traversing the original - * page index array might see a freed WT_REF. During the split we set - * a transaction value, once that's globally visible, we know we can - * evict the created page. + * If a split created new internal pages, those newly created internal + * pages cannot be evicted until all threads are known to have exited + * the original parent page's index, because evicting an internal page + * discards its WT_REF array, and a thread traversing the original + * parent page index might see a freed WT_REF. During the split we set + * a transaction value, we can evict the created page as soon as that + * transaction value is globally visible. */ if (check_splits && WT_PAGE_IS_INTERNAL(page) && - !__wt_txn_visible_all(session, mod->mod_split_txn)) + (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) || + !__wt_txn_visible_all(session, mod->mod_split_txn))) return (false); /* @@ -1374,3 +1375,34 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) return (child->memory_footprint > maxsize); } + +/* + * __wt_split_intl_race -- + * Return if we raced with an internal page split when descending the tree. + */ +static inline bool +__wt_split_intl_race( + WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex) +{ + WT_PAGE_INDEX *pindex; + + /* + * A place to hang this comment... + * + * There's a page-split race when we walk the tree: if we're splitting + * an internal page into its parent, we update the parent's page index + * and then update the page being split, and it's not an atomic update. + * A thread could read the parent page's original page index, and then + * read the page's replacement index. Because internal page splits work + * by replacing the original page with the initial part of the original + * page, the result of this race is we will have a key that's past the + * end of the current page, and the parent's page index will have moved. + * + * It's also possible a thread could read the parent page's replacement + * page index, and then read the page's original index. Because internal + * splits work by truncating the original page, the original page's old + * content is compatible, this isn't a problem and we ignore this race. + */ + WT_INTL_INDEX_GET(session, parent, pindex); + return (pindex != saved_pindex); +} diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h index 408639ab2a9..e836abaccba 100644 --- a/src/third_party/wiredtiger/src/include/config.h +++ b/src/third_party/wiredtiger/src/include/config.h @@ -68,28 +68,29 @@ struct __wt_config_parser_impl { #define WT_CONFIG_ENTRY_WT_SESSION_compact 16 #define WT_CONFIG_ENTRY_WT_SESSION_create 17 #define WT_CONFIG_ENTRY_WT_SESSION_drop 18 -#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 19 -#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 20 -#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 21 -#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 22 -#define WT_CONFIG_ENTRY_WT_SESSION_rename 23 -#define WT_CONFIG_ENTRY_WT_SESSION_reset 24 -#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 25 -#define WT_CONFIG_ENTRY_WT_SESSION_salvage 26 -#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 27 -#define WT_CONFIG_ENTRY_WT_SESSION_strerror 28 -#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 29 -#define WT_CONFIG_ENTRY_WT_SESSION_truncate 30 -#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 31 -#define WT_CONFIG_ENTRY_WT_SESSION_verify 32 -#define WT_CONFIG_ENTRY_colgroup_meta 33 -#define WT_CONFIG_ENTRY_file_meta 34 -#define WT_CONFIG_ENTRY_index_meta 35 -#define WT_CONFIG_ENTRY_table_meta 36 -#define WT_CONFIG_ENTRY_wiredtiger_open 37 -#define WT_CONFIG_ENTRY_wiredtiger_open_all 38 -#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 39 -#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 40 +#define WT_CONFIG_ENTRY_WT_SESSION_join 19 +#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20 +#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21 +#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22 +#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 23 +#define WT_CONFIG_ENTRY_WT_SESSION_rename 24 +#define WT_CONFIG_ENTRY_WT_SESSION_reset 25 +#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 26 +#define WT_CONFIG_ENTRY_WT_SESSION_salvage 27 +#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 28 +#define WT_CONFIG_ENTRY_WT_SESSION_strerror 29 +#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 30 +#define WT_CONFIG_ENTRY_WT_SESSION_truncate 31 +#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 32 +#define WT_CONFIG_ENTRY_WT_SESSION_verify 33 +#define WT_CONFIG_ENTRY_colgroup_meta 34 +#define WT_CONFIG_ENTRY_file_meta 35 +#define WT_CONFIG_ENTRY_index_meta 36 +#define WT_CONFIG_ENTRY_table_meta 37 +#define WT_CONFIG_ENTRY_wiredtiger_open 38 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 39 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 40 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 41 /* * configuration section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 35a83d7c50f..3e8d3705373 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -365,13 +365,13 @@ struct __wt_connection_impl { WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */ - WT_SESSION_IMPL *sweep_session; /* Handle sweep session */ - wt_thread_t sweep_tid; /* Handle sweep thread */ - int sweep_tid_set; /* Handle sweep thread set */ - WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */ - time_t sweep_idle_time;/* Handle sweep idle time */ - time_t sweep_interval;/* Handle sweep interval */ - u_int sweep_handles_min;/* Handle sweep minimum open */ + WT_SESSION_IMPL *sweep_session; /* Handle sweep session */ + wt_thread_t sweep_tid; /* Handle sweep thread */ + int sweep_tid_set; /* Handle sweep thread set */ + WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */ + uint64_t sweep_idle_time; /* Handle sweep idle time */ + uint64_t sweep_interval; /* Handle sweep interval */ + uint64_t sweep_handles_min;/* Handle sweep minimum open */ /* * Shared lookaside lock, session and cursor, used by threads accessing diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 1cbe76216b1..23d3f3745db 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -264,6 +264,66 @@ struct __wt_cursor_index { uint8_t *cg_needvalue; }; +struct __wt_cursor_join_iter { + WT_SESSION_IMPL *session; + WT_CURSOR_JOIN *cjoin; + WT_CURSOR_JOIN_ENTRY *entry; + WT_CURSOR *cursor; + WT_ITEM *curkey; + bool advance; +}; + +struct __wt_cursor_join_endpoint { + WT_ITEM key; + uint8_t recno_buf[10]; /* holds packed recno */ + WT_CURSOR *cursor; + +#define WT_CURJOIN_END_LT 0x01 /* include values < cursor */ +#define WT_CURJOIN_END_EQ 0x02 /* include values == cursor */ +#define WT_CURJOIN_END_GT 0x04 /* include values > cursor */ +#define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ) +#define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ) +#define WT_CURJOIN_END_OWN_KEY 0x08 /* must free key's data */ + uint8_t flags; /* range for this endpoint */ +}; + +struct __wt_cursor_join_entry { + WT_INDEX *index; + WT_CURSOR *main; /* raw main table cursor */ + WT_BLOOM *bloom; /* Bloom filter handle */ + uint64_t bloom_bit_count; /* bits per item in bloom */ + uint64_t bloom_hash_count; /* hash functions in bloom */ + uint64_t count; /* approx number of matches */ + +#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */ +#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */ +#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x04 /* this entry owns the bloom */ + uint8_t flags; + + WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */ + size_t ends_allocated; + size_t ends_next; + + WT_JOIN_STATS stats; /* Join statistics */ +}; + +struct __wt_cursor_join { + WT_CURSOR iface; + + WT_TABLE *table; + const char *projection; + WT_CURSOR_JOIN_ITER *iter; + WT_CURSOR_JOIN_ENTRY *entries; + size_t entries_allocated; + u_int entries_next; + uint8_t recno_buf[10]; /* holds packed recno */ + +#define WT_CURJOIN_ERROR 0x01 /* Error in initialization */ +#define WT_CURJOIN_INITIALIZED 0x02 /* Successful initialization */ +#define WT_CURJOIN_SKIP_FIRST_LEFT 0x04 /* First check not needed */ + uint8_t flags; +}; + struct __wt_cursor_json { char *key_buf; /* JSON formatted string */ char *value_buf; /* JSON formatted string */ @@ -298,6 +358,13 @@ struct __wt_cursor_metadata { uint32_t flags; }; +struct __wt_join_stats_group { + const char *desc_prefix; /* Prefix appears before description */ + WT_CURSOR_JOIN *join_cursor; + ssize_t join_cursor_entry; /* Position in entries */ + WT_JOIN_STATS join_stats; +}; + struct __wt_cursor_stat { WT_CURSOR iface; @@ -307,14 +374,19 @@ struct __wt_cursor_stat { int64_t *stats; /* Statistics */ int stats_base; /* Base statistics value */ int stats_count; /* Count of statistics values */ - const char *(*stats_desc)(int); /* Statistics descriptions */ + int (*stats_desc)(WT_CURSOR_STAT *, int, const char **); + /* Statistics descriptions */ + int (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool, + bool); /* Advance to next set */ union { /* Copies of the statistics */ WT_DSRC_STATS dsrc_stats; WT_CONNECTION_STATS conn_stats; + WT_JOIN_STATS_GROUP join_stats_group; } u; const char **cfg; /* Original cursor configuration */ + char *desc_buf; /* Saved description string */ int key; /* Current stats key */ uint64_t v; /* Current stats value */ diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index c6ce04cab6f..9dd280534b4 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -139,6 +139,70 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) } /* + * __wt_curindex_get_valuev -- + * Internal implementation of WT_CURSOR->get_value for index cursors + */ +static inline int +__wt_curindex_get_valuev(WT_CURSOR *cursor, va_list ap) +{ + WT_CURSOR_INDEX *cindex; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + + cindex = (WT_CURSOR_INDEX *)cursor; + session = (WT_SESSION_IMPL *)cursor->session; + WT_CURSOR_NEEDVALUE(cursor); + + if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { + ret = __wt_schema_project_merge(session, + cindex->cg_cursors, cindex->value_plan, + cursor->value_format, &cursor->value); + if (ret == 0) { + item = va_arg(ap, WT_ITEM *); + item->data = cursor->value.data; + item->size = cursor->value.size; + } + } else + ret = __wt_schema_project_out(session, + cindex->cg_cursors, cindex->value_plan, ap); +err: return (ret); +} + +/* + * __wt_curtable_get_valuev -- + * Internal implementation of WT_CURSOR->get_value for table cursors. + */ +static inline int +__wt_curtable_get_valuev(WT_CURSOR *cursor, va_list ap) +{ + WT_CURSOR *primary; + WT_CURSOR_TABLE *ctable; + WT_DECL_RET; + WT_ITEM *item; + WT_SESSION_IMPL *session; + + ctable = (WT_CURSOR_TABLE *)cursor; + session = (WT_SESSION_IMPL *)cursor->session; + primary = *ctable->cg_cursors; + WT_CURSOR_NEEDVALUE(primary); + + if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { + ret = __wt_schema_project_merge(session, + ctable->cg_cursors, ctable->plan, + cursor->value_format, &cursor->value); + if (ret == 0) { + item = va_arg(ap, WT_ITEM *); + item->data = cursor->value.data; + item->size = cursor->value.size; + } + } else + ret = __wt_schema_project_out(session, + ctable->cg_cursors, ctable->plan, ap); +err: return (ret); +} + +/* * __wt_cursor_dhandle_incr_use -- * Increment the in-use counter in cursor's data source. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 032b94b7040..743a3c3ac31 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -83,6 +83,8 @@ extern int __wt_bloom_finalize(WT_BLOOM *bloom); extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash); extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash); extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key); +extern int __wt_bloom_inmem_get(WT_BLOOM *bloom, WT_ITEM *key); +extern int __wt_bloom_intersection(WT_BLOOM *bloom, WT_BLOOM *other); extern int __wt_bloom_close(WT_BLOOM *bloom); extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); @@ -155,9 +157,9 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); -extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); @@ -274,7 +276,10 @@ extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **c extern int __wt_curfile_update_check(WT_CURSOR *cursor); extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap, WT_CURSOR **cursorp); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curindex_joined(WT_CURSOR *cursor); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint32_t flags, uint32_t range, uint64_t count, uint64_t bloom_bit_count, uint64_t bloom_hash_count); extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor); extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode); @@ -287,8 +292,8 @@ extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); -extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst); -extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst); +extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_cursor_notsup(WT_CURSOR *cursor); extern int __wt_cursor_noop(WT_CURSOR *cursor); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor); @@ -316,7 +321,7 @@ extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...); extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...); extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...); extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop); -extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop); extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); @@ -523,7 +528,6 @@ extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid); extern void __wt_thread_id(char *buf, size_t buflen); -extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep); extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); extern void __wt_yield(void); extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...); @@ -534,6 +538,8 @@ extern int __wt_struct_confchk(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v); extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...); extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...); extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...); +extern int __wt_struct_unpack_size(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, size_t *resultp); +extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); @@ -675,19 +681,24 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp extern void __wt_scr_discard(WT_SESSION_IMPL *session); extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size); extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p); -extern const char *__wt_stat_dsrc_desc(int slot); +extern int __wt_stat_dsrc_desc(WT_CURSOR_STAT *cst, int slot, const char **p); extern void __wt_stat_dsrc_init_single(WT_DSRC_STATS *stats); extern void __wt_stat_dsrc_init(WT_DATA_HANDLE *handle); extern void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats); extern void __wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats); extern void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to); extern void __wt_stat_dsrc_aggregate( WT_DSRC_STATS **from, WT_DSRC_STATS *to); -extern const char *__wt_stat_connection_desc(int slot); +extern int __wt_stat_connection_desc(WT_CURSOR_STAT *cst, int slot, const char **p); extern void __wt_stat_connection_init_single(WT_CONNECTION_STATS *stats); extern void __wt_stat_connection_init(WT_CONNECTION_IMPL *handle); extern void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats); extern void __wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats); extern void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to); +extern int __wt_stat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **p); +extern void __wt_stat_join_init_single(WT_JOIN_STATS *stats); +extern void __wt_stat_join_clear_single(WT_JOIN_STATS *stats); +extern void __wt_stat_join_clear_all(WT_JOIN_STATS **stats); +extern void __wt_stat_join_aggregate( WT_JOIN_STATS **from, WT_JOIN_STATS *to); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 95fe18b9ecb..064349125cc 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -36,6 +36,7 @@ #define WT_LOG_DSYNC 0x00000002 #define WT_LOG_FLUSH 0x00000004 #define WT_LOG_FSYNC 0x00000008 +#define WT_LOG_SYNC_ENABLED 0x00000010 #define WT_READ_CACHE 0x00000001 #define WT_READ_COMPACT 0x00000002 #define WT_READ_NO_EMPTY 0x00000004 diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index eca77214b47..e542baec642 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -13,6 +13,7 @@ #define WT_UNUSED(var) (void)(var) /* Basic constants. */ +#define WT_THOUSAND (1000) #define WT_MILLION (1000000) #define WT_BILLION (1000000000) diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 80096d0cf72..75068706b70 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -30,6 +30,22 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) } /* + * __wt_seconds -- + * Return the seconds since the Epoch. + */ +static inline int +__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) +{ + struct timespec t; + + WT_RET(__wt_epoch(session, &t)); + + *timep = t.tv_sec; + + return (0); +} + +/* * __wt_verbose -- * Verbose message. */ diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index 54a9cc6f9fd..7eb042dd79f 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -18,7 +18,7 @@ /* Default to spinning 1000 times before yielding. */ #ifndef WT_SPIN_COUNT -#define WT_SPIN_COUNT 1000 +#define WT_SPIN_COUNT WT_THOUSAND #endif /* @@ -300,7 +300,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) * situation happens if there are more threads than cores in the * system and we're thrashing on shared resources. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); @@ -329,7 +329,7 @@ __wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) #ifdef HAVE_DIAGNOSTIC /* * __wt_fair_islocked -- - * Test whether the lock is currently held + * Test whether the lock is currently held. */ static inline bool __wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index 4ba588111b8..d135fd9eb1f 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -65,9 +65,16 @@ typedef enum { } \ } while (0) -#define WT_TIMEDIFF(end, begin) \ - (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \ +#define WT_TIMEDIFF_NS(end, begin) \ + (WT_BILLION * (uint64_t)((end).tv_sec - (begin).tv_sec) + \ (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec) +#define WT_TIMEDIFF_US(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_THOUSAND) +#define WT_TIMEDIFF_MS(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_MILLION) +#define WT_TIMEDIFF_SEC(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_BILLION) + #define WT_TIMECMP(t1, t2) \ ((t1).tv_sec < (t2).tv_sec ? -1 : \ (t1).tv_sec == (t2.tv_sec) ? \ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 1ebe253e5db..dfe7ee5c6cd 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -272,7 +272,8 @@ struct __wt_connection_stats { int64_t cache_eviction_server_evicting; int64_t cache_eviction_server_not_evicting; int64_t cache_eviction_slow; - int64_t cache_eviction_split; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; int64_t cache_inmem_split; @@ -297,6 +298,7 @@ struct __wt_connection_stats { int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; + int64_t cursor_truncate; int64_t cursor_update; int64_t dh_conn_handle_count; int64_t dh_session_handles; @@ -358,6 +360,8 @@ struct __wt_connection_stats { int64_t page_read_blocked; int64_t page_sleep; int64_t read_io; + int64_t rec_page_delete; + int64_t rec_page_delete_fast; int64_t rec_pages; int64_t rec_pages_eviction; int64_t rec_split_stashed_bytes; @@ -378,7 +382,10 @@ struct __wt_connection_stats { int64_t txn_fail_cache; int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_range; + int64_t txn_pinned_snapshot_range; int64_t txn_rollback; + int64_t txn_snapshots_created; + int64_t txn_snapshots_dropped; int64_t txn_sync; int64_t write_io; }; @@ -432,7 +439,8 @@ struct __wt_dsrc_stats { int64_t cache_eviction_fail; int64_t cache_eviction_hazard; int64_t cache_eviction_internal; - int64_t cache_eviction_split; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_inmem_split; int64_t cache_inmem_splittable; int64_t cache_overflow_value; @@ -461,6 +469,7 @@ struct __wt_dsrc_stats { int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; + int64_t cursor_truncate; int64_t cursor_update; int64_t cursor_update_bytes; int64_t lsm_checkpoint_throttle; @@ -476,6 +485,7 @@ struct __wt_dsrc_stats { int64_t rec_overflow_key_leaf; int64_t rec_overflow_value; int64_t rec_page_delete; + int64_t rec_page_delete_fast; int64_t rec_page_match; int64_t rec_pages; int64_t rec_pages_eviction; @@ -486,4 +496,14 @@ struct __wt_dsrc_stats { int64_t txn_update_conflict; }; +/* + * Statistics entries for join cursors. + */ +#define WT_JOIN_STATS_BASE 3000 +struct __wt_join_stats { + int64_t accesses; + int64_t actual_count; + int64_t bloom_false_positive; +}; + /* Statistics section: END */ diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 044611d655e..08f73386090 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -574,11 +574,12 @@ struct __wt_cursor { #define WT_CURSTD_KEY_EXT 0x0020 /* Key points out of the tree. */ #define WT_CURSTD_KEY_INT 0x0040 /* Key points into the tree. */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) -#define WT_CURSTD_OPEN 0x0080 -#define WT_CURSTD_OVERWRITE 0x0100 -#define WT_CURSTD_RAW 0x0200 -#define WT_CURSTD_VALUE_EXT 0x0400 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x0800 /* Value points into the tree. */ +#define WT_CURSTD_JOINED 0x0080 +#define WT_CURSTD_OPEN 0x0100 +#define WT_CURSTD_OVERWRITE 0x0200 +#define WT_CURSTD_RAW 0x0400 +#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -1236,6 +1237,61 @@ struct __wt_session { const char *name, const char *config); /*! + * Join a join cursor with a reference cursor. + * + * @snippet ex_schema.c Join cursors + * + * @param session the session handle + * @param join_cursor a cursor that was opened using a + * \c "join:" URI. It may not have been used for any operations + * other than other join calls. + * @param ref_cursor either an index cursor having the same base table + * as the join_cursor, or a table cursor open on the same base table. + * The ref_cursor must be positioned. + * + * The ref_cursor limits the results seen by iterating the + * join_cursor to table items referred to by the key in this + * index. The set of keys referred to is modified by the compare + * config option. + * + * Multiple join calls builds up a set of ref_cursors, and the + * results seen by iteration are the intersection of the cursor + * ranges participating in the join. + * + * After the join call completes, the ref_cursor cursor may not be + * used for any purpose other than get_key and get_value. Any other + * cursor method (e.g. next, prev,close) will fail. When the + * join_cursor is closed, the ref_cursor is made available for + * general use again. The application should close ref_cursor when + * finished with it, although not before the join_cursor is closed. + * + * @configstart{WT_SESSION.join, see dist/api_data.py} + * @config{bloom_bit_count, the number of bits used per item for the + * bloom filter., an integer between 2 and 1000; default \c 16.} + * @config{bloom_hash_count, the number of hash values per item for the + * bloom filter., an integer between 2 and 100; default \c 8.} + * @config{compare, modifies the set of items to be returned so that the + * index key satisfies the given comparison relative to the key set in + * this cursor., a string\, chosen from the following options: \c "eq"\, + * \c "ge"\, \c "gt"\, \c "le"\, \c "lt"; default \c "eq".} + * @config{count, set an approximate count of the elements that would be + * included in the join. This is used in sizing the bloom filter\, and + * also influences evaluation order for cursors in the join. When the + * count is equal for multiple bloom filters in a composition of joins\, + * the bloom filter may be shared., an integer; default \c .} + * @config{strategy, when set to bloom\, a bloom filter is created and + * populated for this index. This has an up front cost but may reduce + * the number of accesses to the main table when iterating the joined + * cursor. The bloom setting requires that count be set., a string\, + * chosen from the following options: \c "bloom"\, \c "default"; default + * empty.} + * @configend + * @errors + */ + int __F(join)(WT_SESSION *session, WT_CURSOR *join_cursor, + WT_CURSOR *ref_cursor, const char *config); + + /*! * Flush the log. * * @param session the session handle @@ -2328,10 +2384,13 @@ struct __wt_connection { * string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c * "none"; default \c fsync.} * @config{ ),,} + * @config{use_environment, use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME + * environment variables if the process is not running with special privileges. + * See @ref home for more information., a boolean flag; default \c true.} * @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c - * WIREDTIGER_HOME environment variables regardless of whether or not the - * process is running with special privileges. See @ref home for more - * information., a boolean flag; default \c false.} + * WIREDTIGER_HOME environment variables even if the process is running with + * special privileges. See @ref home for more information., a boolean flag; + * default \c false.} * @config{verbose, enable messages for various events. Only available if * WiredTiger is configured with --enable-verbose. Options are given as a * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with @@ -3710,224 +3769,239 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043 /*! cache: eviction server unable to reach eviction goal */ #define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044 -/*! cache: pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1045 +/*! cache: internal pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1046 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1052 +#define WT_STAT_CONN_CACHE_OVERHEAD 1053 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1055 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1055 +#define WT_STAT_CONN_CACHE_READ 1056 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1057 +#define WT_STAT_CONN_CACHE_WRITE 1058 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1060 +#define WT_STAT_CONN_COND_WAIT 1061 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1061 +#define WT_STAT_CONN_CURSOR_CREATE 1062 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1062 +#define WT_STAT_CONN_CURSOR_INSERT 1063 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1063 +#define WT_STAT_CONN_CURSOR_NEXT 1064 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1064 +#define WT_STAT_CONN_CURSOR_PREV 1065 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1065 +#define WT_STAT_CONN_CURSOR_REMOVE 1066 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1066 +#define WT_STAT_CONN_CURSOR_RESET 1067 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1067 +#define WT_STAT_CONN_CURSOR_RESTART 1068 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1068 +#define WT_STAT_CONN_CURSOR_SEARCH 1069 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070 +/*! cursor: truncate calls */ +#define WT_STAT_CONN_CURSOR_TRUNCATE 1071 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1070 +#define WT_STAT_CONN_CURSOR_UPDATE 1072 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1073 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1072 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1074 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1075 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1076 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1075 +#define WT_STAT_CONN_DH_SWEEP_REF 1077 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1078 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1077 +#define WT_STAT_CONN_DH_SWEEP_TOD 1079 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1078 +#define WT_STAT_CONN_DH_SWEEPS 1080 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1079 +#define WT_STAT_CONN_FILE_OPEN 1081 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1082 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1083 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1084 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1085 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1086 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1087 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1088 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1089 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1090 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1089 +#define WT_STAT_CONN_LOG_FLUSH 1091 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1090 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1092 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1091 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1093 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1092 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1094 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1093 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1095 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1094 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1096 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1095 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1097 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1096 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1098 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1097 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1099 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1098 +#define WT_STAT_CONN_LOG_SCANS 1100 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1099 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1101 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1100 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1102 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1101 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1103 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1102 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1104 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1103 +#define WT_STAT_CONN_LOG_SLOT_RACES 1105 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1104 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1106 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1107 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1106 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1108 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1107 +#define WT_STAT_CONN_LOG_SYNC 1109 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1108 +#define WT_STAT_CONN_LOG_SYNC_DIR 1110 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1109 +#define WT_STAT_CONN_LOG_WRITE_LSN 1111 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1110 +#define WT_STAT_CONN_LOG_WRITES 1112 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1111 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1113 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1112 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1114 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1113 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1115 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1114 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1116 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1115 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1117 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1116 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1118 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1117 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1119 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1118 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1120 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1119 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1121 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1120 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1122 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1121 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1123 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1122 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1124 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1123 +#define WT_STAT_CONN_MEMORY_FREE 1125 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1124 +#define WT_STAT_CONN_MEMORY_GROW 1126 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1125 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1127 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1126 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1128 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1127 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1129 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1128 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1130 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1129 +#define WT_STAT_CONN_PAGE_SLEEP 1131 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1130 +#define WT_STAT_CONN_READ_IO 1132 +/*! reconciliation: pages deleted */ +#define WT_STAT_CONN_REC_PAGE_DELETE 1133 +/*! reconciliation: fast-path pages deleted */ +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1134 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1131 +#define WT_STAT_CONN_REC_PAGES 1135 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1132 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1136 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1133 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1137 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1134 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1138 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1135 +#define WT_STAT_CONN_RWLOCK_READ 1139 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1136 +#define WT_STAT_CONN_RWLOCK_WRITE 1140 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1137 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1141 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1138 +#define WT_STAT_CONN_SESSION_OPEN 1142 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1139 +#define WT_STAT_CONN_TXN_BEGIN 1143 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1140 +#define WT_STAT_CONN_TXN_CHECKPOINT 1144 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1141 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1145 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1142 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1146 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1143 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1147 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1144 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1148 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1145 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1149 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1146 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1150 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1147 +#define WT_STAT_CONN_TXN_COMMIT 1151 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1148 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1152 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1149 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1153 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1150 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1154 +/*! transaction: transaction range of IDs currently pinned by named + * snapshots */ +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1155 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1151 +#define WT_STAT_CONN_TXN_ROLLBACK 1156 +/*! transaction: number of named snapshots created */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1157 +/*! transaction: number of named snapshots dropped */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1158 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1152 +#define WT_STAT_CONN_TXN_SYNC 1159 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1153 +#define WT_STAT_CONN_WRITE_IO 1160 /*! * @} @@ -4023,112 +4097,131 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 /*! cache: internal pages evicted */ #define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 -/*! cache: pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044 +/*! cache: internal pages split during eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045 /*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045 +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046 +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2048 +#define WT_STAT_DSRC_CACHE_READ 2049 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049 +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2051 +#define WT_STAT_DSRC_CACHE_WRITE 2052 /*! cache: page written requiring lookaside records */ -#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052 +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053 +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2057 +#define WT_STAT_DSRC_COMPRESS_READ 2058 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2058 +#define WT_STAT_DSRC_COMPRESS_WRITE 2059 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2061 +#define WT_STAT_DSRC_CURSOR_CREATE 2062 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2062 +#define WT_STAT_DSRC_CURSOR_INSERT 2063 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2065 +#define WT_STAT_DSRC_CURSOR_NEXT 2066 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2066 +#define WT_STAT_DSRC_CURSOR_PREV 2067 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2067 +#define WT_STAT_DSRC_CURSOR_REMOVE 2068 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2069 +#define WT_STAT_DSRC_CURSOR_RESET 2070 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2070 +#define WT_STAT_DSRC_CURSOR_RESTART 2071 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2071 +#define WT_STAT_DSRC_CURSOR_SEARCH 2072 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073 +/*! cursor: truncate calls */ +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2074 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2073 +#define WT_STAT_DSRC_CURSOR_UPDATE 2075 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2076 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2077 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2078 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2079 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2080 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2081 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2080 +#define WT_STAT_DSRC_REC_DICTIONARY 2082 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2083 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2084 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2085 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2087 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2088 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2087 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2089 +/*! reconciliation: fast-path pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2090 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2088 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2091 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2089 +#define WT_STAT_DSRC_REC_PAGES 2092 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2093 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2094 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2095 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2093 +#define WT_STAT_DSRC_SESSION_COMPACT 2096 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2097 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2098 + +/*! + * @} + * @name Statistics for join cursors + * @anchor statistics_join + * @{ + */ +/*! : accesses */ +#define WT_STAT_JOIN_ACCESSES 3000 +/*! : actual count of items */ +#define WT_STAT_JOIN_ACTUAL_COUNT 3001 +/*! : bloom filter false positives */ +#define WT_STAT_JOIN_BLOOM_FALSE_POSITIVE 3002 /*! @} */ /* * Statistics section: END diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 3f4e0ada7f1..0a1e143ce70 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -136,6 +136,14 @@ struct __wt_cursor_dump; typedef struct __wt_cursor_dump WT_CURSOR_DUMP; struct __wt_cursor_index; typedef struct __wt_cursor_index WT_CURSOR_INDEX; +struct __wt_cursor_join; + typedef struct __wt_cursor_join WT_CURSOR_JOIN; +struct __wt_cursor_join_endpoint; + typedef struct __wt_cursor_join_endpoint WT_CURSOR_JOIN_ENDPOINT; +struct __wt_cursor_join_entry; + typedef struct __wt_cursor_join_entry WT_CURSOR_JOIN_ENTRY; +struct __wt_cursor_join_iter; + typedef struct __wt_cursor_join_iter WT_CURSOR_JOIN_ITER; struct __wt_cursor_json; typedef struct __wt_cursor_json WT_CURSOR_JSON; struct __wt_cursor_log; @@ -178,6 +186,10 @@ struct __wt_insert; typedef struct __wt_insert WT_INSERT; struct __wt_insert_head; typedef struct __wt_insert_head WT_INSERT_HEAD; +struct __wt_join_stats; + typedef struct __wt_join_stats WT_JOIN_STATS; +struct __wt_join_stats_group; + typedef struct __wt_join_stats_group WT_JOIN_STATS_GROUP; struct __wt_keyed_encryptor; typedef struct __wt_keyed_encryptor WT_KEYED_ENCRYPTOR; struct __wt_log; diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 44dc7dc30a7..3106094e7e3 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -1313,7 +1313,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); - if (++yield_count < 1000) + if (++yield_count < WT_THOUSAND) __wt_yield(); else ret = __wt_cond_wait(session, log->log_write_cond, 200); diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index b3790412536..255551f99a4 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -380,7 +380,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, * There should almost always be a slot open. */ #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % 1000) == 0; + unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; #endif for (;;) { WT_BARRIER(); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index f988bfc97fd..953698476ef 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -81,7 +81,7 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm) lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) { - if (waited % 1000 == 0) + if (waited % WT_THOUSAND == 0) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); __wt_sleep(0, 10); @@ -1379,7 +1379,15 @@ __clsm_insert(WT_CURSOR *cursor) } WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf)); - ret = __clsm_put(session, clsm, &cursor->key, &value, false); + WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, false)); + + /* + * WT_CURSOR.insert doesn't leave the cursor positioned, and the + * application may want to free the memory used to configure the + * insert; don't read that memory again (matching the underlying + * file object cursor insert semantics). + */ + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); err: __wt_scr_free(session, &buf); __clsm_leave(clsm); @@ -1522,6 +1530,10 @@ __wt_clsm_open(WT_SESSION_IMPL *session, if (!WT_PREFIX_MATCH(uri, "lsm:")) return (EINVAL); + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_RET_MSG(session, EINVAL, + "LSM trees not supported by in-memory configurations"); + WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); if (cval.len != 0) WT_RET_MSG(session, EINVAL, diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 1c5124c32af..d8cf36f2cc1 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -388,8 +388,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) continue; WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : - WT_TIMEDIFF( - now, lsm_tree->work_push_ts) / WT_MILLION; + WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts); fillms = 3 * lsm_tree->chunk_fill_ms; if (fillms == 0) fillms = 10000; diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c index dd1419fe67d..1a2608803e4 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c @@ -94,7 +94,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_RET(__wt_epoch(session, &now)); msec_since_last_merge = - WT_TIMEDIFF(now, lsm_tree->merge_aggressive_ts) / WT_MILLION; + WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts); /* * If there is no estimate for how long it's taking to fill chunks @@ -457,7 +457,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); -#define LSM_MERGE_CHECK_INTERVAL 1000 +#define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index 4381ca0df00..c1eb7a2a389 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -77,12 +77,12 @@ __curstat_lsm_init( */ WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->uri)); - ret = __wt_curstat_open(session, uribuf->data, + ret = __wt_curstat_open(session, uribuf->data, NULL, F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, &stat_cursor); if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) ret = __wt_curstat_open( - session, uribuf->data, cfg, &stat_cursor); + session, uribuf->data, NULL, cfg, &stat_cursor); WT_ERR(ret); /* @@ -107,7 +107,7 @@ __curstat_lsm_init( WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->bloom_uri)); WT_ERR(__wt_curstat_open( - session, uribuf->data, cfg, &stat_cursor)); + session, uribuf->data, NULL, cfg, &stat_cursor)); /* * The underlying statistics have now been initialized; fill in diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 30af051bbcf..0c3642e70e8 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -111,7 +111,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * other schema level operations will return EBUSY, even though * we're dropping the schema lock here. */ - if (i % 1000 == 0) { + if (i % WT_THOUSAND == 0) { WT_WITHOUT_LOCKS(session, ret = __wt_lsm_manager_clear_tree(session, lsm_tree)); WT_RET(ret); @@ -336,6 +336,11 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, } WT_RET_NOTFOUND_OK(ret); + /* In-memory configurations don't make sense for LSM. */ + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_RET_MSG(session, EINVAL, + "LSM trees not supported by in-memory configurations"); + WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, @@ -747,7 +752,7 @@ __wt_lsm_tree_throttle( WT_ASSERT(session, WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); timediff = - WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); + WT_TIMEDIFF_NS(last_chunk->create_ts, ondisk->create_ts); lsm_tree->ckpt_throttle = (in_memory - 2) * timediff / (20 * record_count); @@ -783,8 +788,8 @@ __wt_lsm_tree_throttle( } /* Put an upper bound of 1s on both throttle calculations. */ - lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); - lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); + lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle); + lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. @@ -798,15 +803,16 @@ __wt_lsm_tree_throttle( WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP( last_chunk->create_ts, prev_chunk->create_ts) >= 0); - timediff = - WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); + timediff = WT_TIMEDIFF_NS( + last_chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); - oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); + oldtime = WT_TIMEDIFF_NS( + prev_chunk->create_ts, ondisk->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + - timediff / 1000000) / 4; + timediff / WT_MILLION) / 4; } } diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index fac2c06957d..d5fc86b648b 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -76,9 +76,9 @@ __wt_cond_wait_signal( if (usecs > 0) { WT_ERR(__wt_epoch(session, &ts)); ts.tv_sec += (time_t) - (((uint64_t)ts.tv_nsec + 1000 * usecs) / WT_BILLION); + (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION); ts.tv_nsec = (long) - (((uint64_t)ts.tv_nsec + 1000 * usecs) % WT_BILLION); + (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) % WT_BILLION); ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts); } else ret = pthread_cond_wait(&cond->cond, &cond->mtx); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index d47ab197643..46f134feabb 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -201,7 +201,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Don't sleep long when waiting on a read lock, hopefully we're * waiting on another read thread to increment the reader count. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); @@ -300,7 +300,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * situation happens if there are more threads than cores in the * system and we're thrashing on shared resources. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c index f888e51bf7f..4e90edabc53 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_sleep.c +++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c @@ -17,8 +17,8 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds) { struct timeval t; - t.tv_sec = (time_t)(seconds + micro_seconds / 1000000); - t.tv_usec = (suseconds_t)(micro_seconds % 1000000); + t.tv_sec = (time_t)(seconds + micro_seconds / WT_MILLION); + t.tv_usec = (suseconds_t)(micro_seconds % WT_MILLION); (void)select(0, NULL, NULL, NULL, &t); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c index c52772e77e1..c3052df62e7 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_time.c +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -9,22 +9,6 @@ #include "wt_internal.h" /* - * __wt_seconds -- - * Return the seconds since the Epoch. - */ -int -__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) -{ - struct timespec t; - - WT_RET(__wt_epoch(session, &t)); - - *timep = t.tv_sec; - - return (0); -} - -/* * __wt_epoch -- * Return the time since the Epoch. */ @@ -44,7 +28,7 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); if (ret == 0) { tsp->tv_sec = v.tv_sec; - tsp->tv_nsec = v.tv_usec * 1000; + tsp->tv_nsec = v.tv_usec * WT_THOUSAND; return (0); } WT_RET_MSG(session, ret, "gettimeofday"); diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c index 484cf218f26..33e04c1d8a9 100644 --- a/src/third_party/wiredtiger/src/os_win/os_sleep.c +++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c @@ -19,7 +19,7 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds) * If the caller wants a small pause, set to our * smallest granularity. */ - if (seconds == 0 && micro_seconds < 1000) - micro_seconds = 1000; - Sleep(seconds * 1000 + micro_seconds / 1000); + if (seconds == 0 && micro_seconds < WT_THOUSAND) + micro_seconds = WT_THOUSAND; + Sleep(seconds * WT_THOUSAND + micro_seconds / WT_THOUSAND); } diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c index c51db118ce1..2292c317a64 100644 --- a/src/third_party/wiredtiger/src/os_win/os_time.c +++ b/src/third_party/wiredtiger/src/os_win/os_time.c @@ -9,22 +9,6 @@ #include "wt_internal.h" /* - * __wt_seconds -- - * Return the seconds since the Epoch. - */ -int -__wt_seconds(WT_SESSION_IMPL *session, time_t *timep) -{ - struct timespec t; - - WT_RET(__wt_epoch(session, &t)); - - *timep = t.tv_sec; - - return (0); -} - -/* * __wt_epoch -- * Return the time since the Epoch. */ diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c index 3a4428eae15..447c887dc6f 100644 --- a/src/third_party/wiredtiger/src/packing/pack_impl.c +++ b/src/third_party/wiredtiger/src/packing/pack_impl.c @@ -105,3 +105,108 @@ __wt_struct_unpack(WT_SESSION_IMPL *session, return (ret); } + +/* + * __wt_struct_unpack_size -- + * Determine the packed size of a buffer matching the format. + */ +int +__wt_struct_unpack_size(WT_SESSION_IMPL *session, + const void *buffer, size_t size, const char *fmt, size_t *resultp) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + const uint8_t *p, *end; + + p = buffer; + end = p + size; + + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) + WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (ret != WT_NOTFOUND) + return (ret); + + *resultp = WT_PTRDIFF(p, buffer); + return (0); +} + +/* + * __wt_struct_repack -- + * Return the subset of the packed buffer that represents part of + * the format. If the result is not contiguous in the existing + * buffer, a buffer is reallocated and filled. + */ +int +__wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, + const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, + void **reallocp) +{ + WT_DECL_PACK_VALUE(pvin); + WT_DECL_PACK_VALUE(pvout); + WT_DECL_RET; + WT_PACK packin, packout; + const uint8_t *before, *end, *p; + uint8_t *newbuf, *pout; + size_t len; + const void *start; + + start = newbuf = NULL; + p = inbuf->data; + end = p + inbuf->size; + + /* + * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf. + * The former case has the size embedded before the item, the latter + * does not. + */ + if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' && + strlen(infmt) > len && infmt[len - 1] == 'U') { + WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp)); + pout = *reallocp; + } else + pout = NULL; + + WT_ERR(__pack_init(session, &packout, outfmt)); + WT_ERR(__pack_init(session, &packin, infmt)); + + /* Outfmt should complete before infmt */ + while ((ret = __pack_next(&packout, &pvout)) == 0) { + WT_ERR(__pack_next(&packin, &pvin)); + before = p; + WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p))); + if (pvout.type != pvin.type) { + if (pvout.type == 'u' && pvin.type == 'U') { + /* Skip the prefixed size, we don't need it */ + WT_ERR(__wt_struct_unpack_size(session, before, + (size_t)(end - before), "I", &len)); + before += len; + } else + WT_ERR(ENOTSUP); + } + if (pout != NULL) { + memcpy(pout, before, WT_PTRDIFF(p, before)); + pout += p - before; + } else if (start == NULL) + start = before; + } + WT_ERR_NOTFOUND_OK(ret); + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (pout != NULL) { + outbuf->data = *reallocp; + outbuf->size = WT_PTRDIFF(pout, *reallocp); + } else { + outbuf->data = start; + outbuf->size = WT_PTRDIFF(p, start); + } + +err: return (ret); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 965f798e820..6d53230e9e0 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -960,7 +960,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) * than 10,000 boundary structure elements, discard the boundary array * entirely and start over next time. */ - if (destroy || r->bnd_entries > 10 * 1000) { + if (destroy || r->bnd_entries > 10 * WT_THOUSAND) { for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); @@ -2505,7 +2505,10 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, * the page: the offset is the byte offset to the possible split-point * (adjusted for an initial chunk that cannot be compressed), entries * is the cumulative page entries covered by the byte offset, recnos is - * the cumulative rows covered by the byte offset. + * the cumulative rows covered by the byte offset. Allocate to handle + * both column- and row-store regardless of this page type, structures + * are potentially reused for subsequent reconciliations of different + * page types. */ if (r->entries >= r->raw_max_slots) { __wt_free(session, r->raw_entries); @@ -2516,9 +2519,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, i = r->entries + 100; WT_RET(__wt_calloc_def(session, i, &r->raw_entries)); WT_RET(__wt_calloc_def(session, i, &r->raw_offsets)); - if (dsk->type == WT_PAGE_COL_INT || - dsk->type == WT_PAGE_COL_VAR) - WT_RET(__wt_calloc_def(session, i, &r->raw_recnos)); + WT_RET(__wt_calloc_def(session, i, &r->raw_recnos)); r->raw_max_slots = i; } @@ -5469,6 +5470,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case 0: /* Page delete */ WT_RET(__wt_verbose( session, WT_VERB_RECONCILE, "page %p empty", page)); + WT_STAT_FAST_CONN_INCR(session, rec_page_delete); WT_STAT_FAST_DATA_INCR(session, rec_page_delete); /* If this is the root page, we need to create a sync point. */ diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c index d73d66cd399..82c2e2a15dc 100644 --- a/src/third_party/wiredtiger/src/schema/schema_stat.c +++ b/src/third_party/wiredtiger/src/schema/schema_stat.c @@ -24,7 +24,7 @@ __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source)); - ret = __wt_curstat_init(session, buf->data, cfg, cst); + ret = __wt_curstat_init(session, buf->data, NULL, cfg, cst); err: __wt_scr_free(session, &buf); return (ret); @@ -46,7 +46,7 @@ __wt_curstat_index_init(WT_SESSION_IMPL *session, WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source)); - ret = __wt_curstat_init(session, buf->data, cfg, cst); + ret = __wt_curstat_init(session, buf->data, NULL, cfg, cst); err: __wt_scr_free(session, &buf); return (ret); @@ -159,7 +159,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_fmt( session, buf, "statistics:%s", table->cgroups[i]->name)); WT_ERR(__wt_curstat_open( - session, buf->data, cfg, &stat_cursor)); + session, buf->data, NULL, cfg, &stat_cursor)); new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); if (i == 0) *stats = *new; @@ -174,7 +174,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_fmt( session, buf, "statistics:%s", table->indices[i]->name)); WT_ERR(__wt_curstat_open( - session, buf->data, cfg, &stat_cursor)); + session, buf->data, NULL, cfg, &stat_cursor)); new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c index 03a991a9aba..c39bba4753c 100644 --- a/src/third_party/wiredtiger/src/schema/schema_truncate.c +++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c @@ -26,6 +26,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *uri) /* Open and lock the file. */ WT_RET(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* Get the allocation size. */ allocsize = S2BT(session)->allocsize; @@ -56,6 +57,7 @@ __truncate_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) u_int i; WT_RET(__wt_schema_get_table(session, uri, strlen(uri), false, &table)); + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* Truncate the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) @@ -90,6 +92,7 @@ __truncate_dsrc(WT_SESSION_IMPL *session, const char *uri) while ((ret = cursor->next(cursor)) == 0) WT_ERR(cursor->remove(cursor)); WT_ERR_NOTFOUND_OK(ret); + WT_STAT_FAST_DATA_INCR(session, cursor_truncate); err: WT_TRET(cursor->close(cursor)); return (ret); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index ed0e016dcb2..db81623c613 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -240,12 +240,12 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* - * __wt_open_cursor -- - * Internal version of WT_SESSION::open_cursor. + * __session_open_cursor_int -- + * Internal version of WT_SESSION::open_cursor, with second cursor arg. */ -int -__wt_open_cursor(WT_SESSION_IMPL *session, - const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +static int +__session_open_cursor_int(WT_SESSION_IMPL *session, const char *uri, + WT_CURSOR *owner, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp) { WT_COLGROUP *colgroup; WT_DATA_SOURCE *dsrc; @@ -267,7 +267,8 @@ __wt_open_cursor(WT_SESSION_IMPL *session, */ case 't': if (WT_PREFIX_MATCH(uri, "table:")) - WT_RET(__wt_curtable_open(session, uri, cfg, cursorp)); + WT_RET(__wt_curtable_open( + session, uri, owner, cfg, cursorp)); break; case 'c': if (WT_PREFIX_MATCH(uri, "colgroup:")) { @@ -288,6 +289,11 @@ __wt_open_cursor(WT_SESSION_IMPL *session, WT_RET(__wt_curindex_open( session, uri, owner, cfg, cursorp)); break; + case 'j': + if (WT_PREFIX_MATCH(uri, "join:")) + WT_RET(__wt_curjoin_open( + session, uri, owner, cfg, cursorp)); + break; case 'l': if (WT_PREFIX_MATCH(uri, "lsm:")) WT_RET(__wt_clsm_open( @@ -316,7 +322,8 @@ __wt_open_cursor(WT_SESSION_IMPL *session, break; case 's': if (WT_PREFIX_MATCH(uri, "statistics:")) - WT_RET(__wt_curstat_open(session, uri, cfg, cursorp)); + WT_RET(__wt_curstat_open(session, uri, other, cfg, + cursorp)); break; default: break; @@ -346,6 +353,18 @@ __wt_open_cursor(WT_SESSION_IMPL *session, } /* + * __wt_open_cursor -- + * Internal version of WT_SESSION::open_cursor. + */ +int +__wt_open_cursor(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + return (__session_open_cursor_int(session, uri, owner, NULL, cfg, + cursorp)); +} + +/* * __session_open_cursor -- * WT_SESSION->open_cursor method. */ @@ -356,18 +375,22 @@ __session_open_cursor(WT_SESSION *wt_session, WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool statjoin; cursor = *cursorp = NULL; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, open_cursor, config, cfg); - if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL)) + statjoin = (to_dup != NULL && uri != NULL && + WT_STREQ(uri, "statistics:join")); + if ((to_dup == NULL && uri == NULL) || + (to_dup != NULL && uri != NULL && !statjoin)) WT_ERR_MSG(session, EINVAL, "should be passed either a URI or a cursor to duplicate, " "but not both"); - if (to_dup != NULL) { + if (to_dup != NULL && !statjoin) { uri = to_dup->uri; if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "index:") && @@ -379,8 +402,9 @@ __session_open_cursor(WT_SESSION *wt_session, WT_ERR(__wt_bad_object_type(session, uri)); } - WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor)); - if (to_dup != NULL) + WT_ERR(__session_open_cursor_int(session, uri, NULL, + statjoin ? to_dup : NULL, cfg, &cursor)); + if (to_dup != NULL && !statjoin) WT_ERR(__wt_cursor_dup_position(to_dup, cursor)); *cursorp = cursor; @@ -614,6 +638,123 @@ err: /* Note: drop operations cannot be unrolled (yet?). */ } /* + * __session_join -- + * WT_SESSION->join method. + */ +static int +__session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, + WT_CURSOR *ref_cursor, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_CURSOR_INDEX *cindex; + WT_CURSOR_JOIN *cjoin; + WT_CURSOR_TABLE *ctable; + WT_INDEX *idx; + WT_TABLE *table; + uint32_t flags, range; + uint64_t count; + uint64_t bloom_bit_count, bloom_hash_count; + + count = 0; + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, join, config, cfg); + table = NULL; + + if (!WT_PREFIX_MATCH(join_cursor->uri, "join:")) { + __wt_errx(session, "not a join cursor"); + WT_ERR(EINVAL); + } + + if (WT_PREFIX_MATCH(ref_cursor->uri, "index:")) { + cindex = (WT_CURSOR_INDEX *)ref_cursor; + idx = cindex->index; + table = cindex->table; + WT_CURSOR_CHECKKEY(ref_cursor); + } else if (WT_PREFIX_MATCH(ref_cursor->uri, "table:")) { + idx = NULL; + ctable = (WT_CURSOR_TABLE *)ref_cursor; + table = ctable->table; + WT_CURSOR_CHECKKEY(ctable->cg_cursors[0]); + } else { + __wt_errx(session, "not an index or table cursor"); + WT_ERR(EINVAL); + } + + cjoin = (WT_CURSOR_JOIN *)join_cursor; + if (cjoin->table != table) { + __wt_errx(session, "table for join cursor does not match " + "table for index"); + WT_ERR(EINVAL); + } + if (F_ISSET(ref_cursor, WT_CURSTD_JOINED)) { + __wt_errx(session, "index cursor already used in a join"); + WT_ERR(EINVAL); + } + + /* "ge" is the default */ + range = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ; + flags = 0; + WT_ERR(__wt_config_gets(session, cfg, "compare", &cval)); + if (cval.len != 0) { + if (WT_STRING_MATCH("gt", cval.str, cval.len)) + range = WT_CURJOIN_END_GT; + else if (WT_STRING_MATCH("lt", cval.str, cval.len)) + range = WT_CURJOIN_END_LT; + else if (WT_STRING_MATCH("le", cval.str, cval.len)) + range = WT_CURJOIN_END_LE; + else if (WT_STRING_MATCH("eq", cval.str, cval.len)) + range = WT_CURJOIN_END_EQ; + else if (!WT_STRING_MATCH("ge", cval.str, cval.len)) + WT_ERR(EINVAL); + } + WT_ERR(__wt_config_gets(session, cfg, "count", &cval)); + if (cval.len != 0) + count = (uint64_t)cval.val; + + WT_ERR(__wt_config_gets(session, cfg, "strategy", &cval)); + if (cval.len != 0) { + if (WT_STRING_MATCH("bloom", cval.str, cval.len)) + LF_SET(WT_CURJOIN_ENTRY_BLOOM); + else if (!WT_STRING_MATCH("default", cval.str, cval.len)) + WT_ERR(EINVAL); + } + WT_ERR(__wt_config_gets(session, cfg, "bloom_bit_count", &cval)); + bloom_bit_count = (uint64_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "bloom_hash_count", &cval)); + bloom_hash_count = (uint64_t)cval.val; + if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) { + if (count == 0) { + __wt_errx(session, "count must be nonzero when " + "strategy=bloom"); + WT_ERR(EINVAL); + } + if (cjoin->entries_next == 0) { + __wt_errx(session, "the first joined cursor cannot " + "specify strategy=bloom"); + WT_ERR(EINVAL); + } + } + WT_ERR(__wt_curjoin_join(session, cjoin, idx, ref_cursor, flags, + range, count, bloom_bit_count, bloom_hash_count)); + /* + * There's an implied ownership ordering that isn't + * known when the cursors are created: the join cursor + * must be closed before any of the indices. Enforce + * that here by reordering. + */ + if (TAILQ_FIRST(&session->cursors) != join_cursor) { + TAILQ_REMOVE(&session->cursors, join_cursor, q); + TAILQ_INSERT_HEAD(&session->cursors, join_cursor, q); + } + /* Disable the reference cursor for regular operations */ + F_SET(ref_cursor, WT_CURSTD_JOINED); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* * __session_salvage -- * WT_SESSION->salvage method. */ @@ -657,6 +798,7 @@ __session_truncate(WT_SESSION *wt_session, session = (WT_SESSION_IMPL *)wt_session; SESSION_TXN_API_CALL(session, truncate, config, cfg); + WT_STAT_FAST_CONN_INCR(session, cursor_truncate); /* * If the URI is specified, we don't need a start/stop, if start/stop @@ -1009,7 +1151,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_epoch(session, &now)); - waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION; + waited_ms = WT_TIMEDIFF_MS(now, start); if (forever || waited_ms < timeout_ms) /* * Note, we will wait an increasing amount of time @@ -1144,6 +1286,7 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_create, __wt_session_compact, __session_drop, + __session_join, __session_log_flush, __session_log_printf, __session_rename, diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index bd503cd7826..456fcd3ce03 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -133,8 +133,7 @@ __session_compact_check_timeout( return (0); WT_RET(__wt_epoch(session, &end)); - if (session->compact->max_time < - WT_TIMEDIFF(end, begin) / WT_BILLION) + if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin)) WT_RET(ETIMEDOUT); return (0); } diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index ec2f0921ef2..dd5094fb480 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -390,7 +390,7 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) * do it again. */ WT_RET(__wt_seconds(session, &now)); - if (now - session->last_sweep < conn->sweep_interval) + if (difftime(now, session->last_sweep) < conn->sweep_interval) return (0); session->last_sweep = now; @@ -404,7 +404,8 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) dhandle->session_inuse == 0 && (WT_DHANDLE_INACTIVE(dhandle) || (dhandle->timeofdeath != 0 && - now - dhandle->timeofdeath > conn->sweep_idle_time))) { + difftime(now, dhandle->timeofdeath) > + conn->sweep_idle_time))) { WT_STAT_FAST_CONN_INCR(session, dh_session_handles); WT_ASSERT(session, !WT_IS_METADATA(dhandle)); __session_discard_dhandle(session, dhandle_cache); diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index c4bf4e8946a..de518cbf08b 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -199,7 +199,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, remain = WT_PTRDIFF(end, p); wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", - (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid); + (uintmax_t)ts.tv_sec, + (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); p = wlen >= remain ? end : p + wlen; prefix_cnt = 1; } diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 9e817fad512..4d7cd65fd18 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -47,7 +47,8 @@ static const char * const __stats_dsrc_desc[] = { "cache: data source pages selected for eviction unable to be evicted", "cache: hazard pointer blocked page eviction", "cache: internal pages evicted", - "cache: pages split during eviction", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: in-memory page splits", "cache: in-memory page passed criteria to be split", "cache: overflow values cached in memory", @@ -76,6 +77,7 @@ static const char * const __stats_dsrc_desc[] = { "cursor: restarted searches", "cursor: search calls", "cursor: search near calls", + "cursor: truncate calls", "cursor: update calls", "cursor: cursor-update value bytes updated", "LSM: sleep for LSM checkpoint throttle", @@ -91,6 +93,7 @@ static const char * const __stats_dsrc_desc[] = { "reconciliation: leaf-page overflow keys", "reconciliation: overflow values written", "reconciliation: pages deleted", + "reconciliation: fast-path pages deleted", "reconciliation: page checksum matches", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", @@ -101,10 +104,12 @@ static const char * const __stats_dsrc_desc[] = { "transaction: update conflicts", }; -const char * -__wt_stat_dsrc_desc(int slot) +int +__wt_stat_dsrc_desc(WT_CURSOR_STAT *cst, int slot, const char **p) { - return (__stats_dsrc_desc[slot]); + WT_UNUSED(cst); + *p = __stats_dsrc_desc[slot]; + return (0); } void @@ -163,6 +168,8 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_eviction_split_internal = 0; + stats->cache_eviction_split_leaf = 0; stats->cache_eviction_dirty = 0; stats->cache_read_overflow = 0; stats->cache_overflow_value = 0; @@ -170,7 +177,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_write_lookaside = 0; stats->cache_read = 0; stats->cache_read_lookaside = 0; - stats->cache_eviction_split = 0; stats->cache_write = 0; stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; @@ -194,6 +200,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cursor_restart = 0; stats->cursor_search = 0; stats->cursor_search_near = 0; + stats->cursor_truncate = 0; stats->cursor_update = 0; stats->bloom_false_positive = 0; stats->bloom_hit = 0; @@ -208,6 +215,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->lsm_merge_throttle = 0; stats->bloom_size = 0; stats->rec_dictionary = 0; + stats->rec_page_delete_fast = 0; stats->rec_suffix_compression = 0; stats->rec_multiblock_internal = 0; stats->rec_overflow_key_internal = 0; @@ -280,6 +288,9 @@ __wt_stat_dsrc_aggregate_single( to->cache_inmem_splittable += from->cache_inmem_splittable; to->cache_inmem_split += from->cache_inmem_split; to->cache_eviction_internal += from->cache_eviction_internal; + to->cache_eviction_split_internal += + from->cache_eviction_split_internal; + to->cache_eviction_split_leaf += from->cache_eviction_split_leaf; to->cache_eviction_dirty += from->cache_eviction_dirty; to->cache_read_overflow += from->cache_read_overflow; to->cache_overflow_value += from->cache_overflow_value; @@ -287,7 +298,6 @@ __wt_stat_dsrc_aggregate_single( to->cache_write_lookaside += from->cache_write_lookaside; to->cache_read += from->cache_read; to->cache_read_lookaside += from->cache_read_lookaside; - to->cache_eviction_split += from->cache_eviction_split; to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; @@ -311,6 +321,7 @@ __wt_stat_dsrc_aggregate_single( to->cursor_restart += from->cursor_restart; to->cursor_search += from->cursor_search; to->cursor_search_near += from->cursor_search_near; + to->cursor_truncate += from->cursor_truncate; to->cursor_update += from->cursor_update; to->bloom_false_positive += from->bloom_false_positive; to->bloom_hit += from->bloom_hit; @@ -326,6 +337,7 @@ __wt_stat_dsrc_aggregate_single( to->lsm_merge_throttle += from->lsm_merge_throttle; to->bloom_size += from->bloom_size; to->rec_dictionary += from->rec_dictionary; + to->rec_page_delete_fast += from->rec_page_delete_fast; to->rec_suffix_compression += from->rec_suffix_compression; to->rec_multiblock_internal += from->rec_multiblock_internal; to->rec_overflow_key_internal += from->rec_overflow_key_internal; @@ -407,6 +419,10 @@ __wt_stat_dsrc_aggregate( to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_eviction_split_internal += + WT_STAT_READ(from, cache_eviction_split_internal); + to->cache_eviction_split_leaf += + WT_STAT_READ(from, cache_eviction_split_leaf); to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow); to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value); @@ -416,7 +432,6 @@ __wt_stat_dsrc_aggregate( WT_STAT_READ(from, cache_write_lookaside); to->cache_read += WT_STAT_READ(from, cache_read); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); - to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); @@ -442,6 +457,7 @@ __wt_stat_dsrc_aggregate( to->cursor_restart += WT_STAT_READ(from, cursor_restart); to->cursor_search += WT_STAT_READ(from, cursor_search); to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); + to->cursor_truncate += WT_STAT_READ(from, cursor_truncate); to->cursor_update += WT_STAT_READ(from, cursor_update); to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); to->bloom_hit += WT_STAT_READ(from, bloom_hit); @@ -459,6 +475,7 @@ __wt_stat_dsrc_aggregate( to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); to->bloom_size += WT_STAT_READ(from, bloom_size); to->rec_dictionary += WT_STAT_READ(from, rec_dictionary); + to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_suffix_compression += WT_STAT_READ(from, rec_suffix_compression); to->rec_multiblock_internal += @@ -529,7 +546,8 @@ static const char * const __stats_connection_desc[] = { "cache: eviction server evicting pages", "cache: eviction server populating queue, but not evicting pages", "cache: eviction server unable to reach eviction goal", - "cache: pages split during eviction", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", "cache: in-memory page splits", @@ -554,6 +572,7 @@ static const char * const __stats_connection_desc[] = { "cursor: cursor restarted searches", "cursor: cursor search calls", "cursor: cursor search near calls", + "cursor: truncate calls", "cursor: cursor update calls", "data-handle: connection data handles currently active", "data-handle: session dhandles swept", @@ -615,6 +634,8 @@ static const char * const __stats_connection_desc[] = { "thread-yield: page acquire read blocked", "thread-yield: page acquire time sleeping (usecs)", "connection: total read I/Os", + "reconciliation: pages deleted", + "reconciliation: fast-path pages deleted", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", "reconciliation: split bytes currently awaiting free", @@ -635,15 +656,20 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction failures due to cache overflow", "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned", + "transaction: transaction range of IDs currently pinned by named snapshots", "transaction: transactions rolled back", + "transaction: number of named snapshots created", + "transaction: number of named snapshots dropped", "transaction: transaction sync calls", "connection: total write I/Os", }; -const char * -__wt_stat_connection_desc(int slot) +int +__wt_stat_connection_desc(WT_CURSOR_STAT *cst, int slot, const char **p) { - return (__stats_connection_desc[slot]); + WT_UNUSED(cst); + *p = __stats_connection_desc[slot]; + return (0); } void @@ -701,6 +727,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_eviction_split_internal = 0; + stats->cache_eviction_split_leaf = 0; stats->cache_lookaside_insert = 0; stats->cache_lookaside_remove = 0; /* not clearing cache_bytes_max */ @@ -715,7 +743,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_read = 0; stats->cache_read_lookaside = 0; stats->cache_eviction_fail = 0; - stats->cache_eviction_split = 0; stats->cache_eviction_walk = 0; stats->cache_write = 0; stats->cache_write_restore = 0; @@ -745,6 +772,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cursor_search = 0; stats->cursor_search_near = 0; stats->cursor_update = 0; + stats->cursor_truncate = 0; /* not clearing dh_conn_handle_count */ stats->dh_sweep_ref = 0; stats->dh_sweep_close = 0; @@ -795,8 +823,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lsm_work_units_done = 0; stats->lsm_work_units_created = 0; stats->lsm_work_queue_max = 0; + stats->rec_page_delete_fast = 0; stats->rec_pages = 0; stats->rec_pages_eviction = 0; + stats->rec_page_delete = 0; /* not clearing rec_split_stashed_bytes */ /* not clearing rec_split_stashed_objects */ /* not clearing session_cursor_open */ @@ -806,6 +836,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->page_locked_blocked = 0; stats->page_read_blocked = 0; stats->page_sleep = 0; + stats->txn_snapshots_created = 0; + stats->txn_snapshots_dropped = 0; stats->txn_begin = 0; /* not clearing txn_checkpoint_running */ /* not clearing txn_checkpoint_generation */ @@ -817,6 +849,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_fail_cache = 0; /* not clearing txn_pinned_range */ /* not clearing txn_pinned_checkpoint_range */ + /* not clearing txn_pinned_snapshot_range */ stats->txn_sync = 0; stats->txn_commit = 0; stats->txn_rollback = 0; @@ -880,6 +913,10 @@ __wt_stat_connection_aggregate( to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_eviction_split_internal += + WT_STAT_READ(from, cache_eviction_split_internal); + to->cache_eviction_split_leaf += + WT_STAT_READ(from, cache_eviction_split_leaf); to->cache_lookaside_insert += WT_STAT_READ(from, cache_lookaside_insert); to->cache_lookaside_remove += @@ -900,7 +937,6 @@ __wt_stat_connection_aggregate( to->cache_read += WT_STAT_READ(from, cache_read); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); - to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); @@ -930,6 +966,7 @@ __wt_stat_connection_aggregate( to->cursor_search += WT_STAT_READ(from, cursor_search); to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); to->cursor_update += WT_STAT_READ(from, cursor_update); + to->cursor_truncate += WT_STAT_READ(from, cursor_truncate); to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count); to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref); to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close); @@ -988,8 +1025,10 @@ __wt_stat_connection_aggregate( to->lsm_work_units_created += WT_STAT_READ(from, lsm_work_units_created); to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); + to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_pages += WT_STAT_READ(from, rec_pages); to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); + to->rec_page_delete += WT_STAT_READ(from, rec_page_delete); to->rec_split_stashed_bytes += WT_STAT_READ(from, rec_split_stashed_bytes); to->rec_split_stashed_objects += @@ -1002,6 +1041,10 @@ __wt_stat_connection_aggregate( to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked); to->page_read_blocked += WT_STAT_READ(from, page_read_blocked); to->page_sleep += WT_STAT_READ(from, page_sleep); + to->txn_snapshots_created += + WT_STAT_READ(from, txn_snapshots_created); + to->txn_snapshots_dropped += + WT_STAT_READ(from, txn_snapshots_dropped); to->txn_begin += WT_STAT_READ(from, txn_begin); to->txn_checkpoint_running += WT_STAT_READ(from, txn_checkpoint_running); @@ -1020,7 +1063,55 @@ __wt_stat_connection_aggregate( to->txn_pinned_range += WT_STAT_READ(from, txn_pinned_range); to->txn_pinned_checkpoint_range += WT_STAT_READ(from, txn_pinned_checkpoint_range); + to->txn_pinned_snapshot_range += + WT_STAT_READ(from, txn_pinned_snapshot_range); to->txn_sync += WT_STAT_READ(from, txn_sync); to->txn_commit += WT_STAT_READ(from, txn_commit); to->txn_rollback += WT_STAT_READ(from, txn_rollback); } + +static const char * const __stats_join_desc[] = { + ": accesses", + ": actual count of items", + ": bloom filter false positives", +}; + +int +__wt_stat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **p) +{ + WT_UNUSED(cst); + *p = __stats_join_desc[slot]; + return (0); +} + +void +__wt_stat_join_init_single(WT_JOIN_STATS *stats) +{ + memset(stats, 0, sizeof(*stats)); +} + +void +__wt_stat_join_clear_single(WT_JOIN_STATS *stats) +{ + stats->accesses = 0; + stats->actual_count = 0; + stats->bloom_false_positive = 0; +} + +void +__wt_stat_join_clear_all(WT_JOIN_STATS **stats) +{ + u_int i; + + for (i = 0; i < WT_COUNTER_SLOTS; ++i) + __wt_stat_join_clear_single(stats[i]); +} + +void +__wt_stat_join_aggregate( + WT_JOIN_STATS **from, WT_JOIN_STATS *to) +{ + to->accesses += WT_STAT_READ(from, accesses); + to->actual_count += WT_STAT_READ(from, actual_count); + to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); +} diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index a37fa3555b0..2079410a4d1 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -373,8 +373,11 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) */ F_SET(txn, WT_TXN_SYNC_SET); + /* + * If sync is turned off explicitly, clear the transaction's sync field. + */ if (cval.val == 0) - FLD_CLR(txn->txn_logsync, WT_LOG_FLUSH); + txn->txn_logsync = 0; WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); if (cval.len > 0) @@ -481,7 +484,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * explicit setting. */ if (cval.len == 0) { - if (!FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) && + if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { @@ -650,16 +653,21 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **stats; - uint64_t checkpoint_pinned; + uint64_t checkpoint_pinned, snapshot_pinned; conn = S2C(session); txn_global = &conn->txn_global; stats = conn->stats; checkpoint_pinned = txn_global->checkpoint_pinned; + snapshot_pinned = txn_global->nsnap_oldest_id; WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); + WT_STAT_SET(session, stats, txn_pinned_snapshot_range, + snapshot_pinned == WT_TXN_NONE ? + 0 : txn_global->current - snapshot_pinned); + WT_STAT_SET(session, stats, txn_pinned_checkpoint_range, checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 066abc9ed0f..bc1537ca878 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -297,7 +297,7 @@ __checkpoint_stats( /* * Get time diff in microseconds. */ - msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION; + msec = WT_TIMEDIFF_MS(*stop, *start); if (msec > conn->ckpt_time_max) conn->ckpt_time_max = msec; @@ -327,7 +327,7 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, /* * Get time diff in microseconds. */ - msec = WT_TIMEDIFF(stop, *start) / WT_MILLION; + msec = WT_TIMEDIFF_MS(stop, *start); WT_RET(__wt_verbose(session, WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64 ": Full database checkpoint %s", diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c index a5ec9cb9b82..169929a46de 100644 --- a/src/third_party/wiredtiger/src/txn/txn_nsnap.c +++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c @@ -47,6 +47,7 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; TAILQ_REMOVE(&txn_global->nsnaph, found, q); __nsnap_destroy(session, found); + WT_STAT_FAST_CONN_INCR(session, txn_snapshots_dropped); return (ret); } @@ -111,6 +112,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) WT_ASSERT(session, nsnap != NULL); TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); __nsnap_destroy(session, nsnap); + WT_STAT_FAST_CONN_INCR(session, txn_snapshots_dropped); /* Last will be NULL in the all case so it will never match */ } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); @@ -176,6 +178,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (TAILQ_EMPTY(&txn_global->nsnaph)) txn_global->nsnap_oldest_id = nsnap_new->snap_min; TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); + WT_STAT_FAST_CONN_INCR(session, txn_snapshots_created); nsnap_new = NULL; err: if (started_txn) diff --git a/src/third_party/wiredtiger/tools/wtstats/stat_data.py b/src/third_party/wiredtiger/tools/wtstats/stat_data.py index f2f193c0860..7cee87e49ed 100644 --- a/src/third_party/wiredtiger/tools/wtstats/stat_data.py +++ b/src/third_party/wiredtiger/tools/wtstats/stat_data.py @@ -32,6 +32,7 @@ no_scale_per_second_list = [ 'transaction: transaction checkpoint total time (msecs)', 'transaction: transaction range of IDs currently pinned', 'transaction: transaction range of IDs currently pinned by a checkpoint', + 'transaction: transaction range of IDs currently pinned by named snapshots', 'block-manager: checkpoint size', 'block-manager: file allocation unit size', 'block-manager: file magic number', @@ -95,6 +96,7 @@ no_clear_list = [ 'transaction: transaction checkpoint total time (msecs)', 'transaction: transaction range of IDs currently pinned', 'transaction: transaction range of IDs currently pinned by a checkpoint', + 'transaction: transaction range of IDs currently pinned by named snapshots', 'btree: btree checkpoint generation', 'session: open cursor count', ] |