summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorRamon Fernandez <rfmnyc@gmail.com>2015-11-19 09:37:38 -0500
committerRamon Fernandez <rfmnyc@gmail.com>2015-11-19 09:41:39 -0500
commita0771ea5ec1b44537d3c409e3d712db24fd8e6bb (patch)
tree62517780ad0982ec80b8a6d968a72cf0474df617 /src/third_party/wiredtiger
parent042d8fa2d252142489c5fa3009927bad20d77efb (diff)
downloadmongo-a0771ea5ec1b44537d3c409e3d712db24fd8e6bb.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.2.0-rc3-177-g9d375e3.tar.gz from wiredtiger branch mongodb-3.2
ref: d9ec1ff..9d375e3 16c0a1a WT-1315 Fix some leaks with join cursors. 59857f9 WT-2222 Add statistics for named snapshots. 4368d39 WT-1315 Cursor join implementation a72ddb7 WT-2218 Add truncate stats fb9cebe WT-2224 Track which deleted refs are discarded by a split. e2f1130 WT-2220 Split WT_TIMEDIFF macro into unit specific macros. be412b5 WT-2182 when internal pages grow large enough, split them into their parents ce8c091 WT-2219 Enhancements to in-memory testing 347d922 WT-2220 time_t cleanup. 08c0fcd WT-2217 change WT_CURSOR.insert to clear "set" key/value on return d1b5e7f WT-2135 Fix log_only setting for backup cursor. Fix initialization. 78bd4ac WT-2210 raw compression fails if row-store recovery precedes column-store recovery c1b2634 WT-2182 fixes for splitting up the tree. 0a1ee34 WT-2199 Fix transaction sync inconsistency. ee31bb2 WT-2182 Simplify the split deepen logic. c360d53 WT-2212 Add a "use_environment" config to "wiredtiger_open" 3f132a4 WT-2182 detect internal page split races.
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/README6
-rw-r--r--src/third_party/wiredtiger/RELEASE_INFO4
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c14
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/version-set.m48
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/version.m42
-rw-r--r--src/third_party/wiredtiger/build_win/filelist.win1
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py36
-rw-r--r--src/third_party/wiredtiger/dist/filelist1
-rw-r--r--src/third_party/wiredtiger/dist/flags.py1
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list6
-rw-r--r--src/third_party/wiredtiger/dist/s_funcs.list2
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok13
-rw-r--r--src/third_party/wiredtiger/dist/stat.py44
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py33
-rw-r--r--src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c45
-rw-r--r--src/third_party/wiredtiger/lang/java/java_doc.i1
-rw-r--r--src/third_party/wiredtiger/src/bloom/bloom.c41
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c1808
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c33
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c89
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c33
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c12
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c14
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c19
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c33
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c14
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_dump.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c16
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c54
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c1054
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c146
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c98
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c12
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c14
-rw-r--r--src/third_party/wiredtiger/src/include/api.h16
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h17
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i48
-rw-r--r--src/third_party/wiredtiger/src/include/config.h45
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h14
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h74
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i64
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h25
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h1
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h1
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i16
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i6
-rw-r--r--src/third_party/wiredtiger/src/include/os.h11
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h24
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in435
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h12
-rw-r--r--src/third_party/wiredtiger/src/log/log.c2
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c16
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c3
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c4
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c6
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c22
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c4
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c4
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_sleep.c4
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c18
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_sleep.c6
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c16
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_impl.c105
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c12
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c8
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_truncate.c3
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c167
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c3
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c5
-rw-r--r--src/third_party/wiredtiger/src/support/err.c3
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c117
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c14
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c4
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_nsnap.c3
-rw-r--r--src/third_party/wiredtiger/tools/wtstats/stat_data.py2
84 files changed, 3779 insertions, 1326 deletions
diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README
index c30b210029d..5056431c95b 100644
--- a/src/third_party/wiredtiger/README
+++ b/src/third_party/wiredtiger/README
@@ -1,6 +1,6 @@
-WiredTiger 2.6.2: (June 4, 2015)
+WiredTiger 2.7.0: (November 19, 2015)
-This is version 2.6.2 of WiredTiger.
+This is version 2.7.0 of WiredTiger.
WiredTiger release packages and documentation can be found at:
@@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at:
The documentation for this specific release can be found at:
- http://source.wiredtiger.com/2.6.2/index.html
+ http://source.wiredtiger.com/2.7.0/index.html
The WiredTiger source code can be found at:
diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO
index a178c2e40fb..1204e262af2 100644
--- a/src/third_party/wiredtiger/RELEASE_INFO
+++ b/src/third_party/wiredtiger/RELEASE_INFO
@@ -1,6 +1,6 @@
WIREDTIGER_VERSION_MAJOR=2
-WIREDTIGER_VERSION_MINOR=6
-WIREDTIGER_VERSION_PATCH=2
+WIREDTIGER_VERSION_MINOR=7
+WIREDTIGER_VERSION_PATCH=0
WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index 44aff59963c..9ac96862fa1 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -653,7 +653,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
/* Increment operation count */
@@ -936,7 +936,7 @@ populate_thread(void *arg)
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
++thread->insert.ops; /* Same as trk->ops */
@@ -1068,7 +1068,7 @@ populate_async(void *arg)
goto err;
}
++trk->latency_ops;
- usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ usecs = WT_TIMEDIFF_US(stop, start);
track_operation(trk, usecs);
}
if ((ret = session->close(session, NULL)) != 0) {
@@ -1386,7 +1386,7 @@ execute_populate(CONFIG *cfg)
}
lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount);
- msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+ msecs = WT_TIMEDIFF_MS(stop, start);
/*
* This is needed as the divisions will fail if the insert takes no time
@@ -1444,7 +1444,7 @@ execute_populate(CONFIG *cfg)
}
lprintf(cfg, 0, 1,
"Compact completed in %" PRIu64 " seconds",
- (uint64_t)(ns_to_sec(WT_TIMEDIFF(stop, start))));
+ (uint64_t)(WT_TIMEDIFF_SEC(stop, start)));
assert(tables == 0);
}
return (0);
@@ -2423,7 +2423,7 @@ worker_throttle(int64_t throttle, int64_t *ops, struct timespec *interval)
* If we did enough operations in less than a second, sleep for
* the rest of the second.
*/
- usecs_to_complete = ns_to_us(WT_TIMEDIFF(now, *interval));
+ usecs_to_complete = WT_TIMEDIFF_US(now, *interval);
if (usecs_to_complete < USEC_PER_SEC)
(void)usleep((useconds_t)(USEC_PER_SEC - usecs_to_complete));
@@ -2457,7 +2457,7 @@ drop_all_tables(CONFIG *cfg)
}
}
(void)__wt_epoch(NULL, &stop);
- msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+ msecs = WT_TIMEDIFF_MS(stop, start);
lprintf(cfg, 0, 1,
"Executed %" PRIu32 " drop operations average time %" PRIu64 "ms",
cfg->table_count, msecs / cfg->table_count);
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
index cec54f5e842..c8b89b7842b 100644
--- a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
+++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
@@ -1,14 +1,14 @@
dnl build by dist/s_version
VERSION_MAJOR=2
-VERSION_MINOR=6
-VERSION_PATCH=2
-VERSION_STRING='"WiredTiger 2.6.2: (June 4, 2015)"'
+VERSION_MINOR=7
+VERSION_PATCH=0
+VERSION_STRING='"WiredTiger 2.7.0: (November 19, 2015)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
AC_SUBST(VERSION_PATCH)
AC_SUBST(VERSION_STRING)
-VERSION_NOPATCH=2.6
+VERSION_NOPATCH=2.7
AC_SUBST(VERSION_NOPATCH)
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version.m4
index 4a4f3427df7..2ebe4516695 100644
--- a/src/third_party/wiredtiger/build_posix/aclocal/version.m4
+++ b/src/third_party/wiredtiger/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
-2.6.2
+2.7.0
diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win
index 9d0ee10d305..af6ddf98da9 100644
--- a/src/third_party/wiredtiger/build_win/filelist.win
+++ b/src/third_party/wiredtiger/build_win/filelist.win
@@ -72,6 +72,7 @@ src/cursor/cur_ds.c
src/cursor/cur_dump.c
src/cursor/cur_file.c
src/cursor/cur_index.c
+src/cursor/cur_join.c
src/cursor/cur_json.c
src/cursor/cur_log.c
src/cursor/cur_metadata.c
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 9afff74ca71..f58a48b4a0b 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -705,10 +705,15 @@ wiredtiger_open = wiredtiger_open_common + [
Config('in_memory', 'false', r'''
keep data in-memory only, minimize disk I/O''',
type='boolean', undoc=True),
+ Config('use_environment', 'true', r'''
+ use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
+ variables if the process is not running with special privileges.
+ See @ref home for more information''',
+ type='boolean'),
Config('use_environment_priv', 'false', r'''
use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
- variables regardless of whether or not the process is running
- with special privileges. See @ref home for more information''',
+ variables even if the process is running with special privileges.
+ See @ref home for more information''',
type='boolean'),
]
@@ -767,6 +772,33 @@ methods = {
type='boolean'),
]),
+'WT_SESSION.join' : Method([
+ Config('compare', '"eq"', r'''
+ modifies the set of items to be returned so that the index key
+ satisfies the given comparison relative to the key set in this
+ cursor''',
+ choices=['eq', 'ge', 'gt', 'le', 'lt']),
+ Config('count', '', r'''
+ set an approximate count of the elements that would be included in
+ the join. This is used in sizing the bloom filter, and also influences
+ evaluation order for cursors in the join. When the count is equal
+ for multiple bloom filters in a composition of joins, the bloom
+ filter may be shared''',
+ type='int'),
+ Config('bloom_bit_count', '16', r'''
+ the number of bits used per item for the bloom filter''',
+ min='2', max='1000'),
+ Config('bloom_hash_count', '8', r'''
+ the number of hash values per item for the bloom filter''',
+ min='2', max='100'),
+ Config('strategy', '', r'''
+ when set to bloom, a bloom filter is created and populated for
+ this index. This has an up front cost but may reduce the number
+ of accesses to the main table when iterating the joined cursor.
+ The bloom setting requires that count be set''',
+ choices=['bloom', 'default']),
+]),
+
'WT_SESSION.log_flush' : Method([
Config('sync', 'on', r'''
forcibly flush the log and wait for it to achieve the synchronization
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index f33f0e9a962..52af87c2a68 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -72,6 +72,7 @@ src/cursor/cur_ds.c
src/cursor/cur_dump.c
src/cursor/cur_file.c
src/cursor/cur_index.c
+src/cursor/cur_join.c
src/cursor/cur_json.c
src/cursor/cur_log.c
src/cursor/cur_metadata.c
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 0e2bad0910c..1965dfb7dbe 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -26,6 +26,7 @@ flags = {
'LOG_DSYNC',
'LOG_FLUSH',
'LOG_FSYNC',
+ 'LOG_SYNC_ENABLED',
],
'page_read' : [
'READ_CACHE',
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index d204a11835b..8b0d9a0bdcd 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -4,6 +4,7 @@ API_CALL
API_CALL_NOCONF
API_SESSION_INIT
FLD_MASK
+JOINABLE_CURSOR_CALL_CHECK
LF_MASK
LLONG_MAX
LLONG_MIN
@@ -36,6 +37,8 @@ WT_READ_BARRIER
WT_REF_SIZE
WT_SESSION_LOCKED_CHECKPOINT
WT_SESSION_LOCKED_TURTLE
+WT_STATS_FIELD_TO_SLOT
+WT_STATS_SLOT_ID
WT_STAT_DECR
WT_STAT_DECRV
WT_STAT_FAST_CONN_DECRV
@@ -45,9 +48,8 @@ WT_STAT_FAST_DECRV
WT_STAT_FAST_INCR
WT_STAT_FAST_INCRV
WT_STAT_FAST_SET
-WT_STATS_FIELD_TO_SLOT
-WT_STATS_SLOT_ID
WT_STAT_WRITE
+WT_TIMEDIFF_US
WT_TRET_ERROR_OK
WT_WITH_LOCK
__F
diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list
index 3b5690a4bc2..ed6cf43bb2f 100644
--- a/src/third_party/wiredtiger/dist/s_funcs.list
+++ b/src/third_party/wiredtiger/dist/s_funcs.list
@@ -27,6 +27,8 @@ __wt_log_scan
__wt_nlpo2
__wt_nlpo2_round
__wt_print_huffman_code
+__wt_stat_join_aggregate
+__wt_stat_join_clear_all
__wt_try_readlock
wiredtiger_config_parser_open
wiredtiger_config_validate
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index c14f4c961e6..7de139f6a40 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -221,6 +221,7 @@ OUTBUFF
OVFL
ObWgfvgw
Obama
+Outfmt
PARAM
POSIX
PREDEFINE
@@ -351,6 +352,7 @@ allocfile
allocsize
amd
ao
+ap
api
arg
argc
@@ -421,6 +423,7 @@ checksums
chk
chongo
cip
+cjoin
ckpt
ckptfrag
ckptlist
@@ -464,6 +467,7 @@ curdump
curextract
curfile
curindex
+curjoin
curlog
curmetadata
cursoring
@@ -507,8 +511,10 @@ dev
dh
dhandle
dhandles
+difftime
dir
dirlist
+disjunction
dlclose
dlh
dll
@@ -540,6 +546,7 @@ enqueue
enqueued
env
eof
+eq
equalp
errhandler
errno
@@ -592,6 +599,7 @@ ftruncate
func
gcc
gdb
+ge
getenv
getline
getone
@@ -607,6 +615,7 @@ goesc
gostring
gostruct
goutf
+gt
hashval
havesize
hdr
@@ -632,6 +641,7 @@ indirects
indx
infeasible
inflateInit
+infmt
init
initn
initsize
@@ -650,6 +660,7 @@ io
ip
islocked
ispo
+iter
iteratively
jnr
jrx
@@ -668,6 +679,7 @@ latencies
lbrace
lbracket
ld
+le
len
lenp
level's
@@ -714,6 +726,7 @@ mem
memalign
membar
memcpy
+memget
memmove
memset
memsize
diff --git a/src/third_party/wiredtiger/dist/stat.py b/src/third_party/wiredtiger/dist/stat.py
index c9684665a53..d62fda3fcb9 100644
--- a/src/third_party/wiredtiger/dist/stat.py
+++ b/src/third_party/wiredtiger/dist/stat.py
@@ -5,7 +5,7 @@ import re, string, sys, textwrap
from dist import compare_srcfile
# Read the source files.
-from stat_data import groups, dsrc_stats, connection_stats
+from stat_data import groups, dsrc_stats, connection_stats, join_stats
def print_struct(title, name, base, stats):
'''Print the structures for the stat.h file.'''
@@ -35,9 +35,17 @@ for line in open('../src/include/stat.h', 'r'):
print_struct(
'connections', 'connection', 1000, connection_stats)
print_struct('data sources', 'dsrc', 2000, dsrc_stats)
+ print_struct('join cursors', 'join', 3000, join_stats)
f.close()
compare_srcfile(tmp_file, '../src/include/stat.h')
+def print_defines_one(capname, base, stats):
+ for v, l in enumerate(stats, base):
+ f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70)))
+ f.write('#define\tWT_STAT_' + capname + '_' + l.name.upper() + "\t" *
+ max(1, 6 - int((len('WT_STAT_' + capname + '_' + l.name)) / 8)) +
+ str(v) + '\n')
+
def print_defines():
'''Print the #defines for the wiredtiger.in file.'''
f.write('''
@@ -51,11 +59,7 @@ def print_defines():
* @{
*/
''')
- for v, l in enumerate(connection_stats, 1000):
- f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70)))
- f.write('#define\tWT_STAT_CONN_' + l.name.upper() + "\t" *
- max(1, 6 - int((len('WT_STAT_CONN_' + l.name)) / 8)) +
- str(v) + '\n')
+ print_defines_one('CONN', 1000, connection_stats)
f.write('''
/*!
* @}
@@ -64,11 +68,16 @@ def print_defines():
* @{
*/
''')
- for v, l in enumerate(dsrc_stats, 2000):
- f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70)))
- f.write('#define\tWT_STAT_DSRC_' + l.name.upper() + "\t" *
- max(1, 6 - int((len('WT_STAT_DSRC_' + l.name)) / 8)) +
- str(v) + '\n')
+ print_defines_one('DSRC', 2000, dsrc_stats)
+ f.write('''
+/*!
+ * @}
+ * @name Statistics for join cursors
+ * @anchor statistics_join
+ * @{
+ */
+''')
+ print_defines_one('JOIN', 3000, join_stats)
f.write('/*! @} */\n')
# Update the #defines in the wiredtiger.in file.
@@ -98,10 +107,12 @@ def print_func(name, handle, list):
f.write('};\n')
f.write('''
-const char *
-__wt_stat_''' + name + '''_desc(int slot)
+int
+__wt_stat_''' + name + '''_desc(WT_CURSOR_STAT *cst, int slot, const char **p)
{
-\treturn (__stats_''' + name + '''_desc[slot]);
+\tWT_UNUSED(cst);
+\t*p = __stats_''' + name + '''_desc[slot];
+\treturn (0);
}
''')
@@ -113,7 +124,8 @@ __wt_stat_''' + name + '_init_single(WT_' + name.upper() + '''_STATS *stats)
}
''')
- f.write('''
+ if handle != None:
+ f.write('''
void
__wt_stat_''' + name + '_init(' + handle + ''' *handle)
{
@@ -205,6 +217,7 @@ f.write('#include "wt_internal.h"\n')
print_func('dsrc', 'WT_DATA_HANDLE', dsrc_stats)
print_func('connection', 'WT_CONNECTION_IMPL', connection_stats)
+print_func('join', None, join_stats)
f.close()
compare_srcfile(tmp_file, '../src/support/stat.c')
@@ -224,6 +237,7 @@ for l in sorted(dsrc_stats):
scale_info += ' \'' + l.desc + '\',\n'
if 'no_clear' in l.flags:
clear_info += ' \'' + l.desc + '\',\n'
+# No join statistics can be captured in wtstats
scale_info += ']\n'
clear_info += ']\n'
prefix_info = 'prefix_list = [\n'
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 76fdf185137..3a23071a3f2 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -67,6 +67,10 @@ class DhandleStat(Stat):
prefix = 'data-handle'
def __init__(self, name, desc, flags=''):
Stat.__init__(self, name, DhandleStat.prefix, desc, flags)
+class JoinStat(Stat):
+ prefix = '' # prefix is inserted dynamically
+ def __init__(self, name, desc, flags=''):
+ Stat.__init__(self, name, JoinStat.prefix, desc, flags)
class LogStat(Stat):
prefix = 'log'
def __init__(self, name, desc, flags=''):
@@ -199,7 +203,9 @@ connection_stats = [
'eviction server populating queue, but not evicting pages'),
CacheStat('cache_eviction_slow',
'eviction server unable to reach eviction goal'),
- CacheStat('cache_eviction_split', 'pages split during eviction'),
+ CacheStat('cache_eviction_split_internal',
+ 'internal pages split during eviction'),
+ CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_eviction_walk', 'pages walked for eviction'),
CacheStat('cache_eviction_worker_evicting',
'eviction worker thread evicting pages'),
@@ -278,6 +284,8 @@ connection_stats = [
# Reconciliation statistics
##########################################
RecStat('rec_pages', 'page reconciliation calls'),
+ RecStat('rec_page_delete', 'pages deleted'),
+ RecStat('rec_page_delete_fast', 'fast-path pages deleted'),
RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
RecStat('rec_split_stashed_bytes',
'split bytes currently awaiting free', 'no_clear,no_scale'),
@@ -309,6 +317,11 @@ connection_stats = [
'no_clear,no_scale'),
TxnStat('txn_pinned_range',
'transaction range of IDs currently pinned', 'no_clear,no_scale'),
+ TxnStat('txn_pinned_snapshot_range',
+ 'transaction range of IDs currently pinned by named snapshots',
+ 'no_clear,no_scale'),
+ TxnStat('txn_snapshots_created', 'number of named snapshots created'),
+ TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'),
TxnStat('txn_rollback', 'transactions rolled back'),
TxnStat('txn_sync', 'transaction sync calls'),
@@ -349,6 +362,7 @@ connection_stats = [
CursorStat('cursor_restart', 'cursor restarted searches'),
CursorStat('cursor_search', 'cursor search calls'),
CursorStat('cursor_search_near', 'cursor search near calls'),
+ CursorStat('cursor_truncate', 'truncate calls'),
CursorStat('cursor_update', 'cursor update calls'),
##########################################
@@ -390,6 +404,7 @@ dsrc_stats = [
CursorStat('cursor_restart', 'restarted searches'),
CursorStat('cursor_search', 'search calls'),
CursorStat('cursor_search_near', 'search near calls'),
+ CursorStat('cursor_truncate', 'truncate calls'),
CursorStat('cursor_update', 'update calls'),
CursorStat('cursor_update_bytes', 'cursor-update value bytes updated'),
@@ -476,7 +491,9 @@ dsrc_stats = [
'data source pages selected for eviction unable to be evicted'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
CacheStat('cache_eviction_internal', 'internal pages evicted'),
- CacheStat('cache_eviction_split', 'pages split during eviction'),
+ CacheStat('cache_eviction_split_internal',
+ 'internal pages split during eviction'),
+ CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
CacheStat('cache_inmem_splittable',
'in-memory page passed criteria to be split'),
@@ -518,6 +535,7 @@ dsrc_stats = [
RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'),
RecStat('rec_overflow_value', 'overflow values written'),
RecStat('rec_page_delete', 'pages deleted'),
+ RecStat('rec_page_delete_fast', 'fast-path pages deleted'),
RecStat('rec_page_match', 'page checksum matches'),
RecStat('rec_pages', 'page reconciliation calls'),
RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
@@ -533,3 +551,14 @@ dsrc_stats = [
]
dsrc_stats = sorted(dsrc_stats, key=attrgetter('name'))
+
+##########################################
+# Cursor Join statistics
+##########################################
+join_stats = [
+ JoinStat('accesses', 'accesses'),
+ JoinStat('actual_count', 'actual count of items'),
+ JoinStat('bloom_false_positive', 'bloom filter false positives'),
+]
+
+join_stats = sorted(join_stats, key=attrgetter('name'))
diff --git a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
index 34b8d7c7c64..8d50cc7ec5d 100644
--- a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
+++ b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c
@@ -49,7 +49,8 @@
typedef struct {
WT_EXTRACTOR extractor; /* Must come first */
WT_EXTENSION_API *wt_api; /* Extension API */
- int field_num; /* Field to extract */
+ int field; /* Field to extract */
+ int format_isnum; /* Field contents are numeric */
} CSV_EXTRACTOR;
/*
@@ -61,15 +62,15 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor)
{
char *copy, *p, *pend, *valstr;
- const CSV_EXTRACTOR *cvs_extractor;
- int i, ret;
+ const CSV_EXTRACTOR *csv_extractor;
+ int i, ret, val;
size_t len;
WT_EXTENSION_API *wtapi;
(void)key; /* Unused parameters */
- cvs_extractor = (const CSV_EXTRACTOR *)extractor;
- wtapi = cvs_extractor->wt_api;
+ csv_extractor = (const CSV_EXTRACTOR *)extractor;
+ wtapi = csv_extractor->wt_api;
/* Unpack the value. */
if ((ret = wtapi->struct_unpack(wtapi,
@@ -78,11 +79,11 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
p = valstr;
pend = strchr(p, ',');
- for (i = 0; i < cvs_extractor->field_num && pend != NULL; i++) {
+ for (i = 0; i < csv_extractor->field && pend != NULL; i++) {
p = pend + 1;
pend = strchr(p, ',');
}
- if (i == cvs_extractor->field_num) {
+ if (i == csv_extractor->field) {
if (pend == NULL)
pend = p + strlen(p);
/*
@@ -95,7 +96,12 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
return (errno);
strncpy(copy, p, len);
copy[len] = '\0';
- result_cursor->set_key(result_cursor, copy);
+ if (csv_extractor->format_isnum) {
+ if ((val = atoi(copy)) < 0)
+ return (EINVAL);
+ result_cursor->set_key(result_cursor, val);
+ } else
+ result_cursor->set_key(result_cursor, copy);
ret = result_cursor->insert(result_cursor);
free(copy);
if (ret != 0)
@@ -107,7 +113,7 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
/*
* csv_customize --
* The customize function creates a customized extractor,
- * needed to save the field number.
+ * needed to save the field number and format.
*/
static int
csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
@@ -115,20 +121,37 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
{
const CSV_EXTRACTOR *orig;
CSV_EXTRACTOR *csv_extractor;
+ WT_CONFIG_ITEM field, format;
+ WT_CONFIG_PARSER *parser;
+ WT_EXTENSION_API *wtapi;
+ int ret;
long field_num;
(void)session; /* Unused parameters */
(void)uri; /* Unused parameters */
orig = (const CSV_EXTRACTOR *)extractor;
- field_num = strtol(appcfg->str, NULL, 10);
+ wtapi = orig->wt_api;
+ if ((ret = wtapi->config_parser_open(wtapi, session, appcfg->str,
+ appcfg->len, &parser)) != 0)
+ return (ret);
+ if ((ret = parser->get(parser, "field", &field)) != 0 ||
+ (ret = parser->get(parser, "format", &format)) != 0) {
+ if (ret == WT_NOTFOUND)
+ return (EINVAL);
+ return (ret);
+ }
+ field_num = strtol(field.str, NULL, 10);
if (field_num < 0 || field_num > INT_MAX)
return (EINVAL);
+ if (format.len != 1 || (format.str[0] != 'S' && format.str[0] != 'i'))
+ return (EINVAL);
if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL)
return (errno);
*csv_extractor = *orig;
- csv_extractor->field_num = (int)field_num;
+ csv_extractor->field = field_num;
+ csv_extractor->format_isnum = (format.str[0] == 'i');
*customp = (WT_EXTRACTOR *)csv_extractor;
return (0);
}
diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i
index 75c14dbfe8f..17317ab875b 100644
--- a/src/third_party/wiredtiger/lang/java/java_doc.i
+++ b/src/third_party/wiredtiger/lang/java/java_doc.i
@@ -33,6 +33,7 @@ COPYDOC(__wt_session, WT_SESSION, open_cursor)
COPYDOC(__wt_session, WT_SESSION, create)
COPYDOC(__wt_session, WT_SESSION, compact)
COPYDOC(__wt_session, WT_SESSION, drop)
+COPYDOC(__wt_session, WT_SESSION, join)
COPYDOC(__wt_session, WT_SESSION, log_flush)
COPYDOC(__wt_session, WT_SESSION, log_printf)
COPYDOC(__wt_session, WT_SESSION, rename)
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
index 9225b9fe3b5..e3a21f25dc1 100644
--- a/src/third_party/wiredtiger/src/bloom/bloom.c
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -314,6 +314,47 @@ __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key)
}
/*
+ * __wt_bloom_inmem_get --
+ * Tests whether the given key is in the Bloom filter.
+ * This can be used in place of __wt_bloom_get
+ * for Bloom filters that are memory only.
+ */
+int
+__wt_bloom_inmem_get(WT_BLOOM *bloom, WT_ITEM *key)
+{
+ uint64_t h1, h2;
+ uint32_t i;
+
+ h1 = __wt_hash_fnv64(key->data, key->size);
+ h2 = __wt_hash_city64(key->data, key->size);
+ for (i = 0; i < bloom->k; i++, h1 += h2) {
+ if (!__bit_test(bloom->bitstring, h1 % bloom->m))
+ return (WT_NOTFOUND);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bloom_intersection --
+ * Modify the Bloom filter to contain the intersection of this
+ * filter with another.
+ */
+int
+__wt_bloom_intersection(WT_BLOOM *bloom, WT_BLOOM *other)
+{
+ uint64_t i, nbytes;
+
+ if (bloom->k != other->k || bloom->factor != other->factor ||
+ bloom->m != other->m || bloom->n != other->n)
+ return (EINVAL);
+
+ nbytes = __bitstr_size(bloom->m);
+ for (i = 0; i < nbytes; i++)
+ bloom->bitstring[i] &= other->bitstring[i];
+ return (0);
+}
+
+/*
* __wt_bloom_close --
* Close the Bloom filter, release any resources.
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 3290fd6374c..69512f45933 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -1093,6 +1093,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
cbt = (start != NULL) ? start : stop;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/*
* We always delete in a forward direction because it's faster, assert
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 8edc40794e2..0f47c060daf 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -566,7 +566,7 @@ __debug_tree(
/* A NULL page starts at the top of the tree -- it's a convenience. */
if (page == NULL)
- page = S2BT(session)->root.page;
+ page = btree->root.page;
WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags));
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 757b7b51cdd..98c6390e0f4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -138,6 +138,8 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
WT_ERR(__wt_txn_modify_ref(session, ref));
*skipp = true;
+ WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast);
+ WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast);
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 3e611a107ab..dbdf94fc1b6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -643,11 +643,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
uint64_t cache_size;
uint32_t intl_split_size, leaf_split_size;
const char **cfg;
btree = S2BT(session);
+ conn = S2C(session);
cfg = btree->dhandle->cfg;
/*
@@ -688,8 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage =
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
- if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
- if ((cache_size = S2C(session)->cache_size) > 0)
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ if ((cache_size = conn->cache_size) > 0)
btree->maxmempage =
WT_MIN(btree->maxmempage, cache_size / 4);
}
@@ -723,6 +725,17 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
/*
* Get the maximum internal/leaf page key/value sizes.
*
+ * In-memory configuration overrides any key/value sizes, there's no
+ * such thing as an overflow item in an in-memory configuration.
+ */
+ if (F_ISSET(conn, WT_CONN_IN_MEMORY)) {
+ btree->maxintlkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafkey = WT_BTREE_MAX_OBJECT_SIZE;
+ btree->maxleafvalue = WT_BTREE_MAX_OBJECT_SIZE;
+ return (0);
+ }
+
+ /*
* In historic versions of WiredTiger, the maximum internal/leaf page
* key/value sizes were set by the internal_item_max and leaf_item_max
* configuration strings. Look for those strings if we don't find the
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index e60f7b3fb02..389ac761c5b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -586,8 +586,8 @@ skip_evict:
* CPU to no purpose.
*/
if (stalled)
- wait_cnt += 1000;
- else if (++wait_cnt < 1000) {
+ wait_cnt += WT_THOUSAND;
+ else if (++wait_cnt < WT_THOUSAND) {
__wt_yield();
continue;
}
@@ -603,7 +603,7 @@ skip_evict:
if (cache_work)
continue;
}
- sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+ sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
__wt_sleep(0, sleep_cnt);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 9e45bf10a5c..caba12b78f1 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session,
return (__split_stash_add(session, split_gen, p, s));
}
+#ifdef HAVE_DIAGNOSTIC
/*
- * __split_should_deepen --
- * Return if we should deepen the tree.
+ * __split_verify_intl_key_order --
+ * Verify the key order on an internal page after a split, diagnostic only.
*/
-static bool
-__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_ITEM *next, _next, *last, _last, *tmp;
+ WT_REF *ref;
+ uint64_t recno;
+ int cmp;
+ bool first;
btree = S2BT(session);
- page = ref->page;
-
- /*
- * Our caller is holding the parent page locked to single-thread splits,
- * which means we can safely look at the page's index without setting a
- * split generation.
- */
- pindex = WT_INTL_INDEX_GET_SAFE(page);
-
- /*
- * Sanity check for a reasonable number of keys on-page keys. Splitting
- * with too few keys leads to excessively deep trees.
- */
- if (pindex->entries < 100)
- return (false);
-
- /*
- * Deepen the tree if the page's memory footprint is larger than the
- * maximum size for a page in memory (presumably putting eviction
- * pressure on the cache).
- */
- if (page->memory_footprint > btree->maxmempage)
- return (true);
- /*
- * Check if the page has enough keys to make it worth splitting. If
- * the number of keys is allowed to grow too large, the cost of
- * splitting into parent pages can become large enough to result
- * in slow operations.
- */
- if (!__wt_ref_is_root(ref) &&
- pindex->entries > btree->split_deepen_min_child)
- return (true);
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ recno = 0; /* Less than any valid record number. */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->key.recno > recno);
+ recno = ref->key.recno;
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ next = &_next;
+ WT_CLEAR(_next);
+ last = &_last;
+ WT_CLEAR(_last);
- return (false);
+ first = true;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &next->data, &next->size);
+ if (last->size == 0) {
+ if (first)
+ first = false;
+ else {
+ WT_ASSERT(session, __wt_compare(
+ session, btree->collator, last,
+ next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
+ }
+ }
+ tmp = last;
+ last = next;
+ next = tmp;
+ } WT_INTL_FOREACH_END;
+ break;
+ }
}
+#endif
/*
* __split_ovfl_key_cleanup --
@@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
}
/*
- * __split_ref_deepen_move --
- * Move a WT_REF from a parent to a child in service of a split to deepen
- * the tree, including updating the accounting information.
+ * __split_ref_move --
+ * Move a WT_REF from one page to another, including updating accounting
+ * information.
*/
static int
-__split_ref_deepen_move(WT_SESSION_IMPL *session,
- WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
+ WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp)
{
WT_ADDR *addr;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
WT_IKEY *ikey;
+ WT_REF *ref;
size_t size;
void *key;
+ ref = *from_refp;
+
/*
+ * The from-home argument is the page into which the "from" WT_REF may
+ * point, for example, if there's an on-page key the "from" WT_REF
+ * references, it will be on the page "from-home".
+ *
* Instantiate row-store keys, and column- and row-store addresses in
- * the WT_REF structures referenced by a page that's being split (and
- * deepening the tree). The WT_REF structures aren't moving, but the
- * index references are moving from the page we're splitting to a set
- * of child pages, and so we can no longer reference the block image
- * that remains with the page being split.
+ * the WT_REF structures referenced by a page that's being split. The
+ * WT_REF structures aren't moving, but the index references are moving
+ * from the page we're splitting to a set of new pages, and so we can
+ * no longer reference the block image that remains with the page being
+ * split.
*
* No locking is required to update the WT_REF structure because we're
- * the only thread splitting the parent page, and there's no way for
- * readers to race with our updates of single pointers. The changes
- * have to be written before the page goes away, of course, our caller
- * owns that problem.
- *
- * Row-store keys, first.
+ * the only thread splitting the page, and there's no way for readers
+ * to race with our updates of single pointers. The changes have to be
+ * written before the page goes away, of course, our caller owns that
+ * problem.
*/
- if (parent->type == WT_PAGE_ROW_INT) {
+ if (from_home->type == WT_PAGE_ROW_INT) {
+ /*
+ * Row-store keys: if it's not yet instantiated, instantiate it.
+ * If already instantiated, check for overflow cleanup (overflow
+ * keys are always instantiated).
+ */
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
- __wt_ref_key(parent, ref, &key, &size);
+ __wt_ref_key(from_home, ref, &key, &size);
WT_RET(__wt_row_ikey(session, 0, key, size, ref));
ikey = ref->key.ikey;
} else {
- WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
- *parent_decrp += sizeof(WT_IKEY) + ikey->size;
+ WT_RET(
+ __split_ovfl_key_cleanup(session, from_home, ref));
+ *decrp += sizeof(WT_IKEY) + ikey->size;
}
- *child_incrp += sizeof(WT_IKEY) + ikey->size;
+ *incrp += sizeof(WT_IKEY) + ikey->size;
}
/*
@@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
* get the address from the on-page cell.
*/
addr = ref->addr;
- if (addr != NULL && !__wt_off_page(parent, addr)) {
+ if (addr != NULL && !__wt_off_page(from_home, addr)) {
__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
@@ -330,364 +345,1048 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
ref->addr = addr;
}
- /* And finally, the WT_REF itself. */
- WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));
+ /* And finally, copy the WT_REF pointer itself. */
+ *to_refp = ref;
+ WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF));
return (0);
}
-#ifdef HAVE_DIAGNOSTIC
/*
- * __split_verify_intl_key_order --
- * Verify the key order on an internal page after a split, diagnostic only.
+ * __split_child_block_evict_and_split --
+ * Ensure the newly created child isn't evicted or split for now.
*/
static void
-__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+__split_child_block_evict_and_split(WT_PAGE *child)
{
- WT_BTREE *btree;
- WT_ITEM *next, _next, *last, _last, *tmp;
- WT_REF *ref;
- uint64_t recno;
- int cmp;
- bool first;
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page which previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * newly created page's modify structure has a field with a transaction
+ * ID that's checked before any internal page is evicted. Unfortunately,
+ * we don't know the correct value until we update the original page's
+ * index (we need a transaction ID from after that update), but the act
+ * of updating the original page's index is what allows the eviction to
+ * happen.
+ *
+ * Once the split is live, newly created internal pages might themselves
+ * split. The split itself is not the problem: if a page splits before
+ * we fix up its WT_REF (in other words, a WT_REF we move is then moved
+ * again, before we reset the underlying page's parent reference), it's
+ * OK because the test we use to find a WT_REF and WT_PAGE that require
+ * fixing up is only that the WT_REF points to the wrong parent, not it
+ * points to a specific wrong parent. The problem is our fix up of the
+ * WT_REFs in the created page could race with the subsequent fix of the
+ * same WT_REFs (in a different created page), we'd have to acquire some
+ * lock to prevent that race, and that's going to be difficult at best.
+ *
+ * For now, block eviction and splits in newly created pages until they
+ * have been fixed up.
+ */
+ F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+}
- btree = S2BT(session);
+/*
+ * __split_ref_move_final --
+ * Finalize the moved WT_REF structures after the split succeeds.
+ */
+static int
+__split_ref_move_final(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
+{
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_REF *ref, *child_ref;
+ uint64_t txn_new_id;
+ uint32_t i;
- switch (page->type) {
- case WT_PAGE_COL_INT:
- recno = 0; /* Less than any valid record number. */
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- WT_ASSERT(session, ref->key.recno > recno);
- recno = ref->key.recno;
- } WT_INTL_FOREACH_END;
- break;
- case WT_PAGE_ROW_INT:
- next = &_next;
- WT_CLEAR(_next);
- last = &_last;
- WT_CLEAR(_last);
+ /*
+ * When creating new internal pages as part of a split, we set a field
+ * in those pages modify structure to prevent them from being evicted
+ * until all threads are known to have exited the index of the page that
+ * previously "owned" the WT_REF. Set that field to a safe value.
+ */
+ txn_new_id = __wt_txn_new_id(session);
- first = true;
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- __wt_ref_key(page, ref, &next->data, &next->size);
- if (last->size == 0) {
- if (first)
- first = false;
- else {
- WT_ASSERT(session, __wt_compare(
- session, btree->collator, last,
- next, &cmp) == 0);
- WT_ASSERT(session, cmp < 0);
- }
+ /*
+ * The WT_REF structures moved to newly allocated child pages reference
+ * the wrong parent page and we have to fix that up. The problem is
+ * revealed when a thread of control searches for the child page's
+ * reference structure slot, and fails to find it because the parent
+ * page being searched no longer references the child. When that failure
+ * happens the thread waits for the reference's home page to be updated,
+ * which we do here: walk the children and fix them up.
+ */
+ for (i = 0; i < entries; ++i, ++refp) {
+ ref = *refp;
+
+ /*
+ * We don't hold hazard pointers on created pages, they cannot
+ * be evicted because the page-modify transaction value set as
+ * they were created prevents eviction. (See above, we reset
+ * that value as part of fixing up the page.) But, an eviction
+ * thread might be attempting to evict the page (the WT_REF may
+ * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF
+ * may be WT_REF_READING), or it may be in some other state.
+ * Acquire a hazard pointer for any in-memory pages so we know
+ * the state of the page. Ignore pages not in-memory (deleted,
+ * on-disk, being read), there's no in-memory structure to fix.
+ */
+ if ((ret = __wt_page_in(session,
+ ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND)
+ continue;
+ WT_ERR(ret);
+
+ child = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, child));
+#endif
+ /*
+ * We use a page flag to prevent the child from splitting from
+ * underneath us, but the split-generation error checks don't
+ * know about that flag; use the standard macros to ensure that
+ * reading the child's page index structure is safe.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ /*
+ * The page's home reference may not be wrong, as we
+ * opened up access from the top of the tree already,
+ * disk pages may have been read in since then, and
+ * those pages would have correct parent references.
+ */
+ if (child_ref->home != child) {
+ child_ref->home = child;
+ child_ref->pindex_hint = 0;
+
+ child->modify->mod_split_txn = txn_new_id;
}
- tmp = last;
- last = next;
- next = tmp;
} WT_INTL_FOREACH_END;
- break;
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /* The child can now be evicted or split. */
+ F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
+
+ WT_ERR(__wt_hazard_clear(session, child));
}
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+ return (0);
+
+err: /* Something really bad just happened. */
+ WT_PANIC_RET(session, ret, "fatal error resolving a split");
}
-#endif
/*
- * __split_deepen --
- * Split an internal page in-memory, deepening the tree.
+ * __split_root --
+ * Split the root page in-memory, deepening the tree.
*/
static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+__split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *child;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
- WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
- size_t child_incr, parent_decr, parent_incr, size;
+ WT_REF **child_refp, *ref, **root_refp;
+ size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
- uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
- uint32_t skip_leading, slots;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
btree = S2BT(session);
alloc_index = NULL;
- parent_incr = parent_decr = 0;
+ root_decr = root_incr = 0;
complete = false;
+ /* The root page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, root));
+
/*
- * Our caller is holding the parent page locked to single-thread splits,
+ * Our caller is holding the root page locked to single-thread splits,
* which means we can safely look at the page's index without setting a
* split generation.
*/
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
+ pindex = WT_INTL_INDEX_GET_SAFE(root);
/*
- * A prepending/appending workload will repeatedly deepen parts of the
- * tree that aren't changing, and appending workloads are not uncommon.
- * First, keep the first/last pages of the tree at their current level,
- * to catch simple workloads. Second, track the number of entries which
- * resulted from the last time we deepened this page, and if we refilled
- * this page without splitting into those slots, ignore them for this
- * split. It's not exact because an eviction might split into any part
- * of the page: if 80% of the splits are at the end of the page, assume
- * an append-style workload. Of course, the plan eventually fails: when
- * repeatedly deepening this page for an append-only workload, we will
- * progressively ignore more and more of the slots. When ignoring 90% of
- * the slots, deepen the entire page again.
- *
- * Figure out how many slots we're leaving at this level and how many
- * child pages we're creating.
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
*/
-#undef skip_trailing
-#define skip_trailing 1
- skip_leading = 1;
- new_entries = pindex->entries - parent->pg_intl_deepen_split_last;
- if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10)
- skip_leading = parent->pg_intl_deepen_split_last;
- if (skip_leading > (pindex->entries * 9) * 10)
- skip_leading = 1;
-
- /*
- * In a few (rare) cases we split pages with only a few entries, and in
- * those cases we keep it simple, 10 children, skip only first and last
- * entries. Otherwise, split into a lot of child pages.
- */
- moved_entries = pindex->entries - (skip_leading + skip_trailing);
- children = moved_entries / btree->split_deepen_per_child;
+ children = pindex->entries / btree->split_deepen_per_child;
if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
children = 10;
- skip_leading = 1;
- moved_entries =
- pindex->entries - (skip_leading + skip_trailing);
}
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
- parent, pindex->entries, children));
+ "%p: %" PRIu32 " root page elements, splitting into %" PRIu32
+ " children",
+ root, pindex->entries, children));
/*
- * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
- * the slots of the allocated WT_PAGE_INDEX to point to the pages we're
- * keeping at the current level, and the rest of the slots to point to
- * new WT_REF objects.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the root page, replacing the root's page-index.
*/
- size = sizeof(WT_PAGE_INDEX) +
- (children + skip_leading + skip_trailing) * sizeof(WT_REF *);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
+ root_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = children + skip_leading + skip_trailing;
- for (alloc_refp = alloc_index->index,
- i = 0; i < skip_leading; ++alloc_refp, ++i)
- alloc_index->index[i] = pindex->index[i];
- for (i = 0; i < children; ++alloc_refp, ++i)
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ for (i = 0; i < children; alloc_refp++, ++i)
WT_ERR(__wt_calloc_one(session, alloc_refp));
- parent_incr += children * sizeof(WT_REF);
- alloc_index->index[alloc_index->entries - 1] =
- pindex->index[pindex->entries - 1];
+ root_incr += children * sizeof(WT_REF);
/* Allocate child pages, and connect them into the new page index. */
- chunk = moved_entries / children;
- remain = moved_entries - chunk * (children - 1);
- for (parent_refp = pindex->index + skip_leading,
- alloc_refp = alloc_index->index + skip_leading,
- i = 0; i < children; ++i) {
+ for (root_refp = pindex->index,
+ alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
WT_ERR(__wt_page_alloc(
- session, parent->type, 0, slots, false, &child));
+ session, root->type, 0, slots, false, &child));
/*
- * Initialize the parent page's child reference; we need a copy
- * of the page's key.
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
*/
ref = *alloc_refp++;
- ref->home = parent;
+ ref->home = root;
ref->page = child;
ref->addr = NULL;
- if (parent->type == WT_PAGE_ROW_INT) {
- __wt_ref_key(parent, *parent_refp, &p, &size);
+ if (root->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(root, *root_refp, &p, &size);
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
- parent_incr += sizeof(WT_IKEY) + size;
+ root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*parent_refp)->key.recno;
+ ref->key.recno = (*root_refp)->key.recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (parent->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*parent_refp)->key.recno;
+ if (root->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*root_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
- /*
- * Once the split goes live, the newly created internal pages
- * might be evicted and their WT_REF structures freed. If those
- * pages are evicted before threads exit the previous page index
- * array, a thread might see a freed WT_REF. Set the eviction
- * transaction requirement for the newly created internal pages.
- */
- child->modify->mod_split_txn = __wt_txn_new_id(session);
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
/*
* The newly allocated child's page index references the same
- * structures as the parent. (We cannot move WT_REF structures,
+ * structures as the root. (We cannot move WT_REF structures,
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
* the page that has an page index entry for the WT_REF is about
* to change.
*/
- child_incr = 0;
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
- for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
- WT_ERR(__split_ref_deepen_move(session,
- parent, *parent_refp, &parent_decr, &child_incr));
- *child_refp++ = *parent_refp++;
- }
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++root_refp, ++j)
+ WT_ERR(__split_ref_move(session, root,
+ root_refp, &root_decr, child_refp, &child_incr));
+
__wt_cache_page_inmem_incr(session, child, child_incr);
}
WT_ASSERT(session,
- alloc_refp - alloc_index->index ==
- (ptrdiff_t)(alloc_index->entries - skip_trailing));
- WT_ASSERT(session, parent_refp - pindex->index ==
- (ptrdiff_t)(pindex->entries - skip_trailing));
+ alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/*
- * Confirm the parent page's index hasn't moved, then update it, which
+ * Confirm the root page's index hasn't moved, then update it, which
* makes the split visible to threads descending the tree. From this
* point on, we're committed to the split.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
- * parent remained unchanged, although it may have an incorrect memory
- * footprint. From now on we've modified the parent page, attention
+ * root remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the root page, attention
* needs to be paid. However, subsequent failures are relatively benign,
* the split is OK and complete. For that reason, we ignore errors past
* this point unless there's a panic.
*/
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
+ WT_INTL_INDEX_SET(root, alloc_index);
+ complete = true;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, root));
+#endif
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index, alloc_index->entries));
+
+ /* We've installed the allocated page-index, ensure error handling. */
+ alloc_index = NULL;
+
+ /*
+ * We can't free the previous root's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling: we have already
+ * updated the page with a new index. Even if stashing the old value
+ * fails, we don't roll back that change, because threads may already
+ * be using the new index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ root_decr += size;
+
+ /* Adjust the root's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, root, root_incr);
+ __wt_cache_page_inmem_decr(session, root, root_decr);
+ __wt_page_modify_set(session, root);
+
+err: /*
+ * If complete is true, we saw an error after opening up the tree to
+ * descent through the root page's new index. There is nothing we
+ * can do, there are threads potentially active in both versions of
+ * the tree.
+ *
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete)
+ __wt_free_ref_index(session, root, alloc_index, true);
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during root page split to "
+ "deepen the tree");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_parent --
+ * Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
+ uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard)
+{
+ WT_DECL_ITEM(scr);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *parent;
+ WT_PAGE_INDEX *alloc_index, *pindex;
+ WT_REF **alloc_refp, *next_ref;
+ size_t parent_decr, size;
+ uint64_t split_gen;
+ uint32_t i, j;
+ uint32_t deleted_entries, parent_entries, result_entries;
+ uint32_t *deleted_refs;
+ bool complete, empty_parent;
+
+ parent = ref->home;
+
+ alloc_index = pindex = NULL;
+ parent_decr = 0;
+ parent_entries = 0;
+ complete = empty_parent = false;
+
+ /* The parent page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, parent));
+
+ /*
+ * We've locked the parent, which means it cannot split (which is the
+ * only reason to worry about split generation values).
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(parent);
+ parent_entries = pindex->entries;
+
+ /*
+ * Remove any refs to deleted pages while we are splitting, we have
+ * the internal page locked down, and are copying the refs into a new
+ * array anyway. Switch them to the special split state, so that any
+ * reading thread will restart.
+ */
+ WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
+ if ((discard && next_ref == ref) ||
+ (next_ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, next_ref, true) &&
+ __wt_atomic_casv32(
+ &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) {
+ WT_ERR(__wt_buf_grow(session, scr,
+ (deleted_entries + 1) * sizeof(uint32_t)));
+ deleted_refs = scr->mem;
+ deleted_refs[deleted_entries++] = i;
+ }
+ }
+
+ /*
+ * The final entry count consists of the original count, plus any new
+ * pages, less any WT_REFs we're removing (deleted entries plus the
+ * entry we're replacing).
+ */
+ result_entries = (parent_entries + new_entries) - deleted_entries;
+ if (!discard)
+ --result_entries;
+
+ /*
+ * If there are no remaining entries on the parent, give up, we can't
+ * leave an empty internal page. Mark it to be evicted soon and clean
+ * up any references that have changed state.
+ */
+ if (result_entries == 0) {
+ empty_parent = true;
+ __wt_page_evict_soon(parent);
+ goto err;
+ }
+
+ /*
+ * Allocate and initialize a new page index array for the parent, then
+ * copy references from the original index array, plus references from
+ * the newly created split array, into place.
+ */
+ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = result_entries;
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref == ref)
+ for (j = 0; j < new_entries; ++j) {
+ ref_new[j]->home = parent;
+ *alloc_refp++ = ref_new[j];
+ }
+ else if (next_ref->state != WT_REF_SPLIT)
+ /* Skip refs we have marked for deletion. */
+ *alloc_refp++ = next_ref;
+ }
+
+ /* Check that we filled in all the entries. */
+ WT_ASSERT(session,
+ alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+
+ /*
+ * Confirm the parent page's index hasn't moved then update it, which
+ * makes the split visible to threads descending the tree.
+ */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- complete = true;
+ alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, parent));
#endif
+
/*
- * Save the number of entries created by deepening the tree and reset
- * the count of splits into this page after that point.
+ * If discarding the page's original WT_REF field, reset it to split.
+ * Threads cursoring through the tree were blocked because that WT_REF
+ * state was set to locked. Changing the locked state to split unblocks
+ * those threads and causes them to re-calculate their position based
+ * on the just-updated parent page's index.
*/
- parent->pg_intl_deepen_split_append = 0;
- parent->pg_intl_deepen_split_last = alloc_index->entries;
+ if (discard)
+ WT_PUBLISH(ref->state, WT_REF_SPLIT);
/*
- * The moved reference structures now reference the wrong parent page,
- * and we have to fix that up. The problem is revealed when a thread
- * of control searches for a page's reference structure slot, and fails
- * to find it because the page it's searching no longer references it.
- * When that failure happens, the thread waits for the reference's home
- * page to be updated, which we do here: walk the children and fix them
- * up.
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+
+ /*
+ * A note on error handling: failures before we swapped the new page
+ * index into the parent can be resolved by freeing allocated memory
+ * because the original page is unchanged, we can continue to use it
+ * and we have not yet modified the parent. Failures after we swap
+ * the new page index into the parent are also relatively benign, the
+ * split is OK and complete. For those reasons, we ignore errors past
+ * this point unless there's a panic.
+ */
+ complete = true;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
+ " (%s%" PRIu32 ")",
+ ref->page, ref->page == NULL ?
+ "unknown page type" : __wt_page_type_string(ref->page->type),
+ ref->page == NULL ? "reverse " : "", parent,
+ parent_entries, result_entries,
+ ref->page == NULL ? "-" : "+",
+ ref->page == NULL ?
+ parent_entries - result_entries : result_entries - parent_entries));
+
+ /*
+ * The new page index is in place, free the WT_REF we were splitting and
+ * any deleted WT_REFs we found, modulo the usual safe free semantics.
*
- * We're not acquiring hazard pointers on these pages, they cannot be
- * evicted because of the eviction transaction value set above.
- */
- for (parent_refp = alloc_index->index,
- i = alloc_index->entries; i > 0; ++parent_refp, --i) {
- parent_ref = *parent_refp;
- WT_ASSERT(session, parent_ref->home == parent);
- if (parent_ref->state != WT_REF_MEM)
- continue;
+ * Acquire a new split generation.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
+ next_ref = pindex->index[deleted_refs[i]];
+ WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
/*
- * We left the first/last children of the parent at the current
- * level to avoid bad split patterns, they might be leaf pages;
- * check the page type before we continue.
- */
- child = parent_ref->page;
- if (!WT_PAGE_IS_INTERNAL(child))
- continue;
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, child));
-#endif
- /*
- * We have the parent locked, but there's nothing to prevent
- * this child from splitting beneath us; ensure that reading
- * the child's page index structure is safe.
+ * We set the WT_REF to split, discard it, freeing any resources
+ * it holds.
+ *
+ * Row-store trees where the old version of the page is being
+ * discarded: the previous parent page's key for this child page
+ * may have been an on-page overflow key. In that case, if the
+ * key hasn't been deleted, delete it now, including its backing
+ * blocks. We are exchanging the WT_REF that referenced it for
+ * the split page WT_REFs and their keys, and there's no longer
+ * any reference to it. Done after completing the split (if we
+ * failed, we'd leak the underlying blocks, but the parent page
+ * would be unaffected).
*/
- WT_ENTER_PAGE_INDEX(session);
- WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ if (parent->type == WT_PAGE_ROW_INT) {
+ WT_TRET(__split_ovfl_key_cleanup(
+ session, parent, next_ref));
+ ikey = __wt_ref_key_instantiated(next_ref);
+ if (ikey != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, ikey, size));
+ parent_decr += size;
+ }
/*
- * The page's parent reference may not be wrong, as we
- * opened up access from the top of the tree already,
- * pages may have been read in since then. Check and
- * only update pages that reference the original page,
- * they must be wrong.
+ * The page_del structure can be freed immediately: it
+ * is only read when the ref state is WT_REF_DELETED.
+ * The size of the structure wasn't added to the parent,
+ * don't decrement.
*/
- if (child_ref->home == parent) {
- child_ref->home = child;
- child_ref->pindex_hint = 0;
+ if (next_ref->page_del != NULL) {
+ __wt_free(session,
+ next_ref->page_del->update_list);
+ __wt_free(session, next_ref->page_del);
}
- } WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
+ }
+
+ WT_TRET(__split_safe_free(
+ session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
+ parent_decr += sizeof(WT_REF);
}
+ /* We freed the reference that was split in the loop above. */
+ ref = NULL;
+
/*
- * Push out the changes: not required for correctness, but don't let
- * threads spin on incorrect page references longer than necessary.
+ * We can't free the previous page index, there may be threads using it.
+ * Add it to the session discard list, to be freed when it's safe.
*/
- WT_FULL_BARRIER();
- alloc_index = NULL;
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
+ parent_decr += size;
+ /* Adjust the parent's memory footprint and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ __wt_page_modify_set(session, parent);
+
+err: __wt_scr_free(session, &scr);
/*
- * We can't free the previous parent's index, there may be threads using
- * it. Add to the session's discard list, to be freed once we know no
- * threads can still be using it.
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete) {
+ for (i = 0; i < parent_entries; ++i) {
+ next_ref = pindex->index[i];
+ if (next_ref->state == WT_REF_SPLIT)
+ next_ref->state = WT_REF_DELETED;
+ }
+
+ __wt_free_ref_index(session, NULL, alloc_index, false);
+
+ /*
+ * The split couldn't proceed because the parent would be empty,
+ * return EBUSY so our caller knows to unlock the WT_REF that's
+ * being deleted, but don't be noisy, there's nothing wrong.
+ */
+ if (empty_parent)
+ return (EBUSY);
+ }
+
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page split");
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_internal --
+ * Split an internal page into its parent.
+ */
+static int
+__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
+ WT_REF **alloc_refp;
+ WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ size_t child_incr, page_decr, page_incr, parent_incr, size;
+ uint64_t split_gen;
+ uint32_t children, chunk, i, j, remain;
+ uint32_t slots;
+ bool complete;
+ void *p;
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
+
+ /* The page will be marked dirty, make sure that will succeed. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ btree = S2BT(session);
+ alloc_index = replace_index = NULL;
+ page_ref = page->pg_intl_parent_ref;
+ page_decr = page_incr = parent_incr = 0;
+ complete = false;
+
+ /*
+ * Our caller is holding the page locked to single-thread splits, which
+ * means we can safely look at the page's index without setting a split
+ * generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /*
+ * Decide how many child pages to create, then calculate the standard
+ * chunk and whatever remains. Sanity check the number of children:
+ * the decision to split matched to the deepen-per-child configuration
+ * might get it wrong.
+ */
+ children = pindex->entries / btree->split_deepen_per_child;
+ if (children < 10) {
+ if (pindex->entries < 100)
+ return (EBUSY);
+ children = 10;
+ }
+ chunk = pindex->entries / children;
+ remain = pindex->entries - chunk * (children - 1);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " internal page elements, splitting %" PRIu32
+ " children into parent %p",
+ page, pindex->entries, children, parent));
+
+ /*
+ * Ideally, we'd discard the original page, but that's hard since other
+ * threads of control are using it (for example, if eviction is walking
+ * the tree and looking at the page.) Instead, perform a right-split,
+ * moving all except the first chunk of the page's WT_REF objects to new
+ * pages.
*
- * This change requires care with error handling: we have already
- * updated the page with a new index. Even if stashing the old value
- * fails, we don't roll back that change, because threads may already
- * be using the new index.
+ * Create and initialize a replacement WT_PAGE_INDEX for the original
+ * page.
*/
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size));
- parent_decr += size;
+ size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &replace_index));
+ page_incr += size;
+ replace_index->index = (WT_REF **)(replace_index + 1);
+ replace_index->entries = chunk;
+ for (page_refp = pindex->index, i = 0; i < chunk; ++i)
+ replace_index->index[i] = *page_refp++;
/*
- * Adjust the parent's memory footprint.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
+ * into the page's parent, replacing the page's page-index.
+ *
+ * The first slot of the new WT_PAGE_INDEX is the original page WT_REF.
+ * The remainder of the slots are allocated WT_REFs.
*/
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
+ size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ parent_incr += size;
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = children;
+ alloc_refp = alloc_index->index;
+ *alloc_refp++ = page_ref;
+ for (i = 1; i < children; ++alloc_refp, ++i)
+ WT_ERR(__wt_calloc_one(session, alloc_refp));
+ parent_incr += children * sizeof(WT_REF);
+
+ /* Allocate child pages, and connect them into the new page index. */
+ WT_ASSERT(session, page_refp == pindex->index + chunk);
+ for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
+ slots = i == children - 1 ? remain : chunk;
+ WT_ERR(__wt_page_alloc(
+ session, page->type, 0, slots, false, &child));
+
+ /*
+ * Initialize the page's child reference; we need a copy of the
+ * page's key.
+ */
+ ref = *alloc_refp++;
+ ref->home = parent;
+ ref->page = child;
+ ref->addr = NULL;
+ if (page->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(page, *page_refp, &p, &size);
+ WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
+ parent_incr += sizeof(WT_IKEY) + size;
+ } else
+ ref->key.recno = (*page_refp)->key.recno;
+ ref->state = WT_REF_MEM;
+
+ /* Initialize the child page. */
+ if (page->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*page_refp)->key.recno;
+ child->pg_intl_parent_ref = ref;
+
+ /* Mark it dirty. */
+ WT_ERR(__wt_page_modify_init(session, child));
+ __wt_page_modify_set(session, child);
+
+ /* Ensure the page isn't evicted or split for now. */
+ __split_child_block_evict_and_split(child);
+
+ /*
+ * The newly allocated child's page index references the same
+ * structures as the parent. (We cannot move WT_REF structures,
+ * threads may be underneath us right now changing the structure
+ * state.) However, if the WT_REF structures reference on-page
+ * information, we have to fix that, because the disk image for
+ * the page that has an page index entry for the WT_REF is about
+ * to be discarded.
+ */
+ child_pindex = WT_INTL_INDEX_GET_SAFE(child);
+ child_incr = 0;
+ for (child_refp = child_pindex->index,
+ j = 0; j < slots; ++child_refp, ++page_refp, ++j)
+ WT_ERR(__split_ref_move(session, page,
+ page_refp, &page_decr, child_refp, &child_incr));
+
+ __wt_cache_page_inmem_incr(session, child, child_incr);
+ }
+ WT_ASSERT(session, alloc_refp -
+ alloc_index->index == (ptrdiff_t)alloc_index->entries);
+ WT_ASSERT(session,
+ page_refp - pindex->index == (ptrdiff_t)pindex->entries);
+
+ /* Split into the parent. */
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
+
+ /*
+ * A note on error handling: until this point, there's no problem with
+ * unwinding on error. We allocated a new page index, a new set of
+ * WT_REFs and a new set of child pages -- if an error occurred, the
+ * page remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the parent page, attention
+ * needs to be paid. However, subsequent failures are relatively benign,
+ * the split is OK and complete. For that reason, we ignore errors past
+ * this point unless there's a panic.
+ */
+ complete = true;
+
+ /* Confirm the page's index hasn't moved, then update it. */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, parent));
+ WT_WITH_PAGE_INDEX(session,
+ __split_verify_intl_key_order(session, page));
+#endif
+
+ /* Fix up the moved WT_REF structures. */
+ WT_ERR(__split_ref_move_final(
+ session, alloc_index->index + 1, alloc_index->entries - 1));
+
+ /*
+ * We don't care about the page-index we allocated, all we needed was
+ * the array of WT_REF structures, which has now been split into the
+ * parent page.
+ */
+ __wt_free(session, alloc_index);
+
+ /*
+ * We can't free the previous page's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling, we've already updated
+ * the parent page. Even if stashing the old value fails, we don't roll
+ * back that change, because threads may already be using the new parent
+ * page.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
+ page_decr += size;
+
+ /* Adjust the page's memory footprint, and mark it dirty. */
+ __wt_cache_page_inmem_incr(session, page, page_incr);
+ __wt_cache_page_inmem_decr(session, page, page_decr);
+ __wt_page_modify_set(session, page);
err: /*
* If complete is true, we saw an error after opening up the tree to
- * descent through the parent page's new index. There is nothing we
- * can do, there are threads potentially active in both versions of
- * the tree.
+ * descent through the page's new index. There is nothing we can do,
+ * there are threads potentially active in both versions of the tree.
*
* A note on error handling: if we completed the split, return success,
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete)
- __wt_free_ref_index(session, parent, alloc_index, true);
+ if (!complete) {
+ __wt_free_ref_index(session, page, alloc_index, true);
+ __wt_free_ref_index(session, page, replace_index, false);
+ }
if (ret != 0 && ret != WT_PANIC)
__wt_err(session, ret,
- "ignoring not-fatal error during parent page split to "
- "deepen the tree");
+ "ignoring not-fatal error during internal page split");
return (ret == WT_PANIC || !complete ? ret : 0);
}
/*
+ * __split_internal_lock --
+ * Lock an internal page.
+ */
+static int
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *parent_ref;
+
+ *hazardp = false;
+ *parentp = NULL;
+
+ /*
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we want to split
+ * the parent, give up to avoid that deadlock.
+ */
+ if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ return (EBUSY);
+
+ /*
+ * Get a page-level lock on the parent to single-thread splits into the
+ * page because we need to single-thread sizing/growing the page index.
+ * It's OK to queue up multiple splits as the child pages split, but the
+ * actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the
+ * locked period shorter.
+ *
+ * We use the reconciliation lock here because not only do we have to
+ * single-thread the split, we have to lock out reconciliation of the
+ * parent because reconciliation of the parent can't deal with finding
+ * a split child during internal page traversal. Basically, there's no
+ * reason to use a different lock if we have to block reconciliation
+ * anyway.
+ */
+ for (;;) {
+ parent = ref->home;
+
+ /* Skip pages that aren't ready to split. */
+ if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
+ return (EBUSY);
+
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (parent == ref->home)
+ break;
+ WT_RET(__wt_fair_unlock(session, &parent->page_lock));
+ }
+
+ /*
+ * We have exclusive access to split the parent, and at this point, the
+ * child prevents the parent from being evicted. However, once we
+ * update the parent's index, it may no longer refer to the child, and
+ * could conceivably be evicted. Get a hazard pointer on the parent
+ * now, so that we can safely access it after updating the index.
+ *
+ * Take care getting the page doesn't trigger eviction work: we could
+ * block trying to split a different child of our parent and deadlock
+ * or we could be the eviction server relied upon by other threads to
+ * populate the eviction queue.
+ */
+ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+ WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+ *hazardp = true;
+ }
+
+ *parentp = parent;
+ return (0);
+
+err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_unlock --
+ * Unlock the parent page.
+ */
+static int
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+{
+ WT_DECL_RET;
+
+ if (hazard)
+ ret = __wt_hazard_clear(session, parent);
+
+ WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
+ return (ret);
+}
+
+/*
+ * __split_internal_should_split --
+ * Return if we should split an internal page.
+ */
+static bool
+__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * Our caller is holding the parent page locked to single-thread splits,
+ * which means we can safely look at the page's index without setting a
+ * split generation.
+ */
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+
+ /* Sanity check for a reasonable number of on-page keys. */
+ if (pindex->entries < 100)
+ return (false);
+
+ /*
+ * Deepen the tree if the page's memory footprint is larger than the
+ * maximum size for a page in memory (presumably putting eviction
+ * pressure on the cache).
+ */
+ if (page->memory_footprint > btree->maxmempage)
+ return (true);
+
+ /*
+ * Check if the page has enough keys to make it worth splitting. If
+ * the number of keys is allowed to grow too large, the cost of
+ * splitting into parent pages can become large enough to result
+ * in slow operations.
+ */
+ if (pindex->entries > btree->split_deepen_min_child)
+ return (true);
+
+ return (false);
+}
+
+/*
+ * __split_parent_climb --
+ * Check if we should split up the tree.
+ */
+static int
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_REF *ref;
+ bool parent_hazard;
+
+ /*
+ * Page splits trickle up the tree, that is, as leaf pages grow large
+ * enough and are evicted, they'll split into their parent. And, as
+ * that parent page grows large enough and is evicted, it splits into
+ * its parent and so on. When the page split wave reaches the root,
+ * the tree will permanently deepen as multiple root pages are written.
+ *
+ * However, this only helps if internal pages are evicted (and we resist
+ * evicting internal pages for obvious reasons), or if the tree were to
+ * be closed and re-opened from a disk image, which may be a rare event.
+ *
+ * To avoid internal pages becoming too large absent eviction, check
+ * parent pages each time pages are split into them. If the page is big
+ * enough, either split the page into its parent or, in the case of the
+ * root, deepen the tree.
+ *
+ * Split up the tree.
+ */
+ for (;;) {
+ parent = NULL;
+ parent_hazard = false;
+ ref = page->pg_intl_parent_ref;
+
+ /* If we don't need to split the page, we're done. */
+ if (!__split_internal_should_split(session, ref))
+ break;
+
+ /*
+ * If we've reached the root page, there are no subsequent pages
+ * to review, deepen the tree and quit.
+ */
+ if (__wt_ref_is_root(ref)) {
+ ret = __split_root(session, page);
+ break;
+ }
+
+ /*
+ * Lock the parent and split into it, then swap the parent/page
+ * locks, lock-coupling up the tree.
+ */
+ WT_ERR(__split_internal_lock(
+ session, ref, &parent, &parent_hazard));
+ ret = __split_internal(session, parent, page);
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ page = parent;
+ page_hazard = parent_hazard;
+ parent = NULL;
+ parent_hazard = false;
+ WT_ERR(ret);
+ }
+
+err: if (parent != NULL)
+ WT_TRET(
+ __split_internal_unlock(session, parent, parent_hazard));
+ WT_TRET(__split_internal_unlock(session, page, page_hazard));
+
+ /* A page may have been busy, in which case return without error. */
+ WT_RET_BUSY_OK(ret);
+ return (0);
+}
+
+/*
* __split_multi_inmem --
* Instantiate a page in a multi-block set.
*/
@@ -901,369 +1600,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
}
/*
- * __split_parent_lock --
- * Lock the parent page.
- */
-static int
-__split_parent_lock(
- WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- WT_REF *parent_ref;
-
- *hazardp = false;
- *parentp = NULL;
-
- /*
- * A checkpoint reconciling this parent page can deadlock with
- * our split. We have an exclusive page lock on the child before
- * we acquire the page's reconciliation lock, and reconciliation
- * acquires the page's reconciliation lock before it encounters
- * the child's exclusive lock (which causes reconciliation to
- * loop until the exclusive lock is resolved). If we want to split
- * the parent, give up to avoid that deadlock.
- */
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
- return (EBUSY);
-
- /*
- * Get a page-level lock on the parent to single-thread splits into the
- * page because we need to single-thread sizing/growing the page index.
- * It's OK to queue up multiple splits as the child pages split, but the
- * actual split into the parent has to be serialized. Note we allocate
- * memory inside of the lock and may want to invest effort in making the
- * locked period shorter.
- *
- * We use the reconciliation lock here because not only do we have to
- * single-thread the split, we have to lock out reconciliation of the
- * parent because reconciliation of the parent can't deal with finding
- * a split child during internal page traversal. Basically, there's no
- * reason to use a different lock if we have to block reconciliation
- * anyway.
- */
- for (;;) {
- parent = ref->home;
- WT_RET(__wt_fair_lock(session, &parent->page_lock));
- if (parent == ref->home)
- break;
- /* Try again if the page deepened while we were waiting */
- WT_RET(__wt_fair_unlock(session, &parent->page_lock));
- }
-
- /*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
- * update the parent's index, it will no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
-
- *parentp = parent;
- return (0);
-
-err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent_unlock --
- * Unlock the parent page.
- */
-static int
-__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
-{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent);
-
- WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
- return (ret);
-}
-
-/*
- * __split_parent --
- * Resolve a multi-page split, inserting new information into the parent.
- */
-static int
-__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
-{
- WT_DECL_RET;
- WT_IKEY *ikey;
- WT_PAGE *parent;
- WT_PAGE_INDEX *alloc_index, *pindex;
- WT_REF **alloc_refp, *next_ref, *parent_ref;
- size_t parent_decr, size;
- uint64_t split_gen;
- uint32_t i, j;
- uint32_t deleted_entries, parent_entries, result_entries;
- bool complete;
-
- parent = ref->home;
- parent_ref = parent->pg_intl_parent_ref;
-
- alloc_index = pindex = NULL;
- parent_decr = 0;
- parent_entries = 0;
- complete = false;
-
- /*
- * We've locked the parent, which means it cannot split (which is the
- * only reason to worry about split generation values).
- */
- pindex = WT_INTL_INDEX_GET_SAFE(parent);
- parent_entries = pindex->entries;
-
- /*
- * Remove any refs to deleted pages while we are splitting, we have
- * the internal page locked down, and are copying the refs into a new
- * array anyway. Switch them to the special split state, so that any
- * reading thread will restart. Include the ref we are splitting in
- * the count to be deleted.
- */
- for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
- if (next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref, true) &&
- __wt_atomic_casv32(
- &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
- deleted_entries++;
- }
-
- /*
- * The final entry count consists of the original count, plus any new
- * pages, less any WT_REFs we're removing.
- */
- result_entries = (parent_entries + new_entries) - deleted_entries;
-
- /*
- * If the entire (sub)tree is empty, give up: we can't leave an empty
- * internal page. Mark it to be evicted soon and clean up any
- * references that have changed state.
- */
- if (result_entries == 0) {
- __wt_page_evict_soon(parent);
- goto err;
- }
-
- /*
- * Allocate and initialize a new page index array for the parent, then
- * copy references from the original index array, plus references from
- * the newly created split array, into place.
- */
- size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
- WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
- parent_incr += size;
- alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref == ref) {
- for (j = 0; j < new_entries; ++j) {
- ref_new[j]->home = parent;
- *alloc_refp++ = ref_new[j];
-
- /*
- * Clear the split reference as it moves to the
- * allocated page index, so it never appears on
- * both after an error.
- */
- ref_new[j] = NULL;
- }
-
- /*
- * We detect append-style workloads to avoid repeatedly
- * deepening parts of the tree where no work is being
- * done by tracking if we're splitting after the slots
- * created by the last split to deepen this parent.
- *
- * Note the calculation: i is a 0-based array offset and
- * split-last is a count of entries, also either or both
- * i and split-last might be unsigned 0, don't decrement
- * either one.
- */
- if (i > parent->pg_intl_deepen_split_last)
- parent->
- pg_intl_deepen_split_append += new_entries;
- } else if (next_ref->state != WT_REF_SPLIT)
- /* Skip refs we have marked for deletion. */
- *alloc_refp++ = next_ref;
- }
-
- /* Check that we filled in all the entries. */
- WT_ASSERT(session,
- alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
-
- /*
- * Confirm the parent page's index hasn't moved then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
- WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- alloc_index = NULL;
-
-#ifdef HAVE_DIAGNOSTIC
- WT_WITH_PAGE_INDEX(session,
- __split_verify_intl_key_order(session, parent));
-#endif
-
- /*
- * Reset the page's original WT_REF field to split. Threads cursoring
- * through the tree were blocked because that WT_REF state was set to
- * locked. This update changes the locked state to split, unblocking
- * those threads and causing them to re-calculate their position based
- * on the updated parent page's index.
- */
- WT_PUBLISH(ref->state, WT_REF_SPLIT);
-
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
-
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")", ref->page == NULL ?
- "reverse" : __wt_page_type_string(ref->page->type),
- parent_entries, result_entries, result_entries - parent_entries));
-
- /*
- * The new page index is in place, free the WT_REF we were splitting
- * and any deleted WT_REFs we found, modulo the usual safe free
- * semantics.
- */
- for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state != WT_REF_SPLIT)
- continue;
- --deleted_entries;
-
- /*
- * We set the WT_REF to split, discard it, freeing any resources
- * it holds.
- *
- * Row-store trees where the old version of the page is being
- * discarded: the previous parent page's key for this child page
- * may have been an on-page overflow key. In that case, if the
- * key hasn't been deleted, delete it now, including its backing
- * blocks. We are exchanging the WT_REF that referenced it for
- * the split page WT_REFs and their keys, and there's no longer
- * any reference to it. Done after completing the split (if we
- * failed, we'd leak the underlying blocks, but the parent page
- * would be unaffected).
- */
- if (parent->type == WT_PAGE_ROW_INT) {
- WT_TRET(__split_ovfl_key_cleanup(
- session, parent, next_ref));
- ikey = __wt_ref_key_instantiated(next_ref);
- if (ikey != NULL) {
- size = sizeof(WT_IKEY) + ikey->size;
- WT_TRET(__split_safe_free(
- session, split_gen, 0, ikey, size));
- parent_decr += size;
- }
- /*
- * The page_del structure can be freed immediately: it
- * is only read when the ref state is WT_REF_DELETED.
- * The size of the structure wasn't added to the parent,
- * don't decrement.
- */
- if (next_ref->page_del != NULL) {
- __wt_free(session,
- next_ref->page_del->update_list);
- __wt_free(session, next_ref->page_del);
- }
- }
-
- WT_TRET(__split_safe_free(
- session, split_gen, 0, next_ref, sizeof(WT_REF)));
- parent_decr += sizeof(WT_REF);
- }
-
- /* We freed the reference that was split in the loop above. */
- ref = NULL;
-
- /*
- * We can't free the previous page index, there may be threads using it.
- * Add it to the session discard list, to be freed when it's safe.
- */
- size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
- parent_decr += size;
-
- /*
- * Adjust the parent's memory footprint.
- */
- __wt_cache_page_inmem_incr(session, parent, parent_incr);
- __wt_cache_page_inmem_decr(session, parent, parent_decr);
-
- /*
- * Simple page splits trickle up the tree, that is, as leaf pages grow
- * large enough and are evicted, they'll split into their parent. And,
- * as that parent grows large enough and is evicted, it will split into
- * its parent and so on. When the page split wave reaches the root,
- * the tree will permanently deepen as multiple root pages are written.
- * However, this only helps if first, the pages are evicted (and
- * we resist evicting internal pages for obvious reasons), and second,
- * if the tree is closed and re-opened from a disk image, which may be
- * a rare event.
- * To avoid the case of internal pages becoming too large when they
- * aren't being evicted, check internal pages each time a leaf page is
- * split into them. If it's big enough, deepen the tree at that point.
- * Do the check here because we've just grown the parent page and
- * are holding it locked.
- */
- if (ret == 0 && !exclusive &&
- __split_should_deepen(session, parent_ref))
- ret = __split_deepen(session, parent);
-
-err: /*
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
- if (!complete) {
- for (i = 0; i < parent_entries; ++i) {
- next_ref = pindex->index[i];
- if (next_ref->state == WT_REF_SPLIT)
- next_ref->state = WT_REF_DELETED;
- }
-
- /* If we gave up on a reverse split, unlock the child. */
- if (ref_new == NULL) {
- WT_ASSERT(session, ref->state == WT_REF_LOCKED);
- ref->state = WT_REF_DELETED;
- }
-
- __wt_free_ref_index(session, NULL, alloc_index, false);
- }
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
-}
-
-/*
* __split_insert --
* Split a page's last insert list entries into a separate page.
*/
@@ -1279,6 +1615,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
size_t page_decr, parent_incr, right_incr;
int i;
+ WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
+ WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
+
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
@@ -1491,7 +1830,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, false)) != 0) {
+ session, ref, split_ref, 2, parent_incr, false, true)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1513,9 +1852,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(ret);
}
- WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
- WT_STAT_FAST_DATA_INCR(session, cache_inmem_split);
-
return (0);
err: if (split_ref[0] != NULL) {
@@ -1543,83 +1879,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_insert(session, ref);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-insert", ref->page));
-/*
- * __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
- */
-int
-__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
-
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, 0);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
- return (ret);
-}
-
-/*
- * __wt_split_rewrite --
- * Rewrite an in-memory page with a new version.
- */
-int
-__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- WT_REF new;
-
- page = ref->page;
- mod = page->modify;
-
- /*
- * This isn't a split: a reconciliation failed because we couldn't write
- * something, and in the case of forced eviction, we need to stop this
- * page from being such a problem. We have exclusive access, rewrite the
- * page in memory. The code lives here because the split code knows how
- * to re-create a page in memory after it's been reconciled, and that's
- * exactly what we want to do.
- *
- * Build the new page.
- */
- memset(&new, 0, sizeof(new));
- WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
-
- /*
- * The rewrite succeeded, we can no longer fail.
- *
- * Finalize the move, discarding moved update lists from the original
- * page.
- */
- __split_multi_inmem_final(page, &mod->mod_multi[0]);
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_insert(session, ref)) != 0) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
/*
- * Discard the original page.
- *
- * Pages with unresolved changes are not marked clean during
- * reconciliation, do it now.
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
*/
- __wt_page_modify_clear(session, page);
- __wt_ref_out(session, ref);
-
- /* Swap the new page into place. */
- ref->page = new.page;
- WT_PUBLISH(ref->state, WT_REF_MEM);
-
- return (0);
-
-err: __split_multi_inmem_fail(session, &new);
- return (ret);
+ return (__split_parent_climb(session, parent, hazard));
}
/*
@@ -1636,6 +1910,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
size_t parent_incr;
uint32_t i, new_entries;
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf);
+
page = ref->page;
mod = page->modify;
new_entries = mod->mod_multi_entries;
@@ -1656,10 +1933,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* exclusively.
*/
WT_ERR(__split_parent(
- session, ref, ref_new, new_entries, parent_incr, closing));
-
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
- WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
+ session, ref, ref_new, new_entries, parent_incr, closing, true));
/*
* The split succeeded, we can no longer fail.
@@ -1697,8 +1971,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_PAGE *parent;
bool hazard;
- WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
- ret = __split_multi(session, ref, closing);
- WT_TRET(__split_parent_unlock(session, parent, hazard));
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-multi", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+ }
+
+ /*
+ * Split up through the tree as necessary; we're holding the original
+ * parent page locked, note the functions we call are responsible for
+ * releasing that lock.
+ */
+ return (__split_parent_climb(session, parent, hazard));
+}
+
+/*
+ * __wt_split_reverse --
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
+ */
+int
+__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ bool hazard;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: reverse-split", ref->page));
+
+ WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ WT_TRET(__split_internal_unlock(session, parent, hazard));
+ return (ret);
+}
+
+/*
+ * __wt_split_rewrite --
+ * Rewrite an in-memory page with a new version.
+ */
+int
+__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page));
+
+ /*
+ * This isn't a split: a reconciliation failed because we couldn't write
+ * something, and in the case of forced eviction, we need to stop this
+ * page from being such a problem. We have exclusive access, rewrite the
+ * page in memory. The code lives here because the split code knows how
+ * to re-create a page in memory after it's been reconciled, and that's
+ * exactly what we want to do.
+ *
+ * Build the new page.
+ */
+ memset(&new, 0, sizeof(new));
+ WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * The rewrite succeeded, we can no longer fail.
+ *
+ * Finalize the move, discarding moved update lists from the original
+ * page.
+ */
+ __split_multi_inmem_final(page, &mod->mod_multi[0]);
+
+ /*
+ * Discard the original page.
+ *
+ * Pages with unresolved changes are not marked clean during
+ * reconciliation, do it now.
+ */
+ __wt_page_modify_clear(session, page);
+ __wt_ref_out(session, ref);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+
+err: __split_multi_inmem_fail(session, &new);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 7395cce11e1..07bb2eb3a01 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
syncop == WT_SYNC_WRITE_LEAVES ?
"WRITE_LEAVES" : "CHECKPOINT",
leaf_bytes, leaf_pages, internal_bytes, internal_pages,
- WT_TIMEDIFF(end, start) / WT_MILLION));
+ WT_TIMEDIFF_MS(end, start)));
}
err: /* On error, clear any left-over tree walk. */
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index d02f23ed164..e9fa570f97b 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
uint32_t base, indx, limit;
int depth;
@@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session,
goto leaf_only;
}
+restart_root:
/* Search the internal pages of the tree. */
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_COL_INT)
break;
@@ -51,8 +53,19 @@ restart: page = current->page;
descent = pindex->index[base - 1];
/* Fast path appends. */
- if (recno >= descent->key.recno)
+ if (recno >= descent->key.recno) {
+ /*
+ * If on the last slot (the key is larger than any key
+ * on the page), check for an internal page split race.
+ */
+ if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ WT_RET(__wt_page_release(session, current, 0));
+ goto restart_root;
+ }
goto descend;
+ }
/* Binary search of internal pages. */
for (base = 0,
@@ -90,15 +103,13 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
- goto restart;
- default:
- return (ret);
+ continue;
}
+ if (ret == WT_RESTART)
+ goto restart_page;
+ return (ret);
}
/* Track how deep the tree gets. */
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 7b21f1e40bb..d2d8a4640ca 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_ITEM *item;
WT_PAGE *page;
- WT_PAGE_INDEX *pindex;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
WT_ROW *rip;
size_t match, skiphigh, skiplow;
@@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session,
btree = S2BT(session);
collator = btree->collator;
item = cbt->tmp;
+ current = NULL;
__cursor_pos_clear(cbt);
/*
- * The row-store search routine uses a different comparison API.
- * The assumption is we're comparing more than a few keys with
- * matching prefixes, and it's a win to avoid the memory fetches
- * by skipping over those prefixes. That's done by tracking the
- * length of the prefix match for the lowest and highest keys we
- * compare as we descend the tree.
+ * In some cases we expect we're comparing more than a few keys with
+ * matching prefixes, so it's faster to avoid the memory fetches by
+ * skipping over those prefixes. That's done by tracking the length of
+ * the prefix match for the lowest and highest keys we compare as we
+ * descend the tree.
*/
skiphigh = skiplow = 0;
@@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session,
}
/* Search the internal pages of the tree. */
- cmp = -1;
+restart_root:
current = &btree->root;
- for (depth = 2;; ++depth) {
-restart: page = current->page;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+restart_page: page = current->page;
if (page->type != WT_PAGE_ROW_INT)
break;
@@ -211,7 +212,7 @@ restart: page = current->page;
WT_ERR(__wt_compare(
session, collator, srch_key, item, &cmp));
if (cmp >= 0)
- goto descend;
+ goto append;
/* A failed append check turns off append checks. */
append_check = false;
@@ -252,7 +253,26 @@ restart: page = current->page;
} else if (cmp == 0)
goto descend;
}
- else if (collator == NULL)
+ else if (collator == NULL) {
+ /*
+ * Reset the skipped prefix counts; we'd normally expect
+ * the parent's skipped prefix values to be larger than
+ * the child's values and so we'd only increase them as
+ * we walk down the tree (in other words, if we can skip
+ * N bytes on the parent, we can skip at least N bytes
+ * on the child). However, if a child internal page was
+ * split up into the parent, the child page's key space
+ * will have been truncated, and the values from the
+ * parent's search may be wrong for the child. We only
+ * need to reset the high count because the split-page
+ * algorithm truncates the end of the internal page's
+ * key space, the low count is still correct. We also
+ * don't need to clear either count when transitioning
+ * to a leaf page, a leaf page's key space can't change
+ * in flight.
+ */
+ skiphigh = 0;
+
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -271,7 +291,7 @@ restart: page = current->page;
else
goto descend;
}
- else
+ } else
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -288,9 +308,10 @@ restart: page = current->page;
}
/*
- * Set the slot to descend the tree: descent is already set if
- * there was an exact match on the page, otherwise, base is
- * the smallest index greater than key, possibly (last + 1).
+ * Set the slot to descend the tree: descent was already set if
+ * there was an exact match on the page, otherwise, base is the
+ * smallest index greater than key, possibly one past the last
+ * slot.
*/
descent = pindex->index[base - 1];
@@ -298,25 +319,41 @@ restart: page = current->page;
* If we end up somewhere other than the last slot, it's not a
* right-side descent.
*/
- if (pindex->entries != base - 1)
+ if (pindex->entries != base)
descend_right = false;
+ /*
+ * If on the last slot (the key is larger than any key on the
+ * page), check for an internal page split race.
+ */
+ if (pindex->entries == base) {
+append: if (parent_pindex != NULL &&
+ __wt_split_intl_race(
+ session, current->home, parent_pindex)) {
+ if ((ret = __wt_page_release(
+ session, current, 0)) != 0)
+ return (ret);
+
+ skiplow = skiphigh = 0;
+ goto restart_root;
+ }
+ }
+
descend: /*
* Swap the current page for the child page. If the page splits
* while we're retrieving it, restart the search in the current
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- switch (ret = __wt_page_swap(session, current, descent, 0)) {
- case 0:
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
current = descent;
- break;
- case WT_RESTART:
+ continue;
+ }
+ if (ret == WT_RESTART) {
skiphigh = skiplow = 0;
- goto restart;
- default:
- return (ret);
+ goto restart_page;
}
+ return (ret);
}
/* Track how deep the tree gets. */
@@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
__cursor_pos_clear(cbt);
-restart:
+restart_root:
/* Walk the internal pages of the tree. */
current = &btree->root;
for (;;) {
@@ -544,7 +581,7 @@ restart:
*/
if (ret == WT_RESTART &&
(ret = __wt_page_release(session, current, 0)) == 0)
- goto restart;
+ goto restart_root;
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 311ddd56b7a..d79ce6853e6 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -295,6 +295,19 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = {
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
+static const WT_CONFIG_CHECK confchk_WT_SESSION_join[] = {
+ { "bloom_bit_count", "int", NULL, "min=2,max=1000", NULL, 0 },
+ { "bloom_hash_count", "int", NULL, "min=2,max=100", NULL, 0 },
+ { "compare", "string",
+ NULL, "choices=[\"eq\",\"ge\",\"gt\",\"le\",\"lt\"]",
+ NULL, 0 },
+ { "count", "int", NULL, NULL, NULL, 0 },
+ { "strategy", "string",
+ NULL, "choices=[\"bloom\",\"default\"]",
+ NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
static const WT_CONFIG_CHECK confchk_WT_SESSION_log_flush[] = {
{ "sync", "string",
NULL, "choices=[\"background\",\"off\",\"on\"]",
@@ -543,6 +556,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
+ { "use_environment", "boolean", NULL, NULL, NULL, 0 },
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
@@ -622,6 +636,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
+ { "use_environment", "boolean", NULL, NULL, NULL, 0 },
{ "use_environment_priv", "boolean", NULL, NULL, NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
@@ -891,6 +906,11 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"force=0,remove_files=",
confchk_WT_SESSION_drop, 2
},
+ { "WT_SESSION.join",
+ "bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=,"
+ "strategy=",
+ confchk_WT_SESSION_join, 5
+ },
{ "WT_SESSION.log_flush",
"sync=on",
confchk_WT_SESSION_log_flush, 1
@@ -995,9 +1015,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",name=,quota=0,reserve=0,size=500MB),statistics=none,"
"statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
- ",verbose=,write_through=",
- confchk_wiredtiger_open, 36
+ "transaction_sync=(enabled=0,method=fsync),use_environment=,"
+ "use_environment_priv=0,verbose=,write_through=",
+ confchk_wiredtiger_open, 37
},
{ "wiredtiger_open_all",
"async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
@@ -1016,9 +1036,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",name=,quota=0,reserve=0,size=500MB),statistics=none,"
"statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\","
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
- ",verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_all, 37
+ "transaction_sync=(enabled=0,method=fsync),use_environment=,"
+ "use_environment_priv=0,verbose=,version=(major=0,minor=0),"
+ "write_through=",
+ confchk_wiredtiger_open_all, 38
},
{ "wiredtiger_open_basecfg",
"async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index c65b74e4e4e..bd14e1bf4fd 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1286,6 +1286,11 @@ __conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf)
const char *env_config;
size_t len;
+ /* Only use the environment variable if configured. */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment", &cval));
+ if (cval.val == 0)
+ return (0);
+
ret = __wt_getenv(session, "WIREDTIGER_CONFIG", &env_config);
if (ret == WT_NOTFOUND)
return (0);
@@ -1333,15 +1338,16 @@ err: __wt_free(session, env_config);
static int
__conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[])
{
- WT_DECL_RET;
WT_CONFIG_ITEM cval;
/* If the application specifies a home directory, use it. */
if (home != NULL)
goto copy;
- ret = __wt_getenv(session, "WIREDTIGER_HOME", &S2C(session)->home);
- if (ret == 0)
+ /* Only use the environment variable if configured. */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment", &cval));
+ if (cval.val != 0 &&
+ __wt_getenv(session, "WIREDTIGER_HOME", &S2C(session)->home) == 0)
return (0);
/* If there's no WIREDTIGER_HOME environment variable, use ".". */
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index aa14e9aadde..8d16f94c092 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -734,7 +734,7 @@ __wt_cache_pool_server(void *arg)
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
if (cp->currently_used <= cp->size)
WT_ERR(__wt_cond_wait(session,
- cp->cache_pool_cond, 1000000));
+ cp->cache_pool_cond, WT_MILLION));
/*
* Re-check pool run flag - since we want to avoid getting the
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index 8f039e61654..b47e2550b23 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -31,7 +31,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
* Checkpoints based on log size also require logging be enabled.
*/
WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
- conn->ckpt_usecs = (uint64_t)cval.val * 1000000;
+ conn->ckpt_usecs = (uint64_t)cval.val * WT_MILLION;
WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
conn->ckpt_logsize = (wt_off_t)cval.val;
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 527b756ee1a..1d44d816467 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -23,17 +23,19 @@ __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
WT_RET(
__wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
if (cval.val)
- FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
+ FLD_SET(conn->txn_logsync, WT_LOG_SYNC_ENABLED);
else
- FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH);
+ FLD_CLR(conn->txn_logsync, WT_LOG_SYNC_ENABLED);
WT_RET(
__wt_config_gets(session, cfg, "transaction_sync.method", &cval));
- FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC);
+ FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH | WT_LOG_FSYNC);
if (WT_STRING_MATCH("dsync", cval.str, cval.len))
- FLD_SET(conn->txn_logsync, WT_LOG_DSYNC);
+ FLD_SET(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH);
else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
FLD_SET(conn->txn_logsync, WT_LOG_FSYNC);
+ else if (WT_STRING_MATCH("none", cval.str, cval.len))
+ FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
return (0);
}
@@ -536,8 +538,8 @@ restart:
while (i < WT_SLOT_POOL) {
save_i = i;
slot = &log->slot_pool[i++];
- WT_ASSERT(session, slot->slot_state != 0 ||
- slot->slot_release_lsn.file >= log->write_lsn.file);
+ WT_ASSERT(session, slot->slot_state != 0 ||
+ slot->slot_release_lsn.file >= log->write_lsn.file);
if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
continue;
written[written_i].slot_index = save_i;
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index ec3a630581a..31438e10606 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -83,7 +83,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
/* Only start the server if wait time is non-zero */
*runp = cval.val != 0;
- conn->stat_usecs = (uint64_t)cval.val * 1000000;
+ conn->stat_usecs = (uint64_t)cval.val * WT_MILLION;
WT_RET(__wt_config_gets(
session, cfg, "statistics_log.on_close", &cval));
@@ -154,7 +154,7 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats)
WT_DECL_RET;
int64_t *stats;
int i;
- const char *uri;
+ const char *desc, *uri;
const char *cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
@@ -175,16 +175,19 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats)
* If we don't find an underlying object, silently ignore it, the object
* may exist only intermittently.
*/
- switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
+ switch (ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) {
case 0:
cst = (WT_CURSOR_STAT *)cursor;
- for (stats = cst->stats, i = 0; i < cst->stats_count; ++i)
+ for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) {
+ if (conn_stats)
+ WT_ERR(__wt_stat_connection_desc(cst, i,
+ &desc));
+ else
+ WT_ERR(__wt_stat_dsrc_desc(cst, i, &desc));
WT_ERR(__wt_fprintf(conn->stat_fp,
"%s %" PRId64 " %s %s\n",
- conn->stat_stamp, stats[i],
- name, conn_stats ?
- __wt_stat_connection_desc(i) :
- __wt_stat_dsrc_desc(i)));
+ conn->stat_stamp, stats[i], name, desc));
+ }
WT_ERR(cursor->close(cursor));
break;
case EBUSY:
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index a8620ebaa99..b9b46f3211c 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -136,7 +136,8 @@ __sweep_expire(WT_SESSION_IMPL *session, time_t now)
!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
dhandle->session_inuse != 0 ||
dhandle->timeofdeath == 0 ||
- now <= dhandle->timeofdeath + conn->sweep_idle_time)
+ difftime(now, dhandle->timeofdeath) <=
+ conn->sweep_idle_time)
continue;
WT_WITH_DHANDLE(session, dhandle,
@@ -276,8 +277,8 @@ __sweep_server(void *arg)
while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
/* Wait until the next event. */
- WT_ERR(__wt_cond_wait(session, conn->sweep_cond,
- (uint64_t)conn->sweep_interval * WT_MILLION));
+ WT_ERR(__wt_cond_wait(session,
+ conn->sweep_cond, conn->sweep_interval * WT_MILLION));
WT_ERR(__wt_seconds(session, &now));
WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
@@ -329,27 +330,25 @@ __wt_sweep_config(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
- /* Pull out the sweep configurations. */
- WT_RET(__wt_config_gets(session,
- cfg, "file_manager.close_idle_time", &cval));
- conn->sweep_idle_time = (time_t)cval.val;
-
- /* Non-zero sweep idle time is incompatible with in-memory */
- if (conn->sweep_idle_time != 0) {
- WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
- if (cval.val != 0)
- WT_RET_MSG(session, EINVAL,
- "In memory configuration incompatible with "
- "non zero file_manager=(close_idle_time)");
+ /*
+ * A non-zero idle time is incompatible with in-memory, and the default
+ * is non-zero; set the in-memory configuration idle time to zero.
+ */
+ conn->sweep_idle_time = 0;
+ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val == 0) {
+ WT_RET(__wt_config_gets(session,
+ cfg, "file_manager.close_idle_time", &cval));
+ conn->sweep_idle_time = (uint64_t)cval.val;
}
WT_RET(__wt_config_gets(session,
cfg, "file_manager.close_scan_interval", &cval));
- conn->sweep_interval = (time_t)cval.val;
+ conn->sweep_interval = (uint64_t)cval.val;
WT_RET(__wt_config_gets(session,
cfg, "file_manager.close_handle_minimum", &cval));
- conn->sweep_handles_min = (u_int)cval.val;
+ conn->sweep_handles_min = (uint64_t)cval.val;
return (0);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index 6f7d492327b..62ac2203b97 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -17,8 +17,7 @@ static int __backup_list_append(
static int __backup_start(
WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]);
static int __backup_stop(WT_SESSION_IMPL *);
-static int __backup_uri(
- WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], bool *, bool *);
+static int __backup_uri(WT_SESSION_IMPL *, const char *[], bool *, bool *);
/*
* __curbackup_next --
@@ -197,6 +196,7 @@ __backup_start(
cb->next = 0;
cb->list = NULL;
+ cb->list_next = 0;
/*
* Single thread hot backups: we're holding the schema lock, so we
@@ -235,7 +235,7 @@ __backup_start(
* a checkpoint that completes during the backup.
*/
target_list = false;
- WT_ERR(__backup_uri(session, cb, cfg, &target_list, &log_only));
+ WT_ERR(__backup_uri(session, cfg, &target_list, &log_only));
if (!target_list) {
WT_ERR(__backup_log_append(session, cb, true));
@@ -391,7 +391,7 @@ err: if (cursor != NULL)
*/
static int
__backup_uri(WT_SESSION_IMPL *session,
- WT_CURSOR_BACKUP *cb, const char *cfg[], bool *foundp, bool *log_only)
+ const char *cfg[], bool *foundp, bool *log_only)
{
WT_CONFIG targetconf;
WT_CONFIG_ITEM cval, k, v;
@@ -408,7 +408,7 @@ __backup_uri(WT_SESSION_IMPL *session,
*/
WT_RET(__wt_config_gets(session, cfg, "target", &cval));
WT_RET(__wt_config_subinit(session, &targetconf, &cval));
- for (cb->list_next = 0, target_list = false;
+ for (target_list = false;
(ret = __wt_config_next(&targetconf, &k, &v)) == 0;
target_list = true) {
/* If it is our first time through, allocate. */
@@ -432,9 +432,11 @@ __backup_uri(WT_SESSION_IMPL *session,
if (WT_PREFIX_MATCH(uri, "log:")) {
*log_only = !target_list;
WT_ERR(__wt_backup_list_uri_append(session, uri, NULL));
- } else
+ } else {
+ *log_only = false;
WT_ERR(__wt_schema_worker(session,
uri, NULL, __wt_backup_list_uri_append, cfg, 0));
+ }
}
WT_ERR_NOTFOUND_OK(ret);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c
index 6c11c4b407e..e5799fbad05 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_dump.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c
@@ -329,7 +329,7 @@ __curdump_close(WT_CURSOR *cursor)
cdump = (WT_CURSOR_DUMP *)cursor;
child = cdump->child;
- CURSOR_API_CALL(cursor, session, get_key, NULL);
+ CURSOR_API_CALL(cursor, session, close, NULL);
if (child != NULL)
WT_TRET(child->close(child));
/* We shared the child's URI. */
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 1db819b8b40..7c18b59fded 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -246,17 +246,17 @@ __curfile_insert(WT_CURSOR *cursor)
/*
* Insert is the one cursor operation that doesn't end with the cursor
- * pointing to an on-page item. The standard macro handles errors
- * correctly, but we need to leave the application cursor unchanged in
- * the case of success, except for column-store appends, where we are
- * returning a key.
+ * pointing to an on-page item (except for column-store appends, where
+ * we are returning a key). That is, the application's cursor continues
+ * to reference the application's memory after a successful cursor call,
+ * which isn't true anywhere else. We don't want to have to explain that
+ * scoping corner case, so we reset the application's cursor so it can
+ * free the referenced memory and continue on without risking subsequent
+ * core dumps.
*/
if (ret == 0) {
- if (!F_ISSET(cursor, WT_CURSTD_APPEND)) {
- F_SET(cursor, WT_CURSTD_KEY_EXT);
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND))
F_CLR(cursor, WT_CURSTD_KEY_INT);
- }
- F_SET(cursor, WT_CURSTD_VALUE_EXT);
F_CLR(cursor, WT_CURSTD_VALUE_INT);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index fd2a6cd7480..a909eaece99 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -8,6 +8,20 @@
#include "wt_internal.h"
+ /*
+ * __wt_curindex_joined --
+ * Produce an error that this cursor is being used in a join call.
+ */
+int
+__wt_curindex_joined(WT_CURSOR *cursor)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_errx(session, "index cursor is being used in a join");
+ return (ENOTSUP);
+}
+
/*
* __curindex_get_value --
* WT_CURSOR->get_value implementation for index cursors.
@@ -15,32 +29,16 @@
static int
__curindex_get_value(WT_CURSOR *cursor, ...)
{
- WT_CURSOR_INDEX *cindex;
WT_DECL_RET;
- WT_ITEM *item;
WT_SESSION_IMPL *session;
va_list ap;
- cindex = (WT_CURSOR_INDEX *)cursor;
- CURSOR_API_CALL(cursor, session, get_value, NULL);
- WT_CURSOR_NEEDVALUE(cursor);
-
va_start(ap, cursor);
- if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
- ret = __wt_schema_project_merge(session,
- cindex->cg_cursors, cindex->value_plan,
- cursor->value_format, &cursor->value);
- if (ret == 0) {
- item = va_arg(ap, WT_ITEM *);
- item->data = cursor->value.data;
- item->size = cursor->value.size;
- }
- } else
- ret = __wt_schema_project_out(session,
- cindex->cg_cursors, cindex->value_plan, ap);
- va_end(ap);
+ JOINABLE_CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_ERR(__wt_curindex_get_valuev(cursor, ap));
-err: API_END_RET(session, ret);
+err: va_end(ap);
+ API_END_RET(session, ret);
}
/*
@@ -53,7 +51,7 @@ __curindex_set_value(WT_CURSOR *cursor, ...)
WT_DECL_RET;
WT_SESSION_IMPL *session;
- CURSOR_API_CALL(cursor, session, set_value, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL);
ret = ENOTSUP;
err: cursor->saved_err = ret;
F_CLR(cursor, WT_CURSTD_VALUE_SET);
@@ -72,7 +70,7 @@ __curindex_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
WT_SESSION_IMPL *session;
cindex = (WT_CURSOR_INDEX *)a;
- CURSOR_API_CALL(a, session, compare, NULL);
+ JOINABLE_CURSOR_API_CALL(a, session, compare, NULL);
/* Check both cursors are "index:" type. */
if (!WT_PREFIX_MATCH(a->uri, "index:") ||
@@ -150,7 +148,7 @@ __curindex_next(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
cindex = (WT_CURSOR_INDEX *)cursor;
- CURSOR_API_CALL(cursor, session, next, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
if ((ret = cindex->child->next(cindex->child)) == 0)
@@ -171,7 +169,7 @@ __curindex_prev(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
cindex = (WT_CURSOR_INDEX *)cursor;
- CURSOR_API_CALL(cursor, session, prev, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, prev, NULL);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
if ((ret = cindex->child->prev(cindex->child)) == 0)
@@ -194,7 +192,7 @@ __curindex_reset(WT_CURSOR *cursor)
u_int i;
cindex = (WT_CURSOR_INDEX *)cursor;
- CURSOR_API_CALL(cursor, session, reset, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_TRET(cindex->child->reset(cindex->child));
@@ -225,7 +223,7 @@ __curindex_search(WT_CURSOR *cursor)
cindex = (WT_CURSOR_INDEX *)cursor;
child = cindex->child;
- CURSOR_API_CALL(cursor, session, search, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);
/*
* We are searching using the application-specified key, which
@@ -284,7 +282,7 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact)
WT_SESSION_IMPL *session;
cindex = (WT_CURSOR_INDEX *)cursor;
- CURSOR_API_CALL(cursor, session, search_near, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, search_near, NULL);
__wt_cursor_set_raw_key(cindex->child, &cursor->key);
if ((ret = cindex->child->search_near(cindex->child, exact)) == 0)
ret = __curindex_move(cindex);
@@ -311,7 +309,7 @@ __curindex_close(WT_CURSOR *cursor)
cindex = (WT_CURSOR_INDEX *)cursor;
idx = cindex->index;
- CURSOR_API_CALL(cursor, session, close, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL);
if ((cp = cindex->cg_cursors) != NULL)
for (i = 0, cp = cindex->cg_cursors;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
new file mode 100644
index 00000000000..c5155c75a0c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curjoin_entry_iter_init --
+ * Initialize an iteration for the index managed by a join entry.
+ *
+ */
+static int
+__curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp)
+{
+ WT_CURSOR *newcur;
+ WT_CURSOR *to_dup;
+ WT_DECL_RET;
+ const char *raw_cfg[] = { WT_CONFIG_BASE(
+ session, WT_SESSION_open_cursor), "raw", NULL };
+ const char *def_cfg[] = { WT_CONFIG_BASE(
+ session, WT_SESSION_open_cursor), NULL };
+ const char *uri, **config;
+ char *uribuf;
+ WT_CURSOR_JOIN_ITER *iter;
+ size_t size;
+
+ iter = NULL;
+ uribuf = NULL;
+ to_dup = entry->ends[0].cursor;
+
+ uri = to_dup->uri;
+ if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
+ config = &raw_cfg[0];
+ else
+ config = &def_cfg[0];
+
+ if (cjoin->projection != NULL) {
+ size = strlen(uri) + strlen(cjoin->projection) + 1;
+ WT_ERR(__wt_calloc(session, size, 1, &uribuf));
+ snprintf(uribuf, size, "%s%s", uri, cjoin->projection);
+ uri = uribuf;
+ }
+ WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
+ &newcur));
+ WT_ERR(__wt_cursor_dup_position(to_dup, newcur));
+ WT_ERR(__wt_calloc_one(session, &iter));
+ iter->cjoin = cjoin;
+ iter->session = session;
+ iter->entry = entry;
+ iter->cursor = newcur;
+ iter->advance = false;
+ *iterp = iter;
+
+ if (0) {
+err: __wt_free(session, iter);
+ }
+ __wt_free(session, uribuf);
+ return (ret);
+}
+
+/*
+ * __curjoin_pack_recno --
+ * Pack the given recno into a buffer; prepare an item referencing it.
+ *
+ */
+static int
+__curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf,
+ size_t bufsize, WT_ITEM *item)
+{
+ WT_DECL_RET;
+ WT_SESSION *wtsession;
+ size_t sz;
+
+ wtsession = (WT_SESSION *)session;
+ WT_ERR(wiredtiger_struct_size(wtsession, &sz, "r", r));
+ WT_ASSERT(session, sz < bufsize);
+ WT_ERR(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r));
+ item->size = sz;
+ item->data = buf;
+
+err: return (ret);
+}
+
+/*
+ * __curjoin_entry_iter_next --
+ * Get the next item in an iteration.
+ *
+ */
+static int
+__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey,
+ uint64_t *rp)
+{
+ WT_CURSOR *firstcg_cur;
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t r;
+
+ if (iter->advance)
+ WT_ERR(iter->cursor->next(iter->cursor));
+ else
+ iter->advance = true;
+
+ session = iter->session;
+ cjoin = iter->cjoin;
+
+ /*
+ * Set our key to the primary key, we'll also need this
+ * to check membership.
+ */
+ if (iter->entry->index != NULL)
+ firstcg_cur = ((WT_CURSOR_INDEX *)iter->cursor)->cg_cursors[0];
+ else
+ firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0];
+ if (WT_CURSOR_RECNO(&cjoin->iface)) {
+ r = *(uint64_t *)firstcg_cur->key.data;
+ WT_ERR(__curjoin_pack_recno(session, r, cjoin->recno_buf,
+ sizeof(cjoin->recno_buf), primkey));
+ *rp = r;
+ } else {
+ WT_ITEM_SET(*primkey, firstcg_cur->key);
+ *rp = 0;
+ }
+ iter->curkey = primkey;
+ iter->entry->stats.actual_count++;
+ iter->entry->stats.accesses++;
+
+err: return (ret);
+}
+
+/*
+ * __curjoin_entry_iter_reset --
+ * Reset an iteration to the starting point.
+ *
+ */
+static int
+__curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter)
+{
+ WT_DECL_RET;
+
+ if (iter->advance) {
+ WT_ERR(iter->cursor->reset(iter->cursor));
+ WT_ERR(__wt_cursor_dup_position(
+ iter->cjoin->entries[0].ends[0].cursor, iter->cursor));
+ iter->advance = false;
+ iter->entry->stats.actual_count = 0;
+ }
+
+err: return (ret);
+}
+
+/*
+ * __curjoin_entry_iter_ready --
+ * The iterator is positioned.
+ *
+ */
+static bool
+__curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter)
+{
+ return (iter->advance);
+}
+
+/*
+ * __curjoin_entry_iter_close --
+ * Close the iteration, release resources.
+ *
+ */
+static int
+__curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter)
+{
+ WT_DECL_RET;
+
+ if (iter->cursor != NULL)
+ WT_TRET(iter->cursor->close(iter->cursor));
+ __wt_free(iter->session, iter);
+
+ return (ret);
+}
+
+/*
+ * __curjoin_get_key --
+ * WT_CURSOR->get_key for join cursors.
+ */
+static int
+__curjoin_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+ if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
+ !__curjoin_entry_iter_ready(cjoin->iter)) {
+ __wt_errx(session, "join cursor must be advanced with next()");
+ WT_ERR(EINVAL);
+ }
+ WT_ERR(__wt_cursor_get_keyv(cursor, cursor->flags, ap));
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curjoin_get_value --
+ * WT_CURSOR->get_value for join cursors.
+ */
+static int
+__curjoin_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_CURSOR_JOIN_ITER *iter;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+ iter = cjoin->iter;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
+ !__curjoin_entry_iter_ready(iter)) {
+ __wt_errx(session, "join cursor must be advanced with next()");
+ WT_ERR(EINVAL);
+ }
+ if (iter->entry->index != NULL)
+ WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap));
+ else
+ WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap));
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curjoin_init_bloom --
+ * Populate Bloom filters
+ */
+static int
+__curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom)
+{
+ WT_COLLATOR *collator;
+ WT_CURSOR *c;
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
+ WT_DECL_RET;
+ WT_DECL_ITEM(uribuf);
+ WT_ITEM curkey, curvalue, *k;
+ WT_TABLE *maintable;
+ const char *raw_cfg[] = { WT_CONFIG_BASE(
+ session, WT_SESSION_open_cursor), "raw", NULL };
+ const char *mainkey_str, *p;
+ void *allocbuf;
+ size_t mainkey_len, size;
+ u_int i;
+ int cmp, skip;
+
+ c = NULL;
+ allocbuf = NULL;
+ skip = 0;
+
+ if (entry->index != NULL) {
+ /*
+ * Open a cursor having a projection of the keys of the
+ * index we're comparing against. Open it raw, we're
+ * going to compare it to the raw keys of the
+ * reference cursors.
+ */
+ maintable = ((WT_CURSOR_TABLE *)entry->main)->table;
+ mainkey_str = maintable->colconf.str + 1;
+ for (p = mainkey_str, i = 0;
+ p != NULL && i < maintable->nkey_columns; i++)
+ p = strchr(p + 1, ',');
+ WT_ASSERT(session, p != 0);
+ mainkey_len = WT_PTRDIFF(p, mainkey_str);
+ size = strlen(entry->index->name) + mainkey_len + 3;
+ WT_ERR(__wt_scr_alloc(session, size, &uribuf));
+ WT_ERR(__wt_buf_fmt(session, uribuf, "%s(%.*s)",
+ entry->index->name, (int)mainkey_len, mainkey_str));
+ } else {
+ /*
+ * For joins on the main table, we just need the primary
+ * key for comparison, we don't need any values.
+ */
+ size = strlen(cjoin->table->name) + 3;
+ WT_ERR(__wt_scr_alloc(session, size, &uribuf));
+ WT_ERR(__wt_buf_fmt(session, uribuf, "%s()",
+ cjoin->table->name));
+ }
+ WT_ERR(__wt_open_cursor(
+ session, uribuf->data, &cjoin->iface, raw_cfg, &c));
+
+ /* Initially position the cursor if necessary. */
+ endmax = &entry->ends[entry->ends_next];
+ if ((end = &entry->ends[0]) < endmax &&
+ F_ISSET(end, WT_CURJOIN_END_GE)) {
+ WT_ERR(__wt_cursor_dup_position(end->cursor, c));
+ if (end->flags == WT_CURJOIN_END_GE)
+ skip = 1;
+ }
+ collator = (entry->index == NULL) ? NULL : entry->index->collator;
+ while (ret == 0) {
+ c->get_key(c, &curkey);
+ if (entry->index != NULL) {
+ cindex = (WT_CURSOR_INDEX *)c;
+ if (cindex->index->extractor == NULL) {
+ /*
+ * Repack so it's comparable to the
+ * reference endpoints.
+ */
+ k = &cindex->child->key;
+ WT_ERR(__wt_struct_repack(session,
+ cindex->child->key_format,
+ entry->main->value_format, k, &curkey,
+ &allocbuf));
+ } else
+ curkey = cindex->child->key;
+ }
+ for (end = &entry->ends[skip]; end < endmax; end++) {
+ WT_ERR(__wt_compare(session, collator, &curkey,
+ &end->key, &cmp));
+ if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
+ if (cmp < 0 || (cmp == 0 &&
+ !F_ISSET(end, WT_CURJOIN_END_EQ)))
+ goto advance;
+ if (cmp > 0) {
+ if (F_ISSET(end, WT_CURJOIN_END_GT))
+ skip = 1;
+ else
+ goto done;
+ }
+ } else {
+ if (cmp > 0 || (cmp == 0 &&
+ !F_ISSET(end, WT_CURJOIN_END_EQ)))
+ goto done;
+ }
+ }
+ if (entry->index != NULL)
+ c->get_value(c, &curvalue);
+ else
+ c->get_key(c, &curvalue);
+ WT_ERR(__wt_bloom_insert(bloom, &curvalue));
+ entry->stats.actual_count++;
+advance:
+ if ((ret = c->next(c)) == WT_NOTFOUND)
+ break;
+ }
+done:
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: if (c != NULL)
+ WT_TRET(c->close(c));
+ __wt_scr_free(session, &uribuf);
+ __wt_free(session, allocbuf);
+ return (ret);
+}
+
+/*
+ * __curjoin_endpoint_init_key --
+ * Set the key in the reference endpoint.
+ */
+static int
+__curjoin_endpoint_init_key(WT_SESSION_IMPL *session,
+ WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ENDPOINT *endpoint)
+{
+ WT_CURSOR *cursor;
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_ITEM *k;
+ uint64_t r;
+ void *allocbuf;
+
+ allocbuf = NULL;
+ if ((cursor = endpoint->cursor) != NULL) {
+ if (entry->index != NULL) {
+ cindex = (WT_CURSOR_INDEX *)endpoint->cursor;
+ if (cindex->index->extractor == NULL) {
+ WT_ERR(__wt_struct_repack(session,
+ cindex->child->key_format,
+ entry->main->value_format,
+ &cindex->child->key, &endpoint->key,
+ &allocbuf));
+ if (allocbuf != NULL)
+ F_SET(endpoint, WT_CURJOIN_END_OWN_KEY);
+ } else
+ endpoint->key = cindex->child->key;
+ } else {
+ k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key;
+ if (WT_CURSOR_RECNO(cursor)) {
+ r = *(uint64_t *)k->data;
+ WT_ERR(__curjoin_pack_recno(session, r,
+ endpoint->recno_buf,
+ sizeof(endpoint->recno_buf),
+ &endpoint->key));
+ }
+ else
+ endpoint->key = *k;
+ }
+ }
+ if (0) {
+err: __wt_free(session, allocbuf);
+ }
+ return (ret);
+}
+
+/*
+ * __curjoin_init_iter --
+ * Initialize before any iteration.
+ */
+static int
+__curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
+{
+ WT_BLOOM *bloom;
+ WT_DECL_RET;
+ WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
+ WT_CURSOR_JOIN_ENDPOINT *end;
+ uint64_t k, m;
+
+ if (cjoin->entries_next == 0) {
+ __wt_errx(session, "join cursor has not yet been joined "
+ "with any other cursors");
+ return (EINVAL);
+ }
+
+ je = &cjoin->entries[0];
+ WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter));
+
+ jeend = &cjoin->entries[cjoin->entries_next];
+ for (je = cjoin->entries; je < jeend; je++) {
+ __wt_stat_join_init_single(&je->stats);
+ for (end = &je->ends[0]; end < &je->ends[je->ends_next];
+ end++)
+ WT_RET(__curjoin_endpoint_init_key(session, je, end));
+
+ /*
+ * The first entry is iterated as the 'outermost' cursor.
+ * For the common GE case, we don't have to test against
+ * the left reference key, we know it will be true since
+ * the btree is ordered.
+ */
+ if (je == cjoin->entries && je->ends[0].flags ==
+ (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ))
+ F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
+
+ if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
+ if (je->bloom == NULL) {
+ /*
+ * Look for compatible filters to be shared,
+ * pick compatible numbers for bit counts
+ * and number of hashes.
+ */
+ m = je->bloom_bit_count;
+ k = je->bloom_hash_count;
+ for (je2 = je + 1; je2 < jeend; je2++)
+ if (F_ISSET(je2,
+ WT_CURJOIN_ENTRY_BLOOM) &&
+ je2->count == je->count) {
+ m = WT_MAX(
+ je2->bloom_bit_count, m);
+ k = WT_MAX(
+ je2->bloom_hash_count, k);
+ }
+ je->bloom_bit_count = m;
+ je->bloom_hash_count = k;
+ WT_RET(__wt_bloom_create(session, NULL,
+ NULL, je->count, m, k, &je->bloom));
+ F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
+ WT_RET(__curjoin_init_bloom(session, cjoin,
+ je, je->bloom));
+ /*
+ * Share the Bloom filter, making all
+ * config info consistent.
+ */
+ for (je2 = je + 1; je2 < jeend; je2++)
+ if (F_ISSET(je2,
+ WT_CURJOIN_ENTRY_BLOOM) &&
+ je2->count == je->count) {
+ WT_ASSERT(session,
+ je2->bloom == NULL);
+ je2->bloom = je->bloom;
+ je2->bloom_bit_count = m;
+ je2->bloom_hash_count = k;
+ }
+ } else {
+ /*
+ * Create a temporary filter that we'll
+ * merge into the shared one. The Bloom
+ * parameters of the two filters must match.
+ */
+ WT_RET(__wt_bloom_create(session, NULL,
+ NULL, je->count, je->bloom_bit_count,
+ je->bloom_hash_count, &bloom));
+ WT_RET(__curjoin_init_bloom(session, cjoin,
+ je, bloom));
+ WT_RET(__wt_bloom_intersection(je->bloom,
+ bloom));
+ WT_RET(__wt_bloom_close(bloom));
+ }
+ }
+ }
+
+ F_SET(cjoin, WT_CURJOIN_INITIALIZED);
+ return (ret);
+}
+
+/*
+ * __curjoin_entry_in_range --
+ * Check if a key is in the range specified by the entry, returning
+ * WT_NOTFOUND if not.
+ */
+static int
+__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
+ WT_ITEM *curkey, bool skip_left)
+{
+ WT_COLLATOR *collator;
+ WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
+ WT_DECL_RET;
+ int cmp;
+
+ collator = (entry->index != NULL) ? entry->index->collator : NULL;
+ endmax = &entry->ends[entry->ends_next];
+ for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) {
+ WT_ERR(__wt_compare(session, collator, curkey, &end->key,
+ &cmp));
+ if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
+ if (cmp < 0 ||
+ (cmp == 0 &&
+ !F_ISSET(end, WT_CURJOIN_END_EQ)) ||
+ (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT)))
+ WT_ERR(WT_NOTFOUND);
+ } else {
+ if (cmp > 0 ||
+ (cmp == 0 &&
+ !F_ISSET(end, WT_CURJOIN_END_EQ)) ||
+ (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT)))
+ WT_ERR(WT_NOTFOUND);
+ }
+ }
+err: return (ret);
+}
+
+typedef struct {
+ WT_CURSOR iface;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ int ismember;
+} WT_CURJOIN_EXTRACTOR;
+
+/*
+ * __curjoin_extract_insert --
+ * Handle a key produced by a custom extractor.
+ */
+static int
+__curjoin_extract_insert(WT_CURSOR *cursor) {
+ WT_CURJOIN_EXTRACTOR *cextract;
+ WT_DECL_RET;
+ WT_ITEM ikey;
+ WT_SESSION_IMPL *session;
+
+ cextract = (WT_CURJOIN_EXTRACTOR *)cursor;
+ /*
+ * This insert method may be called multiple times during a single
+ * extraction. If we already have a definitive answer to the
+ * membership question, exit early.
+ */
+ if (cextract->ismember)
+ return (0);
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_ITEM_SET(ikey, cursor->key);
+ /*
+ * We appended a padding byte to the key to avoid rewriting the last
+ * column. Strip that away here.
+ */
+ WT_ASSERT(session, ikey.size > 0);
+ --ikey.size;
+
+ ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false);
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ else
+ cextract->ismember = 1;
+
+ return (ret);
+}
+
+/*
+ * __curjoin_entry_member --
+ * Do a membership check for a particular index that was joined,
+ * if not a member, returns WT_NOTFOUND.
+ */
+static int
+__curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_CURSOR_JOIN_ENTRY *entry, bool skip_left)
+{
+ WT_CURJOIN_EXTRACTOR extract_cursor;
+ WT_CURSOR *c;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __wt_cursor_notsup, /* equals */
+ __wt_cursor_notsup, /* next */
+ __wt_cursor_notsup, /* prev */
+ __wt_cursor_notsup, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __curjoin_extract_insert, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* reconfigure */
+ __wt_cursor_notsup, /* remove */
+ __wt_cursor_notsup); /* close */
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_ITEM *key, v;
+ bool bloom_found;
+
+ key = cjoin->iter->curkey;
+ entry->stats.accesses++;
+ bloom_found = false;
+
+ if (entry->bloom != NULL) {
+ /*
+ * If we don't own the Bloom filter, we must be sharing one
+ * in a previous entry. So the shared filter has already
+ * been checked and passed.
+ */
+ if (!F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
+ return (0);
+
+ /*
+ * If the item is not in the Bloom filter, we return
+ * immediately, otherwise, we still need to check the
+ * long way.
+ */
+ WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
+ bloom_found = true;
+ }
+ if (entry->index != NULL) {
+ c = entry->main;
+ c->set_key(c, key);
+ if ((ret = c->search(c)) == 0)
+ ret = c->get_value(c, &v);
+ else if (ret == WT_NOTFOUND)
+ WT_ERR_MSG(session, WT_ERROR,
+ "main table for join is missing entry.");
+ c->reset(c);
+ WT_ERR(ret);
+ } else
+ v = *key;
+
+ if ((idx = entry->index) != NULL && idx->extractor != NULL) {
+ extract_cursor.iface = iface;
+ extract_cursor.iface.session = &session->iface;
+ extract_cursor.iface.key_format = idx->exkey_format;
+ extract_cursor.ismember = 0;
+ extract_cursor.entry = entry;
+ WT_ERR(idx->extractor->extract(idx->extractor,
+ &session->iface, key, &v, &extract_cursor.iface));
+ if (!extract_cursor.ismember)
+ WT_ERR(WT_NOTFOUND);
+ } else
+ WT_ERR(__curjoin_entry_in_range(session, entry, &v, skip_left));
+
+ if (0) {
+err: if (ret == WT_NOTFOUND && bloom_found)
+ entry->stats.bloom_false_positive++;
+ }
+ return (ret);
+}
+
+/*
+ * __curjoin_next --
+ * WT_CURSOR::next for join cursors.
+ */
+static int
+__curjoin_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ bool skip_left;
+ u_int i;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ if (F_ISSET(cjoin, WT_CURJOIN_ERROR)) {
+ __wt_errx(session, "join cursor encountered previous error");
+ WT_ERR(WT_ERROR);
+ }
+ if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
+ WT_ERR(__curjoin_init_iter(session, cjoin));
+
+nextkey:
+ if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key,
+ &cursor->recno)) == 0) {
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+
+ /*
+ * We may have already established membership for the
+ * 'left' case for the first entry, since we're
+ * using that in our iteration.
+ */
+ skip_left = F_ISSET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
+ for (i = 0; i < cjoin->entries_next; i++) {
+ ret = __curjoin_entry_member(session, cjoin,
+ &cjoin->entries[i], skip_left);
+ if (ret == WT_NOTFOUND)
+ goto nextkey;
+ skip_left = false;
+ WT_ERR(ret);
+ }
+ }
+
+ if (0) {
+err: F_SET(cjoin, WT_CURJOIN_ERROR);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curjoin_reset --
+ * WT_CURSOR::reset for join cursors.
+ */
+static int
+__curjoin_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ if (F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
+ WT_ERR(__curjoin_entry_iter_reset(cjoin->iter));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curjoin_close --
+ * WT_CURSOR::close for join cursors.
+ */
+static int
+__curjoin_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_CURSOR_JOIN_ENDPOINT *end;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ __wt_schema_release_table(session, cjoin->table);
+ /* These are owned by the table */
+ cursor->internal_uri = NULL;
+ cursor->key_format = NULL;
+ if (cjoin->projection != NULL) {
+ __wt_free(session, cjoin->projection);
+ __wt_free(session, cursor->value_format);
+ }
+
+ for (entry = cjoin->entries, i = 0; i < cjoin->entries_next;
+ entry++, i++) {
+ if (entry->main != NULL)
+ WT_TRET(entry->main->close(entry->main));
+ if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
+ WT_TRET(__wt_bloom_close(entry->bloom));
+ for (end = &entry->ends[0];
+ end < &entry->ends[entry->ends_next]; end++) {
+ F_CLR(end->cursor, WT_CURSTD_JOINED);
+ if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY))
+ __wt_free(session, end->key.data);
+ }
+ __wt_free(session, entry->ends);
+ }
+
+ if (cjoin->iter != NULL)
+ WT_TRET(__curjoin_entry_iter_close(cjoin->iter));
+ __wt_free(session, cjoin->entries);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curjoin_open --
+ * Initialize a join cursor.
+ *
+ * Join cursors are read-only.
+ */
+int
+__wt_curjoin_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __curjoin_get_key, /* get-key */
+ __curjoin_get_value, /* get-value */
+ __wt_cursor_notsup, /* set-key */
+ __wt_cursor_notsup, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __wt_cursor_notsup, /* equals */
+ __curjoin_next, /* next */
+ __wt_cursor_notsup, /* prev */
+ __curjoin_reset, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __wt_cursor_notsup, /* reconfigure */
+ __curjoin_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_TABLE *table;
+ size_t size;
+ const char *tablename, *columns;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_JOIN, iface) == 0);
+
+ if (!WT_PREFIX_SKIP(uri, "join:"))
+ return (EINVAL);
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "table:"))
+ return (EINVAL);
+
+ columns = strchr(tablename, '(');
+ if (columns == NULL)
+ size = strlen(tablename);
+ else
+ size = WT_PTRDIFF(columns, tablename);
+ WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table));
+
+ WT_RET(__wt_calloc_one(session, &cjoin));
+ cursor = &cjoin->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->internal_uri = table->name;
+ cursor->key_format = table->key_format;
+ cursor->value_format = table->value_format;
+ cjoin->table = table;
+
+ /* Handle projections. */
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ if (columns != NULL) {
+ WT_ERR(__wt_struct_reformat(session, table,
+ columns, strlen(columns), NULL, 1, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cursor->value_format));
+ WT_ERR(__wt_strdup(session, columns, &cjoin->projection));
+ }
+
+ if (owner != NULL)
+ WT_ERR(EINVAL);
+
+ WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+ if (0) {
+err: WT_TRET(__curjoin_close(cursor));
+ *cursorp = NULL;
+ }
+
+ __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
+ * __wt_curjoin_join --
+ * Add a new join to a join cursor.
+ */
+int
+__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_INDEX *idx, WT_CURSOR *ref_cursor, uint32_t flags, uint32_t range,
+ uint64_t count, uint64_t bloom_bit_count, uint64_t bloom_hash_count)
+{
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_DECL_RET;
+ WT_CURSOR_JOIN_ENDPOINT *end, *newend;
+ bool hasins, needbloom, range_eq;
+ u_int i, ins, nonbloom;
+ const char *raw_cfg[] = { WT_CONFIG_BASE(
+ session, WT_SESSION_open_cursor), "raw", NULL };
+ char *main_uri;
+ size_t namesize, newsize;
+
+ entry = NULL;
+ hasins = needbloom = false;
+ ins = 0; /* -Wuninitialized */
+ main_uri = NULL;
+ nonbloom = 0; /* -Wuninitialized */
+ namesize = strlen(cjoin->table->name);
+
+ for (i = 0; i < cjoin->entries_next; i++) {
+ if (cjoin->entries[i].index == idx) {
+ entry = &cjoin->entries[i];
+ break;
+ }
+ if (!needbloom && i > 0 &&
+ !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) {
+ needbloom = true;
+ nonbloom = i;
+ }
+ }
+ if (entry == NULL) {
+ WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated,
+ cjoin->entries_next + 1, &cjoin->entries));
+ if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
+ /*
+ * Reorder the list so that after the first entry,
+ * the Bloom filtered entries come next, followed by
+ * the non-Bloom entries. Once the Bloom filters
+ * are built, determining membership via Bloom is
+ * faster than without Bloom, so we can answer
+ * membership questions more quickly, and with less
+ * I/O, with the Bloom entries first.
+ */
+ entry = &cjoin->entries[nonbloom];
+ memmove(entry + 1, entry,
+ (cjoin->entries_next - nonbloom) *
+ sizeof(WT_CURSOR_JOIN_ENTRY));
+ memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
+ }
+ else
+ entry = &cjoin->entries[cjoin->entries_next];
+ entry->index = idx;
+ entry->flags = flags;
+ entry->count = count;
+ entry->bloom_bit_count = bloom_bit_count;
+ entry->bloom_hash_count = bloom_hash_count;
+ ++cjoin->entries_next;
+ } else {
+ /* Merge the join into an existing entry for this index */
+ if (count != 0 && entry->count != 0 && entry->count != count) {
+ __wt_errx(session, "count=%" PRIu64 " does not match "
+ "previous count=%" PRIu64 " for this index",
+ count, entry->count);
+ WT_ERR(EINVAL);
+ }
+ if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) !=
+ F_ISSET(entry, WT_CURJOIN_ENTRY_BLOOM)) {
+ __wt_errx(session, "join has incompatible strategy "
+ "values for the same index");
+ WT_ERR(EINVAL);
+ }
+ /*
+ * Check against other comparisons (we call them endpoints)
+ * already set up for this index.
+ * We allow either:
+ * - one or more "eq" (with disjunction)
+ * - exactly one "eq" (with conjunction)
+ * - exactly one of "gt" or "ge" (conjunction or disjunction)
+ * - exactly one of "lt" or "le" (conjunction or disjunction)
+ * - one of "gt"/"ge" along with one of "lt"/"le"
+ * (currently restricted to conjunction).
+ *
+ * Some other combinations, although expressible either do
+ * not make sense (X == 3 AND X == 5) or are reducible (X <
+ * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15)
+ * or (X == 4 OR X > 15) make sense but we don't handle yet.
+ */
+ for (i = 0; i < entry->ends_next; i++) {
+ end = &entry->ends[i];
+ range_eq = (range == WT_CURJOIN_END_EQ);
+ if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
+ ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
+ (F_ISSET(end, WT_CURJOIN_END_LT) &&
+ ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
+ (end->flags == WT_CURJOIN_END_EQ &&
+ (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
+ != 0)) {
+ __wt_errx(session,
+ "join has overlapping ranges");
+ WT_ERR(EINVAL);
+ }
+ if (range == WT_CURJOIN_END_EQ &&
+ end->flags == WT_CURJOIN_END_EQ &&
+ !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) {
+ __wt_errx(session,
+ "compare=eq can only be combined "
+ "using operation=or");
+ WT_ERR(EINVAL);
+ }
+
+ /*
+ * Sort "gt"/"ge" to the front, followed by any number
+ * of "eq", and finally "lt"/"le".
+ */
+ if (!hasins &&
+ ((range & WT_CURJOIN_END_GT) != 0 ||
+ (range == WT_CURJOIN_END_EQ &&
+ !F_ISSET(end, WT_CURJOIN_END_GT)))) {
+ ins = i;
+ hasins = true;
+ }
+ }
+ /* All checks completed, merge any new configuration now */
+ entry->count = count;
+ entry->bloom_bit_count =
+ WT_MAX(entry->bloom_bit_count, bloom_bit_count);
+ entry->bloom_hash_count =
+ WT_MAX(entry->bloom_hash_count, bloom_hash_count);
+ }
+ WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
+ entry->ends_next + 1, &entry->ends));
+ if (!hasins)
+ ins = entry->ends_next;
+ newend = &entry->ends[ins];
+ memmove(newend + 1, newend,
+ (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
+ memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
+ entry->ends_next++;
+ newend->cursor = ref_cursor;
+ F_SET(newend, range);
+
+ /* Open the main file with a projection of the indexed columns. */
+ if (entry->main == NULL && entry->index != NULL) {
+ namesize = strlen(cjoin->table->name);
+ newsize = namesize + entry->index->colconf.len + 1;
+ WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
+ snprintf(main_uri, newsize, "%s%.*s",
+ cjoin->table->name, (int)entry->index->colconf.len,
+ entry->index->colconf.str);
+ WT_ERR(__wt_open_cursor(session, main_uri,
+ (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
+ }
+
+err: if (main_uri != NULL)
+ __wt_free(session, main_uri);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 81d028c165a..65d2dc81406 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -103,7 +103,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
va_list ap;
size_t size;
uint64_t *v;
- const char **p;
+ const char *desc, **p;
cst = (WT_CURSOR_STAT *)cursor;
va_start(ap, cursor);
@@ -111,15 +111,13 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
WT_CURSOR_NEEDVALUE(cursor);
+ WT_ERR(cst->stats_desc(cst, WT_STAT_KEY_OFFSET(cst), &desc));
if (F_ISSET(cursor, WT_CURSTD_RAW)) {
WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
- cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
- cst->pv.data, cst->v));
+ desc, cst->pv.data, cst->v));
WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
- cursor->value_format,
- cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
- cst->pv.data, cst->v));
+ cursor->value_format, desc, cst->pv.data, cst->v));
item = va_arg(ap, WT_ITEM *);
item->data = cursor->value.data;
@@ -130,7 +128,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
* pointer support isn't documented, but it's a cheap test.
*/
if ((p = va_arg(ap, const char **)) != NULL)
- *p = cst->stats_desc(WT_STAT_KEY_OFFSET(cst));
+ *p = desc;
if ((p = va_arg(ap, const char **)) != NULL)
*p = cst->pv.data;
if ((v = va_arg(ap, uint64_t *)) != NULL)
@@ -201,7 +199,9 @@ __curstat_next(WT_CURSOR *cursor)
/* Initialize on demand. */
if (cst->notinitialized) {
WT_ERR(__wt_curstat_init(
- session, cursor->internal_uri, cst->cfg, cst));
+ session, cursor->internal_uri, NULL, cst->cfg, cst));
+ if (cst->next_set != NULL)
+ WT_ERR((*cst->next_set)(session, cst, true, true));
cst->notinitialized = false;
}
@@ -211,15 +211,19 @@ __curstat_next(WT_CURSOR *cursor)
cst->key = WT_STAT_KEY_MIN(cst);
} else if (cst->key < WT_STAT_KEY_MAX(cst))
++cst->key;
- else {
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else if (cst->next_set != NULL)
+ WT_ERR((*cst->next_set)(session, cst, true, false));
+ else
WT_ERR(WT_NOTFOUND);
- }
+
cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-err: API_END_RET(session, ret);
+ if (0) {
+err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ }
+ API_END_RET(session, ret);
}
/*
@@ -239,7 +243,9 @@ __curstat_prev(WT_CURSOR *cursor)
/* Initialize on demand. */
if (cst->notinitialized) {
WT_ERR(__wt_curstat_init(
- session, cursor->internal_uri, cst->cfg, cst));
+ session, cursor->internal_uri, NULL, cst->cfg, cst));
+ if (cst->next_set != NULL)
+ WT_ERR((*cst->next_set)(session, cst, false, true));
cst->notinitialized = false;
}
@@ -249,16 +255,19 @@ __curstat_prev(WT_CURSOR *cursor)
cst->key = WT_STAT_KEY_MAX(cst);
} else if (cst->key > WT_STAT_KEY_MIN(cst))
--cst->key;
- else {
- F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ else if (cst->next_set != NULL)
+ WT_ERR((*cst->next_set)(session, cst, false, false));
+ else
WT_ERR(WT_NOTFOUND);
- }
cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-err: API_END_RET(session, ret);
+ if (0) {
+err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ }
+ API_END_RET(session, ret);
}
/*
@@ -301,7 +310,7 @@ __curstat_search(WT_CURSOR *cursor)
/* Initialize on demand. */
if (cst->notinitialized) {
WT_ERR(__wt_curstat_init(
- session, cursor->internal_uri, cst->cfg, cst));
+ session, cursor->internal_uri, NULL, cst->cfg, cst));
cst->notinitialized = false;
}
@@ -332,6 +341,7 @@ __curstat_close(WT_CURSOR *cursor)
__curstat_free_config(session, cst);
__wt_buf_free(session, &cst->pv);
+ __wt_free(session, cst->desc_buf);
WT_ERR(__wt_cursor_close(cursor));
@@ -426,12 +436,102 @@ __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
}
/*
+ * __curstat_join_next_set --
+ * Advance to another index used in a join to give another set of
+ * statistics.
+ */
+static int
+__curstat_join_next_set(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst,
+ bool forw, bool init)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_JOIN_STATS_GROUP *join_group;
+ ssize_t pos;
+
+ WT_ASSERT(session, WT_STREQ(cst->iface.uri, "statistics:join"));
+ join_group = &cst->u.join_stats_group;
+ cjoin = join_group->join_cursor;
+ if (init)
+ pos = forw ? 0 : cjoin->entries_next - 1;
+ else
+ pos = join_group->join_cursor_entry + (forw ? 1 : -1);
+ if (pos < 0 || (size_t)pos >= cjoin->entries_next)
+ return (WT_NOTFOUND);
+
+ join_group->join_cursor_entry = pos;
+ if (cjoin->entries[pos].index == NULL) {
+ WT_ASSERT(session, WT_PREFIX_MATCH(cjoin->iface.uri, "join:"));
+ join_group->desc_prefix = cjoin->iface.uri + 5;
+ } else
+ join_group->desc_prefix = cjoin->entries[pos].index->name;
+ join_group->join_stats = cjoin->entries[pos].stats;
+ if (!init)
+ cst->key = forw ? WT_STAT_KEY_MIN(cst) : WT_STAT_KEY_MAX(cst);
+ return (0);
+}
+
+/*
+ * __curstat_join_desc --
+ * Assemble the description field based on current index and statistic.
+ */
+static int
+__curstat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **resultp)
+{
+ size_t len;
+ const char *static_desc;
+ WT_JOIN_STATS_GROUP *sgrp;
+ WT_SESSION_IMPL *session;
+
+ sgrp = &cst->u.join_stats_group;
+ session = (WT_SESSION_IMPL *)sgrp->join_cursor->iface.session;
+ WT_RET(__wt_stat_join_desc(cst, slot, &static_desc));
+ len = strlen("join: ") + strlen(sgrp->desc_prefix) +
+ strlen(static_desc) + 1;
+ WT_RET(__wt_realloc(session, NULL, len, &cst->desc_buf));
+ snprintf(cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix,
+ static_desc);
+ *resultp = cst->desc_buf;
+ return (0);
+}
+
+/*
+ * __curstat_join_init --
+ * Initialize the statistics for a joined cursor.
+ */
+static int
+__curstat_join_init(WT_SESSION_IMPL *session,
+ WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_CURSOR_JOIN *cjoin;
+ WT_DECL_RET;
+
+ WT_UNUSED(cfg);
+
+ if (curjoin == NULL && cst->u.join_stats_group.join_cursor != NULL)
+ curjoin = &cst->u.join_stats_group.join_cursor->iface;
+ if (curjoin == NULL || !WT_PREFIX_MATCH(curjoin->uri, "join:"))
+ WT_ERR_MSG(session, EINVAL,
+ "join cursor must be used with statistics:join");
+ cjoin = (WT_CURSOR_JOIN *)curjoin;
+ memset(&cst->u.join_stats_group, 0, sizeof(WT_JOIN_STATS_GROUP));
+ cst->u.join_stats_group.join_cursor = cjoin;
+
+ cst->stats = (int64_t *)&cst->u.join_stats_group.join_stats;
+ cst->stats_base = WT_JOIN_STATS_BASE;
+ cst->stats_count = sizeof(WT_JOIN_STATS) / sizeof(int64_t);
+ cst->stats_desc = __curstat_join_desc;
+ cst->next_set = __curstat_join_next_set;
+
+err: return (ret);
+}
+
+/*
* __wt_curstat_init --
* Initialize a statistics cursor.
*/
int
__wt_curstat_init(WT_SESSION_IMPL *session,
- const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+ const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst)
{
const char *dsrc_uri;
@@ -442,6 +542,10 @@ __wt_curstat_init(WT_SESSION_IMPL *session,
dsrc_uri = uri + strlen("statistics:");
+ if (WT_STREQ(dsrc_uri, "join"))
+ return (
+ __curstat_join_init(session, curjoin, cfg, cst));
+
if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:"))
return (
__wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst));
@@ -467,7 +571,7 @@ __wt_curstat_init(WT_SESSION_IMPL *session,
*/
int
__wt_curstat_open(WT_SESSION_IMPL *session,
- const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+ const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp)
{
WT_CONNECTION_IMPL *conn;
WT_CURSOR_STATIC_INIT(iface,
@@ -581,7 +685,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
* objects like tables, we need to a valid set of statistics when before
* the open returns.
*/
- WT_ERR(__wt_curstat_init(session, uri, cst->cfg, cst));
+ WT_ERR(__wt_curstat_init(session, uri, other, cst->cfg, cst));
cst->notinitialized = false;
/* The cursor isn't yet positioned. */
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index 38359236b27..dca72a16ee5 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -186,34 +186,16 @@ __wt_curtable_get_key(WT_CURSOR *cursor, ...)
int
__wt_curtable_get_value(WT_CURSOR *cursor, ...)
{
- WT_CURSOR *primary;
- WT_CURSOR_TABLE *ctable;
WT_DECL_RET;
- WT_ITEM *item;
WT_SESSION_IMPL *session;
va_list ap;
- ctable = (WT_CURSOR_TABLE *)cursor;
- primary = *ctable->cg_cursors;
- CURSOR_API_CALL(cursor, session, get_value, NULL);
- WT_CURSOR_NEEDVALUE(primary);
-
va_start(ap, cursor);
- if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
- ret = __wt_schema_project_merge(session,
- ctable->cg_cursors, ctable->plan,
- cursor->value_format, &cursor->value);
- if (ret == 0) {
- item = va_arg(ap, WT_ITEM *);
- item->data = cursor->value.data;
- item->size = cursor->value.size;
- }
- } else
- ret = __wt_schema_project_out(session,
- ctable->cg_cursors, ctable->plan, ap);
- va_end(ap);
+ JOINABLE_CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_ERR(__wt_curtable_get_valuev(cursor, ap));
-err: API_END_RET(session, ret);
+err: va_end(ap);
+ API_END_RET(session, ret);
}
/*
@@ -264,7 +246,7 @@ __wt_curtable_set_value(WT_CURSOR *cursor, ...)
u_int i;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, set_value, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL);
va_start(ap, cursor);
if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) {
@@ -332,7 +314,7 @@ __curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
WT_DECL_RET;
WT_SESSION_IMPL *session;
- CURSOR_API_CALL(a, session, compare, NULL);
+ JOINABLE_CURSOR_API_CALL(a, session, compare, NULL);
/*
* Confirm both cursors refer to the same source and have keys, then
@@ -362,7 +344,7 @@ __curtable_next(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, next, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL);
APPLY_CG(ctable, next);
err: API_END_RET(session, ret);
@@ -383,7 +365,7 @@ __curtable_next_random(WT_CURSOR *cursor)
u_int i;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, next, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL);
cp = ctable->cg_cursors;
/* Split out the first next, it retrieves the random record. */
@@ -414,7 +396,7 @@ __curtable_prev(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, prev, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, prev, NULL);
APPLY_CG(ctable, prev);
err: API_END_RET(session, ret);
@@ -432,7 +414,7 @@ __curtable_reset(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, reset, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL);
APPLY_CG(ctable, reset);
err: API_END_RET(session, ret);
@@ -450,7 +432,7 @@ __curtable_search(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, search, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);
APPLY_CG(ctable, search);
err: API_END_RET(session, ret);
@@ -470,7 +452,7 @@ __curtable_search_near(WT_CURSOR *cursor, int *exact)
u_int i;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, search_near, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, search_near, NULL);
cp = ctable->cg_cursors;
primary = *cp;
WT_ERR(primary->search_near(primary, exact));
@@ -501,7 +483,7 @@ __curtable_insert(WT_CURSOR *cursor)
u_int i;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+ JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
WT_ERR(__curtable_open_indices(ctable));
/*
@@ -520,29 +502,38 @@ __curtable_insert(WT_CURSOR *cursor)
if (ctable->table->nindices > 0)
F_CLR(primary, WT_CURSTD_OVERWRITE);
ret = primary->insert(primary);
- F_SET(primary, flag_orig);
- if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
- /*
- * !!!
- * The insert failure clears these flags, but does not touch the
- * items. We could make a copy each time for overwrite cursors,
- * but for now we just reset the flags.
- */
- F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
- ret = __curtable_update(cursor);
- goto err;
- }
- WT_ERR(ret);
+ /*
+ * !!!
+ * WT_CURSOR.insert clears the set internally/externally flags
+ * but doesn't touch the items. We could make a copy each time
+ * for overwrite cursors, but for now we just reset the flags.
+ */
+ F_SET(primary, flag_orig | WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
- for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
- (*cp)->recno = primary->recno;
- WT_ERR((*cp)->insert(*cp));
+ if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE))
+ WT_ERR(__curtable_update(cursor));
+ else {
+ WT_ERR(ret);
+
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->recno = primary->recno;
+ WT_ERR((*cp)->insert(*cp));
+ }
+
+ WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, insert), false));
}
- WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, insert), false));
+ /*
+ * WT_CURSOR.insert doesn't leave the cursor positioned, and the
+ * application may want to free the memory used to configure the
+ * insert; don't read that memory again (matching the underlying
+ * file object cursor insert semantics).
+ */
+ F_CLR(primary, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
err: CURSOR_UPDATE_API_END(session, ret);
+
return (ret);
}
@@ -559,7 +550,7 @@ __curtable_update(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+ JOINABLE_CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
WT_ERR(__curtable_open_indices(ctable));
/*
@@ -610,7 +601,7 @@ __curtable_remove(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_REMOVE_API_CALL(cursor, session, NULL);
+ JOINABLE_CURSOR_REMOVE_API_CALL(cursor, session, NULL);
WT_ERR(__curtable_open_indices(ctable));
/* Find the old record so it can be removed from indices */
@@ -650,6 +641,7 @@ __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop)
/* Open any indices. */
WT_RET(__curtable_open_indices(ctable));
WT_RET(__wt_scr_alloc(session, 128, &key));
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/*
* Step through the cursor range, removing the index entries.
@@ -721,7 +713,7 @@ __curtable_close(WT_CURSOR *cursor)
u_int i;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_API_CALL(cursor, session, close, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL);
if (ctable->cg_cursors != NULL)
for (i = 0, cp = ctable->cg_cursors;
@@ -844,7 +836,7 @@ __curtable_open_indices(WT_CURSOR_TABLE *ctable)
*/
int
__wt_curtable_open(WT_SESSION_IMPL *session,
- const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
WT_CURSOR_STATIC_INIT(iface,
__wt_curtable_get_key, /* get-key */
@@ -935,7 +927,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
}
WT_ERR(__wt_cursor_init(
- cursor, cursor->internal_uri, NULL, cfg, cursorp));
+ cursor, cursor->internal_uri, owner, cfg, cursorp));
if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
WT_ERR(__wt_json_column_init(cursor, table->key_format,
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 18335d6fb5e..fa6c4f4313f 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -183,10 +183,10 @@ __evict_server(void *arg)
session, &conn->dhandle_lock)) == EBUSY &&
!F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
spins++) {
- if (spins < 1000)
+ if (spins < WT_THOUSAND)
__wt_yield();
else
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
}
/*
* If we gave up acquiring the lock, that indicates a
@@ -210,7 +210,7 @@ __evict_server(void *arg)
else {
/* After being stuck for 5 minutes, give up. */
WT_ERR(__wt_epoch(session, &now));
- if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) {
+ if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) {
__wt_errx(session,
"Cache stuck for too long, giving up");
(void)__wt_cache_dump(session, NULL);
@@ -601,7 +601,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* that can free space in cache, such as LSM discarding
* handles.
*/
- __wt_sleep(0, 1000 * (uint64_t)loop);
+ __wt_sleep(0, WT_THOUSAND * (uint64_t)loop);
if (loop == 100) {
/*
* Mark the cache as stuck if we need space
@@ -992,10 +992,10 @@ retry: while (slot < max_entries && ret == 0) {
session, &conn->dhandle_lock)) == EBUSY &&
!F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
spins++) {
- if (spins < 1000)
+ if (spins < WT_THOUSAND)
__wt_yield();
else
- __wt_sleep(0, 1000);
+ __wt_sleep(0, WT_THOUSAND);
}
if (ret != 0)
break;
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index e49098e90db..94c969fa5bb 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -179,9 +179,17 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* something is busy, be sure that the page still ends up
* marked deleted.
*/
- if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
- (ret = __wt_split_reverse(session, ref)) != EBUSY)
- return (ret);
+ if (ndeleted > pindex->entries / 10 && pindex->entries > 1) {
+ if ((ret = __wt_split_reverse(session, ref)) == 0)
+ return (0);
+ WT_RET_BUSY_OK(ret);
+
+ /*
+ * The child must be locked after a failed reverse
+ * split.
+ */
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ }
}
WT_PUBLISH(ref->state, WT_REF_DELETED);
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 74c58845c43..4821b450f9e 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -116,11 +116,23 @@
API_CALL_NOCONF(s, WT_CURSOR, n, cur, \
((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+#define JOINABLE_CURSOR_CALL_CHECK(cur) \
+ if (F_ISSET(cur, WT_CURSTD_JOINED)) \
+ WT_ERR(__wt_curindex_joined(cur))
+
+#define JOINABLE_CURSOR_API_CALL(cur, s, n, bt) \
+ CURSOR_API_CALL(cur, s, n, bt); \
+ JOINABLE_CURSOR_CALL_CHECK(cur)
+
#define CURSOR_REMOVE_API_CALL(cur, s, bt) \
(s) = (WT_SESSION_IMPL *)(cur)->session; \
TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \
((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle);
+#define JOINABLE_CURSOR_REMOVE_API_CALL(cur, s, bt) \
+ CURSOR_REMOVE_API_CALL(cur, s, bt); \
+ JOINABLE_CURSOR_CALL_CHECK(cur)
+
#define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \
(s) = (WT_SESSION_IMPL *)(cur)->session; \
TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \
@@ -128,6 +140,10 @@
if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \
WT_ERR(WT_CACHE_FULL);
+#define JOINABLE_CURSOR_UPDATE_API_CALL(cur, s, n, bt) \
+ CURSOR_UPDATE_API_CALL(cur, s, n, bt); \
+ JOINABLE_CURSOR_CALL_CHECK(cur)
+
#define CURSOR_UPDATE_API_END(s, ret) \
TXN_API_END(s, ret)
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 02819237c13..ae29dc68003 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -437,24 +437,10 @@ struct __wt_page {
uint32_t deleted_entries;
WT_REF **index;
} * volatile __index; /* Collated children */
-
- /*
- * When splitting to deepen the tree, track the number
- * of entries in the newly created parent, and how many
- * subsequent splits follow the initial set of entries.
- * If future splits into the page are generally after
- * the initial set of items, perform future deepening
- * splits in this page to optimize for an append-style
- * workload.
- */
- uint32_t deepen_split_append;
- uint32_t deepen_split_last;
} intl;
#undef pg_intl_recno
#define pg_intl_recno u.intl.recno
#define pg_intl_parent_ref u.intl.parent_ref
-#define pg_intl_deepen_split_append u.intl.deepen_split_append
-#define pg_intl_deepen_split_last u.intl.deepen_split_last
/*
* Macros to copy/set the index because the name is obscured to ensure
@@ -581,7 +567,8 @@ struct __wt_page {
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
+#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */
+#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
uint8_t unused[2]; /* Unused padding */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 23e212eb772..a92d52e784a 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1101,16 +1101,17 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
return (false);
/*
- * If the tree was deepened, there's a requirement that newly created
- * internal pages not be evicted until all threads are known to have
- * exited the original page index array, because evicting an internal
- * page discards its WT_REF array, and a thread traversing the original
- * page index array might see a freed WT_REF. During the split we set
- * a transaction value, once that's globally visible, we know we can
- * evict the created page.
+ * If a split created new internal pages, those newly created internal
+ * pages cannot be evicted until all threads are known to have exited
+ * the original parent page's index, because evicting an internal page
+ * discards its WT_REF array, and a thread traversing the original
+ * parent page index might see a freed WT_REF. During the split we set
+ * a transaction value, we can evict the created page as soon as that
+ * transaction value is globally visible.
*/
if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
- !__wt_txn_visible_all(session, mod->mod_split_txn))
+ (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) ||
+ !__wt_txn_visible_all(session, mod->mod_split_txn)))
return (false);
/*
@@ -1374,3 +1375,34 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
return (child->memory_footprint > maxsize);
}
+
+/*
+ * __wt_split_intl_race --
+ * Return if we raced with an internal page split when descending the tree.
+ */
+static inline bool
+__wt_split_intl_race(
+ WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * A place to hang this comment...
+ *
+ * There's a page-split race when we walk the tree: if we're splitting
+ * an internal page into its parent, we update the parent's page index
+ * and then update the page being split, and it's not an atomic update.
+ * A thread could read the parent page's original page index, and then
+ * read the page's replacement index. Because internal page splits work
+ * by replacing the original page with the initial part of the original
+ * page, the result of this race is we will have a key that's past the
+ * end of the current page, and the parent's page index will have moved.
+ *
+ * It's also possible a thread could read the parent page's replacement
+ * page index, and then read the page's original index. Because internal
+ * splits work by truncating the original page, the original page's old
+ * content is compatible, this isn't a problem and we ignore this race.
+ */
+ WT_INTL_INDEX_GET(session, parent, pindex);
+ return (pindex != saved_pindex);
+}
diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h
index 408639ab2a9..e836abaccba 100644
--- a/src/third_party/wiredtiger/src/include/config.h
+++ b/src/third_party/wiredtiger/src/include/config.h
@@ -68,28 +68,29 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_SESSION_compact 16
#define WT_CONFIG_ENTRY_WT_SESSION_create 17
#define WT_CONFIG_ENTRY_WT_SESSION_drop 18
-#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 19
-#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 20
-#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 21
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 22
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 23
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 24
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 25
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 26
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 27
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 28
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 29
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 30
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 31
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 32
-#define WT_CONFIG_ENTRY_colgroup_meta 33
-#define WT_CONFIG_ENTRY_file_meta 34
-#define WT_CONFIG_ENTRY_index_meta 35
-#define WT_CONFIG_ENTRY_table_meta 36
-#define WT_CONFIG_ENTRY_wiredtiger_open 37
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 38
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 39
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 40
+#define WT_CONFIG_ENTRY_WT_SESSION_join 19
+#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20
+#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21
+#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 23
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 24
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 25
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 26
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 27
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 28
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 29
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 30
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 31
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 32
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 33
+#define WT_CONFIG_ENTRY_colgroup_meta 34
+#define WT_CONFIG_ENTRY_file_meta 35
+#define WT_CONFIG_ENTRY_index_meta 36
+#define WT_CONFIG_ENTRY_table_meta 37
+#define WT_CONFIG_ENTRY_wiredtiger_open 38
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 39
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 40
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 41
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 35a83d7c50f..3e8d3705373 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -365,13 +365,13 @@ struct __wt_connection_impl {
WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */
- WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
- wt_thread_t sweep_tid; /* Handle sweep thread */
- int sweep_tid_set; /* Handle sweep thread set */
- WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
- time_t sweep_idle_time;/* Handle sweep idle time */
- time_t sweep_interval;/* Handle sweep interval */
- u_int sweep_handles_min;/* Handle sweep minimum open */
+ WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
+ wt_thread_t sweep_tid; /* Handle sweep thread */
+ int sweep_tid_set; /* Handle sweep thread set */
+ WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
+ uint64_t sweep_idle_time; /* Handle sweep idle time */
+ uint64_t sweep_interval; /* Handle sweep interval */
+ uint64_t sweep_handles_min;/* Handle sweep minimum open */
/*
* Shared lookaside lock, session and cursor, used by threads accessing
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 1cbe76216b1..23d3f3745db 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -264,6 +264,66 @@ struct __wt_cursor_index {
uint8_t *cg_needvalue;
};
+struct __wt_cursor_join_iter {
+ WT_SESSION_IMPL *session;
+ WT_CURSOR_JOIN *cjoin;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_CURSOR *cursor;
+ WT_ITEM *curkey;
+ bool advance;
+};
+
+struct __wt_cursor_join_endpoint {
+ WT_ITEM key;
+ uint8_t recno_buf[10]; /* holds packed recno */
+ WT_CURSOR *cursor;
+
+#define WT_CURJOIN_END_LT 0x01 /* include values < cursor */
+#define WT_CURJOIN_END_EQ 0x02 /* include values == cursor */
+#define WT_CURJOIN_END_GT 0x04 /* include values > cursor */
+#define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
+#define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
+#define WT_CURJOIN_END_OWN_KEY 0x08 /* must free key's data */
+ uint8_t flags; /* range for this endpoint */
+};
+
+struct __wt_cursor_join_entry {
+ WT_INDEX *index;
+ WT_CURSOR *main; /* raw main table cursor */
+ WT_BLOOM *bloom; /* Bloom filter handle */
+ uint64_t bloom_bit_count; /* bits per item in bloom */
+ uint64_t bloom_hash_count; /* hash functions in bloom */
+ uint64_t count; /* approx number of matches */
+
+#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */
+#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */
+#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x04 /* this entry owns the bloom */
+ uint8_t flags;
+
+ WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */
+ size_t ends_allocated;
+ size_t ends_next;
+
+ WT_JOIN_STATS stats; /* Join statistics */
+};
+
+struct __wt_cursor_join {
+ WT_CURSOR iface;
+
+ WT_TABLE *table;
+ const char *projection;
+ WT_CURSOR_JOIN_ITER *iter;
+ WT_CURSOR_JOIN_ENTRY *entries;
+ size_t entries_allocated;
+ u_int entries_next;
+ uint8_t recno_buf[10]; /* holds packed recno */
+
+#define WT_CURJOIN_ERROR 0x01 /* Error in initialization */
+#define WT_CURJOIN_INITIALIZED 0x02 /* Successful initialization */
+#define WT_CURJOIN_SKIP_FIRST_LEFT 0x04 /* First check not needed */
+ uint8_t flags;
+};
+
struct __wt_cursor_json {
char *key_buf; /* JSON formatted string */
char *value_buf; /* JSON formatted string */
@@ -298,6 +358,13 @@ struct __wt_cursor_metadata {
uint32_t flags;
};
+struct __wt_join_stats_group {
+ const char *desc_prefix; /* Prefix appears before description */
+ WT_CURSOR_JOIN *join_cursor;
+ ssize_t join_cursor_entry; /* Position in entries */
+ WT_JOIN_STATS join_stats;
+};
+
struct __wt_cursor_stat {
WT_CURSOR iface;
@@ -307,14 +374,19 @@ struct __wt_cursor_stat {
int64_t *stats; /* Statistics */
int stats_base; /* Base statistics value */
int stats_count; /* Count of statistics values */
- const char *(*stats_desc)(int); /* Statistics descriptions */
+ int (*stats_desc)(WT_CURSOR_STAT *, int, const char **);
+ /* Statistics descriptions */
+ int (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool,
+ bool); /* Advance to next set */
union { /* Copies of the statistics */
WT_DSRC_STATS dsrc_stats;
WT_CONNECTION_STATS conn_stats;
+ WT_JOIN_STATS_GROUP join_stats_group;
} u;
const char **cfg; /* Original cursor configuration */
+ char *desc_buf; /* Saved description string */
int key; /* Current stats key */
uint64_t v; /* Current stats value */
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index c6ce04cab6f..9dd280534b4 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -139,6 +139,70 @@ __curfile_leave(WT_CURSOR_BTREE *cbt)
}
/*
+ * __wt_curindex_get_valuev --
+ * Internal implementation of WT_CURSOR->get_value for index cursors
+ */
+static inline int
+__wt_curindex_get_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ session = (WT_SESSION_IMPL *)cursor->session;
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
+ ret = __wt_schema_project_merge(session,
+ cindex->cg_cursors, cindex->value_plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ cindex->cg_cursors, cindex->value_plan, ap);
+err: return (ret);
+}
+
+/*
+ * __wt_curtable_get_valuev --
+ * Internal implementation of WT_CURSOR->get_value for table cursors.
+ */
+static inline int
+__wt_curtable_get_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_CURSOR *primary;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ session = (WT_SESSION_IMPL *)cursor->session;
+ primary = *ctable->cg_cursors;
+ WT_CURSOR_NEEDVALUE(primary);
+
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
+ ret = __wt_schema_project_merge(session,
+ ctable->cg_cursors, ctable->plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ ctable->cg_cursors, ctable->plan, ap);
+err: return (ret);
+}
+
+/*
* __wt_cursor_dhandle_incr_use --
* Increment the in-use counter in cursor's data source.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 032b94b7040..743a3c3ac31 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -83,6 +83,8 @@ extern int __wt_bloom_finalize(WT_BLOOM *bloom);
extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash);
extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash);
extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_inmem_get(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_intersection(WT_BLOOM *bloom, WT_BLOOM *other);
extern int __wt_bloom_close(WT_BLOOM *bloom);
extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
@@ -155,9 +157,9 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
-extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op);
extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
@@ -274,7 +276,10 @@ extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **c
extern int __wt_curfile_update_check(WT_CURSOR *cursor);
extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap, WT_CURSOR **cursorp);
extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curindex_joined(WT_CURSOR *cursor);
extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint32_t flags, uint32_t range, uint64_t count, uint64_t bloom_bit_count, uint64_t bloom_hash_count);
extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap);
extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode);
@@ -287,8 +292,8 @@ extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t
extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
-extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
-extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_cursor_notsup(WT_CURSOR *cursor);
extern int __wt_cursor_noop(WT_CURSOR *cursor);
extern void __wt_cursor_set_notsup(WT_CURSOR *cursor);
@@ -316,7 +321,7 @@ extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop);
-extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop);
extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
@@ -523,7 +528,6 @@ extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg);
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
extern void __wt_thread_id(char *buf, size_t buflen);
-extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
extern void __wt_yield(void);
extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...);
@@ -534,6 +538,8 @@ extern int __wt_struct_confchk(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v);
extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...);
extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...);
extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_unpack_size(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, size_t *resultp);
+extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp);
extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell);
extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size);
@@ -675,19 +681,24 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
extern void __wt_scr_discard(WT_SESSION_IMPL *session);
extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
-extern const char *__wt_stat_dsrc_desc(int slot);
+extern int __wt_stat_dsrc_desc(WT_CURSOR_STAT *cst, int slot, const char **p);
extern void __wt_stat_dsrc_init_single(WT_DSRC_STATS *stats);
extern void __wt_stat_dsrc_init(WT_DATA_HANDLE *handle);
extern void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats);
extern void __wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats);
extern void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to);
extern void __wt_stat_dsrc_aggregate( WT_DSRC_STATS **from, WT_DSRC_STATS *to);
-extern const char *__wt_stat_connection_desc(int slot);
+extern int __wt_stat_connection_desc(WT_CURSOR_STAT *cst, int slot, const char **p);
extern void __wt_stat_connection_init_single(WT_CONNECTION_STATS *stats);
extern void __wt_stat_connection_init(WT_CONNECTION_IMPL *handle);
extern void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats);
extern void __wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats);
extern void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to);
+extern int __wt_stat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **p);
+extern void __wt_stat_join_init_single(WT_JOIN_STATS *stats);
+extern void __wt_stat_join_clear_single(WT_JOIN_STATS *stats);
+extern void __wt_stat_join_clear_all(WT_JOIN_STATS **stats);
+extern void __wt_stat_join_aggregate( WT_JOIN_STATS **from, WT_JOIN_STATS *to);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 95fe18b9ecb..064349125cc 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -36,6 +36,7 @@
#define WT_LOG_DSYNC 0x00000002
#define WT_LOG_FLUSH 0x00000004
#define WT_LOG_FSYNC 0x00000008
+#define WT_LOG_SYNC_ENABLED 0x00000010
#define WT_READ_CACHE 0x00000001
#define WT_READ_COMPACT 0x00000002
#define WT_READ_NO_EMPTY 0x00000004
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index eca77214b47..e542baec642 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -13,6 +13,7 @@
#define WT_UNUSED(var) (void)(var)
/* Basic constants. */
+#define WT_THOUSAND (1000)
#define WT_MILLION (1000000)
#define WT_BILLION (1000000000)
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index 80096d0cf72..75068706b70 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -30,6 +30,22 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
}
/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+static inline int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
* __wt_verbose --
* Verbose message.
*/
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
index 54a9cc6f9fd..7eb042dd79f 100644
--- a/src/third_party/wiredtiger/src/include/mutex.i
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -18,7 +18,7 @@
/* Default to spinning 1000 times before yielding. */
#ifndef WT_SPIN_COUNT
-#define WT_SPIN_COUNT 1000
+#define WT_SPIN_COUNT WT_THOUSAND
#endif
/*
@@ -300,7 +300,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
* situation happens if there are more threads than cores in the
* system and we're thrashing on shared resources.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
@@ -329,7 +329,7 @@ __wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
#ifdef HAVE_DIAGNOSTIC
/*
* __wt_fair_islocked --
- * Test whether the lock is currently held
+ * Test whether the lock is currently held.
*/
static inline bool
__wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h
index 4ba588111b8..d135fd9eb1f 100644
--- a/src/third_party/wiredtiger/src/include/os.h
+++ b/src/third_party/wiredtiger/src/include/os.h
@@ -65,9 +65,16 @@ typedef enum {
} \
} while (0)
-#define WT_TIMEDIFF(end, begin) \
- (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
+#define WT_TIMEDIFF_NS(end, begin) \
+ (WT_BILLION * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
(uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+#define WT_TIMEDIFF_US(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_THOUSAND)
+#define WT_TIMEDIFF_MS(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_MILLION)
+#define WT_TIMEDIFF_SEC(end, begin) \
+ (WT_TIMEDIFF_NS((end), (begin)) / WT_BILLION)
+
#define WT_TIMECMP(t1, t2) \
((t1).tv_sec < (t2).tv_sec ? -1 : \
(t1).tv_sec == (t2.tv_sec) ? \
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 1ebe253e5db..dfe7ee5c6cd 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -272,7 +272,8 @@ struct __wt_connection_stats {
int64_t cache_eviction_server_evicting;
int64_t cache_eviction_server_not_evicting;
int64_t cache_eviction_slow;
- int64_t cache_eviction_split;
+ int64_t cache_eviction_split_internal;
+ int64_t cache_eviction_split_leaf;
int64_t cache_eviction_walk;
int64_t cache_eviction_worker_evicting;
int64_t cache_inmem_split;
@@ -297,6 +298,7 @@ struct __wt_connection_stats {
int64_t cursor_restart;
int64_t cursor_search;
int64_t cursor_search_near;
+ int64_t cursor_truncate;
int64_t cursor_update;
int64_t dh_conn_handle_count;
int64_t dh_session_handles;
@@ -358,6 +360,8 @@ struct __wt_connection_stats {
int64_t page_read_blocked;
int64_t page_sleep;
int64_t read_io;
+ int64_t rec_page_delete;
+ int64_t rec_page_delete_fast;
int64_t rec_pages;
int64_t rec_pages_eviction;
int64_t rec_split_stashed_bytes;
@@ -378,7 +382,10 @@ struct __wt_connection_stats {
int64_t txn_fail_cache;
int64_t txn_pinned_checkpoint_range;
int64_t txn_pinned_range;
+ int64_t txn_pinned_snapshot_range;
int64_t txn_rollback;
+ int64_t txn_snapshots_created;
+ int64_t txn_snapshots_dropped;
int64_t txn_sync;
int64_t write_io;
};
@@ -432,7 +439,8 @@ struct __wt_dsrc_stats {
int64_t cache_eviction_fail;
int64_t cache_eviction_hazard;
int64_t cache_eviction_internal;
- int64_t cache_eviction_split;
+ int64_t cache_eviction_split_internal;
+ int64_t cache_eviction_split_leaf;
int64_t cache_inmem_split;
int64_t cache_inmem_splittable;
int64_t cache_overflow_value;
@@ -461,6 +469,7 @@ struct __wt_dsrc_stats {
int64_t cursor_restart;
int64_t cursor_search;
int64_t cursor_search_near;
+ int64_t cursor_truncate;
int64_t cursor_update;
int64_t cursor_update_bytes;
int64_t lsm_checkpoint_throttle;
@@ -476,6 +485,7 @@ struct __wt_dsrc_stats {
int64_t rec_overflow_key_leaf;
int64_t rec_overflow_value;
int64_t rec_page_delete;
+ int64_t rec_page_delete_fast;
int64_t rec_page_match;
int64_t rec_pages;
int64_t rec_pages_eviction;
@@ -486,4 +496,14 @@ struct __wt_dsrc_stats {
int64_t txn_update_conflict;
};
+/*
+ * Statistics entries for join cursors.
+ */
+#define WT_JOIN_STATS_BASE 3000
+struct __wt_join_stats {
+ int64_t accesses;
+ int64_t actual_count;
+ int64_t bloom_false_positive;
+};
+
/* Statistics section: END */
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 044611d655e..08f73386090 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -574,11 +574,12 @@ struct __wt_cursor {
#define WT_CURSTD_KEY_EXT 0x0020 /* Key points out of the tree. */
#define WT_CURSTD_KEY_INT 0x0040 /* Key points into the tree. */
#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
-#define WT_CURSTD_OPEN 0x0080
-#define WT_CURSTD_OVERWRITE 0x0100
-#define WT_CURSTD_RAW 0x0200
-#define WT_CURSTD_VALUE_EXT 0x0400 /* Value points out of the tree. */
-#define WT_CURSTD_VALUE_INT 0x0800 /* Value points into the tree. */
+#define WT_CURSTD_JOINED 0x0080
+#define WT_CURSTD_OPEN 0x0100
+#define WT_CURSTD_OVERWRITE 0x0200
+#define WT_CURSTD_RAW 0x0400
+#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
uint32_t flags;
#endif
@@ -1236,6 +1237,61 @@ struct __wt_session {
const char *name, const char *config);
/*!
+ * Join a join cursor with a reference cursor.
+ *
+ * @snippet ex_schema.c Join cursors
+ *
+ * @param session the session handle
+ * @param join_cursor a cursor that was opened using a
+ * \c "join:" URI. It may not have been used for any operations
+ * other than other join calls.
+ * @param ref_cursor either an index cursor having the same base table
+ * as the join_cursor, or a table cursor open on the same base table.
+ * The ref_cursor must be positioned.
+ *
+ * The ref_cursor limits the results seen by iterating the
+ * join_cursor to table items referred to by the key in this
+ * index. The set of keys referred to is modified by the compare
+ * config option.
+ *
+ * Multiple join calls builds up a set of ref_cursors, and the
+ * results seen by iteration are the intersection of the cursor
+ * ranges participating in the join.
+ *
+ * After the join call completes, the ref_cursor cursor may not be
+ * used for any purpose other than get_key and get_value. Any other
+ * cursor method (e.g. next, prev,close) will fail. When the
+ * join_cursor is closed, the ref_cursor is made available for
+ * general use again. The application should close ref_cursor when
+ * finished with it, although not before the join_cursor is closed.
+ *
+ * @configstart{WT_SESSION.join, see dist/api_data.py}
+ * @config{bloom_bit_count, the number of bits used per item for the
+ * bloom filter., an integer between 2 and 1000; default \c 16.}
+ * @config{bloom_hash_count, the number of hash values per item for the
+ * bloom filter., an integer between 2 and 100; default \c 8.}
+ * @config{compare, modifies the set of items to be returned so that the
+ * index key satisfies the given comparison relative to the key set in
+ * this cursor., a string\, chosen from the following options: \c "eq"\,
+ * \c "ge"\, \c "gt"\, \c "le"\, \c "lt"; default \c "eq".}
+ * @config{count, set an approximate count of the elements that would be
+ * included in the join. This is used in sizing the bloom filter\, and
+ * also influences evaluation order for cursors in the join. When the
+ * count is equal for multiple bloom filters in a composition of joins\,
+ * the bloom filter may be shared., an integer; default \c .}
+ * @config{strategy, when set to bloom\, a bloom filter is created and
+ * populated for this index. This has an up front cost but may reduce
+ * the number of accesses to the main table when iterating the joined
+ * cursor. The bloom setting requires that count be set., a string\,
+ * chosen from the following options: \c "bloom"\, \c "default"; default
+ * empty.}
+ * @configend
+ * @errors
+ */
+ int __F(join)(WT_SESSION *session, WT_CURSOR *join_cursor,
+ WT_CURSOR *ref_cursor, const char *config);
+
+ /*!
* Flush the log.
*
* @param session the session handle
@@ -2328,10 +2384,13 @@ struct __wt_connection {
* string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c
* "none"; default \c fsync.}
* @config{ ),,}
+ * @config{use_environment, use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME
+ * environment variables if the process is not running with special privileges.
+ * See @ref home for more information., a boolean flag; default \c true.}
* @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c
- * WIREDTIGER_HOME environment variables regardless of whether or not the
- * process is running with special privileges. See @ref home for more
- * information., a boolean flag; default \c false.}
+ * WIREDTIGER_HOME environment variables even if the process is running with
+ * special privileges. See @ref home for more information., a boolean flag;
+ * default \c false.}
* @config{verbose, enable messages for various events. Only available if
* WiredTiger is configured with --enable-verbose. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
@@ -3710,224 +3769,239 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043
/*! cache: eviction server unable to reach eviction goal */
#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044
-/*! cache: pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1045
+/*! cache: internal pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045
+/*! cache: leaf pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1046
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047
/*! cache: eviction worker thread evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050
/*! cache: lookaside table insert calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051
/*! cache: lookaside table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1052
+#define WT_STAT_CONN_CACHE_OVERHEAD 1053
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1055
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1055
+#define WT_STAT_CONN_CACHE_READ 1056
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1057
+#define WT_STAT_CONN_CACHE_WRITE 1058
/*! cache: page written requiring lookaside records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1060
+#define WT_STAT_CONN_COND_WAIT 1061
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1061
+#define WT_STAT_CONN_CURSOR_CREATE 1062
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1062
+#define WT_STAT_CONN_CURSOR_INSERT 1063
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1063
+#define WT_STAT_CONN_CURSOR_NEXT 1064
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1064
+#define WT_STAT_CONN_CURSOR_PREV 1065
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1065
+#define WT_STAT_CONN_CURSOR_REMOVE 1066
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1066
+#define WT_STAT_CONN_CURSOR_RESET 1067
/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1067
+#define WT_STAT_CONN_CURSOR_RESTART 1068
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1068
+#define WT_STAT_CONN_CURSOR_SEARCH 1069
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070
+/*! cursor: truncate calls */
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1071
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1070
+#define WT_STAT_CONN_CURSOR_UPDATE 1072
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1073
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1072
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1074
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1075
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1076
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1075
+#define WT_STAT_CONN_DH_SWEEP_REF 1077
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1078
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1077
+#define WT_STAT_CONN_DH_SWEEP_TOD 1079
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1078
+#define WT_STAT_CONN_DH_SWEEPS 1080
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1079
+#define WT_STAT_CONN_FILE_OPEN 1081
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1082
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1083
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1084
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1085
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1086
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1087
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1088
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1089
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1090
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1089
+#define WT_STAT_CONN_LOG_FLUSH 1091
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1090
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1092
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1091
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1093
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1092
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1094
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1093
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1095
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1094
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1096
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1095
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1097
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1096
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1098
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1097
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1099
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1098
+#define WT_STAT_CONN_LOG_SCANS 1100
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1099
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1101
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1100
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1102
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1101
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1103
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1102
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1104
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1103
+#define WT_STAT_CONN_LOG_SLOT_RACES 1105
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1104
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1106
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1107
/*! log: consolidated slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1106
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1108
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1107
+#define WT_STAT_CONN_LOG_SYNC 1109
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1108
+#define WT_STAT_CONN_LOG_SYNC_DIR 1110
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1109
+#define WT_STAT_CONN_LOG_WRITE_LSN 1111
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1110
+#define WT_STAT_CONN_LOG_WRITES 1112
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1111
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1113
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1112
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1114
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1113
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1115
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1114
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1116
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1115
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1117
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1116
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1118
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1117
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1119
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1118
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1120
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1119
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1121
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1120
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1122
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1121
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1123
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1122
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1124
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1123
+#define WT_STAT_CONN_MEMORY_FREE 1125
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1124
+#define WT_STAT_CONN_MEMORY_GROW 1126
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1125
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1127
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1126
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1128
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1127
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1129
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1128
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1130
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1129
+#define WT_STAT_CONN_PAGE_SLEEP 1131
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1130
+#define WT_STAT_CONN_READ_IO 1132
+/*! reconciliation: pages deleted */
+#define WT_STAT_CONN_REC_PAGE_DELETE 1133
+/*! reconciliation: fast-path pages deleted */
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1134
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1131
+#define WT_STAT_CONN_REC_PAGES 1135
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1132
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1136
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1133
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1137
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1134
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1138
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1135
+#define WT_STAT_CONN_RWLOCK_READ 1139
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1136
+#define WT_STAT_CONN_RWLOCK_WRITE 1140
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1137
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1141
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1138
+#define WT_STAT_CONN_SESSION_OPEN 1142
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1139
+#define WT_STAT_CONN_TXN_BEGIN 1143
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1140
+#define WT_STAT_CONN_TXN_CHECKPOINT 1144
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1141
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1145
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1142
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1146
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1143
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1147
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1144
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1148
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1145
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1149
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1146
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1150
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1147
+#define WT_STAT_CONN_TXN_COMMIT 1151
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1148
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1152
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1149
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1153
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1150
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1154
+/*! transaction: transaction range of IDs currently pinned by named
+ * snapshots */
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1155
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1151
+#define WT_STAT_CONN_TXN_ROLLBACK 1156
+/*! transaction: number of named snapshots created */
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1157
+/*! transaction: number of named snapshots dropped */
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1158
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1152
+#define WT_STAT_CONN_TXN_SYNC 1159
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1153
+#define WT_STAT_CONN_WRITE_IO 1160
/*!
* @}
@@ -4023,112 +4097,131 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042
/*! cache: internal pages evicted */
#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043
-/*! cache: pages split during eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044
+/*! cache: internal pages split during eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044
+/*! cache: leaf pages split during eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045
/*! cache: in-memory page splits */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045
+#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046
+#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047
/*! cache: overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048
/*! cache: pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 2048
+#define WT_STAT_DSRC_CACHE_READ 2049
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049
+#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050
/*! cache: overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2051
+#define WT_STAT_DSRC_CACHE_WRITE 2052
/*! cache: page written requiring lookaside records */
-#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052
+#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053
+#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2057
+#define WT_STAT_DSRC_COMPRESS_READ 2058
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2058
+#define WT_STAT_DSRC_COMPRESS_WRITE 2059
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2061
+#define WT_STAT_DSRC_CURSOR_CREATE 2062
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2062
+#define WT_STAT_DSRC_CURSOR_INSERT 2063
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2065
+#define WT_STAT_DSRC_CURSOR_NEXT 2066
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2066
+#define WT_STAT_DSRC_CURSOR_PREV 2067
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2067
+#define WT_STAT_DSRC_CURSOR_REMOVE 2068
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2069
+#define WT_STAT_DSRC_CURSOR_RESET 2070
/*! cursor: restarted searches */
-#define WT_STAT_DSRC_CURSOR_RESTART 2070
+#define WT_STAT_DSRC_CURSOR_RESTART 2071
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2071
+#define WT_STAT_DSRC_CURSOR_SEARCH 2072
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073
+/*! cursor: truncate calls */
+#define WT_STAT_DSRC_CURSOR_TRUNCATE 2074
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2073
+#define WT_STAT_DSRC_CURSOR_UPDATE 2075
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2076
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2077
/*! LSM: chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2078
/*! LSM: highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2079
/*! LSM: queries that could have benefited from a Bloom filter that did
* not exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2080
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2081
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2080
+#define WT_STAT_DSRC_REC_DICTIONARY 2082
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2083
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2084
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2085
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2087
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2088
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2087
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2089
+/*! reconciliation: fast-path pages deleted */
+#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2090
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2088
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2091
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2089
+#define WT_STAT_DSRC_REC_PAGES 2092
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2093
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2094
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2095
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2093
+#define WT_STAT_DSRC_SESSION_COMPACT 2096
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2097
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2098
+
+/*!
+ * @}
+ * @name Statistics for join cursors
+ * @anchor statistics_join
+ * @{
+ */
+/*! : accesses */
+#define WT_STAT_JOIN_ACCESSES 3000
+/*! : actual count of items */
+#define WT_STAT_JOIN_ACTUAL_COUNT 3001
+/*! : bloom filter false positives */
+#define WT_STAT_JOIN_BLOOM_FALSE_POSITIVE 3002
/*! @} */
/*
* Statistics section: END
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 3f4e0ada7f1..0a1e143ce70 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -136,6 +136,14 @@ struct __wt_cursor_dump;
typedef struct __wt_cursor_dump WT_CURSOR_DUMP;
struct __wt_cursor_index;
typedef struct __wt_cursor_index WT_CURSOR_INDEX;
+struct __wt_cursor_join;
+ typedef struct __wt_cursor_join WT_CURSOR_JOIN;
+struct __wt_cursor_join_endpoint;
+ typedef struct __wt_cursor_join_endpoint WT_CURSOR_JOIN_ENDPOINT;
+struct __wt_cursor_join_entry;
+ typedef struct __wt_cursor_join_entry WT_CURSOR_JOIN_ENTRY;
+struct __wt_cursor_join_iter;
+ typedef struct __wt_cursor_join_iter WT_CURSOR_JOIN_ITER;
struct __wt_cursor_json;
typedef struct __wt_cursor_json WT_CURSOR_JSON;
struct __wt_cursor_log;
@@ -178,6 +186,10 @@ struct __wt_insert;
typedef struct __wt_insert WT_INSERT;
struct __wt_insert_head;
typedef struct __wt_insert_head WT_INSERT_HEAD;
+struct __wt_join_stats;
+ typedef struct __wt_join_stats WT_JOIN_STATS;
+struct __wt_join_stats_group;
+ typedef struct __wt_join_stats_group WT_JOIN_STATS_GROUP;
struct __wt_keyed_encryptor;
typedef struct __wt_keyed_encryptor WT_KEYED_ENCRYPTOR;
struct __wt_log;
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 44dc7dc30a7..3106094e7e3 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -1313,7 +1313,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
- if (++yield_count < 1000)
+ if (++yield_count < WT_THOUSAND)
__wt_yield();
else
ret = __wt_cond_wait(session, log->log_write_cond, 200);
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index b3790412536..255551f99a4 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -380,7 +380,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
* There should almost always be a slot open.
*/
#ifdef HAVE_DIAGNOSTIC
- unbuf_force = (++log->write_calls % 1000) == 0;
+ unbuf_force = (++log->write_calls % WT_THOUSAND) == 0;
#endif
for (;;) {
WT_BARRIER();
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index f988bfc97fd..953698476ef 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -81,7 +81,7 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
lsm_tree->nchunks == 0 ||
clsm->dsk_gen == lsm_tree->dsk_gen;
++waited) {
- if (waited % 1000 == 0)
+ if (waited % WT_THOUSAND == 0)
WT_RET(__wt_lsm_manager_push_entry(
session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
__wt_sleep(0, 10);
@@ -1379,7 +1379,15 @@ __clsm_insert(WT_CURSOR *cursor)
}
WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
- ret = __clsm_put(session, clsm, &cursor->key, &value, false);
+ WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, false));
+
+ /*
+ * WT_CURSOR.insert doesn't leave the cursor positioned, and the
+ * application may want to free the memory used to configure the
+ * insert; don't read that memory again (matching the underlying
+ * file object cursor insert semantics).
+ */
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
err: __wt_scr_free(session, &buf);
__clsm_leave(clsm);
@@ -1522,6 +1530,10 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
if (!WT_PREFIX_MATCH(uri, "lsm:"))
return (EINVAL);
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees not supported by in-memory configurations");
+
WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
if (cval.len != 0)
WT_RET_MSG(session, EINVAL,
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index 1c5124c32af..d8cf36f2cc1 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -388,8 +388,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
continue;
WT_ERR(__wt_epoch(session, &now));
pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
- WT_TIMEDIFF(
- now, lsm_tree->work_push_ts) / WT_MILLION;
+ WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts);
fillms = 3 * lsm_tree->chunk_fill_ms;
if (fillms == 0)
fillms = 10000;
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
index dd1419fe67d..1a2608803e4 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -94,7 +94,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_RET(__wt_epoch(session, &now));
msec_since_last_merge =
- WT_TIMEDIFF(now, lsm_tree->merge_aggressive_ts) / WT_MILLION;
+ WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts);
/*
* If there is no estimate for how long it's taking to fill chunks
@@ -457,7 +457,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
cfg[2] = NULL;
WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
-#define LSM_MERGE_CHECK_INTERVAL 1000
+#define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
index 4381ca0df00..c1eb7a2a389 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -77,12 +77,12 @@ __curstat_lsm_init(
*/
WT_ERR(__wt_buf_fmt(
session, uribuf, "statistics:%s", chunk->uri));
- ret = __wt_curstat_open(session, uribuf->data,
+ ret = __wt_curstat_open(session, uribuf->data, NULL,
F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
&stat_cursor);
if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
ret = __wt_curstat_open(
- session, uribuf->data, cfg, &stat_cursor);
+ session, uribuf->data, NULL, cfg, &stat_cursor);
WT_ERR(ret);
/*
@@ -107,7 +107,7 @@ __curstat_lsm_init(
WT_ERR(__wt_buf_fmt(
session, uribuf, "statistics:%s", chunk->bloom_uri));
WT_ERR(__wt_curstat_open(
- session, uribuf->data, cfg, &stat_cursor));
+ session, uribuf->data, NULL, cfg, &stat_cursor));
/*
* The underlying statistics have now been initialized; fill in
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 30af051bbcf..0c3642e70e8 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -111,7 +111,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* other schema level operations will return EBUSY, even though
* we're dropping the schema lock here.
*/
- if (i % 1000 == 0) {
+ if (i % WT_THOUSAND == 0) {
WT_WITHOUT_LOCKS(session, ret =
__wt_lsm_manager_clear_tree(session, lsm_tree));
WT_RET(ret);
@@ -336,6 +336,11 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
}
WT_RET_NOTFOUND_OK(ret);
+ /* In-memory configurations don't make sense for LSM. */
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees not supported by in-memory configurations");
+
WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
if (WT_STRING_MATCH("r", cval.str, cval.len))
WT_RET_MSG(session, EINVAL,
@@ -747,7 +752,7 @@ __wt_lsm_tree_throttle(
WT_ASSERT(session,
WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
timediff =
- WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
+ WT_TIMEDIFF_NS(last_chunk->create_ts, ondisk->create_ts);
lsm_tree->ckpt_throttle =
(in_memory - 2) * timediff / (20 * record_count);
@@ -783,8 +788,8 @@ __wt_lsm_tree_throttle(
}
/* Put an upper bound of 1s on both throttle calculations. */
- lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
- lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);
+ lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
+ lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
/*
* Update our estimate of how long each in-memory chunk stays active.
@@ -798,15 +803,16 @@ __wt_lsm_tree_throttle(
WT_ASSERT(session, prev_chunk->generation == 0);
WT_ASSERT(session, WT_TIMECMP(
last_chunk->create_ts, prev_chunk->create_ts) >= 0);
- timediff =
- WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
+ timediff = WT_TIMEDIFF_NS(
+ last_chunk->create_ts, prev_chunk->create_ts);
WT_ASSERT(session,
WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
- oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
+ oldtime = WT_TIMEDIFF_NS(
+ prev_chunk->create_ts, ondisk->create_ts);
if (timediff < 10 * oldtime)
lsm_tree->chunk_fill_ms =
(3 * lsm_tree->chunk_fill_ms +
- timediff / 1000000) / 4;
+ timediff / WT_MILLION) / 4;
}
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index fac2c06957d..d5fc86b648b 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -76,9 +76,9 @@ __wt_cond_wait_signal(
if (usecs > 0) {
WT_ERR(__wt_epoch(session, &ts));
ts.tv_sec += (time_t)
- (((uint64_t)ts.tv_nsec + 1000 * usecs) / WT_BILLION);
+ (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION);
ts.tv_nsec = (long)
- (((uint64_t)ts.tv_nsec + 1000 * usecs) % WT_BILLION);
+ (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) % WT_BILLION);
ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
} else
ret = pthread_cond_wait(&cond->cond, &cond->mtx);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
index d47ab197643..46f134feabb 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -201,7 +201,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* Don't sleep long when waiting on a read lock, hopefully we're
* waiting on another read thread to increment the reader count.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
@@ -300,7 +300,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* situation happens if there are more threads than cores in the
* system and we're thrashing on shared resources.
*/
- if (++pause_cnt < 1000)
+ if (++pause_cnt < WT_THOUSAND)
WT_PAUSE();
else
__wt_sleep(0, 10);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
index f888e51bf7f..4e90edabc53 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_sleep.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
@@ -17,8 +17,8 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
{
struct timeval t;
- t.tv_sec = (time_t)(seconds + micro_seconds / 1000000);
- t.tv_usec = (suseconds_t)(micro_seconds % 1000000);
+ t.tv_sec = (time_t)(seconds + micro_seconds / WT_MILLION);
+ t.tv_usec = (suseconds_t)(micro_seconds % WT_MILLION);
(void)select(0, NULL, NULL, NULL, &t);
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
index c52772e77e1..c3052df62e7 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_time.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -9,22 +9,6 @@
#include "wt_internal.h"
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-int
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- WT_RET(__wt_epoch(session, &t));
-
- *timep = t.tv_sec;
-
- return (0);
-}
-
-/*
* __wt_epoch --
* Return the time since the Epoch.
*/
@@ -44,7 +28,7 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
if (ret == 0) {
tsp->tv_sec = v.tv_sec;
- tsp->tv_nsec = v.tv_usec * 1000;
+ tsp->tv_nsec = v.tv_usec * WT_THOUSAND;
return (0);
}
WT_RET_MSG(session, ret, "gettimeofday");
diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c
index 484cf218f26..33e04c1d8a9 100644
--- a/src/third_party/wiredtiger/src/os_win/os_sleep.c
+++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c
@@ -19,7 +19,7 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds)
* If the caller wants a small pause, set to our
* smallest granularity.
*/
- if (seconds == 0 && micro_seconds < 1000)
- micro_seconds = 1000;
- Sleep(seconds * 1000 + micro_seconds / 1000);
+ if (seconds == 0 && micro_seconds < WT_THOUSAND)
+ micro_seconds = WT_THOUSAND;
+ Sleep(seconds * WT_THOUSAND + micro_seconds / WT_THOUSAND);
}
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
index c51db118ce1..2292c317a64 100644
--- a/src/third_party/wiredtiger/src/os_win/os_time.c
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -9,22 +9,6 @@
#include "wt_internal.h"
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-int
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- WT_RET(__wt_epoch(session, &t));
-
- *timep = t.tv_sec;
-
- return (0);
-}
-
-/*
* __wt_epoch --
* Return the time since the Epoch.
*/
diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c
index 3a4428eae15..447c887dc6f 100644
--- a/src/third_party/wiredtiger/src/packing/pack_impl.c
+++ b/src/third_party/wiredtiger/src/packing/pack_impl.c
@@ -105,3 +105,108 @@ __wt_struct_unpack(WT_SESSION_IMPL *session,
return (ret);
}
+
+/*
+ * __wt_struct_unpack_size --
+ * Determine the packed size of a buffer matching the format.
+ */
+int
+__wt_struct_unpack_size(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, size_t *resultp)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ const uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0)
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ *resultp = WT_PTRDIFF(p, buffer);
+ return (0);
+}
+
+/*
+ * __wt_struct_repack --
+ * Return the subset of the packed buffer that represents part of
+ * the format. If the result is not contiguous in the existing
+ * buffer, a buffer is reallocated and filled.
+ */
+int
+__wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt,
+ const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf,
+ void **reallocp)
+{
+ WT_DECL_PACK_VALUE(pvin);
+ WT_DECL_PACK_VALUE(pvout);
+ WT_DECL_RET;
+ WT_PACK packin, packout;
+ const uint8_t *before, *end, *p;
+ uint8_t *newbuf, *pout;
+ size_t len;
+ const void *start;
+
+ start = newbuf = NULL;
+ p = inbuf->data;
+ end = p + inbuf->size;
+
+ /*
+ * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf.
+ * The former case has the size embedded before the item, the latter
+ * does not.
+ */
+ if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' &&
+ strlen(infmt) > len && infmt[len - 1] == 'U') {
+ WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp));
+ pout = *reallocp;
+ } else
+ pout = NULL;
+
+ WT_ERR(__pack_init(session, &packout, outfmt));
+ WT_ERR(__pack_init(session, &packin, infmt));
+
+ /* Outfmt should complete before infmt */
+ while ((ret = __pack_next(&packout, &pvout)) == 0) {
+ WT_ERR(__pack_next(&packin, &pvin));
+ before = p;
+ WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p)));
+ if (pvout.type != pvin.type) {
+ if (pvout.type == 'u' && pvin.type == 'U') {
+ /* Skip the prefixed size, we don't need it */
+ WT_ERR(__wt_struct_unpack_size(session, before,
+ (size_t)(end - before), "I", &len));
+ before += len;
+ } else
+ WT_ERR(ENOTSUP);
+ }
+ if (pout != NULL) {
+ memcpy(pout, before, WT_PTRDIFF(p, before));
+ pout += p - before;
+ } else if (start == NULL)
+ start = before;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (pout != NULL) {
+ outbuf->data = *reallocp;
+ outbuf->size = WT_PTRDIFF(pout, *reallocp);
+ } else {
+ outbuf->data = start;
+ outbuf->size = WT_PTRDIFF(p, start);
+ }
+
+err: return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 965f798e820..6d53230e9e0 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -960,7 +960,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
* than 10,000 boundary structure elements, discard the boundary array
* entirely and start over next time.
*/
- if (destroy || r->bnd_entries > 10 * 1000) {
+ if (destroy || r->bnd_entries > 10 * WT_THOUSAND) {
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->disk_image);
@@ -2505,7 +2505,10 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
* the page: the offset is the byte offset to the possible split-point
* (adjusted for an initial chunk that cannot be compressed), entries
* is the cumulative page entries covered by the byte offset, recnos is
- * the cumulative rows covered by the byte offset.
+ * the cumulative rows covered by the byte offset. Allocate to handle
+ * both column- and row-store regardless of this page type, structures
+ * are potentially reused for subsequent reconciliations of different
+ * page types.
*/
if (r->entries >= r->raw_max_slots) {
__wt_free(session, r->raw_entries);
@@ -2516,9 +2519,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
i = r->entries + 100;
WT_RET(__wt_calloc_def(session, i, &r->raw_entries));
WT_RET(__wt_calloc_def(session, i, &r->raw_offsets));
- if (dsk->type == WT_PAGE_COL_INT ||
- dsk->type == WT_PAGE_COL_VAR)
- WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
+ WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
r->raw_max_slots = i;
}
@@ -5469,6 +5470,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case 0: /* Page delete */
WT_RET(__wt_verbose(
session, WT_VERB_RECONCILE, "page %p empty", page));
+ WT_STAT_FAST_CONN_INCR(session, rec_page_delete);
WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
/* If this is the root page, we need to create a sync point. */
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
index d73d66cd399..82c2e2a15dc 100644
--- a/src/third_party/wiredtiger/src/schema/schema_stat.c
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -24,7 +24,7 @@ __wt_curstat_colgroup_init(WT_SESSION_IMPL *session,
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source));
- ret = __wt_curstat_init(session, buf->data, cfg, cst);
+ ret = __wt_curstat_init(session, buf->data, NULL, cfg, cst);
err: __wt_scr_free(session, &buf);
return (ret);
@@ -46,7 +46,7 @@ __wt_curstat_index_init(WT_SESSION_IMPL *session,
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source));
- ret = __wt_curstat_init(session, buf->data, cfg, cst);
+ ret = __wt_curstat_init(session, buf->data, NULL, cfg, cst);
err: __wt_scr_free(session, &buf);
return (ret);
@@ -159,7 +159,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
WT_ERR(__wt_buf_fmt(
session, buf, "statistics:%s", table->cgroups[i]->name));
WT_ERR(__wt_curstat_open(
- session, buf->data, cfg, &stat_cursor));
+ session, buf->data, NULL, cfg, &stat_cursor));
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
if (i == 0)
*stats = *new;
@@ -174,7 +174,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
WT_ERR(__wt_buf_fmt(
session, buf, "statistics:%s", table->indices[i]->name));
WT_ERR(__wt_curstat_open(
- session, buf->data, cfg, &stat_cursor));
+ session, buf->data, NULL, cfg, &stat_cursor));
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
__wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
index 03a991a9aba..c39bba4753c 100644
--- a/src/third_party/wiredtiger/src/schema/schema_truncate.c
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -26,6 +26,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *uri)
/* Open and lock the file. */
WT_RET(__wt_session_get_btree(
session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/* Get the allocation size. */
allocsize = S2BT(session)->allocsize;
@@ -56,6 +57,7 @@ __truncate_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
u_int i;
WT_RET(__wt_schema_get_table(session, uri, strlen(uri), false, &table));
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/* Truncate the column groups. */
for (i = 0; i < WT_COLGROUPS(table); i++)
@@ -90,6 +92,7 @@ __truncate_dsrc(WT_SESSION_IMPL *session, const char *uri)
while ((ret = cursor->next(cursor)) == 0)
WT_ERR(cursor->remove(cursor));
WT_ERR_NOTFOUND_OK(ret);
+ WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
err: WT_TRET(cursor->close(cursor));
return (ret);
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index ed0e016dcb2..db81623c613 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -240,12 +240,12 @@ err: API_END_RET_NOTFOUND_MAP(session, ret);
}
/*
- * __wt_open_cursor --
- * Internal version of WT_SESSION::open_cursor.
+ * __session_open_cursor_int --
+ * Internal version of WT_SESSION::open_cursor, with second cursor arg.
*/
-int
-__wt_open_cursor(WT_SESSION_IMPL *session,
- const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+static int
+__session_open_cursor_int(WT_SESSION_IMPL *session, const char *uri,
+ WT_CURSOR *owner, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp)
{
WT_COLGROUP *colgroup;
WT_DATA_SOURCE *dsrc;
@@ -267,7 +267,8 @@ __wt_open_cursor(WT_SESSION_IMPL *session,
*/
case 't':
if (WT_PREFIX_MATCH(uri, "table:"))
- WT_RET(__wt_curtable_open(session, uri, cfg, cursorp));
+ WT_RET(__wt_curtable_open(
+ session, uri, owner, cfg, cursorp));
break;
case 'c':
if (WT_PREFIX_MATCH(uri, "colgroup:")) {
@@ -288,6 +289,11 @@ __wt_open_cursor(WT_SESSION_IMPL *session,
WT_RET(__wt_curindex_open(
session, uri, owner, cfg, cursorp));
break;
+ case 'j':
+ if (WT_PREFIX_MATCH(uri, "join:"))
+ WT_RET(__wt_curjoin_open(
+ session, uri, owner, cfg, cursorp));
+ break;
case 'l':
if (WT_PREFIX_MATCH(uri, "lsm:"))
WT_RET(__wt_clsm_open(
@@ -316,7 +322,8 @@ __wt_open_cursor(WT_SESSION_IMPL *session,
break;
case 's':
if (WT_PREFIX_MATCH(uri, "statistics:"))
- WT_RET(__wt_curstat_open(session, uri, cfg, cursorp));
+ WT_RET(__wt_curstat_open(session, uri, other, cfg,
+ cursorp));
break;
default:
break;
@@ -346,6 +353,18 @@ __wt_open_cursor(WT_SESSION_IMPL *session,
}
/*
+ * __wt_open_cursor --
+ * Internal version of WT_SESSION::open_cursor.
+ */
+int
+__wt_open_cursor(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ return (__session_open_cursor_int(session, uri, owner, NULL, cfg,
+ cursorp));
+}
+
+/*
* __session_open_cursor --
* WT_SESSION->open_cursor method.
*/
@@ -356,18 +375,22 @@ __session_open_cursor(WT_SESSION *wt_session,
WT_CURSOR *cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ bool statjoin;
cursor = *cursorp = NULL;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, open_cursor, config, cfg);
- if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL))
+ statjoin = (to_dup != NULL && uri != NULL &&
+ WT_STREQ(uri, "statistics:join"));
+ if ((to_dup == NULL && uri == NULL) ||
+ (to_dup != NULL && uri != NULL && !statjoin))
WT_ERR_MSG(session, EINVAL,
"should be passed either a URI or a cursor to duplicate, "
"but not both");
- if (to_dup != NULL) {
+ if (to_dup != NULL && !statjoin) {
uri = to_dup->uri;
if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
!WT_PREFIX_MATCH(uri, "index:") &&
@@ -379,8 +402,9 @@ __session_open_cursor(WT_SESSION *wt_session,
WT_ERR(__wt_bad_object_type(session, uri));
}
- WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
- if (to_dup != NULL)
+ WT_ERR(__session_open_cursor_int(session, uri, NULL,
+ statjoin ? to_dup : NULL, cfg, &cursor));
+ if (to_dup != NULL && !statjoin)
WT_ERR(__wt_cursor_dup_position(to_dup, cursor));
*cursorp = cursor;
@@ -614,6 +638,123 @@ err: /* Note: drop operations cannot be unrolled (yet?). */
}
/*
+ * __session_join --
+ * WT_SESSION->join method.
+ */
+static int
+__session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
+ WT_CURSOR *ref_cursor, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR_JOIN *cjoin;
+ WT_CURSOR_TABLE *ctable;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ uint32_t flags, range;
+ uint64_t count;
+ uint64_t bloom_bit_count, bloom_hash_count;
+
+ count = 0;
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, join, config, cfg);
+ table = NULL;
+
+ if (!WT_PREFIX_MATCH(join_cursor->uri, "join:")) {
+ __wt_errx(session, "not a join cursor");
+ WT_ERR(EINVAL);
+ }
+
+ if (WT_PREFIX_MATCH(ref_cursor->uri, "index:")) {
+ cindex = (WT_CURSOR_INDEX *)ref_cursor;
+ idx = cindex->index;
+ table = cindex->table;
+ WT_CURSOR_CHECKKEY(ref_cursor);
+ } else if (WT_PREFIX_MATCH(ref_cursor->uri, "table:")) {
+ idx = NULL;
+ ctable = (WT_CURSOR_TABLE *)ref_cursor;
+ table = ctable->table;
+ WT_CURSOR_CHECKKEY(ctable->cg_cursors[0]);
+ } else {
+ __wt_errx(session, "not an index or table cursor");
+ WT_ERR(EINVAL);
+ }
+
+ cjoin = (WT_CURSOR_JOIN *)join_cursor;
+ if (cjoin->table != table) {
+ __wt_errx(session, "table for join cursor does not match "
+ "table for index");
+ WT_ERR(EINVAL);
+ }
+ if (F_ISSET(ref_cursor, WT_CURSTD_JOINED)) {
+ __wt_errx(session, "index cursor already used in a join");
+ WT_ERR(EINVAL);
+ }
+
+ /* "ge" is the default */
+ range = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ;
+ flags = 0;
+ WT_ERR(__wt_config_gets(session, cfg, "compare", &cval));
+ if (cval.len != 0) {
+ if (WT_STRING_MATCH("gt", cval.str, cval.len))
+ range = WT_CURJOIN_END_GT;
+ else if (WT_STRING_MATCH("lt", cval.str, cval.len))
+ range = WT_CURJOIN_END_LT;
+ else if (WT_STRING_MATCH("le", cval.str, cval.len))
+ range = WT_CURJOIN_END_LE;
+ else if (WT_STRING_MATCH("eq", cval.str, cval.len))
+ range = WT_CURJOIN_END_EQ;
+ else if (!WT_STRING_MATCH("ge", cval.str, cval.len))
+ WT_ERR(EINVAL);
+ }
+ WT_ERR(__wt_config_gets(session, cfg, "count", &cval));
+ if (cval.len != 0)
+ count = (uint64_t)cval.val;
+
+ WT_ERR(__wt_config_gets(session, cfg, "strategy", &cval));
+ if (cval.len != 0) {
+ if (WT_STRING_MATCH("bloom", cval.str, cval.len))
+ LF_SET(WT_CURJOIN_ENTRY_BLOOM);
+ else if (!WT_STRING_MATCH("default", cval.str, cval.len))
+ WT_ERR(EINVAL);
+ }
+ WT_ERR(__wt_config_gets(session, cfg, "bloom_bit_count", &cval));
+ bloom_bit_count = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "bloom_hash_count", &cval));
+ bloom_hash_count = (uint64_t)cval.val;
+ if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) {
+ if (count == 0) {
+ __wt_errx(session, "count must be nonzero when "
+ "strategy=bloom");
+ WT_ERR(EINVAL);
+ }
+ if (cjoin->entries_next == 0) {
+ __wt_errx(session, "the first joined cursor cannot "
+ "specify strategy=bloom");
+ WT_ERR(EINVAL);
+ }
+ }
+ WT_ERR(__wt_curjoin_join(session, cjoin, idx, ref_cursor, flags,
+ range, count, bloom_bit_count, bloom_hash_count));
+ /*
+ * There's an implied ownership ordering that isn't
+ * known when the cursors are created: the join cursor
+ * must be closed before any of the indices. Enforce
+ * that here by reordering.
+ */
+ if (TAILQ_FIRST(&session->cursors) != join_cursor) {
+ TAILQ_REMOVE(&session->cursors, join_cursor, q);
+ TAILQ_INSERT_HEAD(&session->cursors, join_cursor, q);
+ }
+ /* Disable the reference cursor for regular operations */
+ F_SET(ref_cursor, WT_CURSTD_JOINED);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
* __session_salvage --
* WT_SESSION->salvage method.
*/
@@ -657,6 +798,7 @@ __session_truncate(WT_SESSION *wt_session,
session = (WT_SESSION_IMPL *)wt_session;
SESSION_TXN_API_CALL(session, truncate, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, cursor_truncate);
/*
* If the URI is specified, we don't need a start/stop, if start/stop
@@ -1009,7 +1151,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
WT_ERR(__wt_epoch(session, &now));
- waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION;
+ waited_ms = WT_TIMEDIFF_MS(now, start);
if (forever || waited_ms < timeout_ms)
/*
* Note, we will wait an increasing amount of time
@@ -1144,6 +1286,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_create,
__wt_session_compact,
__session_drop,
+ __session_join,
__session_log_flush,
__session_log_printf,
__session_rename,
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
index bd503cd7826..456fcd3ce03 100644
--- a/src/third_party/wiredtiger/src/session/session_compact.c
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -133,8 +133,7 @@ __session_compact_check_timeout(
return (0);
WT_RET(__wt_epoch(session, &end));
- if (session->compact->max_time <
- WT_TIMEDIFF(end, begin) / WT_BILLION)
+ if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin))
WT_RET(ETIMEDOUT);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index ec2f0921ef2..dd5094fb480 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -390,7 +390,7 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
* do it again.
*/
WT_RET(__wt_seconds(session, &now));
- if (now - session->last_sweep < conn->sweep_interval)
+ if (difftime(now, session->last_sweep) < conn->sweep_interval)
return (0);
session->last_sweep = now;
@@ -404,7 +404,8 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
dhandle->session_inuse == 0 &&
(WT_DHANDLE_INACTIVE(dhandle) ||
(dhandle->timeofdeath != 0 &&
- now - dhandle->timeofdeath > conn->sweep_idle_time))) {
+ difftime(now, dhandle->timeofdeath) >
+ conn->sweep_idle_time))) {
WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
WT_ASSERT(session, !WT_IS_METADATA(dhandle));
__session_discard_dhandle(session, dhandle_cache);
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
index c4bf4e8946a..de518cbf08b 100644
--- a/src/third_party/wiredtiger/src/support/err.c
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -199,7 +199,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error,
remain = WT_PTRDIFF(end, p);
wlen = (size_t)snprintf(p, remain,
"[%" PRIuMAX ":%" PRIuMAX "][%s]",
- (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid);
+ (uintmax_t)ts.tv_sec,
+ (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid);
p = wlen >= remain ? end : p + wlen;
prefix_cnt = 1;
}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 9e817fad512..4d7cd65fd18 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -47,7 +47,8 @@ static const char * const __stats_dsrc_desc[] = {
"cache: data source pages selected for eviction unable to be evicted",
"cache: hazard pointer blocked page eviction",
"cache: internal pages evicted",
- "cache: pages split during eviction",
+ "cache: internal pages split during eviction",
+ "cache: leaf pages split during eviction",
"cache: in-memory page splits",
"cache: in-memory page passed criteria to be split",
"cache: overflow values cached in memory",
@@ -76,6 +77,7 @@ static const char * const __stats_dsrc_desc[] = {
"cursor: restarted searches",
"cursor: search calls",
"cursor: search near calls",
+ "cursor: truncate calls",
"cursor: update calls",
"cursor: cursor-update value bytes updated",
"LSM: sleep for LSM checkpoint throttle",
@@ -91,6 +93,7 @@ static const char * const __stats_dsrc_desc[] = {
"reconciliation: leaf-page overflow keys",
"reconciliation: overflow values written",
"reconciliation: pages deleted",
+ "reconciliation: fast-path pages deleted",
"reconciliation: page checksum matches",
"reconciliation: page reconciliation calls",
"reconciliation: page reconciliation calls for eviction",
@@ -101,10 +104,12 @@ static const char * const __stats_dsrc_desc[] = {
"transaction: update conflicts",
};
-const char *
-__wt_stat_dsrc_desc(int slot)
+int
+__wt_stat_dsrc_desc(WT_CURSOR_STAT *cst, int slot, const char **p)
{
- return (__stats_dsrc_desc[slot]);
+ WT_UNUSED(cst);
+ *p = __stats_dsrc_desc[slot];
+ return (0);
}
void
@@ -163,6 +168,8 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
+ stats->cache_eviction_split_internal = 0;
+ stats->cache_eviction_split_leaf = 0;
stats->cache_eviction_dirty = 0;
stats->cache_read_overflow = 0;
stats->cache_overflow_value = 0;
@@ -170,7 +177,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_write_lookaside = 0;
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
- stats->cache_eviction_split = 0;
stats->cache_write = 0;
stats->cache_write_restore = 0;
stats->cache_eviction_clean = 0;
@@ -194,6 +200,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cursor_restart = 0;
stats->cursor_search = 0;
stats->cursor_search_near = 0;
+ stats->cursor_truncate = 0;
stats->cursor_update = 0;
stats->bloom_false_positive = 0;
stats->bloom_hit = 0;
@@ -208,6 +215,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->lsm_merge_throttle = 0;
stats->bloom_size = 0;
stats->rec_dictionary = 0;
+ stats->rec_page_delete_fast = 0;
stats->rec_suffix_compression = 0;
stats->rec_multiblock_internal = 0;
stats->rec_overflow_key_internal = 0;
@@ -280,6 +288,9 @@ __wt_stat_dsrc_aggregate_single(
to->cache_inmem_splittable += from->cache_inmem_splittable;
to->cache_inmem_split += from->cache_inmem_split;
to->cache_eviction_internal += from->cache_eviction_internal;
+ to->cache_eviction_split_internal +=
+ from->cache_eviction_split_internal;
+ to->cache_eviction_split_leaf += from->cache_eviction_split_leaf;
to->cache_eviction_dirty += from->cache_eviction_dirty;
to->cache_read_overflow += from->cache_read_overflow;
to->cache_overflow_value += from->cache_overflow_value;
@@ -287,7 +298,6 @@ __wt_stat_dsrc_aggregate_single(
to->cache_write_lookaside += from->cache_write_lookaside;
to->cache_read += from->cache_read;
to->cache_read_lookaside += from->cache_read_lookaside;
- to->cache_eviction_split += from->cache_eviction_split;
to->cache_write += from->cache_write;
to->cache_write_restore += from->cache_write_restore;
to->cache_eviction_clean += from->cache_eviction_clean;
@@ -311,6 +321,7 @@ __wt_stat_dsrc_aggregate_single(
to->cursor_restart += from->cursor_restart;
to->cursor_search += from->cursor_search;
to->cursor_search_near += from->cursor_search_near;
+ to->cursor_truncate += from->cursor_truncate;
to->cursor_update += from->cursor_update;
to->bloom_false_positive += from->bloom_false_positive;
to->bloom_hit += from->bloom_hit;
@@ -326,6 +337,7 @@ __wt_stat_dsrc_aggregate_single(
to->lsm_merge_throttle += from->lsm_merge_throttle;
to->bloom_size += from->bloom_size;
to->rec_dictionary += from->rec_dictionary;
+ to->rec_page_delete_fast += from->rec_page_delete_fast;
to->rec_suffix_compression += from->rec_suffix_compression;
to->rec_multiblock_internal += from->rec_multiblock_internal;
to->rec_overflow_key_internal += from->rec_overflow_key_internal;
@@ -407,6 +419,10 @@ __wt_stat_dsrc_aggregate(
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_eviction_split_internal +=
+ WT_STAT_READ(from, cache_eviction_split_internal);
+ to->cache_eviction_split_leaf +=
+ WT_STAT_READ(from, cache_eviction_split_leaf);
to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow);
to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value);
@@ -416,7 +432,6 @@ __wt_stat_dsrc_aggregate(
WT_STAT_READ(from, cache_write_lookaside);
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
- to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_write += WT_STAT_READ(from, cache_write);
to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
@@ -442,6 +457,7 @@ __wt_stat_dsrc_aggregate(
to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_search += WT_STAT_READ(from, cursor_search);
to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
+ to->cursor_truncate += WT_STAT_READ(from, cursor_truncate);
to->cursor_update += WT_STAT_READ(from, cursor_update);
to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive);
to->bloom_hit += WT_STAT_READ(from, bloom_hit);
@@ -459,6 +475,7 @@ __wt_stat_dsrc_aggregate(
to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle);
to->bloom_size += WT_STAT_READ(from, bloom_size);
to->rec_dictionary += WT_STAT_READ(from, rec_dictionary);
+ to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast);
to->rec_suffix_compression +=
WT_STAT_READ(from, rec_suffix_compression);
to->rec_multiblock_internal +=
@@ -529,7 +546,8 @@ static const char * const __stats_connection_desc[] = {
"cache: eviction server evicting pages",
"cache: eviction server populating queue, but not evicting pages",
"cache: eviction server unable to reach eviction goal",
- "cache: pages split during eviction",
+ "cache: internal pages split during eviction",
+ "cache: leaf pages split during eviction",
"cache: pages walked for eviction",
"cache: eviction worker thread evicting pages",
"cache: in-memory page splits",
@@ -554,6 +572,7 @@ static const char * const __stats_connection_desc[] = {
"cursor: cursor restarted searches",
"cursor: cursor search calls",
"cursor: cursor search near calls",
+ "cursor: truncate calls",
"cursor: cursor update calls",
"data-handle: connection data handles currently active",
"data-handle: session dhandles swept",
@@ -615,6 +634,8 @@ static const char * const __stats_connection_desc[] = {
"thread-yield: page acquire read blocked",
"thread-yield: page acquire time sleeping (usecs)",
"connection: total read I/Os",
+ "reconciliation: pages deleted",
+ "reconciliation: fast-path pages deleted",
"reconciliation: page reconciliation calls",
"reconciliation: page reconciliation calls for eviction",
"reconciliation: split bytes currently awaiting free",
@@ -635,15 +656,20 @@ static const char * const __stats_connection_desc[] = {
"transaction: transaction failures due to cache overflow",
"transaction: transaction range of IDs currently pinned by a checkpoint",
"transaction: transaction range of IDs currently pinned",
+ "transaction: transaction range of IDs currently pinned by named snapshots",
"transaction: transactions rolled back",
+ "transaction: number of named snapshots created",
+ "transaction: number of named snapshots dropped",
"transaction: transaction sync calls",
"connection: total write I/Os",
};
-const char *
-__wt_stat_connection_desc(int slot)
+int
+__wt_stat_connection_desc(WT_CURSOR_STAT *cst, int slot, const char **p)
{
- return (__stats_connection_desc[slot]);
+ WT_UNUSED(cst);
+ *p = __stats_connection_desc[slot];
+ return (0);
}
void
@@ -701,6 +727,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
+ stats->cache_eviction_split_internal = 0;
+ stats->cache_eviction_split_leaf = 0;
stats->cache_lookaside_insert = 0;
stats->cache_lookaside_remove = 0;
/* not clearing cache_bytes_max */
@@ -715,7 +743,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
stats->cache_eviction_fail = 0;
- stats->cache_eviction_split = 0;
stats->cache_eviction_walk = 0;
stats->cache_write = 0;
stats->cache_write_restore = 0;
@@ -745,6 +772,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cursor_search = 0;
stats->cursor_search_near = 0;
stats->cursor_update = 0;
+ stats->cursor_truncate = 0;
/* not clearing dh_conn_handle_count */
stats->dh_sweep_ref = 0;
stats->dh_sweep_close = 0;
@@ -795,8 +823,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->lsm_work_units_done = 0;
stats->lsm_work_units_created = 0;
stats->lsm_work_queue_max = 0;
+ stats->rec_page_delete_fast = 0;
stats->rec_pages = 0;
stats->rec_pages_eviction = 0;
+ stats->rec_page_delete = 0;
/* not clearing rec_split_stashed_bytes */
/* not clearing rec_split_stashed_objects */
/* not clearing session_cursor_open */
@@ -806,6 +836,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->page_locked_blocked = 0;
stats->page_read_blocked = 0;
stats->page_sleep = 0;
+ stats->txn_snapshots_created = 0;
+ stats->txn_snapshots_dropped = 0;
stats->txn_begin = 0;
/* not clearing txn_checkpoint_running */
/* not clearing txn_checkpoint_generation */
@@ -817,6 +849,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->txn_fail_cache = 0;
/* not clearing txn_pinned_range */
/* not clearing txn_pinned_checkpoint_range */
+ /* not clearing txn_pinned_snapshot_range */
stats->txn_sync = 0;
stats->txn_commit = 0;
stats->txn_rollback = 0;
@@ -880,6 +913,10 @@ __wt_stat_connection_aggregate(
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_eviction_split_internal +=
+ WT_STAT_READ(from, cache_eviction_split_internal);
+ to->cache_eviction_split_leaf +=
+ WT_STAT_READ(from, cache_eviction_split_leaf);
to->cache_lookaside_insert +=
WT_STAT_READ(from, cache_lookaside_insert);
to->cache_lookaside_remove +=
@@ -900,7 +937,6 @@ __wt_stat_connection_aggregate(
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
- to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
to->cache_write += WT_STAT_READ(from, cache_write);
to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
@@ -930,6 +966,7 @@ __wt_stat_connection_aggregate(
to->cursor_search += WT_STAT_READ(from, cursor_search);
to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
to->cursor_update += WT_STAT_READ(from, cursor_update);
+ to->cursor_truncate += WT_STAT_READ(from, cursor_truncate);
to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count);
to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref);
to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close);
@@ -988,8 +1025,10 @@ __wt_stat_connection_aggregate(
to->lsm_work_units_created +=
WT_STAT_READ(from, lsm_work_units_created);
to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max);
+ to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast);
to->rec_pages += WT_STAT_READ(from, rec_pages);
to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction);
+ to->rec_page_delete += WT_STAT_READ(from, rec_page_delete);
to->rec_split_stashed_bytes +=
WT_STAT_READ(from, rec_split_stashed_bytes);
to->rec_split_stashed_objects +=
@@ -1002,6 +1041,10 @@ __wt_stat_connection_aggregate(
to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked);
to->page_read_blocked += WT_STAT_READ(from, page_read_blocked);
to->page_sleep += WT_STAT_READ(from, page_sleep);
+ to->txn_snapshots_created +=
+ WT_STAT_READ(from, txn_snapshots_created);
+ to->txn_snapshots_dropped +=
+ WT_STAT_READ(from, txn_snapshots_dropped);
to->txn_begin += WT_STAT_READ(from, txn_begin);
to->txn_checkpoint_running +=
WT_STAT_READ(from, txn_checkpoint_running);
@@ -1020,7 +1063,55 @@ __wt_stat_connection_aggregate(
to->txn_pinned_range += WT_STAT_READ(from, txn_pinned_range);
to->txn_pinned_checkpoint_range +=
WT_STAT_READ(from, txn_pinned_checkpoint_range);
+ to->txn_pinned_snapshot_range +=
+ WT_STAT_READ(from, txn_pinned_snapshot_range);
to->txn_sync += WT_STAT_READ(from, txn_sync);
to->txn_commit += WT_STAT_READ(from, txn_commit);
to->txn_rollback += WT_STAT_READ(from, txn_rollback);
}
+
+static const char * const __stats_join_desc[] = {
+ ": accesses",
+ ": actual count of items",
+ ": bloom filter false positives",
+};
+
+int
+__wt_stat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **p)
+{
+ WT_UNUSED(cst);
+ *p = __stats_join_desc[slot];
+ return (0);
+}
+
+void
+__wt_stat_join_init_single(WT_JOIN_STATS *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+}
+
+void
+__wt_stat_join_clear_single(WT_JOIN_STATS *stats)
+{
+ stats->accesses = 0;
+ stats->actual_count = 0;
+ stats->bloom_false_positive = 0;
+}
+
+void
+__wt_stat_join_clear_all(WT_JOIN_STATS **stats)
+{
+ u_int i;
+
+ for (i = 0; i < WT_COUNTER_SLOTS; ++i)
+ __wt_stat_join_clear_single(stats[i]);
+}
+
+void
+__wt_stat_join_aggregate(
+ WT_JOIN_STATS **from, WT_JOIN_STATS *to)
+{
+ to->accesses += WT_STAT_READ(from, accesses);
+ to->actual_count += WT_STAT_READ(from, actual_count);
+ to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index a37fa3555b0..2079410a4d1 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -373,8 +373,11 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
*/
F_SET(txn, WT_TXN_SYNC_SET);
+ /*
+ * If sync is turned off explicitly, clear the transaction's sync field.
+ */
if (cval.val == 0)
- FLD_CLR(txn->txn_logsync, WT_LOG_FLUSH);
+ txn->txn_logsync = 0;
WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
if (cval.len > 0)
@@ -481,7 +484,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* explicit setting.
*/
if (cval.len == 0) {
- if (!FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) &&
+ if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
!F_ISSET(txn, WT_TXN_SYNC_SET))
txn->txn_logsync = 0;
} else {
@@ -650,16 +653,21 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS **stats;
- uint64_t checkpoint_pinned;
+ uint64_t checkpoint_pinned, snapshot_pinned;
conn = S2C(session);
txn_global = &conn->txn_global;
stats = conn->stats;
checkpoint_pinned = txn_global->checkpoint_pinned;
+ snapshot_pinned = txn_global->nsnap_oldest_id;
WT_STAT_SET(session, stats, txn_pinned_range,
txn_global->current - txn_global->oldest_id);
+ WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
+ snapshot_pinned == WT_TXN_NONE ?
+ 0 : txn_global->current - snapshot_pinned);
+
WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
checkpoint_pinned == WT_TXN_NONE ?
0 : txn_global->current - checkpoint_pinned);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 066abc9ed0f..bc1537ca878 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -297,7 +297,7 @@ __checkpoint_stats(
/*
* Get time diff in microseconds.
*/
- msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION;
+ msec = WT_TIMEDIFF_MS(*stop, *start);
if (msec > conn->ckpt_time_max)
conn->ckpt_time_max = msec;
@@ -327,7 +327,7 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
/*
* Get time diff in microseconds.
*/
- msec = WT_TIMEDIFF(stop, *start) / WT_MILLION;
+ msec = WT_TIMEDIFF_MS(stop, *start);
WT_RET(__wt_verbose(session,
WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64
": Full database checkpoint %s",
diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c
index a5ec9cb9b82..169929a46de 100644
--- a/src/third_party/wiredtiger/src/txn/txn_nsnap.c
+++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c
@@ -47,6 +47,7 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
TAILQ_REMOVE(&txn_global->nsnaph, found, q);
__nsnap_destroy(session, found);
+ WT_STAT_FAST_CONN_INCR(session, txn_snapshots_dropped);
return (ret);
}
@@ -111,6 +112,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive)
WT_ASSERT(session, nsnap != NULL);
TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
__nsnap_destroy(session, nsnap);
+ WT_STAT_FAST_CONN_INCR(session, txn_snapshots_dropped);
/* Last will be NULL in the all case so it will never match */
} while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
@@ -176,6 +178,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
if (TAILQ_EMPTY(&txn_global->nsnaph))
txn_global->nsnap_oldest_id = nsnap_new->snap_min;
TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
+ WT_STAT_FAST_CONN_INCR(session, txn_snapshots_created);
nsnap_new = NULL;
err: if (started_txn)
diff --git a/src/third_party/wiredtiger/tools/wtstats/stat_data.py b/src/third_party/wiredtiger/tools/wtstats/stat_data.py
index f2f193c0860..7cee87e49ed 100644
--- a/src/third_party/wiredtiger/tools/wtstats/stat_data.py
+++ b/src/third_party/wiredtiger/tools/wtstats/stat_data.py
@@ -32,6 +32,7 @@ no_scale_per_second_list = [
'transaction: transaction checkpoint total time (msecs)',
'transaction: transaction range of IDs currently pinned',
'transaction: transaction range of IDs currently pinned by a checkpoint',
+ 'transaction: transaction range of IDs currently pinned by named snapshots',
'block-manager: checkpoint size',
'block-manager: file allocation unit size',
'block-manager: file magic number',
@@ -95,6 +96,7 @@ no_clear_list = [
'transaction: transaction checkpoint total time (msecs)',
'transaction: transaction range of IDs currently pinned',
'transaction: transaction range of IDs currently pinned by a checkpoint',
+ 'transaction: transaction range of IDs currently pinned by named snapshots',
'btree: btree checkpoint generation',
'session: open cursor count',
]