diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-24 15:26:37 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-24 15:28:10 +1100 |
commit | 0609d0ce2ef563d7a4cde77d46396fe5c92c6df1 (patch) | |
tree | 1fc47f23e3cfd91c91182468dce8fee02082b49b /src/third_party/wiredtiger | |
parent | ab1ee41ecf1c96ae8b17a2b1da1c7ee9b8c58676 (diff) | |
download | mongo-0609d0ce2ef563d7a4cde77d46396fe5c92c6df1.tar.gz |
Import wiredtiger: ef9a7983ea47cea78400a4472a3d4e46735385c5 from branch mongodb-3.4
ref: 6a31c2118c..ef9a7983ea
for: 3.4.0-rc2
WT-1592 Add ability to dump detailed cache information via statistics
WT-2403 Enhance random cursor implementation for LSM trees
WT-2880 Add support for Zstandard compression
WT-2904 Fix a bug where the reported checkpoint size could be many times data size
WT-2949 Add an option to wtperf to not close connection on shutdown
WT-2954 Inserting multi-megabyte values can cause large in-memory pages
WT-2955 Add statistics tracking the amount of time threads spend waiting for high level locks
WT-2956 utility tests -h option is always overridden by the default setup
WT-2959 Ensure WT_SESSION_IMPL is never used before it's initialized
WT-2963 Race setting max_entries during eviction
WT-2965 test_wt2323_join_visibility can hang on OSX
WT-2974 lint
WT-2976 Add a statistic tracking how long application threads spend doing I/O
WT-2977 Csuite LSM Random test can occasionally fail
WT-2985 Race during checkpoint can cause a core dump
WT-2987 Fix a bug where opening a cursor on an incomplete table drops core
WT-2988 Fix a bug where __wt_epoch potentially returns garbage values.
Diffstat (limited to 'src/third_party/wiredtiger')
114 files changed, 3155 insertions, 970 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf index 06745bf7cca..536127f0dd8 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-50r50u.wtperf @@ -10,6 +10,9 @@ create=false compression="snappy" sess_config="isolation=snapshot" table_count=2 +# close_conn as false allows this test to close/finish faster, but if running +# as the set, the next test will need to run recovery. +close_conn=false key_sz=40 value_sz=120 max_latency=2000 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf index 77edbfb4941..d6218c44af0 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf @@ -8,6 +8,9 @@ conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" create=false compression="snappy" +# close_conn as false allows this test to close/finish faster, but if running +# as the set, the next test will need to run recovery. +close_conn=false sess_config="isolation=snapshot table_count=2 key_sz=40 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf index 0c98a0c2db0..bbd3a3ba5ed 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/checkpoint-stress.wtperf @@ -4,6 +4,7 @@ conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false)" table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" # Enough data to fill the cache. 150 million 1k records results in two ~11GB # tables +close_conn=false icount=150000000 create=true compression="snappy" diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf index 9699b9ae3bb..a5a29f66fa0 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-stress-multi.wtperf @@ -1,6 +1,7 @@ conn_config="cache_size=1G,eviction=(threads_max=4),session_max=2000" table_config="type=file" table_count=100 +close_conn=false icount=100000000 report_interval=5 run_time=600 diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index a7618b19707..8c7f0053388 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -2078,6 +2078,11 @@ config_compress(WTPERF *wtperf) wtperf->compress_ext = ZLIB_EXT; #endif wtperf->compress_table = ZLIB_BLK; + } else if (strcmp(s, "zstd") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_ZSTD + wtperf->compress_ext = ZSTD_EXT; +#endif + wtperf->compress_table = ZSTD_BLK; } else { fprintf(stderr, "invalid compression configuration: %s\n", s); @@ -2300,7 +2305,7 @@ err: if (ret == 0) ret = t_ret; } - if (wtperf->conn != NULL && + if (wtperf->conn != NULL && opts->close_conn && (t_ret = wtperf->conn->close(wtperf->conn, NULL)) != 0) { lprintf(wtperf, t_ret, 0, "Error closing connection to %s", wtperf->home); @@ -2329,7 +2334,6 @@ err: if (ret == 0) extern int __wt_optind, __wt_optreset; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; /* * usage -- diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h index afce017d919..81d74e134f6 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h @@ -54,6 +54,9 @@ typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; #define ZLIB_BLK BLKCMP_PFX "zlib" #define ZLIB_EXT \ EXT_PFX EXTPATH "zlib/.libs/libwiredtiger_zlib.so" EXT_SFX +#define ZSTD_BLK BLKCMP_PFX "zstd" +#define ZSTD_EXT \ + EXT_PFX EXTPATH "zstd/.libs/libwiredtiger_zstd.so" EXT_SFX typedef struct { int64_t threads; /* Thread count */ diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index 5a632f26faa..680eb53a90e 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -94,10 +94,13 @@ DEF_OPT_AS_UINT32(checkpoint_stress_rate, 0, DEF_OPT_AS_UINT32(checkpoint_threads, 0, "number of checkpoint threads") DEF_OPT_AS_CONFIG_STRING(conn_config, "create", "connection configuration string") +DEF_OPT_AS_BOOL(close_conn, 1, "properly close connection at end of test. " + "Setting to false does not sync data to disk and can result in lost " + "data after test exits.") DEF_OPT_AS_BOOL(compact, 0, "post-populate compact for LSM merging activity") DEF_OPT_AS_STRING(compression, "none", "compression extension. Allowed configuration values are: " - "'none', 'lz4', 'snappy', 'zlib'") + "'none', 'lz4', 'snappy', 'zlib', 'zstd'") DEF_OPT_AS_BOOL(create, 1, "do population phase; false to use existing database") DEF_OPT_AS_UINT32(database_count, 1, diff --git a/src/third_party/wiredtiger/build_posix/Make.base b/src/third_party/wiredtiger/build_posix/Make.base index 4efbe3f76c3..5b945aca5e0 100644 --- a/src/third_party/wiredtiger/build_posix/Make.base +++ b/src/third_party/wiredtiger/build_posix/Make.base @@ -77,6 +77,9 @@ endif if HAVE_BUILTIN_EXTENSION_ZLIB libwiredtiger_la_LIBADD += ext/compressors/zlib/libwiredtiger_zlib.la endif +if HAVE_BUILTIN_EXTENSION_ZSTD +libwiredtiger_la_LIBADD += ext/compressors/zstd/libwiredtiger_zstd.la +endif libwiredtiger_static_la_LIBADD=$(libwiredtiger_la_LIBADD) libwiredtiger_static_la_SOURCES=$(libwiredtiger_la_SOURCES) diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs index 0b5175e4196..55941837249 100644 --- a/src/third_party/wiredtiger/build_posix/Make.subdirs +++ b/src/third_party/wiredtiger/build_posix/Make.subdirs @@ -11,6 +11,7 @@ ext/compressors/lz4 LZ4 ext/compressors/nop ext/compressors/snappy SNAPPY ext/compressors/zlib ZLIB +ext/compressors/zstd ZSTD ext/datasources/helium HAVE_HELIUM ext/encryptors/nop ext/encryptors/rotn diff --git a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 b/src/third_party/wiredtiger/build_posix/aclocal/options.m4 index 1f6a1690279..7043430a6d6 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/options.m4 @@ -19,10 +19,12 @@ AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_SNAPPY, [Snappy support automatically loaded.]) AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_ZLIB, [Zlib support automatically loaded.]) +AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_ZSTD, + [ZSTD support automatically loaded.]) AC_MSG_CHECKING(if --with-builtins option specified) AC_ARG_WITH(builtins, [AS_HELP_STRING([--with-builtins], - [builtin extension names (lz4, snappy, zlib).])], + [builtin extension names (lz4, snappy, zlib, zstd).])], [with_builtins=$withval], [with_builtins=]) @@ -36,6 +38,8 @@ for builtin_i in $builtin_list; do wt_cv_with_builtin_extension_snappy=yes;; zlib) AC_DEFINE(HAVE_BUILTIN_EXTENSION_ZLIB) wt_cv_with_builtin_extension_zlib=yes;; + zstd) AC_DEFINE(HAVE_BUILTIN_EXTENSION_ZSTD) + wt_cv_with_builtin_extension_zstd=yes;; *) AC_MSG_ERROR([Unknown builtin extension "$builtin_i"]);; esac done @@ -45,6 +49,8 @@ AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_SNAPPY], [test "$wt_cv_with_builtin_extension_snappy" = "yes"]) AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZLIB], [test "$wt_cv_with_builtin_extension_zlib" = "yes"]) +AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZSTD], + [test "$wt_cv_with_builtin_extension_zstd" = "yes"]) AC_MSG_RESULT($with_builtins) AH_TEMPLATE( @@ -276,4 +282,30 @@ if test "$wt_cv_enable_zlib" = "yes"; then fi AM_CONDITIONAL([ZLIB], [test "$wt_cv_enable_zlib" = "yes"]) +AC_MSG_CHECKING(if --enable-zstd option specified) +AC_ARG_ENABLE(zstd, + [AS_HELP_STRING([--enable-zstd], + [Build the zstd compressor extension.])], r=$enableval, r=no) +case "$r" in +no) if test "$wt_cv_with_builtin_extension_zstd" = "yes"; then + wt_cv_enable_zstd=yes + else + wt_cv_enable_zstd=no + fi + ;; +*) if test "$wt_cv_with_builtin_extension_zstd" = "yes"; then + AC_MSG_ERROR( + [Only one of --enable-zstd --with-builtins=zstd allowed]) + fi + wt_cv_enable_zstd=yes;; +esac +AC_MSG_RESULT($wt_cv_enable_zstd) +if test "$wt_cv_enable_zstd" = "yes"; then + AC_CHECK_HEADER(zstd.h,, + [AC_MSG_ERROR([--enable-zstd requires zstd.h])]) + AC_CHECK_LIB(zstd, ZSTD_compress,, + [AC_MSG_ERROR([--enable-zstd requires Zstd library])]) +fi +AM_CONDITIONAL([ZSTD], [test "$wt_cv_enable_zstd" = "yes"]) + ]) diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h index 83ddc6eb194..78d2784cb70 100644 --- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h +++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h @@ -19,6 +19,9 @@ /* Zlib support automatically loaded. */ /* #undef HAVE_BUILTIN_EXTENSION_ZLIB */ +/* ZSTD support automatically loaded. */ +/* #undef HAVE_BUILTIN_EXTENSION_ZSTD */ + /* Define to 1 if you have the `clock_gettime' function. */ /* #undef HAVE_CLOCK_GETTIME */ @@ -70,6 +73,9 @@ /* Define to 1 if you have the `z' library (-lz). */ /* #undef HAVE_LIBZ */ +/* Define to 1 if you have the `zstd' library (-lzstd). */ +/* #undef HAVE_LIBZSTD */ + /* Define to 1 if you have the <memory.h> header file. */ /* #undef HAVE_MEMORY_H */ diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index d02d7e4b985..7affc58a217 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -136,8 +136,8 @@ file_config = format_meta + [ configure a compressor for file blocks. Permitted values are \c "none" or custom compression engine name created with WT_CONNECTION::add_compressor. If WiredTiger has builtin support for - \c "snappy", \c "lz4" or \c "zlib" compression, these names are also - available. See @ref compression for more information'''), + \c "lz4", \c "snappy", \c "zlib" or \c "zstd" compression, these names + are also available. See @ref compression for more information'''), Config('cache_resident', 'false', r''' do not ever evict the object's pages from cache. Not compatible with LSM tables; see @ref tuning_cache_resident for more information''', @@ -502,7 +502,8 @@ connection_runtime_config = [ is used to gather statistics, as well as each time statistics are logged using the \c statistics_log configuration. See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'none', 'clear']), + type='list', + choices=['all', 'cache_walk', 'fast', 'none', 'clear', 'tree_walk']), Config('verbose', '', r''' enable messages for various events. Only available if WiredTiger is configured with --enable-verbose. Options are given as a @@ -569,8 +570,9 @@ wiredtiger_open_log_configuration = [ configure a compressor for log records. Permitted values are \c "none" or custom compression engine name created with WT_CONNECTION::add_compressor. If WiredTiger has builtin support - for \c "snappy", \c "lz4" or \c "zlib" compression, these names - are also available. See @ref compression for more information'''), + for \c "lz4", \c "snappy", \c "zlib" or \c "zstd" compression, + these names are also available. See @ref compression for more + information'''), Config('file_max', '100MB', r''' the maximum size of log files''', min='100KB', max='2GB'), @@ -976,7 +978,8 @@ methods = { gathering them, where appropriate (for example, a cache size statistic is not cleared, while the count of cursor insert operations will be cleared). See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'clear', 'size']), + type='list', + choices=['all', 'cache_walk', 'fast', 'clear', 'size', 'tree_walk']), Config('target', '', r''' if non-empty, backup the list of objects; valid only for a backup data source''', diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 32e4231c5f2..fe9a17b7799 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -90,6 +90,7 @@ src/cursor/cur_table.c src/evict/evict_file.c src/evict/evict_lru.c src/evict/evict_page.c +src/evict/evict_stat.c src/log/log.c src/log/log_auto.c src/log/log_slot.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 93b6e0cbbf4..e200f95fba6 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -133,6 +133,16 @@ flags = { 'SESSION_QUIET_CORRUPT_FILE', 'SESSION_SERVER_ASYNC', ], + 'stat' : [ + 'STAT_CLEAR', + 'STAT_JSON', + 'STAT_ON_CLOSE', + 'STAT_TYPE_ALL', + 'STAT_TYPE_CACHE_WALK', + 'STAT_TYPE_FAST', + 'STAT_TYPE_SIZE', + 'STAT_TYPE_TREE_WALK', + ], } flag_cnt = {} # Dictionary [flag] : [reference count] diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 6a1a32004ea..7b11d665de5 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -39,14 +39,18 @@ WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT WT_SESSION_LOCKED_TURTLE -WT_STATS_FIELD_TO_SLOT +WT_STATS_FIELD_TO_OFFSET WT_STATS_SLOT_ID WT_STAT_CONN_DECRV WT_STAT_DATA_DECRV WT_STAT_DECR WT_STAT_DECRV WT_STAT_DECRV_ATOMIC +WT_STAT_DECRV_ATOMIC_BASE +WT_STAT_DECRV_BASE WT_STAT_INCRV_ATOMIC +WT_STAT_INCRV_ATOMIC_BASE +WT_STAT_INCRV_BASE WT_STAT_WRITE WT_TIMEDIFF_US WT_TRET_ERROR_OK diff --git a/src/third_party/wiredtiger/dist/s_export b/src/third_party/wiredtiger/dist/s_export index dc69238b270..b8e42c970f9 100755 --- a/src/third_party/wiredtiger/dist/s_export +++ b/src/third_party/wiredtiger/dist/s_export @@ -26,7 +26,7 @@ check() sort | uniq -u | egrep -v \ - 'zlib_extension_init|lz4_extension_init|snappy_extension_init' > $t + 'lz4_extension_init|snappy_extension_init|zlib_extension_init|zstd_extension_init' > $t test -s $t && { echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" diff --git a/src/third_party/wiredtiger/dist/s_stat b/src/third_party/wiredtiger/dist/s_stat index 935c7e1fb43..5d5937e1833 100755 --- a/src/third_party/wiredtiger/dist/s_stat +++ b/src/third_party/wiredtiger/dist/s_stat @@ -20,6 +20,25 @@ search=`sed \ -e d ../src/include/stat.h | sort` +# There are some fields that are used, but we can't detect it. +cat << UNUSED_STAT_FIELDS +lock_checkpoint_count +lock_checkpoint_wait_application +lock_checkpoint_wait_internal +lock_handle_list_count +lock_handle_list_wait_application +lock_handle_list_wait_internal +lock_metadata_count +lock_metadata_wait_application +lock_metadata_wait_internal +lock_schema_count +lock_schema_wait_application +lock_schema_wait_internal +lock_table_count +lock_table_wait_application +lock_table_wait_internal +UNUSED_STAT_FIELDS + echo "$search" fgrep -who "$search" $l) | sort | uniq -u > $t diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 0e99c6b9cec..7cf96aec399 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -60,6 +60,7 @@ COVERITY CPUs CRC CSV +CStream CURSORs CURSTD CallsCustDate @@ -69,6 +70,7 @@ Checksum Checksums CityHash CloseHandle +Collet Comparator Config Coverity @@ -125,6 +127,7 @@ FORALL FOREACH FS FULLFSYNC +Facebook FindClose FindFirstFile Fixup @@ -166,6 +169,7 @@ INSN INTL ISA ITEMs +ITER InitializeCriticalSectionAndSpinCount Inline Intra @@ -397,6 +401,12 @@ WriteFile Wuninitialized Wunused XP +Yann +ZSTD +Zlib +Zlib's +Zstd +Zstd's abcdef abcdefghijklmnopqrstuvwxyz addl @@ -418,6 +428,7 @@ argc args argv asm +assertfmt async asyncopp asyncops @@ -513,6 +524,7 @@ collatorp comparator comparep compat +compressStream concat cond conf @@ -532,6 +544,7 @@ cp cpuid crc create's +createCStream crypto cryptobad csv @@ -624,6 +637,7 @@ emp encodings encryptor encryptors +endStream endian english enqueue @@ -751,6 +765,7 @@ infeasible inflateInit infmt init +initCStream initializers initn initsize @@ -786,6 +801,7 @@ isupper isxdigit iter iteratively +iters jnr jrx json @@ -851,6 +867,7 @@ majorp malloc marshall marshalled +maxCLevel maxcpu maxdbs mbll @@ -991,12 +1008,14 @@ qdown qrrSS qsort quartile +queueable qup rN rS rb rbrace rbracket +rcursor rdonly rduppo readlock @@ -1203,6 +1222,7 @@ waitpid walk's warmup wb +wcursor wiredTiger wiredtiger workFactor @@ -1227,4 +1247,7 @@ zalloc zf zfree zlib +zlib's +zstd +zstd's zu diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index f7bfbcc7e8e..e5e9f97c0b7 100644 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -96,10 +96,13 @@ func_ok() -e '/int wiredtiger_extension_init$/d' \ -e '/int wiredtiger_extension_terminate$/d' \ -e '/int wiredtiger_pack_close$/d' \ - -e '/int wt_snappy_pre_size$/d' \ - -e '/int wt_snappy_terminate$/d' \ + -e '/int snappy_pre_size$/d' \ + -e '/int snappy_terminate$/d' \ -e '/int zlib_error$/d' \ - -e '/int zlib_terminate$/d' + -e '/int zlib_terminate$/d' \ + -e '/int zstd_error$/d' \ + -e '/int zstd_pre_size$/d' \ + -e '/int zstd_terminate$/d' } # Complain about functions which return an "int" but which don't return except diff --git a/src/third_party/wiredtiger/dist/stat.py b/src/third_party/wiredtiger/dist/stat.py index c3c85bbe9b4..e42585c1b8c 100644 --- a/src/third_party/wiredtiger/dist/stat.py +++ b/src/third_party/wiredtiger/dist/stat.py @@ -42,8 +42,11 @@ compare_srcfile(tmp_file, '../src/include/stat.h') def print_defines_one(capname, base, stats): for v, l in enumerate(stats, base): desc = l.desc - if 'all_only' in l.flags: - desc += ', only reported if statistics=all is set' + if 'cache_walk' in l.flags: + desc += \ + ', only reported if cache_walk or all statistics are enabled' + if 'tree_walk' in l.flags: + desc += ', only reported if tree_walk or all statistics are enabled' if len(textwrap.wrap(desc, 70)) > 1: f.write('/*!\n') f.write(' * %s\n' % '\n * '.join(textwrap.wrap(desc, 70))) diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 8d1011b1bb3..bcf5201bd90 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -9,7 +9,8 @@ # # Data-source statistics are normally aggregated across the set of underlying # objects. Additional optional configuration flags are available: -# all_only Only gets reported when statistics=all set +# cache_walk Only reported when statistics=cache_walk is set +# tree_walk Only reported when statistics=tree_walk is set # max_aggregate Take the maximum value when aggregating statistics # no_clear Value not cleared when statistics cleared # no_scale Don't scale value per second in the logging tool script @@ -46,6 +47,11 @@ class CacheStat(Stat): prefix = 'cache' def __init__(self, name, desc, flags=''): Stat.__init__(self, name, CacheStat.prefix, desc, flags) +class CacheWalkStat(Stat): + prefix = 'cache_walk' + def __init__(self, name, desc, flags=''): + flags += ',cache_walk' + Stat.__init__(self, name, CacheWalkStat.prefix, desc, flags) class CompressStat(Stat): prefix = 'compression' def __init__(self, name, desc, flags=''): @@ -66,6 +72,10 @@ class JoinStat(Stat): prefix = '' # prefix is inserted dynamically def __init__(self, name, desc, flags=''): Stat.__init__(self, name, JoinStat.prefix, desc, flags) +class LockStat(Stat): + prefix = 'lock' + def __init__(self, name, desc, flags=''): + Stat.__init__(self, name, LockStat.prefix, desc, flags) class LogStat(Stat): prefix = 'log' def __init__(self, name, desc, flags=''): @@ -105,11 +115,16 @@ groups['cursor'] = [CursorStat.prefix, SessionStat.prefix] groups['evict'] = [ BlockStat.prefix, CacheStat.prefix, + CacheWalkStat.prefix, ConnStat.prefix, ThreadStat.prefix ] groups['lsm'] = [LSMStat.prefix, TxnStat.prefix] -groups['memory'] = [CacheStat.prefix, ConnStat.prefix, RecStat.prefix] +groups['memory'] = [ + CacheStat.prefix, + CacheWalkStat.prefix, + ConnStat.prefix, + RecStat.prefix] groups['system'] = [ ConnStat.prefix, DhandleStat.prefix, @@ -226,13 +241,32 @@ connection_stats = [ CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_pages_requested', 'pages requested from the cache'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_app_count', 'application threads page read from disk to cache count'), + CacheStat('cache_read_app_time', 'application threads page read from disk to cache time (usecs)'), CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_app_count', 'application threads page write from cache to disk count'), + CacheStat('cache_write_app_time', 'application threads page write from cache to disk time (usecs)'), CacheStat('cache_write_lookaside', 'page written requiring lookaside records'), CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## + # Cursor operations + ########################################## + CursorStat('cursor_create', 'cursor create calls'), + CursorStat('cursor_insert', 'cursor insert calls'), + CursorStat('cursor_next', 'cursor next calls'), + CursorStat('cursor_prev', 'cursor prev calls'), + CursorStat('cursor_remove', 'cursor remove calls'), + CursorStat('cursor_reset', 'cursor reset calls'), + CursorStat('cursor_restart', 'cursor restarted searches'), + CursorStat('cursor_search', 'cursor search calls'), + CursorStat('cursor_search_near', 'cursor search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), + CursorStat('cursor_update', 'cursor update calls'), + + ########################################## # Dhandle statistics ########################################## DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'), @@ -245,6 +279,25 @@ connection_stats = [ DhandleStat('dh_sweeps', 'connection sweeps'), ########################################## + # Locking statistics + ########################################## + LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'), + LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'), + LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'), + LockStat('lock_handle_list_count', 'handle-list lock acquisitions'), + LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'), + LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'), + LockStat('lock_metadata_count', 'metadata lock acquisitions'), + LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'), + LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'), + LockStat('lock_schema_count', 'schema lock acquisitions'), + LockStat('lock_schema_wait_application', 'schema lock application thread wait time (usecs)'), + LockStat('lock_schema_wait_internal', 'schema lock internal thread wait time (usecs)'), + LockStat('lock_table_count', 'table lock acquisitions'), + LockStat('lock_table_wait_application', 'table lock application thread time waiting for the table lock (usecs)'), + LockStat('lock_table_wait_internal', 'table lock internal thread time waiting for the table lock (usecs)'), + + ########################################## # Logging statistics ########################################## LogStat('log_buffer_size', 'total log buffer size', 'no_clear,no_scale,size'), @@ -286,42 +339,6 @@ connection_stats = [ LogStat('log_zero_fills', 'log files manually zero-filled'), ########################################## - # Reconciliation statistics - ########################################## - RecStat('rec_page_delete', 'pages deleted'), - RecStat('rec_page_delete_fast', 'fast-path pages deleted'), - RecStat('rec_pages', 'page reconciliation calls'), - RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale,size'), - RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), - - ########################################## - # Transaction statistics - ########################################## - TxnStat('txn_begin', 'transaction begins'), - TxnStat('txn_checkpoint', 'transaction checkpoints'), - TxnStat('txn_checkpoint_fsync_post', 'transaction fsync calls for checkpoint after allocating the transaction ID'), - TxnStat('txn_checkpoint_fsync_post_duration', 'transaction fsync duration for checkpoint after allocating the transaction ID (usecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_scrub_time', 'transaction checkpoint scrub time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_skipped', 'transaction checkpoints skipped because database was clean'), - TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), - TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), - TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), - TxnStat('txn_rollback', 'transactions rolled back'), - TxnStat('txn_snapshots_created', 'number of named snapshots created'), - TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), - TxnStat('txn_sync', 'transaction sync calls'), - - ########################################## # LSM statistics ########################################## LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), @@ -336,6 +353,16 @@ connection_stats = [ LSMStat('lsm_work_units_done', 'tree maintenance operations executed'), ########################################## + # Reconciliation statistics + ########################################## + RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_delete_fast', 'fast-path pages deleted'), + RecStat('rec_pages', 'page reconciliation calls'), + RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), + RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale,size'), + RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), + + ########################################## # Session operations ########################################## SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), @@ -358,21 +385,6 @@ connection_stats = [ SessionStat('session_table_verify_success', 'table verify successful calls', 'no_clear,no_scale'), ########################################## - # Total cursor operations - ########################################## - CursorStat('cursor_create', 'cursor create calls'), - CursorStat('cursor_insert', 'cursor insert calls'), - CursorStat('cursor_next', 'cursor next calls'), - CursorStat('cursor_prev', 'cursor prev calls'), - CursorStat('cursor_remove', 'cursor remove calls'), - CursorStat('cursor_reset', 'cursor reset calls'), - CursorStat('cursor_restart', 'cursor restarted searches'), - CursorStat('cursor_search', 'cursor search calls'), - CursorStat('cursor_search_near', 'cursor search near calls'), - CursorStat('cursor_truncate', 'truncate calls'), - CursorStat('cursor_update', 'cursor update calls'), - - ########################################## # Thread Count statistics ########################################## ThreadStat('thread_fsync_active', 'active filesystem fsync calls','no_clear,no_scale'), @@ -380,6 +392,32 @@ connection_stats = [ ThreadStat('thread_write_active', 'active filesystem write calls','no_clear,no_scale'), ########################################## + # Transaction statistics + ########################################## + TxnStat('txn_begin', 'transaction begins'), + TxnStat('txn_checkpoint', 'transaction checkpoints'), + TxnStat('txn_checkpoint_fsync_post', 'transaction fsync calls for checkpoint after allocating the transaction ID'), + TxnStat('txn_checkpoint_fsync_post_duration', 'transaction fsync duration for checkpoint after allocating the transaction ID (usecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_scrub_time', 'transaction checkpoint scrub time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_skipped', 'transaction checkpoints skipped because database was clean'), + TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_min', 'transaction checkpoint min time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_recent', 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_commit', 'transactions committed'), + TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), + TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', 'no_clear,no_scale'), + TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), + TxnStat('txn_pinned_snapshot_range', 'transaction range of IDs currently pinned by named snapshots', 'no_clear,no_scale'), + TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_snapshots_created', 'number of named snapshots created'), + TxnStat('txn_snapshots_dropped', 'number of named snapshots dropped'), + TxnStat('txn_sync', 'transaction sync calls'), + + ########################################## # Yield statistics ########################################## YieldStat('application_cache_time', 'application thread time waiting for cache (usecs)'), @@ -398,41 +436,30 @@ connection_stats = sorted(connection_stats, key=attrgetter('desc')) ########################################## dsrc_stats = [ ########################################## - # Session operations - ########################################## - SessionStat('session_compact', 'object compaction'), - SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), - - ########################################## - # Cursor operations + # Block manager statistics ########################################## - CursorStat('cursor_create', 'create calls'), - CursorStat('cursor_insert', 'insert calls'), - CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), - CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted', 'size'), - CursorStat('cursor_next', 'next calls'), - CursorStat('cursor_prev', 'prev calls'), - CursorStat('cursor_remove', 'remove calls'), - CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), - CursorStat('cursor_reset', 'reset calls'), - CursorStat('cursor_restart', 'restarted searches'), - CursorStat('cursor_search', 'search calls'), - CursorStat('cursor_search_near', 'search near calls'), - CursorStat('cursor_truncate', 'truncate calls'), - CursorStat('cursor_update', 'update calls'), - CursorStat('cursor_update_bytes', 'cursor-update value bytes updated', 'size'), + BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale,size'), + BlockStat('block_alloc', 'blocks allocated'), + BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale,size'), + BlockStat('block_extension', 'allocations requiring file extension'), + BlockStat('block_free', 'blocks freed'), + BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), + BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), + BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), + BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'), + BlockStat('block_size', 'file size in bytes', 'no_scale,size'), ########################################## # Btree statistics ########################################## BtreeStat('btree_checkpoint_generation', 'btree checkpoint generation', 'no_clear,no_scale'), - BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale,all_only'), - BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale,all_only'), - BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale,all_only'), - BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale,all_only'), - BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale,all_only'), + BtreeStat('btree_column_deleted', 'column-store variable-size deleted values', 'no_scale,tree_walk'), + BtreeStat('btree_column_fix', 'column-store fixed-size leaf pages', 'no_scale,tree_walk'), + BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale,tree_walk'), + BtreeStat('btree_column_rle', 'column-store variable-size RLE encoded values', 'no_scale,tree_walk'), + BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale,tree_walk'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), - BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale,all_only'), + BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale,tree_walk'), BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'), BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale,size'), @@ -440,39 +467,9 @@ dsrc_stats = [ BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale,size'), - BtreeStat('btree_overflow', 'overflow pages', 'no_scale,all_only'), - BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale,all_only'), - BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale,all_only'), - - ########################################## - # LSM statistics - ########################################## - LSMStat('bloom_count', 'bloom filters in the LSM tree', 'no_scale'), - LSMStat('bloom_false_positive', 'bloom filter false positives'), - LSMStat('bloom_hit', 'bloom filter hits'), - LSMStat('bloom_miss', 'bloom filter misses'), - LSMStat('bloom_page_evict', 'bloom filter pages evicted from cache'), - LSMStat('bloom_page_read', 'bloom filter pages read into cache'), - LSMStat('bloom_size', 'total size of bloom filters', 'no_scale,size'), - LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), - LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), - LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), - LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'), - LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), - - ########################################## - # Block manager statistics - ########################################## - BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale,size'), - BlockStat('block_alloc', 'blocks allocated'), - BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale,size'), - BlockStat('block_extension', 'allocations requiring file extension'), - BlockStat('block_free', 'blocks freed'), - BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), - BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), - BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), - BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'), - BlockStat('block_size', 'file size in bytes', 'no_scale,size'), + BtreeStat('btree_overflow', 'overflow pages', 'no_scale,tree_walk'), + BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale,tree_walk'), + BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale,tree_walk'), ########################################## # Cache and eviction statistics @@ -501,6 +498,28 @@ dsrc_stats = [ CacheStat('cache_write_restore', 'pages written requiring in-memory restoration'), ########################################## + # Cache content statistics + ########################################## + CacheWalkStat('cache_state_avg_written_size', 'Average on-disk page image size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_avg_gap', 'Average difference between current eviction generation when the page was last considered', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_current', 'Current eviction generation', 'no_clear,no_scale'), + CacheWalkStat('cache_state_gen_max_gap', 'Maximum difference between current eviction generation when the page was last considered', 'no_clear,no_scale'), + CacheWalkStat('cache_state_max_pagesize', 'Maximum page size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_memory', 'Pages created in memory and never written', 'no_clear,no_scale'), + CacheWalkStat('cache_state_min_written_size', 'Minimum on-disk page image size seen', 'no_clear,no_scale'), + CacheWalkStat('cache_state_not_queueable', 'Pages that could not be queued for eviction', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages', 'Total number of pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_clean', 'Clean pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_dirty', 'Dirty pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_internal', 'Internal pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_pages_leaf', 'Leaf pages currently in cache', 'no_clear,no_scale'), + CacheWalkStat('cache_state_queued', 'Pages currently queued for eviction', 'no_clear,no_scale'), + CacheWalkStat('cache_state_refs_skipped', 'Refs skipped during cache traversal', 'no_clear,no_scale'), + CacheWalkStat('cache_state_root_entries', 'Entries in the root page', 'no_clear,no_scale'), + CacheWalkStat('cache_state_root_size', 'Size of the root page', 'no_clear,no_scale'), + CacheWalkStat('cache_state_smaller_alloc_size', 'On-disk page image sizes smaller than a single allocation unit', 'no_clear,no_scale'), + + ########################################## # Compression statistics ########################################## CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), @@ -512,6 +531,41 @@ dsrc_stats = [ CompressStat('compress_write_too_small', 'page written was too small to compress'), ########################################## + # Cursor operations + ########################################## + CursorStat('cursor_create', 'create calls'), + CursorStat('cursor_insert', 'insert calls'), + CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), + CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted', 'size'), + CursorStat('cursor_next', 'next calls'), + CursorStat('cursor_prev', 'prev calls'), + CursorStat('cursor_remove', 'remove calls'), + CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), + CursorStat('cursor_reset', 'reset calls'), + CursorStat('cursor_restart', 'restarted searches'), + CursorStat('cursor_search', 'search calls'), + CursorStat('cursor_search_near', 'search near calls'), + CursorStat('cursor_truncate', 'truncate calls'), + CursorStat('cursor_update', 'update calls'), + CursorStat('cursor_update_bytes', 'cursor-update value bytes updated', 'size'), + + ########################################## + # LSM statistics + ########################################## + LSMStat('bloom_count', 'bloom filters in the LSM tree', 'no_scale'), + LSMStat('bloom_false_positive', 'bloom filter false positives'), + LSMStat('bloom_hit', 'bloom filter hits'), + LSMStat('bloom_miss', 'bloom filter misses'), + LSMStat('bloom_page_evict', 'bloom filter pages evicted from cache'), + LSMStat('bloom_page_read', 'bloom filter pages read into cache'), + LSMStat('bloom_size', 'total size of bloom filters', 'no_scale,size'), + LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), + LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), + LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), + LSMStat('lsm_lookup_no_bloom', 'queries that could have benefited from a Bloom filter that did not exist'), + LSMStat('lsm_merge_throttle', 'sleep for LSM merge throttle'), + + ########################################## # Reconciliation statistics ########################################## RecStat('rec_dictionary', 'dictionary matches'), @@ -530,6 +584,12 @@ dsrc_stats = [ RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression', 'size'), ########################################## + # Session operations + ########################################## + SessionStat('session_compact', 'object compaction'), + SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), + + ########################################## # Transaction statistics ########################################## TxnStat('txn_update_conflict', 'update conflicts'), diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index a2042c22bbb..ea646604a76 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -611,6 +611,13 @@ session_ops(WT_SESSION *session) "block_compressor=zlib,key_format=S,value_format=S"); /*! [Create a zlib compressed table] */ ret = session->drop(session, "table:mytable", NULL); + + /*! [Create a zstd compressed table] */ + ret = session->create(session, + "table:mytable", + "block_compressor=zstd,key_format=S,value_format=S"); + /*! [Create a zstd compressed table] */ + ret = session->drop(session, "table:mytable", NULL); #endif /*! [Configure checksums to uncompressed] */ @@ -1108,6 +1115,32 @@ main(void) if (ret == 0) (void)conn->close(conn, NULL); + /*! [Configure zlib extension with compression level] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/" + "libwiredtiger_zlib.so=[config=[compression_level=3]]]", &conn); + /*! [Configure zlib extension with compression level] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Configure zstd extension] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/libwiredtiger_zstd.so]", &conn); + /*! [Configure zstd extension] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Configure zstd extension with compression level] */ + ret = wiredtiger_open(home, NULL, + "create," + "extensions=[/usr/local/lib/" + "libwiredtiger_zstd.so=[config=[compression_level=9]]]", &conn); + /*! [Configure zstd extension with compression level] */ + if (ret == 0) + (void)conn->close(conn, NULL); + /* * This example code gets run, and direct I/O might not be available, * causing the open to fail. The documentation requires code snippets, diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java index 83a37e9a6a5..cf8491aa4f8 100644 --- a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java +++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java @@ -549,6 +549,12 @@ session_ops(Session session) "block_compressor=zlib,key_format=S,value_format=S"); /*! [Create a zlib compressed table] */ ret = session.drop("table:mytable", null); + + /*! [Create a zstd compressed table] */ + ret = session.create("table:mytable", + "block_compressor=zstd,key_format=S,value_format=S"); + /*! [Create a zstd compressed table] */ + ret = session.drop("table:mytable", null); } // if (false) /*! [Configure checksums to uncompressed] */ @@ -942,6 +948,29 @@ allExample() /*! [Configure zlib extension] */ conn.close(null); + /*! [Configure zlib extension with compression level] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/" + + "libwiredtiger_zlib.so=[config=[compression_level=3]]]"); + /*! [Configure zlib extension with compression level] */ + conn.close(null); + + /*! [Configure zstd extension] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/libwiredtiger_zstd.so]"); + /*! [Configure zstd extension] */ + conn.close(null); + + /*! [Configure zstd extension with compression level] */ + conn = wiredtiger.open(home, + "create," + + "extensions=[/usr/local/lib/" + + "libwiredtiger_zstd.so=[config=[compression_level=9]]]"); + /*! [Configure zstd extension with compression level] */ + conn.close(null); + /* * This example code gets run, and direct I/O might not be available, * causing the open to fail. The documentation requires code snippets, diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c index 35159d0fa76..885701e564b 100644 --- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c @@ -31,10 +31,20 @@ #include <stdlib.h> #include <string.h> +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ #include <wiredtiger_config.h> + #include <wiredtiger.h> #include <wiredtiger_ext.h> +#ifdef _MSC_VER +#define inline __inline +#endif + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -171,8 +181,6 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, int decoded; uint8_t *dst_tmp; - (void)src_len; /* Unused parameters */ - wt_api = ((LZ4_COMPRESSOR *)compressor)->wt_api; /* @@ -183,6 +191,13 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, #ifdef WORDS_BIGENDIAN lz4_prefix_swap(&prefix); #endif + if (prefix.compressed_len + sizeof(LZ4_PREFIX) > src_len) { + (void)wt_api->err_printf(wt_api, + session, + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); + return (WT_ERROR); + } /* * Decompress, starting after the prefix bytes. Use safe decompression: @@ -267,18 +282,24 @@ lz4_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, size_t *result_lenp, uint32_t *result_slotsp) { LZ4_PREFIX prefix; - int lz4_len; uint32_t slot; - int sourceSize, targetDestSize; + int lz4_len, sourceSize, targetDestSize; (void)compressor; /* Unused parameters */ (void)session; (void)split_pct; (void)final; - sourceSize = (int)offsets[slots]; /* Type conversion */ - targetDestSize = - (int)((dst_len < page_max ? dst_len : page_max) - extra); + /* + * Set the source and target sizes. The target size is complicated: we + * don't want to exceed the smaller of the maximum page size or the + * destination buffer length, and in both cases we have to take into + * account the space for our overhead and the extra bytes required by + * our caller. + */ + sourceSize = (int)offsets[slots]; + targetDestSize = (int)(page_max < dst_len ? page_max : dst_len); + targetDestSize -= (int)(sizeof(LZ4_PREFIX) + extra); /* Compress, starting after the prefix bytes. */ lz4_len = LZ4_compress_destSize((const char *)src, @@ -352,7 +373,7 @@ lz4_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) * Add a LZ4 compressor. */ static int -lz_add_compressor(WT_CONNECTION *connection, int raw, const char *name) +lz_add_compressor(WT_CONNECTION *connection, bool raw, const char *name) { LZ4_COMPRESSOR *lz4_compressor; @@ -391,9 +412,9 @@ lz4_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) (void)config; /* Unused parameters */ - if ((ret = lz_add_compressor(connection, 1, "lz4")) != 0) + if ((ret = lz_add_compressor(connection, true, "lz4")) != 0) return (ret); - if ((ret = lz_add_compressor(connection, 0, "lz4-noraw")) != 0) + if ((ret = lz_add_compressor(connection, false, "lz4-noraw")) != 0) return (ret); return (0); } diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c index 981e334a2de..32f1ddcb9a0 100644 --- a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c @@ -31,10 +31,20 @@ #include <stdlib.h> #include <string.h> +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ #include <wiredtiger_config.h> + #include <wiredtiger.h> #include <wiredtiger_ext.h> +#ifdef _MSC_VER +#define inline __inline +#endif + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -42,6 +52,12 @@ typedef struct { WT_EXTENSION_API *wt_api; /* Extension API */ } SNAPPY_COMPRESSOR; +/* + * Snappy decompression requires an exact compressed byte count. WiredTiger + * doesn't track that value, store it in the destination buffer. + */ +#define SNAPPY_PREFIX sizeof(uint64_t) + #ifdef WORDS_BIGENDIAN /* * snappy_bswap64 -- @@ -64,11 +80,11 @@ snappy_bswap64(uint64_t v) #endif /* - * wt_snappy_error -- + * snappy_error -- * Output an error message, and return a standard error code. */ static int -wt_snappy_error(WT_COMPRESSOR *compressor, +snappy_error(WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, snappy_status snret) { WT_EXTENSION_API *wt_api; @@ -94,68 +110,69 @@ wt_snappy_error(WT_COMPRESSOR *compressor, } /* - * wt_snappy_compress -- + * snappy_compression -- * WiredTiger snappy compression. */ static int -wt_snappy_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_compression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp, int *compression_failed) { snappy_status snret; size_t snaplen; + uint64_t snaplen_u64; char *snapbuf; /* - * dst_len was computed in wt_snappy_pre_size, so we know it's big - * enough. Skip past the space we'll use to store the final count - * of compressed bytes. + * dst_len was computed in snappy_pre_size, so we know it's big enough. + * Skip past the space we'll use to store the final count of compressed + * bytes. */ - snaplen = dst_len - sizeof(size_t); - snapbuf = (char *)dst + sizeof(size_t); + snaplen = dst_len - SNAPPY_PREFIX; + snapbuf = (char *)dst + SNAPPY_PREFIX; /* snaplen is an input and an output arg. */ snret = snappy_compress((char *)src, src_len, snapbuf, &snaplen); - if (snret == SNAPPY_OK) { - if (snaplen + sizeof(size_t) < src_len) { - *result_lenp = snaplen + sizeof(size_t); - *compression_failed = 0; - - /* - * On decompression, snappy requires an exact compressed - * byte count (the current value of snaplen). WiredTiger - * does not preserve that value, so save snaplen at the - * beginning of the destination buffer. - * - * Store the value in little-endian format. - */ + if (snret == SNAPPY_OK && snaplen + SNAPPY_PREFIX < src_len) { + *result_lenp = snaplen + SNAPPY_PREFIX; + *compression_failed = 0; + + /* + * On decompression, snappy requires an exact compressed byte + * count (the current value of snaplen). WiredTiger does not + * preserve that value, so save snaplen at the beginning of + * the destination buffer. + * + * Store the value in little-endian format. + */ + snaplen_u64 = snaplen; #ifdef WORDS_BIGENDIAN - snaplen = snappy_bswap64(snaplen); + snaplen_u64 = snappy_bswap64(snaplen_u64); #endif - *(size_t *)dst = snaplen; - } else - /* The compressor failed to produce a smaller result. */ - *compression_failed = 1; + *(uint64_t *)dst = snaplen_u64; return (0); } - return (wt_snappy_error(compressor, session, "snappy_compress", snret)); + + *compression_failed = 1; + return (snret == SNAPPY_OK ? + 0 : snappy_error(compressor, session, "snappy_compress", snret)); } /* - * wt_snappy_decompress -- + * snappy_decompression -- * WiredTiger snappy decompression. */ static int -wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_decompression(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp) { WT_EXTENSION_API *wt_api; snappy_status snret; - size_t snaplen; + uint64_t snaplen; wt_api = ((SNAPPY_COMPRESSOR *)compressor)->wt_api; @@ -163,36 +180,36 @@ wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, * Retrieve the saved length, handling little- to big-endian conversion * as necessary. */ - snaplen = *(size_t *)src; + snaplen = *(uint64_t *)src; #ifdef WORDS_BIGENDIAN snaplen = snappy_bswap64(snaplen); #endif - if (snaplen + sizeof(size_t) > src_len) { + if (snaplen + SNAPPY_PREFIX > src_len) { (void)wt_api->err_printf(wt_api, session, - "wt_snappy_decompress: stored size exceeds buffer size"); + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); return (WT_ERROR); } /* dst_len is an input and an output arg. */ snret = snappy_uncompress( - (char *)src + sizeof(size_t), snaplen, (char *)dst, &dst_len); + (char *)src + SNAPPY_PREFIX, + (size_t)snaplen, (char *)dst, &dst_len); if (snret == SNAPPY_OK) { *result_lenp = dst_len; return (0); } - - return ( - wt_snappy_error(compressor, session, "snappy_decompress", snret)); + return (snappy_error(compressor, session, "snappy_decompress", snret)); } /* - * wt_snappy_pre_size -- + * snappy_pre_size -- * WiredTiger snappy destination buffer sizing. */ static int -wt_snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, +snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, size_t *result_lenp) { @@ -203,19 +220,19 @@ wt_snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, /* * Snappy requires the dest buffer be somewhat larger than the source. * Fortunately, this is fast to compute, and will give us a dest buffer - * in wt_snappy_compress that we can compress to directly. We add space + * in snappy_compress that we can compress to directly. We add space * in the dest buffer to store the accurate compressed size. */ - *result_lenp = snappy_max_compressed_length(src_len) + sizeof(size_t); + *result_lenp = snappy_max_compressed_length(src_len) + SNAPPY_PREFIX; return (0); } /* - * wt_snappy_terminate -- + * snappy_terminate -- * WiredTiger snappy compression termination. */ static int -wt_snappy_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) +snappy_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) { (void)session; /* Unused parameters */ @@ -227,9 +244,9 @@ int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); /* * snappy_extension_init -- - * WiredTiger snappy compression extension - called directly when - * Snappy support is built in, or via wiredtiger_extension_init when - * snappy support is included via extension loading. + * WiredTiger snappy compression extension - called directly when snappy + * support is built in, or via wiredtiger_extension_init when snappy support + * is included via extension loading. */ int snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) @@ -241,11 +258,11 @@ snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) if ((snappy_compressor = calloc(1, sizeof(SNAPPY_COMPRESSOR))) == NULL) return (errno); - snappy_compressor->compressor.compress = wt_snappy_compress; + snappy_compressor->compressor.compress = snappy_compression; snappy_compressor->compressor.compress_raw = NULL; - snappy_compressor->compressor.decompress = wt_snappy_decompress; - snappy_compressor->compressor.pre_size = wt_snappy_pre_size; - snappy_compressor->compressor.terminate = wt_snappy_terminate; + snappy_compressor->compressor.decompress = snappy_decompression; + snappy_compressor->compressor.pre_size = snappy_pre_size; + snappy_compressor->compressor.terminate = snappy_terminate; snappy_compressor->wt_api = connection->get_extension_api(connection); diff --git a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c index 484df0a6785..ef20503df0a 100644 --- a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c @@ -32,16 +32,18 @@ #include <stdlib.h> #include <string.h> -#include <wiredtiger.h> -#include <wiredtiger_ext.h> - /* * We need to include the configuration file to detect whether this extension - * is being built into the WiredTiger library. + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. */ -#include "wiredtiger_config.h" +#include <wiredtiger_config.h> + +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + #ifdef _MSC_VER -#define inline __inline +#define inline __inline #endif /* Local compressor structure. */ @@ -234,121 +236,163 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, { ZLIB_COMPRESSOR *zlib_compressor; ZLIB_OPAQUE opaque; - z_stream *best_zs, last_zs, zs; - uint32_t curr_slot, last_slot; - int ret; + z_stream *best_zs, *last_zs, _last_zs, *zs, _zs; + uint32_t curr_slot, last_slot, zlib_reserved; + bool increase_reserve; + int ret, tret; - curr_slot = last_slot = 0; - (void)split_pct; - (void)dst_len; + (void)split_pct; /* Unused parameters */ (void)final; zlib_compressor = (ZLIB_COMPRESSOR *)compressor; - memset(&zs, 0, sizeof(zs)); - zs.zalloc = zalloc; - zs.zfree = zfree; - opaque.compressor = compressor; - opaque.session = session; - zs.opaque = &opaque; - - if ((ret = deflateInit(&zs, zlib_compressor->zlib_level)) != Z_OK) - return (zlib_error(compressor, session, "deflateInit", ret)); - - zs.next_in = src; - zs.next_out = dst; /* * Experimentally derived, reserve this many bytes for zlib to finish * up a buffer. If this isn't sufficient, we don't fail but we will be * inefficient. */ #define WT_ZLIB_RESERVED 24 - zs.avail_out = (uint32_t)(page_max - (extra + WT_ZLIB_RESERVED)); +#define WT_ZLIB_RESERVED_MAX 48 + zlib_reserved = WT_ZLIB_RESERVED; + + if (0) { +retry: /* If we reached our maximum reserve, quit. */ + if (zlib_reserved == WT_ZLIB_RESERVED_MAX) + return (0); + zlib_reserved = WT_ZLIB_RESERVED_MAX; + } + + best_zs = last_zs = NULL; + last_slot = 0; + increase_reserve = false; + ret = 0; - /* Save the stream state in case the chosen data doesn't fit. */ - if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK) - return (zlib_error(compressor, session, "deflateCopy", ret)); + zs = &_zs; + memset(zs, 0, sizeof(*zs)); + zs->zalloc = zalloc; + zs->zfree = zfree; + opaque.compressor = compressor; + opaque.session = session; + zs->opaque = &opaque; + + if ((ret = deflateInit(zs, zlib_compressor->zlib_level)) != Z_OK) + return (zlib_error(compressor, session, "deflateInit", ret)); + + zs->next_in = src; + zs->next_out = dst; + + /* + * Set the target size. The target size is complicated: we don't want + * to exceed the smaller of the maximum page size or the destination + * buffer length, and in both cases we have to take into account the + * space required by zlib to finish up the buffer and the extra bytes + * required by our caller. + */ + zs->avail_out = (uint32_t)(page_max < dst_len ? page_max : dst_len); + zs->avail_out -= (uint32_t)(zlib_reserved + extra); /* * Strategy: take the available output size and compress that much * input. Continue until there is no input small enough or the * compression fails to fit. */ - for (best_zs = NULL;;) { + for (;;) { /* Find the next slot we will try to compress up to. */ - if ((curr_slot = zlib_find_slot( - zs.total_in + zs.avail_out, offsets, slots)) > last_slot) { - zs.avail_in = offsets[curr_slot] - offsets[last_slot]; - while (zs.avail_in > 0 && zs.avail_out > 0) - if ((ret = deflate(&zs, Z_SYNC_FLUSH)) != Z_OK) - return (zlib_error(compressor, - session, "deflate", ret)); + curr_slot = zlib_find_slot( + zs->total_in + zs->avail_out, offsets, slots); + if (curr_slot > last_slot) { + zs->avail_in = offsets[curr_slot] - offsets[last_slot]; + while (zs->avail_in > 0 && zs->avail_out > 0) + if ((ret = deflate(zs, Z_SYNC_FLUSH)) != Z_OK) { + ret = zlib_error(compressor, + session, "deflate", ret); + goto err; + } } /* * We didn't do a deflate, or it didn't work: use the last saved - * position. + * position (if any). */ - if (curr_slot <= last_slot || zs.avail_in > 0) { - if ((ret = deflateEnd(&zs)) != Z_OK && - ret != Z_DATA_ERROR) - return (zlib_error( - compressor, session, "deflateEnd", ret)); - - best_zs = &last_zs; + if (curr_slot <= last_slot || zs->avail_in > 0) { + best_zs = last_zs; break; } - /* The last deflation succeeded, discard the saved one. */ - if ((ret = deflateEnd(&last_zs)) != Z_OK && ret != Z_DATA_ERROR) - return (zlib_error( - compressor, session, "deflateEnd", ret)); - /* * If there's more compression to do, save a snapshot and keep * going, otherwise, use the current compression. */ last_slot = curr_slot; - if (zs.avail_out > 0) { - if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK) - return (zlib_error( - compressor, session, "deflateCopy", ret)); + if (zs->avail_out > 0) { + /* Discard any previously saved snapshot. */ + if (last_zs != NULL) { + ret = deflateEnd(last_zs); + last_zs = NULL; + if (ret != Z_OK && ret != Z_DATA_ERROR) { + ret = zlib_error(compressor, + session, "deflateEnd", ret); + goto err; + } + } + last_zs = &_last_zs; + if ((ret = deflateCopy(last_zs, zs)) != Z_OK) { + last_zs = NULL; + ret = zlib_error( + compressor, session, "deflateCopy", ret); + goto err; + } continue; } - best_zs = &zs; + best_zs = zs; break; } - best_zs->avail_out += WT_ZLIB_RESERVED; - ret = deflate(best_zs, Z_FINISH); + if (last_slot > 0 && best_zs != NULL) { + /* Add the reserved bytes and try to finish the compression. */ + best_zs->avail_out += zlib_reserved; + ret = deflate(best_zs, Z_FINISH); - /* - * If the end marker didn't fit, report that we got no work done, - * WiredTiger will compress the (possibly large) page image using - * ordinary compression instead. - */ - if (ret == Z_OK || ret == Z_BUF_ERROR) - last_slot = 0; - else if (ret != Z_STREAM_END) - return ( - zlib_error(compressor, session, "deflate end block", ret)); + /* + * If the end marker didn't fit with the default value, try + * again with a maximum value; if that doesn't work, report we + * got no work done, WiredTiger will compress the (possibly + * large) page image using ordinary compression instead. + */ + if (ret == Z_OK || ret == Z_BUF_ERROR) { + last_slot = 0; + increase_reserve = true; + } else if (ret != Z_STREAM_END) { + ret = zlib_error( + compressor, session, "deflate end block", ret); + goto err; + } + ret = 0; + } - if ((ret = deflateEnd(best_zs)) != Z_OK && ret != Z_DATA_ERROR) - return (zlib_error(compressor, session, "deflateEnd", ret)); +err: if (zs != NULL && + (tret = deflateEnd(zs)) != Z_OK && tret != Z_DATA_ERROR) + ret = zlib_error(compressor, session, "deflateEnd", tret); + if (last_zs != NULL && + (tret = deflateEnd(last_zs)) != Z_OK && tret != Z_DATA_ERROR) + ret = zlib_error(compressor, session, "deflateEnd", tret); - if (last_slot > 0) { + if (ret == 0 && last_slot > 0) { *result_slotsp = last_slot; *result_lenp = (size_t)best_zs->total_out; } else { - /* We didn't manage to compress anything: don't retry. */ + /* We didn't manage to compress anything. */ *result_slotsp = 0; *result_lenp = 1; + + if (increase_reserve) + goto retry; } #if 0 /* Decompress the result and confirm it matches the original source. */ - if (last_slot > 0) { + if (ret == 0 && last_slot > 0) { void *decomp; size_t result_len; @@ -363,19 +407,20 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, "deflate compare with original source", Z_DATA_ERROR); zfree(&opaque, decomp); - if (ret != 0) - return (ret); } #endif #if 0 - fprintf(stderr, - "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" PRIu32 - ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n", - final ? "final" : "not final", (uintmax_t)page_max, - slots, last_slot, offsets[last_slot], (uintmax_t)*result_lenp); + if (ret == 0 && last_slot > 0) + fprintf(stderr, + "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" + PRIu32 ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n", + final ? "final" : "not final", (uintmax_t)page_max, + slots, last_slot, offsets[last_slot], + (uintmax_t)*result_lenp); #endif - return (0); + + return (ret); } /* @@ -396,7 +441,8 @@ zlib_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) * Add a zlib compressor. */ static int -zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) +zlib_add_compressor( + WT_CONNECTION *connection, bool raw, const char *name, int zlib_level) { ZLIB_COMPRESSOR *zlib_compressor; @@ -415,17 +461,80 @@ zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) zlib_compressor->compressor.terminate = zlib_terminate; zlib_compressor->wt_api = connection->get_extension_api(connection); - - /* - * Between 0-10: level: see zlib manual. - */ - zlib_compressor->zlib_level = Z_DEFAULT_COMPRESSION; + zlib_compressor->zlib_level = zlib_level; /* Load the compressor. */ return (connection->add_compressor( connection, name, (WT_COMPRESSOR *)zlib_compressor, NULL)); } +/* + * zlib_init_config -- + * Handle zlib configuration. + */ +static int +zlib_init_config( + WT_CONNECTION *connection, WT_CONFIG_ARG *config, int *zlib_levelp) +{ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + int ret, zlib_level; + + /* If configured as a built-in, there's no configuration argument. */ + if (config == NULL) + return (0); + + /* + * Zlib compression engine allows applications to specify a compression + * level; review the configuration. + */ + wtext = connection->get_extension_api(connection); + if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_get: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) + if (strlen("compression_level") == k.len && + strncmp("compression_level", k.str, k.len) == 0) { + /* + * Between 0-9: level: see zlib manual. + */ + zlib_level = (int)v.val; + if (zlib_level < 0 || zlib_level > 9) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zlib configure: " + "unsupported compression level %d", + zlib_level); + return (EINVAL); + } + *zlib_levelp = zlib_level; + continue; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: zlib configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + return (0); +} + int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); /* @@ -437,13 +546,17 @@ int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); int zlib_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { - int ret; + int ret, zlib_level; - (void)config; /* Unused parameters */ + zlib_level = Z_DEFAULT_COMPRESSION; /* Default */ + if ((ret = zlib_init_config(connection, config, &zlib_level)) != 0) + return (ret); - if ((ret = zlib_add_compressor(connection, 1, "zlib")) != 0) + if ((ret = zlib_add_compressor( + connection, true, "zlib", zlib_level)) != 0) return (ret); - if ((ret = zlib_add_compressor(connection, 0, "zlib-noraw")) != 0) + if ((ret = zlib_add_compressor( + connection, false, "zlib-noraw", zlib_level)) != 0) return (ret); return (0); } diff --git a/src/third_party/wiredtiger/ext/compressors/zstd/Makefile.am b/src/third_party/wiredtiger/ext/compressors/zstd/Makefile.am new file mode 100644 index 00000000000..9f0997011e9 --- /dev/null +++ b/src/third_party/wiredtiger/ext/compressors/zstd/Makefile.am @@ -0,0 +1,11 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +if HAVE_BUILTIN_EXTENSION_ZSTD +noinst_LTLIBRARIES = libwiredtiger_zstd.la +else +lib_LTLIBRARIES = libwiredtiger_zstd.la +libwiredtiger_zstd_la_LDFLAGS = -avoid-version -module +endif + +libwiredtiger_zstd_la_SOURCES = zstd_compress.c +libwiredtiger_zstd_la_LIBADD = -lzstd diff --git a/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c new file mode 100644 index 00000000000..3d0447248b6 --- /dev/null +++ b/src/third_party/wiredtiger/ext/compressors/zstd/zstd_compress.c @@ -0,0 +1,358 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <zstd.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +/* + * We need to include the configuration file to detect whether this extension + * is being built into the WiredTiger library; application-loaded compression + * functions won't need it. + */ +#include <wiredtiger_config.h> + +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + +#ifdef _MSC_VER +#define inline __inline +#endif + +/* Local compressor structure. */ +typedef struct { + WT_COMPRESSOR compressor; /* Must come first */ + + WT_EXTENSION_API *wt_api; /* Extension API */ + + int compression_level; /* compression level */ +} ZSTD_COMPRESSOR; + +/* + * Zstd decompression requires an exact compressed byte count. WiredTiger + * doesn't track that value, store it in the destination buffer. + */ +#define ZSTD_PREFIX sizeof(uint64_t) + +#ifdef WORDS_BIGENDIAN +/* + * zstd_bswap64 -- + * 64-bit unsigned little-endian to/from big-endian value. + */ +static inline uint64_t +zstd_bswap64(uint64_t v) +{ + return ( + ((v << 56) & 0xff00000000000000UL) | + ((v << 40) & 0x00ff000000000000UL) | + ((v << 24) & 0x0000ff0000000000UL) | + ((v << 8) & 0x000000ff00000000UL) | + ((v >> 8) & 0x00000000ff000000UL) | + ((v >> 24) & 0x0000000000ff0000UL) | + ((v >> 40) & 0x000000000000ff00UL) | + ((v >> 56) & 0x00000000000000ffUL) + ); +} +#endif + +/* + * zstd_error -- + * Output an error message, and return a standard error code. + */ +static int +zstd_error(WT_COMPRESSOR *compressor, + WT_SESSION *session, const char *call, size_t error) +{ + WT_EXTENSION_API *wt_api; + + wt_api = ((ZSTD_COMPRESSOR *)compressor)->wt_api; + + (void)wt_api->err_printf(wt_api, session, + "zstd error: %s: %s", call, ZSTD_getErrorName(error)); + return (WT_ERROR); +} + +/* + * zstd_compress -- + * WiredTiger Zstd compression. + */ +static int +zstd_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp, int *compression_failed) +{ + ZSTD_COMPRESSOR *zcompressor; + size_t zstd_ret; + uint64_t zstd_len; + + zcompressor = (ZSTD_COMPRESSOR *)compressor; + + /* Compress, starting past the prefix bytes. */ + zstd_ret = ZSTD_compress( + dst + ZSTD_PREFIX, dst_len - ZSTD_PREFIX, + src, src_len, zcompressor->compression_level); + + /* + * If compression succeeded and the compressed length is smaller than + * the original size, return success. + */ + if (!ZSTD_isError(zstd_ret) && zstd_ret + ZSTD_PREFIX < src_len) { + *result_lenp = zstd_ret + ZSTD_PREFIX; + *compression_failed = 0; + + /* + * On decompression, Zstd requires an exact compressed byte + * count (the current value of zstd_ret). WiredTiger does not + * preserve that value, so save zstd_ret at the beginning of + * the destination buffer. + * + * Store the value in little-endian format. + */ + zstd_len = zstd_ret; +#ifdef WORDS_BIGENDIAN + zstd_len = zstd_bswap64(zstd_len); +#endif + *(uint64_t *)dst = zstd_len; + return (0); + } + + *compression_failed = 1; + return (ZSTD_isError(zstd_ret) ? + zstd_error(compressor, session, "ZSTD_compress", zstd_ret) : 0); +} + +/* + * zstd_decompress -- + * WiredTiger Zstd decompression. + */ +static int +zstd_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, + uint8_t *dst, size_t dst_len, + size_t *result_lenp) +{ + WT_EXTENSION_API *wt_api; + size_t zstd_ret; + uint64_t zstd_len; + + wt_api = ((ZSTD_COMPRESSOR *)compressor)->wt_api; + + /* + * Retrieve the saved length, handling little- to big-endian conversion + * as necessary. + */ + zstd_len = *(uint64_t *)src; +#ifdef WORDS_BIGENDIAN + zstd_len = zstd_bswap64(zstd_len); +#endif + if (zstd_len + ZSTD_PREFIX > src_len) { + (void)wt_api->err_printf(wt_api, + session, + "WT_COMPRESSOR.decompress: stored size exceeds source " + "size"); + return (WT_ERROR); + } + + zstd_ret = + ZSTD_decompress(dst, dst_len, src + ZSTD_PREFIX, (size_t)zstd_len); + + if (!ZSTD_isError(zstd_ret)) { + *result_lenp = zstd_ret; + return (0); + } + return (zstd_error(compressor, session, "ZSTD_decompress", zstd_ret)); +} + +/* + * zstd_pre_size -- + * WiredTiger Zstd destination buffer sizing for compression. + */ +static int +zstd_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, + uint8_t *src, size_t src_len, size_t *result_lenp) +{ + (void)compressor; /* Unused parameters */ + (void)session; + (void)src; + + /* + * Zstd compression runs faster if the destination buffer is sized at + * the upper-bound of the buffer size needed by the compression. Use + * the library calculation of that overhead (plus our overhead). + */ + *result_lenp = ZSTD_compressBound(src_len) + ZSTD_PREFIX; + return (0); +} + +/* + * zstd_terminate -- + * WiredTiger Zstd compression termination. + */ +static int +zstd_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) +{ + (void)session; /* Unused parameters */ + + free(compressor); + return (0); +} + +/* + * zstd_init_config -- + * Handle zstd configuration. + */ +static int +zstd_init_config( + WT_CONNECTION *connection, WT_CONFIG_ARG *config, int *compression_levelp) +{ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + int ret; + + /* If configured as a built-in, there's no configuration argument. */ + if (config == NULL) + return (0); + + /* + * Zstd compression engine allows applications to specify a compression + * level; review the configuration. + */ + wtext = connection->get_extension_api(connection); + if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_get: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) + if (strlen("compression_level") == k.len && + strncmp("compression_level", k.str, k.len) == 0) { + *compression_levelp = (int)v.val; + continue; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: zstd configure: %s", + wtext->strerror(wtext, NULL, ret)); + return (ret); + } + return (0); +} + +int zstd_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); + +/* + * zstd_extension_init -- + * WiredTiger Zstd compression extension - called directly when Zstd + * support is built in, or via wiredtiger_extension_init when Zstd support + * is included via extension loading. + */ +int +zstd_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + ZSTD_COMPRESSOR *zstd_compressor; + int compression_level, ret; + + /* + * Zstd's sweet-spot is better compression than zlib at significantly + * faster compression/decompression speeds. LZ4 and snappy are faster + * than zstd, but have worse compression ratios. Applications wanting + * faster compression/decompression with worse compression will select + * LZ4 or snappy, so we configure zstd for better compression. + * + * From the zstd github site, default measurements of the compression + * engines we support, listing compression ratios with compression and + * decompression speeds: + * + * Name Ratio C.speed D.speed + * MB/s MB/s + * zstd 2.877 330 940 + * zlib 2.730 95 360 + * LZ4 2.101 620 3100 + * snappy 2.091 480 1600 + * + * Set the zstd compression level to 3: according to the zstd web site, + * that reduces zstd's compression speed to around 200 MB/s, increasing + * the compression ratio to 3.100 (close to zlib's best compression + * ratio). In other words, position zstd as a zlib replacement, having + * similar compression at much higher compression/decompression speeds. + */ + compression_level = 3; + if ((ret = + zstd_init_config(connection, config, &compression_level)) != 0) + return (ret); + + if ((zstd_compressor = calloc(1, sizeof(ZSTD_COMPRESSOR))) == NULL) + return (errno); + + zstd_compressor->compressor.compress = zstd_compress; + zstd_compressor->compressor.compress_raw = NULL; + zstd_compressor->compressor.decompress = zstd_decompress; + zstd_compressor->compressor.pre_size = zstd_pre_size; + zstd_compressor->compressor.terminate = zstd_terminate; + + zstd_compressor->wt_api = connection->get_extension_api(connection); + + zstd_compressor->compression_level = compression_level; + + /* Load the compressor */ + return (connection->add_compressor( + connection, "zstd", (WT_COMPRESSOR *)zstd_compressor, NULL)); +} + +/* + * We have to remove this symbol when building as a builtin extension otherwise + * it will conflict with other builtin libraries. + */ +#ifndef HAVE_BUILTIN_EXTENSION_ZSTD +/* + * wiredtiger_extension_init -- + * WiredTiger Zstd compression extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + return (zstd_extension_init(connection, config)); +} +#endif diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index b7ac953cdb1..48522768dc9 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -615,8 +615,6 @@ live_update: WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* - * Set the checkpoint size for the live system. - * * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative @@ -624,7 +622,31 @@ live_update: * cookie into its components, and that's a fair amount * of work. */ - ckpt->ckpt_size = ci->ckpt_size = ckpt_size; + ckpt->ckpt_size = ckpt_size; + + /* + * Set the rolling checkpoint size for the live system. + * The current size includes the current checkpoint's + * root page size (root pages are on the checkpoint's + * block allocation list as root pages are allocated + * with the usual block allocation functions). That's + * correct, but we don't want to include it in the size + * for the next checkpoint. + */ + ckpt_size -= ci->root_size; + + /* + * Additionally, we had a bug for awhile where the live + * checkpoint size grew without bound. We can't sanity + * check the value, that would require walking the tree + * as part of the checkpoint. Bound any bug at the size + * of the file. + * It isn't practical to assert that the value is within + * bounds since databases created with older versions + * of WiredTiger (2.8.0) would likely see an error. + */ + ci->ckpt_size = + WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index 42c3a849a88..a8645f79dbe 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -171,6 +171,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool checkpoint_io, bool compressed) { + struct timespec start, stop; WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(ctmp); @@ -356,6 +357,8 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, data_checksum = !compressed; break; } + if (!F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &start); /* Call the block manager to write the block. */ WT_ERR(checkpoint ? @@ -363,6 +366,14 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, bm->write( bm, session, ip, addr, addr_sizep, data_checksum, checkpoint_io)); + /* Update some statistics now that the write is done */ + if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &stop); + WT_STAT_CONN_INCR(session, cache_write_app_count); + WT_STAT_CONN_INCRV(session, cache_write_app_time, + WT_TIMEDIFF_US(stop, start)); + } + WT_STAT_CONN_INCR(session, cache_write); WT_STAT_DATA_INCR(session, cache_write); S2C(session)->cache->bytes_written += dsk->mem_size; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index c54eaa69c43..90188498535 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -327,22 +327,28 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) if (__wt_hazard_count(session, page) > 1) return (false); + /* If we can do an in-memory split, do it. */ + if (__wt_leaf_page_can_split(session, page)) + return (true); + if (page->memory_footprint < btree->maxmempage) + return (false); + + /* Bump the oldest ID, we're about to do some visibility checks. */ + WT_IGNORE_RET(__wt_txn_update_oldest(session, 0)); + /* - * If we have already tried and the transaction state has not moved on, - * eviction is highly likely to fail. + * Allow some leeway if the transaction ID isn't moving forward since + * it is unlikely eviction will be able to evict the page. Don't keep + * skipping the page indefinitely or large records can lead to + * extremely large memory footprints. */ - if (page->modify->last_eviction_id == __wt_txn_oldest_id(session)) + if (page->modify->update_restored && + page->modify->last_eviction_id == __wt_txn_oldest_id(session)) return (false); - if (page->memory_footprint < btree->maxmempage) - return (__wt_leaf_page_can_split(session, page)); - /* Trigger eviction on the next page release. */ __wt_page_evict_soon(session, ref); - /* Bump the oldest ID, we're about to do some visibility checks. */ - WT_IGNORE_RET(__wt_txn_update_oldest(session, 0)); - /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); } @@ -354,6 +360,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { + struct timespec start, stop; const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; @@ -401,7 +408,15 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ + if (!F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &start); WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &stop); + WT_STAT_CONN_INCR(session, cache_read_app_count); + WT_STAT_CONN_INCRV(session, cache_read_app_time, + WT_TIMEDIFF_US(stop, start)); + } WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index ea667460966..017c820ea29 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1582,6 +1582,13 @@ __split_multi_inmem( */ page->modify->first_dirty_txn = WT_TXN_FIRST; + /* + * If the new page is modified, save the oldest ID from reconciliation + * to avoid repeatedly attempting eviction on the same page. + */ + page->modify->last_eviction_id = orig->modify->last_eviction_id; + page->modify->update_restored = 1; + err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt, true)); @@ -2245,14 +2252,6 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) WT_ERR(__split_multi_inmem(session, page, multi, new)); /* - * If the new page is modified, save the oldest ID from reconciliation - * to avoid repeatedly attempting eviction on the same page. - */ - if (new->page->modify != NULL) - new->page->modify->last_eviction_id = - page->modify->last_eviction_id; - - /* * The rewrite succeeded, we can no longer fail. * * Finalize the move, discarding moved update lists from the original diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index d3ddf33446e..06428b87f6e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -8,6 +8,7 @@ #include "wt_internal.h" +static int __stat_tree_walk(WT_SESSION_IMPL *); static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); static void __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); @@ -23,9 +24,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; - WT_DECL_RET; WT_DSRC_STATS **stats; - WT_REF *next_walk; btree = S2BT(session); bm = btree->bm; @@ -44,9 +43,29 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); - /* Everything else is really, really expensive. */ - if (!F_ISSET(cst, WT_CONN_STAT_ALL)) - return (0); + if (F_ISSET(cst, WT_STAT_TYPE_CACHE_WALK)) + __wt_curstat_cache_walk(session); + + if (F_ISSET(cst, WT_STAT_TYPE_TREE_WALK)) + WT_RET(__stat_tree_walk(session)); + + return (0); +} + +/* + * __stat_tree_walk -- + * Gather btree statistics that require traversing the tree. + */ +static int +__stat_tree_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_DSRC_STATS **stats; + WT_REF *next_walk; + + btree = S2BT(session); + stats = btree->dhandle->stats; /* * Clear the statistics we're about to count. diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 7b583bd9c1e..6d4ad9d0d0f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -9,6 +9,59 @@ #include "wt_internal.h" /* + * __sync_checkpoint_can_skip -- + * There are limited conditions under which we can skip writing a dirty + * page during checkpoint. + */ +static inline bool +__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + WT_TXN *txn; + u_int i; + + mod = page->modify; + txn = &session->txn; + + /* + * We can skip some dirty pages during a checkpoint. The requirements: + * + * 1. they must be leaf pages, + * 2. there is a snapshot transaction active (which is the case in + * ordinary application checkpoints but not all internal cases), + * 3. the first dirty update on the page is sufficiently recent the + * checkpoint transaction would skip them, + * 4. there's already an address for every disk block involved. + */ + if (WT_PAGE_IS_INTERNAL(page)) + return (false); + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) + return (false); + + /* + * The problematic case is when a page was evicted but when there were + * unresolved updates and not every block associated with the page has + * a disk address. We can't skip such pages because we need a checkpoint + * write with valid addresses. + * + * The page's modification information can change underfoot if the page + * is being reconciled, so we'd normally serialize with reconciliation + * before reviewing page-modification information. However, checkpoint + * is the only valid writer of dirty leaf pages at this point, we skip + * the lock. + */ + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + if (multi->addr.addr == NULL) + return (false); + return (true); +} + +/* * __sync_file -- * Flush pages for a specific file. */ @@ -20,7 +73,6 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; @@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * reference and checking modified. */ page = walk->page; - mod = page->modify; /* - * Write dirty pages, unless we can be sure they only - * became dirty after the checkpoint started. - * - * We can skip dirty pages if: - * (1) they are leaf pages; - * (2) there is a snapshot transaction active (which - * is the case in ordinary application checkpoints - * but not all internal cases); and - * (3) the first dirty update on the page is - * sufficiently recent that the checkpoint - * transaction would skip them. - * - * Mark the tree dirty: the checkpoint marked it clean - * and we can't skip future checkpoints until this page - * is written. + * Write dirty pages, if we can't skip them. If we skip + * a page, mark the tree dirty. The checkpoint marked it + * clean and we can't skip future checkpoints until this + * page is written. */ - if (!WT_PAGE_IS_INTERNAL(page) && - F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { - __wt_page_modify_set(session, page); + if (__sync_checkpoint_can_skip(session, page)) { + __wt_tree_modify_set(session); continue; } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 4c338bc6ad9..41f50957809 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -49,7 +49,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * don't have to worry about users seeing inconsistent data source * information. */ - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) { + if (FLD_ISSET(conn->stat_flags, WT_STAT_CLEAR)) { WT_STAT_SET(session, dstats, cursor_insert, 0); WT_STAT_SET(session, dstats, cursor_remove, 0); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 7bce4bc9cef..018cc7a8ac4 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -138,7 +138,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -334,7 +335,8 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_open_cursor[] = { { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "skip_sort_check", "boolean", NULL, NULL, NULL, 0 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"clear\",\"size\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"clear\"," + "\"size\",\"tree_walk\"]", NULL, 0 }, { "target", "list", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -709,7 +711,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -793,7 +796,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -874,7 +878,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, @@ -953,7 +958,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", + NULL, "choices=[\"all\",\"cache_walk\",\"fast\",\"none\"," + "\"clear\",\"tree_walk\"]", NULL, 0 }, { "statistics_log", "category", NULL, NULL, diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 0951fd4e58c..04c29e957a3 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -789,14 +789,17 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) return (&conn->extension_api); } +#ifdef HAVE_BUILTIN_EXTENSION_LZ4 + extern int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif -#ifdef HAVE_BUILTIN_EXTENSION_LZ4 - extern int lz4_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + extern int zstd_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); #endif /* @@ -808,14 +811,17 @@ __conn_load_default_extensions(WT_CONNECTION_IMPL *conn) { WT_UNUSED(conn); +#ifdef HAVE_BUILTIN_EXTENSION_LZ4 + WT_RET(lz4_extension_init(&conn->iface, NULL)); +#endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY WT_RET(snappy_extension_init(&conn->iface, NULL)); #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB WT_RET(zlib_extension_init(&conn->iface, NULL)); #endif -#ifdef HAVE_BUILTIN_EXTENSION_LZ4 - WT_RET(lz4_extension_init(&conn->iface, NULL)); +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + WT_RET(zstd_extension_init(&conn->iface, NULL)); #endif return (0); } @@ -1668,32 +1674,60 @@ __conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) if ((ret = __wt_config_subgets( session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_CONN_STAT_FAST); + LF_SET(WT_STAT_TYPE_FAST); ++set; } WT_RET_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "all", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + LF_SET( + WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); ++set; } WT_RET_NOTFOUND_OK(ret); + if (set > 1) + WT_RET_MSG(session, EINVAL, + "Only one of all, fast, none configuration values should " + "be specified"); + + /* + * Now that we've parsed general statistics categories, process + * sub-categories. + */ + if ((ret = __wt_config_subgets( + session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring cache walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring tree walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + WT_RET_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (!LF_ISSET(WT_CONN_STAT_FAST | WT_CONN_STAT_ALL)) + if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) WT_RET_MSG(session, EINVAL, - "the value \"clear\" can be specified only if " - "either \"all\" or \"fast\" is specified"); - LF_SET(WT_CONN_STAT_CLEAR); + "the value \"clear\" can only be specified if " + "statistics are enabled"); + LF_SET(WT_STAT_CLEAR); } WT_RET_NOTFOUND_OK(ret); - if (set > 1) - WT_RET_MSG(session, EINVAL, - "only one statistics configuration value may be specified"); - /* Configuring statistics clears any existing values. */ conn->stat_flags = flags; @@ -1943,6 +1977,42 @@ __conn_chk_file_system(WT_SESSION_IMPL *session, bool readonly) } /* + * wiredtiger_dummy_session_init -- + * Initialize the connection's dummy session. + */ +static void +wiredtiger_dummy_session_init( + WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler) +{ + WT_SESSION_IMPL *session; + + session = &conn->dummy_session; + + /* + * We use a fake session until we can allocate and initialize the real + * ones. Initialize the necessary fields (unfortunately, the fields we + * initialize have been selected by core dumps, we need to do better). + */ + session->iface.connection = &conn->iface; + session->name = "wiredtiger_open"; + + /* Standard I/O and error handling first. */ + __wt_os_stdio(session); + __wt_event_handler_set(session, event_handler); + + /* Statistics */ + session->stat_bucket = 0; + + /* + * Set the default session's strerror method. If one of the extensions + * being loaded reports an error via the WT_EXTENSION_API strerror + * method, but doesn't supply that method a WT_SESSION handle, we'll + * use the WT_CONNECTION_IMPL's default session and its strerror method. + */ + session->iface.strerror = __wt_session_strerror; +} + +/* * wiredtiger_open -- * Main library entry point: open a new connection to a WiredTiger * database. @@ -2013,21 +2083,11 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); __wt_spin_unlock(NULL, &__wt_process.spinlock); - session = conn->default_session = &conn->dummy_session; - session->iface.connection = &conn->iface; - session->name = "wiredtiger_open"; - - /* Do standard I/O and error handling first. */ - __wt_os_stdio(session); - __wt_event_handler_set(session, event_handler); - /* - * Set the default session's strerror method. If one of the extensions - * being loaded reports an error via the WT_EXTENSION_API strerror - * method, but doesn't supply that method a WT_SESSION handle, we'll - * use the WT_CONNECTION_IMPL's default session and its strerror method. + * Initialize the fake session used until we can create real sessions. */ - conn->default_session->iface.strerror = __wt_session_strerror; + wiredtiger_dummy_session_init(conn, event_handler); + session = conn->default_session = &conn->dummy_session; /* Basic initialization of the connection structure. */ WT_ERR(__wt_connection_init(conn)); diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 1b8b3183d3c..fe5f94ea03d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -183,26 +183,26 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) - WT_ERR_MSG(session, EINVAL, + WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", + WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", false, 10000, WT_MILLION, &cache->evict_cond)); - WT_ERR(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); - WT_ERR(__wt_spin_init(session, + WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); + WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); - WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); + WT_RET(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); if ((ret = __wt_open_internal_session(conn, "evict pass", false, WT_SESSION_NO_DATA_HANDLES, &cache->walk_session)) != 0) - WT_ERR_MSG(NULL, ret, + WT_RET_MSG(NULL, ret, "Failed to create session for eviction walks"); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) { - WT_ERR(__wt_calloc_def(session, + WT_RET(__wt_calloc_def(session, cache->evict_slots, &cache->evict_queues[i].evict_queue)); - WT_ERR(__wt_spin_init(session, + WT_RET(__wt_spin_init(session, &cache->evict_queues[i].evict_lock, "cache eviction")); } @@ -218,9 +218,6 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_cache_stats_update(session); return (0); - -err: WT_RET(__wt_cache_destroy(session)); - return (ret); } /* diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 5ff8b7f798b..5104624523b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -50,21 +50,23 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Statistics. */ __wt_stat_connection_init(conn); - /* Locks. */ + /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); - WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint")); - WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle")); + WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); + WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); - WT_RET(__wt_rwlock_alloc(session, - &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); - WT_RET(__wt_spin_init(session, &conn->metadata_lock, "metadata")); + WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); - WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); - WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); + WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); + WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); + /* Read-write locks */ + WT_RET(__wt_rwlock_alloc( + session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index d5a31c671c0..0715a035807 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -130,12 +130,12 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_RET(__wt_config_gets(session, cfg, "statistics_log.json", &cval)); if (cval.val != 0) - FLD_SET(conn->stat_flags, WT_CONN_STAT_JSON); + FLD_SET(conn->stat_flags, WT_STAT_JSON); WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); if (cval.val != 0) - FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE); + FLD_SET(conn->stat_flags, WT_STAT_ON_CLOSE); /* * We don't allow the log path to be reconfigured for security reasons. @@ -206,7 +206,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) #define WT_TIMESTAMP_JSON_DEFAULT "%Y-%m-%dT%H:%M:%S.000Z" WT_ERR(__wt_config_gets( session, cfg, "statistics_log.timestamp", &cval)); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON) && + if (FLD_ISSET(conn->stat_flags, WT_STAT_JSON) && WT_STRING_MATCH(WT_TIMESTAMP_DEFAULT, cval.str, cval.len)) WT_ERR(__wt_strdup( session, WT_TIMESTAMP_JSON_DEFAULT, &conn->stat_format)); @@ -264,7 +264,7 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) goto err; } - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + if (FLD_ISSET(conn->stat_flags, WT_STAT_JSON)) { WT_ERR(__wt_fprintf(session, conn->stat_fs, "{\"version\":\"%s\",\"localTime\":\"%s\"", WIREDTIGER_VERSION_STRING, conn->stat_stamp)); @@ -482,7 +482,7 @@ __wt_statlog_log_one(WT_SESSION_IMPL *session) conn = S2C(session); - if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) + if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); if (F_ISSET(conn, WT_CONN_SERVER_RUN) && diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index e304cf7b775..9fc466f4c76 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -117,12 +117,12 @@ err: API_END_RET(session, ret); } /* - * __curfile_next_random -- + * __wt_curfile_next_random -- * WT_CURSOR->next method for the btree cursor type when configured with - * next_random. + * next_random. This is exported because it is called directly within LSM. */ -static int -__curfile_next_random(WT_CURSOR *cursor) +int +__wt_curfile_next_random(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -473,7 +473,7 @@ __curfile_create(WT_SESSION_IMPL *session, "column-store objects"); __wt_cursor_set_notsup(cursor); - cursor->next = __curfile_next_random; + cursor->next = __wt_curfile_next_random; cursor->reset = __curfile_reset; WT_ERR(__wt_config_gets_def( diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 700cc366ff0..b36416debe1 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -354,7 +354,7 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) */ __wt_conn_stat_init(session); __wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) __wt_stat_connection_clear_all(conn->stats); cst->stats = (int64_t *)&cst->u.conn_stats; @@ -380,7 +380,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, * If we are only getting the size of the file, we don't need to open * the tree. */ - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) { filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); @@ -401,7 +401,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, if ((ret = __wt_btree_stat_init(session, cst)) == 0) { __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); __wt_stat_dsrc_aggregate(dhandle->stats, &cst->u.dsrc_stats); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) __wt_stat_dsrc_clear_all(dhandle->stats); __wt_curstat_dsrc_final(cst); } @@ -604,50 +604,79 @@ __wt_curstat_open(WT_SESSION_IMPL *session, if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) { if ((ret = __wt_config_subgets( session, &cval, "all", &sval)) == 0 && sval.val != 0) { - if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) + if (!FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_ALL)) goto config_err; - F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + F_SET(cst, WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); } WT_ERR_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_ALL)) + if (F_ISSET(cst, WT_STAT_TYPE_ALL)) WT_ERR_MSG(session, EINVAL, - "only one statistics configuration value " - "may be specified"); - F_SET(cst, WT_CONN_STAT_FAST); + "Only one of all, fast, none " + "configuration values should be specified"); + F_SET(cst, WT_STAT_TYPE_FAST); } WT_ERR_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets(session, + &cval, "cache_walk", &sval)) == 0 && sval.val != 0) { + /* + * Configuring cache walk statistics implies fast + * statistics. Keep that knowledge internal for now - + * it may change in the future. + */ + F_SET(cst, WT_STAT_TYPE_CACHE_WALK | WT_STAT_TYPE_FAST); + } + WT_ERR_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets(session, + &cval, "tree_walk", &sval)) == 0 && sval.val != 0) { + /* + * Configuring tree walk statistics implies fast + * statistics. Keep that knowledge internal for now - + * it may change in the future. + */ + F_SET(cst, WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + } + WT_ERR_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( session, &cval, "size", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_FAST | WT_CONN_STAT_ALL)) + if (F_ISSET(cst, WT_STAT_TYPE_FAST | WT_STAT_TYPE_ALL)) WT_ERR_MSG(session, EINVAL, - "only one statistics configuration value " - "may be specified"); - F_SET(cst, WT_CONN_STAT_SIZE); + "Only one of all, fast, none " + "configuration values should be specified"); + F_SET(cst, WT_STAT_TYPE_SIZE); } WT_ERR_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) WT_ERR_MSG(session, EINVAL, "clear is incompatible with size " "statistics"); - F_SET(cst, WT_CONN_STAT_CLEAR); + F_SET(cst, WT_STAT_CLEAR); } WT_ERR_NOTFOUND_OK(ret); /* If no configuration, use the connection's configuration. */ if (cst->flags == 0) { - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL)) - F_SET(cst, WT_CONN_STAT_ALL); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST)) - F_SET(cst, WT_CONN_STAT_FAST); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_ALL)) + F_SET(cst, WT_STAT_TYPE_ALL); + if (FLD_ISSET( + conn->stat_flags, WT_STAT_TYPE_CACHE_WALK)) + F_SET(cst, WT_STAT_TYPE_CACHE_WALK); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_FAST)) + F_SET(cst, WT_STAT_TYPE_FAST); + if (FLD_ISSET(conn->stat_flags, WT_STAT_TYPE_TREE_WALK)) + F_SET(cst, WT_STAT_TYPE_TREE_WALK); } /* If the connection configures clear, so do we. */ - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - F_SET(cst, WT_CONN_STAT_CLEAR); + if (FLD_ISSET(conn->stat_flags, WT_STAT_CLEAR)) + F_SET(cst, WT_STAT_CLEAR); } /* @@ -670,9 +699,9 @@ __wt_curstat_open(WT_SESSION_IMPL *session, /* * Do the initial statistics snapshot: there won't be cursor operations - * to trigger initialization when aggregating statistics for upper-level - * objects like tables, we need to a valid set of statistics when before - * the open returns. + * to trigger initialization with aggregating statistics for upper-level + * objects like tables so we need a valid set of statistics before the + * open returns. */ WT_ERR(__wt_curstat_init(session, uri, other, cst->cfg, cst)); cst->notinitialized = false; diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 1b93b27f564..6543d54e90f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -757,13 +757,36 @@ err: API_END_RET(session, ret); } /* + * __curtable_complete -- + * Return failure if the table is not yet fully created. + */ +static int +__curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_DECL_RET; + bool complete; + + if (table->cg_complete) + return (0); + + /* If the table is incomplete, wait on the table lock and recheck. */ + complete = false; + WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete); + WT_RET(ret); + if (!complete) + WT_RET_MSG(session, EINVAL, + "'%s' not available until all column groups are created", + table->name); + return (0); +} + +/* * __curtable_open_colgroups -- * Open cursors on column groups for a table cursor. */ static int __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) { - WT_DECL_RET; WT_SESSION_IMPL *session; WT_TABLE *table; WT_CURSOR **cp; @@ -775,21 +798,11 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) cfg_arg[0], cfg_arg[1], "dump=\"\",readonly=0", NULL, NULL }; u_int i; - bool complete; session = (WT_SESSION_IMPL *)ctable->iface.session; table = ctable->table; - /* If the table is incomplete, wait on the table lock and recheck. */ - complete = table->cg_complete; - if (!complete) { - WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete); - WT_RET(ret); - } - if (!complete) - WT_RET_MSG(session, EINVAL, - "Can't use '%s' until all column groups are created", - table->name); + WT_RET(__curtable_complete(session, table)); /* completeness check */ WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &ctable->cg_cursors)); @@ -887,6 +900,8 @@ __wt_curtable_open(WT_SESSION_IMPL *session, size = WT_PTRDIFF(columns, tablename); WT_RET(__wt_schema_get_table(session, tablename, size, false, &table)); + WT_RET(__curtable_complete(session, table)); /* completeness check */ + if (table->is_simple) { /* Just return a cursor on the underlying data source. */ ret = __wt_open_cursor(session, diff --git a/src/third_party/wiredtiger/src/docs/build-posix.dox b/src/third_party/wiredtiger/src/docs/build-posix.dox index 4889bf931c9..3e7f8f37acd 100644 --- a/src/third_party/wiredtiger/src/docs/build-posix.dox +++ b/src/third_party/wiredtiger/src/docs/build-posix.dox @@ -150,10 +150,14 @@ Configure WiredTiger to support the \c verbose configuration string to Configure WiredTiger for <a href="http://www.zlib.net/">zlib</a> compression; see @ref compression for more information. +@par \c --enable-zstd +Configure WiredTiger for <a href="https://github.com/facebook/zstd">Zstd</a> +compression; see @ref compression for more information. + @par <code>--with-builtins</code> Configure WiredTiger to include support for extensions in the main library. This avoids requiring additional libraries for supported extensions. Currently -supported options are \c lz4, \c snappy and \c zlib. +supported options are \c lz4, \c snappy, \c zlib and \c zstd. @par <code>--with-python-prefix</code> Configure WiredTiger to install Python libraries to a non-standard Python diff --git a/src/third_party/wiredtiger/src/docs/compression.dox b/src/third_party/wiredtiger/src/docs/compression.dox index 0be96835760..74bed5c6f68 100644 --- a/src/third_party/wiredtiger/src/docs/compression.dox +++ b/src/third_party/wiredtiger/src/docs/compression.dox @@ -1,7 +1,7 @@ /*! @m_page{{c,java},compression,Compressors} This section explains how to configure WiredTiger's builtin support for -the lz4, snappy and zlib compression engines. +the lz4, snappy, zlib and zstd compression engines. @section compression_lz4 Using LZ4 compression @@ -85,11 +85,53 @@ an extension. For example, with the WiredTiger library installed in @snippet ex_all.c Configure zlib extension +The default compression level for the zlib compression is +\c Z_DEFAULT_COMPRESSION (see the zlib documentation for further +information); compression can be configured to other levels using the +additional configuration argument \c compression_level. + +@snippet ex_all.c Configure zlib extension with compression level + Finally, when creating the WiredTiger object, set \c block_compressor to \c zlib: @snippet ex_all.c Create a zlib compressed table +@section compression_zstd Using Zstd compression + +To use the builtin support for Facebook's +<a href="https://github.com/facebook/zstd">Zstd</a> +compression, first check that Zstd is installed in include and library +directories searched by the compiler. Once Zstd is installed, you can +enable Zstd using the \c --enable-zstd option to configure. + +If Zstd is installed in a location not normally searched by the +compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS +to indicate these locations. For example, with the Zstd includes and +libraries installed in \c /usr/local/include and \c /usr/local/lib, you +would run configure with the following additional arguments: + +@code +--enable-zstd CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" +@endcode + +When opening the WiredTiger database, load the Zstd shared library as +an extension. For example, with the WiredTiger library installed in +\c /usr/local/lib, you would use the following extension: + +@snippet ex_all.c Configure zstd extension + +The default compression level for the zstd compression is 3; compression +can be configured to other levels using the additional configuration +argument \c compression_level. + +@snippet ex_all.c Configure zstd extension with compression level + +Finally, when creating the WiredTiger object, set \c block_compressor +to \c zstd: + +@snippet ex_all.c Create a zstd compressed table + @section compression_upgrading Upgrading compression engines WiredTiger does not store information with file blocks to identify the diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index a2ef7658ec6..4b1337f84b8 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -95,6 +95,7 @@ WiredTigerStat WiredTigerTestCase Yann Za +Zstd aR abstime ack'ed @@ -507,3 +508,4 @@ xa yieldcpu zlib zseries +zstd diff --git a/src/third_party/wiredtiger/src/docs/wtperf.dox b/src/third_party/wiredtiger/src/docs/wtperf.dox index df66ad43355..83aadf8a776 100644 --- a/src/third_party/wiredtiger/src/docs/wtperf.dox +++ b/src/third_party/wiredtiger/src/docs/wtperf.dox @@ -155,10 +155,12 @@ checkpoint every rate operations during the populate phase in the populate thre number of checkpoint threads @par conn_config (string, default="create") connection configuration string +@par close_conn (boolean, default=true) +properly close connection at end of test. Setting to false does not sync data to disk and can result in lost data after test exits. @par compact (boolean, default=false) post-populate compact for LSM merging activity @par compression (string, default="none") -compression extension. Allowed configuration values are: 'none', 'lz4', 'snappy', 'zlib' +compression extension. Allowed configuration values are: 'none', 'lz4', 'snappy', 'zlib', 'zstd' @par create (boolean, default=true) do population phase; false to use existing database @par database_count (unsigned int, default=1) diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 45ec9bce3b5..6c99f3a13dc 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -548,6 +548,7 @@ __evict_pass(WT_SESSION_IMPL *session) * does need to do some work. */ __wt_cache_read_gen_incr(session); + ++cache->evict_pass_gen; /* * Update the oldest ID: we use it to decide whether pages are @@ -1055,7 +1056,7 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int max_entries, retries, slot, start_slot, spins; + u_int max_entries, retries, slot, spins, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); @@ -1076,12 +1077,9 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) * Another pathological case: if there are only a tiny number of * candidate pages in cache, don't put all of them on one queue. */ - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) - max_entries = WT_MIN(max_entries, - 1 + (uint32_t)(__wt_cache_pages_inuse(cache) / 2)); - else - max_entries = WT_MIN(max_entries, - 1 + (uint32_t)(cache->pages_dirty_leaf / 2)); + total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ? + __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf); + max_entries = WT_MIN(max_entries, 1 + total_candidates / 2); retry: while (slot < max_entries) { /* @@ -1286,8 +1284,8 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, - WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) +__evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, + u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; @@ -1414,6 +1412,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, page = ref->page; modified = __wt_page_is_modified(page); + page->evict_pass_gen = cache->evict_pass_gen; /* * Use the EVICT_LRU flag to avoid putting pages onto the list @@ -1560,7 +1559,7 @@ __evict_get_ref( server_only = is_server && !WT_EVICT_HAS_WORKERS(session); urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || - __wt_cache_aggressive(session); + (is_app && __wt_cache_aggressive(session)); urgent_queue = cache->evict_urgent_queue; *btreep = NULL; *refp = NULL; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 092f80cc000..3d1557e027e 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -31,23 +31,14 @@ __evict_exclusive_clear(WT_SESSION_IMPL *session, WT_REF *ref) static inline int __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref) { - int loops; - WT_ASSERT(session, ref->state == WT_REF_LOCKED); /* * Check for a hazard pointer indicating another thread is using the * page, meaning the page cannot be evicted. */ - for (loops = 0; loops < 10; loops++) { - if (__wt_page_hazard_check(session, ref->page) == NULL) - return (0); - if (ref->page->read_gen != WT_READGEN_OLDEST && - ref->page->memory_footprint < - S2BT(session)->split_deepen_min_child) - break; - __wt_sleep(0, WT_THOUSAND); - } + if (__wt_page_hazard_check(session, ref->page) == NULL) + return (0); WT_STAT_DATA_INCR(session, cache_eviction_hazard); WT_STAT_CONN_INCR(session, cache_eviction_hazard); diff --git a/src/third_party/wiredtiger/src/evict/evict_stat.c b/src/third_party/wiredtiger/src/evict/evict_stat.c new file mode 100644 index 00000000000..2dd3b1e83a0 --- /dev/null +++ b/src/third_party/wiredtiger/src/evict/evict_stat.c @@ -0,0 +1,138 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __evict_stat_walk -- + * Walk all the pages in cache for a dhandle gathering stats information + */ +static void +__evict_stat_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_REF *next_walk; + uint64_t dsk_size, gen_gap, size; + uint64_t written_size_cnt, written_size_sum; + uint64_t gen_gap_cnt, gen_gap_max, gen_gap_sum; + uint64_t max_pagesize, min_written_size; + uint64_t num_memory, num_queued, num_not_queueable, num_smaller_allocsz; + uint64_t pages_clean, pages_dirty, pages_internal, pages_leaf; + uint64_t seen_count, walk_count; + + btree = S2BT(session); + next_walk = NULL; + written_size_cnt = written_size_sum = 0; + gen_gap_cnt = gen_gap_max = gen_gap_sum = 0; + max_pagesize = 0; + num_memory = num_queued = num_not_queueable = num_smaller_allocsz = 0; + pages_clean = pages_dirty = pages_internal = pages_leaf = 0; + seen_count = walk_count = 0; + min_written_size = UINT64_MAX; + + while (__wt_tree_walk_count(session, &next_walk, &walk_count, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + ++seen_count; + page = next_walk->page; + size = page->memory_footprint; + + if (__wt_page_is_modified(page)) + ++pages_dirty; + else + ++pages_clean; + + if (!__wt_ref_is_root(next_walk) && + !__wt_page_can_evict(session, next_walk, NULL)) + ++num_not_queueable; + + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + ++num_queued; + + if (size > max_pagesize) + max_pagesize = size; + + dsk_size = page->dsk != NULL ? page->dsk->mem_size : 0; + if (dsk_size != 0) { + if (dsk_size < btree->allocsize) + ++num_smaller_allocsz; + if (dsk_size < min_written_size) + min_written_size = dsk_size; + ++written_size_cnt; + written_size_sum += dsk_size; + } else + ++num_memory; + + if (WT_PAGE_IS_INTERNAL(page)) + ++pages_internal; + else + ++pages_leaf; + + /* Skip root pages since they are never considered */ + if (__wt_ref_is_root(next_walk)) + continue; + + gen_gap = + S2C(session)->cache->evict_pass_gen - page->evict_pass_gen; + if (gen_gap > gen_gap_max) + gen_gap_max = gen_gap; + gen_gap_sum += gen_gap; + ++gen_gap_cnt; + } + + WT_STAT_DATA_SET(session, cache_state_avg_written_size, + written_size_cnt == 0 ? 0 : written_size_sum / written_size_cnt); + WT_STAT_DATA_SET(session, cache_state_gen_avg_gap, + gen_gap_cnt == 0 ? 0 : gen_gap_sum / gen_gap_cnt); + + WT_STAT_DATA_SET(session, cache_state_gen_max_gap, gen_gap_max); + WT_STAT_DATA_SET(session, cache_state_max_pagesize, max_pagesize); + WT_STAT_DATA_SET(session, + cache_state_min_written_size, min_written_size); + WT_STAT_DATA_SET(session, cache_state_memory, num_memory); + WT_STAT_DATA_SET(session, cache_state_queued, num_queued); + WT_STAT_DATA_SET(session, cache_state_not_queueable, num_not_queueable); + WT_STAT_DATA_SET(session, + cache_state_smaller_alloc_size, num_smaller_allocsz); + WT_STAT_DATA_SET(session, cache_state_pages, walk_count); + WT_STAT_DATA_SET(session, cache_state_pages_clean, pages_clean); + WT_STAT_DATA_SET(session, cache_state_pages_dirty, pages_dirty); + WT_STAT_DATA_SET(session, cache_state_pages_internal, pages_internal); + WT_STAT_DATA_SET(session, cache_state_pages_leaf, pages_leaf); + WT_STAT_DATA_SET(session, + cache_state_refs_skipped, walk_count - seen_count); +} + +/* + * __wt_curstat_cache_walk -- + * Initialize the statistics for a cache cache_walk pass. + */ +void +__wt_curstat_cache_walk(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_PAGE_INDEX *root_idx; + + btree = S2BT(session); + conn = S2C(session); + + /* Set statistics that don't require walking the cache. */ + WT_STAT_DATA_SET(session, + cache_state_gen_current, conn->cache->evict_pass_gen); + + /* Root page statistics */ + root_idx = WT_INTL_INDEX_GET_SAFE(btree->root.page); + WT_STAT_DATA_SET(session, + cache_state_root_entries, root_idx->entries); + WT_STAT_DATA_SET(session, + cache_state_root_size, btree->root.page->memory_footprint); + + WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); +} diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index b4ca937e7ed..84c91097a99 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -430,6 +430,8 @@ struct __wt_page_modify { #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ uint8_t rec_result; /* Reconciliation state */ + + uint8_t update_restored; /* Page created by restoring updates */ }; /* @@ -619,6 +621,8 @@ struct __wt_page { #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; + /* The evict pass generation for the page */ + uint64_t evict_pass_gen; size_t memory_footprint; /* Memory attached to the page */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 74ebf74f1e9..daf2eb158c1 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -485,6 +485,38 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_tree_modify_set -- + * Mark the tree dirty. + */ +static inline void +__wt_tree_modify_set(WT_SESSION_IMPL *session) +{ + /* + * Test before setting the dirty flag, it's a hot cache line. + * + * The tree's modified flag is cleared by the checkpoint thread: set it + * and insert a barrier before dirtying the page. (I don't think it's + * a problem if the tree is marked dirty with all the pages clean, it + * might result in an extra checkpoint that doesn't do any work but it + * shouldn't cause problems; regardless, let's play it safe.) + */ + if (!S2BT(session)->modified) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + S2BT(session)->modified = true; + WT_FULL_BARRIER(); + } + + /* + * The btree may already be marked dirty while the connection is still + * clean; mark the connection dirty outside the test of the btree state. + */ + if (!S2C(session)->modified) + S2C(session)->modified = true; +} + +/* * __wt_page_modify_clear -- * Clean a modified page. */ @@ -513,30 +545,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Mark the tree dirty (even if the page is already marked dirty), newly * created pages to support "empty" files are dirty, but the file isn't - * marked dirty until there's a real change needing to be written. Test - * before setting the dirty flag, it's a hot cache line. - * - * The tree's modified flag is cleared by the checkpoint thread: set it - * and insert a barrier before dirtying the page. (I don't think it's - * a problem if the tree is marked dirty with all the pages clean, it - * might result in an extra checkpoint that doesn't do any work but it - * shouldn't cause problems; regardless, let's play it safe.) - */ - if (!S2BT(session)->modified) { - /* Assert we never dirty a checkpoint handle. */ - WT_ASSERT(session, session->dhandle->checkpoint == NULL); - - S2BT(session)->modified = true; - WT_FULL_BARRIER(); - } - - /* - * There is a possibility of btree being dirty whereas connection being - * clean when entering this function. So make sure to update connection - * to dirty outside a condition on btree modified flag. + * marked dirty until there's a real change needing to be written. */ - if (!S2C(session)->modified) - S2C(session)->modified = true; + __wt_tree_modify_set(session); __wt_page_only_modify_set(session, page); } @@ -1167,15 +1178,7 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * There is no point doing an in-memory split unless there is a lot of * data in the last skiplist on the page. Split if there are enough * items and the skiplist does not fit within a single disk page. - * - * Rather than scanning the whole list, walk a higher level, which - * gives a sample of the items -- at level 0 we have all the items, at - * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more - * than 30 items and more data than would fit in a disk page, split. */ -#define WT_MIN_SPLIT_DEPTH 2 -#define WT_MIN_SPLIT_COUNT 30 -#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ ins_head = page->type == WT_PAGE_ROW_LEAF ? (page->pg_row_entries == 0 ? @@ -1184,8 +1187,40 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) WT_COL_APPEND(page); if (ins_head == NULL) return (false); + + /* + * In the extreme case, where the page is much larger than the maximum + * size, split as soon as there are 5 items on the page. + */ +#define WT_MAX_SPLIT_COUNT 5 + if (page->memory_footprint > btree->maxleafpage * 2) { + for (count = 0, ins = ins_head->head[0]; + ins != NULL; + ins = ins->next[0]) { + if (++count < WT_MAX_SPLIT_COUNT) + continue; + + WT_STAT_CONN_INCR(session, cache_inmem_splittable); + WT_STAT_DATA_INCR(session, cache_inmem_splittable); + return (true); + } + + return (false); + } + + /* + * Rather than scanning the whole list, walk a higher level, which + * gives a sample of the items -- at level 0 we have all the items, at + * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more + * than 30 items and more data than would fit in a disk page, split. + */ +#define WT_MIN_SPLIT_DEPTH 2 +#define WT_MIN_SPLIT_COUNT 30 +#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ + for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; - ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { + ins != NULL; + ins = ins->next[WT_MIN_SPLIT_DEPTH]) { count += WT_MIN_SPLIT_MULTIPLIER; size += WT_MIN_SPLIT_MULTIPLIER * (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index b24b625aec4..9a2b83b5b57 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -91,6 +91,7 @@ struct __wt_cache { uint64_t read_gen; /* Current page read generation */ uint64_t read_gen_oldest; /* Oldest read generation the eviction * server saw in its last queue load */ + uint64_t evict_pass_gen; /* Number of eviction passes */ /* * Eviction thread information. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index ce81dcf5976..d7c3bf69686 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -285,13 +285,7 @@ struct __wt_connection_impl { uint64_t ckpt_time_recent; /* Checkpoint time recent/total */ uint64_t ckpt_time_total; -#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ -#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ -#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ -#define WT_CONN_STAT_JSON 0x08 /* output JSON format */ -#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ -#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ - uint32_t stat_flags; + uint32_t stat_flags; /* Options declared in flags.py */ /* Connection statistics */ WT_CONNECTION_STATS *stats[WT_COUNTER_SLOTS]; diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index f1fa4d193ac..e322a53a65d 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -467,7 +467,7 @@ struct __wt_cursor_stat { uint64_t v; /* Current stats value */ WT_ITEM pv; /* Current stats value (string) */ - /* Uses the same values as WT_CONNECTION::stat_flags field */ + /* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index e3cffa4ca3c..79e6405e148 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -282,6 +282,7 @@ extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bo extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_curfile_next_random(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curfile_update_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -352,6 +353,7 @@ extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session); extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); @@ -723,7 +725,7 @@ extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_F extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_stats_update(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 5d718da473d..b0d167525b2 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -70,6 +70,14 @@ #define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 #define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 #define WT_SESSION_SERVER_ASYNC 0x00080000 +#define WT_STAT_CLEAR 0x00000001 +#define WT_STAT_JSON 0x00000002 +#define WT_STAT_ON_CLOSE 0x00000004 +#define WT_STAT_TYPE_ALL 0x00000008 +#define WT_STAT_TYPE_CACHE_WALK 0x00000010 +#define WT_STAT_TYPE_FAST 0x00000020 +#define WT_STAT_TYPE_SIZE 0x00000040 +#define WT_STAT_TYPE_TREE_WALK 0x00000080 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index 2550ca444c1..b433e4c3c44 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -31,6 +31,17 @@ struct __wt_lsm_worker_args { }; /* + * WT_LSM_CURSOR_CHUNK -- + * Iterator struct containing all the LSM cursor access points for a chunk. + */ +struct __wt_lsm_cursor_chunk { + WT_BLOOM *bloom; /* Bloom filter handle for each chunk.*/ + WT_CURSOR *cursor; /* Cursor handle for each chunk. */ + uint64_t count; /* Number of items in chunk */ + uint64_t switch_txn; /* Switch txn for each chunk */ +}; + +/* * WT_CURSOR_LSM -- * An LSM cursor. */ @@ -43,17 +54,12 @@ struct __wt_cursor_lsm { u_int nchunks; /* Number of chunks in the cursor */ u_int nupdates; /* Updates needed (including snapshot isolation checks). */ - WT_BLOOM **blooms; /* Bloom filter handles. */ - size_t bloom_alloc; - - WT_CURSOR **cursors; /* Cursor handles. */ - size_t cursor_alloc; - - WT_CURSOR *current; /* The current cursor for iteration */ + WT_CURSOR *current; /* The current cursor for iteration */ WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */ - uint64_t *switch_txn; /* Switch txn for each chunk */ - size_t txnid_alloc; + WT_LSM_CURSOR_CHUNK **chunks; /* Array of LSM cursor units */ + size_t chunks_alloc; /* Current size iterators array */ + size_t chunks_count; /* Current number of iterators */ u_int update_count; /* Updates performed. */ diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index f0f8173bad4..b736d6ee9fb 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -74,6 +74,16 @@ struct __wt_rwlock { struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { volatile int lock; + + /* + * We track acquisitions and time spent waiting for some locks. For + * performance reasons and to make it possible to write generic code + * that tracks statistics for different locks, we store the offset + * of the statistics fields to be updated during lock acquisition. + */ + int16_t stat_count_off; /* acquisitions offset */ + int16_t stat_app_usecs_off; /* waiting application threads offset */ + int16_t stat_int_usecs_off; /* waiting server threads offset */ }; #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ @@ -83,7 +93,17 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { wt_mutex_t lock; - const char *name; /* Statistics: mutex name */ + const char *name; /* Mutex name */ + + /* + * We track acquisitions and time spent waiting for some locks. For + * performance reasons and to make it possible to write generic code + * that tracks statistics for different locks, we store the offset + * of the statistics fields to be updated during lock acquisition. + */ + int16_t stat_count_off; /* acquisitions offset */ + int16_t stat_app_usecs_off; /* waiting application threads offset */ + int16_t stat_int_usecs_off; /* waiting server threads offset */ int8_t initialized; /* Lock initialized, for cleanup */ }; diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index cb1847d9991..a6309e0976b 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -32,6 +32,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) WT_UNUSED(name); t->lock = 0; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; return (0); } @@ -111,6 +112,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) #endif t->name = name; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; t->initialized = 1; WT_UNUSED(session); @@ -255,3 +257,46 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) #error Unknown spinlock type #endif + +/* + * WT_SPIN_INIT_TRACKED -- + * Spinlock initialization, with tracking. + * + * Implemented as a macro so we can pass in a statistics field and convert + * it into a statistics structure array offset. + */ +#define WT_SPIN_INIT_TRACKED(session, t, name) do { \ + WT_RET(__wt_spin_init(session, t, #name)); \ + (t)->stat_count_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_count); \ + (t)->stat_app_usecs_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_wait_application); \ + (t)->stat_int_usecs_off = (int16_t)WT_STATS_FIELD_TO_OFFSET( \ + S2C(session)->stats, lock_##name##_wait_internal); \ +} while (0) + +/* + * __wt_spin_lock_track -- + * Spinlock acquisition, with tracking. + */ +static inline void +__wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + struct timespec enter, leave; + int64_t **stats; + + if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { + __wt_epoch(session, &enter); + __wt_spin_lock(session, t); + __wt_epoch(session, &leave); + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][t->stat_count_off]++; + if (F_ISSET(session, WT_SESSION_INTERNAL)) + stats[session->stat_bucket][t->stat_int_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + else + stats[session->stat_bucket][t->stat_app_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + } else + __wt_spin_lock(session, t); +} diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index f93c596e2ca..6a5ce67a867 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -86,11 +86,11 @@ struct __wt_table { if (F_ISSET(session, (flag))) { \ op; \ } else { \ - __wt_spin_lock(session, (lock)); \ + __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ - __wt_spin_unlock(session, (lock)); \ + __wt_spin_unlock(session, lock); \ } \ } while (0) @@ -102,11 +102,11 @@ struct __wt_table { ret = 0; \ if (!F_ISSET(session, (flag)) && \ F_ISSET(session, WT_SESSION_LOCK_NO_WAIT)) { \ - if ((ret = __wt_spin_trylock(session, (lock))) == 0) { \ + if ((ret = __wt_spin_trylock(session, lock)) == 0) { \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ - __wt_spin_unlock(session, (lock)); \ + __wt_spin_unlock(session, lock); \ } \ } else \ WT_WITH_LOCK_WAIT(session, lock, flag, op); \ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index aa51dae58c4..3f9f495c134 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -147,6 +147,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); + /* Sessions have an associated statistics bucket based on its ID. */ + u_int stat_bucket; /* Statistics bucket offset */ + uint32_t flags; /* diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 68879206851..d0b0b60585a 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -79,9 +79,9 @@ * those structures regardless of the specific statistic structure we're working * with, by translating statistics structure field names to structure offsets. * - * Translate a statistic's value name to an offset. + * Translate a statistic's value name to an offset in the array. */ -#define WT_STATS_FIELD_TO_SLOT(stats, fld) \ +#define WT_STATS_FIELD_TO_OFFSET(stats, fld) \ (int)(&(stats)[0]->fld - (int64_t *)(stats)[0]) /* @@ -140,38 +140,54 @@ __wt_stats_clear(void *stats_arg, int slot) #define WT_STAT_ENABLED(session) (S2C(session)->stat_flags != 0) #define WT_STAT_READ(stats, fld) \ - __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) + __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_OFFSET(stats, fld)) #define WT_STAT_WRITE(session, stats, fld, v) do { \ if (WT_STAT_ENABLED(session)) \ (stats)->fld = (int64_t)(v); \ } while (0) -#define WT_STAT_DECRV(session, stats, fld, value) do { \ +#define WT_STAT_DECRV_BASE(session, stat, fld, value) do { \ if (WT_STAT_ENABLED(session)) \ - (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value); \ + (stat)->fld -= (int64_t)(value); \ } while (0) -#define WT_STAT_DECRV_ATOMIC(session, stats, fld, value) do { \ +#define WT_STAT_DECRV_ATOMIC_BASE(session, stat, fld, value) do { \ + if (WT_STAT_ENABLED(session)) \ + __wt_atomic_subi64(&(stat)->fld, (int64_t)(value)); \ +} while (0) +#define WT_STAT_INCRV_BASE(session, stat, fld, value) do { \ + if (WT_STAT_ENABLED(session)) \ + (stat)->fld += (int64_t)(value); \ +} while (0) +#define WT_STAT_INCRV_ATOMIC_BASE(session, stat, fld, value) do { \ if (WT_STAT_ENABLED(session)) \ - __wt_atomic_subi64(&(stats)[WT_STATS_SLOT_ID(session)]->fld, \ - (int64_t)(value)); \ + __wt_atomic_addi64(&(stat)->fld, (int64_t)(value)); \ +} while (0) + +#define WT_STAT_DECRV(session, stats, fld, value) do { \ + WT_STAT_DECRV_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ +} while (0) +#define WT_STAT_DECRV_ATOMIC(session, stats, fld, value) do { \ + WT_STAT_DECRV_ATOMIC_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_DECR(session, stats, fld) \ WT_STAT_DECRV(session, stats, fld, 1) + #define WT_STAT_INCRV(session, stats, fld, value) do { \ - if (WT_STAT_ENABLED(session)) \ - (stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value); \ + WT_STAT_INCRV_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_INCRV_ATOMIC(session, stats, fld, value) do { \ - if (WT_STAT_ENABLED(session)) \ - __wt_atomic_addi64(&(stats)[WT_STATS_SLOT_ID(session)]->fld, \ - (int64_t)(value)); \ + WT_STAT_INCRV_ATOMIC_BASE( \ + session, (stats)[(session)->stat_bucket], fld, value); \ } while (0) #define WT_STAT_INCR(session, stats, fld) \ WT_STAT_INCRV(session, stats, fld, 1) #define WT_STAT_SET(session, stats, fld, value) do { \ if (WT_STAT_ENABLED(session)) { \ __wt_stats_clear(stats, \ - WT_STATS_FIELD_TO_SLOT(stats, fld)); \ + WT_STATS_FIELD_TO_OFFSET(stats, fld)); \ (stats)[0]->fld = (int64_t)(value); \ } \ } while (0) @@ -179,18 +195,24 @@ __wt_stats_clear(void *stats_arg, int slot) /* * Update connection handle statistics if statistics gathering is enabled. */ -#define WT_STAT_CONN_DECR(session, fld) \ - WT_STAT_DECR(session, S2C(session)->stats, fld) -#define WT_STAT_CONN_DECR_ATOMIC(session, fld) \ - WT_STAT_DECRV_ATOMIC(session, S2C(session)->stats, fld, 1) #define WT_STAT_CONN_DECRV(session, fld, value) \ - WT_STAT_DECRV(session, S2C(session)->stats, fld, value) -#define WT_STAT_CONN_INCR(session, fld) \ - WT_STAT_INCR(session, S2C(session)->stats, fld) -#define WT_STAT_CONN_INCR_ATOMIC(session, fld) \ - WT_STAT_INCRV_ATOMIC(session, S2C(session)->stats, fld, 1) + WT_STAT_DECRV_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, value) +#define WT_STAT_CONN_DECR_ATOMIC(session, fld) \ + WT_STAT_DECRV_ATOMIC_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, 1) +#define WT_STAT_CONN_DECR(session, fld) \ + WT_STAT_CONN_DECRV(session, fld, 1) + #define WT_STAT_CONN_INCRV(session, fld, value) \ - WT_STAT_INCRV(session, S2C(session)->stats, fld, value) + WT_STAT_INCRV_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, value) +#define WT_STAT_CONN_INCR_ATOMIC(session, fld) \ + WT_STAT_INCRV_ATOMIC_BASE(session, \ + S2C(session)->stats[(session)->stat_bucket], fld, 1) +#define WT_STAT_CONN_INCR(session, fld) \ + WT_STAT_CONN_INCRV(session, fld, 1) + #define WT_STAT_CONN_SET(session, fld, value) \ WT_STAT_SET(session, S2C(session)->stats, fld, value) @@ -263,6 +285,10 @@ struct __wt_connection_stats { int64_t block_byte_write_checkpoint; int64_t block_map_read; int64_t block_byte_map_read; + int64_t cache_read_app_count; + int64_t cache_read_app_time; + int64_t cache_write_app_count; + int64_t cache_write_app_time; int64_t cache_bytes_image; int64_t cache_bytes_inuse; int64_t cache_bytes_other; @@ -356,6 +382,21 @@ struct __wt_connection_stats { int64_t dh_sweeps; int64_t dh_session_handles; int64_t dh_session_sweeps; + int64_t lock_checkpoint_count; + int64_t lock_checkpoint_wait_application; + int64_t lock_checkpoint_wait_internal; + int64_t lock_handle_list_count; + int64_t lock_handle_list_wait_application; + int64_t lock_handle_list_wait_internal; + int64_t lock_metadata_count; + int64_t lock_metadata_wait_application; + int64_t lock_metadata_wait_internal; + int64_t lock_schema_count; + int64_t lock_schema_wait_application; + int64_t lock_schema_wait_internal; + int64_t lock_table_count; + int64_t lock_table_wait_application; + int64_t lock_table_wait_internal; int64_t log_slot_switch_busy; int64_t log_slot_closes; int64_t log_slot_races; @@ -518,6 +559,24 @@ struct __wt_dsrc_stats { int64_t cache_write; int64_t cache_write_restore; int64_t cache_eviction_clean; + int64_t cache_state_gen_avg_gap; + int64_t cache_state_avg_written_size; + int64_t cache_state_pages_clean; + int64_t cache_state_gen_current; + int64_t cache_state_pages_dirty; + int64_t cache_state_root_entries; + int64_t cache_state_pages_internal; + int64_t cache_state_pages_leaf; + int64_t cache_state_gen_max_gap; + int64_t cache_state_max_pagesize; + int64_t cache_state_min_written_size; + int64_t cache_state_smaller_alloc_size; + int64_t cache_state_memory; + int64_t cache_state_queued; + int64_t cache_state_not_queueable; + int64_t cache_state_refs_skipped; + int64_t cache_state_root_size; + int64_t cache_state_pages; int64_t compress_read; int64_t compress_write; int64_t compress_write_fail; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 2b71a580532..b6185b4ead6 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -962,8 +962,9 @@ struct __wt_session { * where appropriate (for example\, a cache size statistic is not * cleared\, while the count of cursor insert operations will be * cleared). See @ref statistics for more information., a list\, with - * values chosen from the following options: \c "all"\, \c "fast"\, \c - * "clear"\, \c "size"; default empty.} + * values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "clear"\, \c "size"\, \c "tree_walk"; + * default empty.} * @config{target, if non-empty\, backup the list of objects; valid only * for a backup data source., a list of strings; default empty.} * @configend @@ -1004,9 +1005,9 @@ struct __wt_session { * @config{block_compressor, configure a compressor for file blocks. * Permitted values are \c "none" or custom compression engine name * created with WT_CONNECTION::add_compressor. If WiredTiger has - * builtin support for \c "snappy"\, \c "lz4" or \c "zlib" compression\, - * these names are also available. See @ref compression for more - * information., a string; default \c none.} + * builtin support for \c "lz4"\, \c "snappy"\, \c "zlib" or \c "zstd" + * compression\, these names are also available. See @ref compression + * for more information., a string; default \c none.} * @config{cache_resident, do not ever evict the object's pages from * cache. Not compatible with LSM tables; see @ref * tuning_cache_resident for more information., a boolean flag; default @@ -1903,8 +1904,9 @@ struct __wt_connection { * reset each time a statistics cursor is used to gather statistics\, as * well as each time statistics are logged using the \c statistics_log * configuration. See @ref statistics for more information., a list\, - * with values chosen from the following options: \c "all"\, \c "fast"\, - * \c "none"\, \c "clear"; default \c none.} + * with values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "none"\, \c "clear"\, \c "tree_walk"; + * default \c none.} * @config{statistics_log = (, log any statistics the database is * configured to maintain\, to a file. See @ref statistics for more * information. Enabling the statistics log server uses a session from @@ -2336,11 +2338,11 @@ struct __wt_connection { * @config{ compressor, configure a compressor for log * records. Permitted values are \c "none" or custom compression engine name * created with WT_CONNECTION::add_compressor. If WiredTiger has builtin - * support for \c "snappy"\, \c "lz4" or \c "zlib" compression\, these names are - * also available. See @ref compression for more information., a string; - * default \c none.} - * @config{ enabled, enable logging - * subsystem., a boolean flag; default \c false.} + * support for \c "lz4"\, \c "snappy"\, \c "zlib" or \c "zstd" compression\, + * these names are also available. See @ref compression for more information., + * a string; default \c none.} + * @config{ enabled, enable + * logging subsystem., a boolean flag; default \c false.} * @config{ file_max, the maximum size of log files., an * integer between 100KB and 2GB; default \c 100MB.} * @config{ path, the name of a directory into which log @@ -2406,8 +2408,9 @@ struct __wt_connection { * statistics are reset each time a statistics cursor is used to gather * statistics\, as well as each time statistics are logged using the \c * statistics_log configuration. See @ref statistics for more information., a - * list\, with values chosen from the following options: \c "all"\, \c "fast"\, - * \c "none"\, \c "clear"; default \c none.} + * list\, with values chosen from the following options: \c "all"\, \c + * "cache_walk"\, \c "fast"\, \c "none"\, \c "clear"\, \c "tree_walk"; default + * \c none.} * @config{statistics_log = (, log any statistics the database is configured to * maintain\, to a file. See @ref statistics for more information. Enabling * the statistics log server uses a session from the configured session_max., a @@ -4277,393 +4280,437 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_BLOCK_MAP_READ 1029 /*! block-manager: mapped bytes read */ #define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1030 +/*! cache: application threads page read from disk to cache count */ +#define WT_STAT_CONN_CACHE_READ_APP_COUNT 1031 +/*! cache: application threads page read from disk to cache time (usecs) */ +#define WT_STAT_CONN_CACHE_READ_APP_TIME 1032 +/*! cache: application threads page write from cache to disk count */ +#define WT_STAT_CONN_CACHE_WRITE_APP_COUNT 1033 +/*! cache: application threads page write from cache to disk time (usecs) */ +#define WT_STAT_CONN_CACHE_WRITE_APP_TIME 1034 /*! cache: bytes belonging to page images in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_IMAGE 1031 +#define WT_STAT_CONN_CACHE_BYTES_IMAGE 1035 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1032 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1036 /*! cache: bytes not belonging to page images in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_OTHER 1033 +#define WT_STAT_CONN_CACHE_BYTES_OTHER 1037 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1034 +#define WT_STAT_CONN_CACHE_BYTES_READ 1038 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1035 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1039 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1036 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1040 /*! cache: eviction calls to get a page */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1037 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1041 /*! cache: eviction calls to get a page found queue empty */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1038 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1042 /*! cache: eviction calls to get a page found queue empty after locking */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1039 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1043 /*! cache: eviction currently operating in aggressive mode */ -#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1040 +#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1044 /*! cache: eviction empty score */ -#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1041 +#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1045 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1042 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1046 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1043 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1047 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1044 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1048 /*! * cache: eviction server slept, because we did not make progress with * eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1045 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1049 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1046 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1050 /*! cache: eviction state */ -#define WT_STAT_CONN_CACHE_EVICTION_STATE 1047 +#define WT_STAT_CONN_CACHE_EVICTION_STATE 1051 /*! cache: eviction walks abandoned */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1048 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1052 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1049 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1053 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1050 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1054 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1051 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1055 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1052 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1056 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1053 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1057 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1054 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1058 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1055 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1059 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1056 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1060 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1057 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1061 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1058 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1062 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1059 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1063 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1060 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1064 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1061 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1065 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1062 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1066 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1063 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1067 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1064 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1068 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1065 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1069 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1066 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1070 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1067 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1071 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1068 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1072 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1069 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1073 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1070 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1074 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1071 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1075 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1072 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1076 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1073 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1077 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1074 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1078 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1075 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1079 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1076 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1080 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1077 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1081 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1078 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1082 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1079 +#define WT_STAT_CONN_CACHE_READ 1083 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1080 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1084 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1081 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1085 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1082 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1086 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1083 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1087 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1084 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1088 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1085 +#define WT_STAT_CONN_CACHE_WRITE 1089 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1086 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1090 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1087 +#define WT_STAT_CONN_CACHE_OVERHEAD 1091 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1088 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1092 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1089 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1093 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1090 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1094 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1091 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1095 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1092 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1096 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1093 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1097 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1094 +#define WT_STAT_CONN_COND_AUTO_WAIT 1098 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1095 +#define WT_STAT_CONN_FILE_OPEN 1099 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1096 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1100 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1097 +#define WT_STAT_CONN_MEMORY_FREE 1101 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1098 +#define WT_STAT_CONN_MEMORY_GROW 1102 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1099 +#define WT_STAT_CONN_COND_WAIT 1103 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1100 +#define WT_STAT_CONN_RWLOCK_READ 1104 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1101 +#define WT_STAT_CONN_RWLOCK_WRITE 1105 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1102 +#define WT_STAT_CONN_FSYNC_IO 1106 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1103 +#define WT_STAT_CONN_READ_IO 1107 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1104 +#define WT_STAT_CONN_WRITE_IO 1108 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1105 +#define WT_STAT_CONN_CURSOR_CREATE 1109 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1106 +#define WT_STAT_CONN_CURSOR_INSERT 1110 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1107 +#define WT_STAT_CONN_CURSOR_NEXT 1111 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1108 +#define WT_STAT_CONN_CURSOR_PREV 1112 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1109 +#define WT_STAT_CONN_CURSOR_REMOVE 1113 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1110 +#define WT_STAT_CONN_CURSOR_RESET 1114 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1111 +#define WT_STAT_CONN_CURSOR_RESTART 1115 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1112 +#define WT_STAT_CONN_CURSOR_SEARCH 1116 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1113 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1117 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1114 +#define WT_STAT_CONN_CURSOR_UPDATE 1118 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1115 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1119 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1116 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1120 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1117 +#define WT_STAT_CONN_DH_SWEEP_REF 1121 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1118 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1122 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1119 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1123 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1120 +#define WT_STAT_CONN_DH_SWEEP_TOD 1124 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1121 +#define WT_STAT_CONN_DH_SWEEPS 1125 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1122 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1126 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1123 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1127 +/*! lock: checkpoint lock acquisitions */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1128 +/*! lock: checkpoint lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1129 +/*! lock: checkpoint lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1130 +/*! lock: handle-list lock acquisitions */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1131 +/*! lock: handle-list lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1132 +/*! lock: handle-list lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1133 +/*! lock: metadata lock acquisitions */ +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1134 +/*! lock: metadata lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1135 +/*! lock: metadata lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1136 +/*! lock: schema lock acquisitions */ +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1137 +/*! lock: schema lock application thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1138 +/*! lock: schema lock internal thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1139 +/*! lock: table lock acquisitions */ +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1140 +/*! + * lock: table lock application thread time waiting for the table lock + * (usecs) + */ +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1141 +/*! + * lock: table lock internal thread time waiting for the table lock + * (usecs) + */ +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1142 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1124 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1143 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1125 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1144 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1126 +#define WT_STAT_CONN_LOG_SLOT_RACES 1145 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1127 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1146 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1128 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1147 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1129 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1148 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1130 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1149 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1131 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1150 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1132 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1151 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1133 +#define WT_STAT_CONN_LOG_FLUSH 1152 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1134 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1153 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1135 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1154 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1136 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1155 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1137 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1156 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1138 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1157 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1139 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1158 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1140 +#define WT_STAT_CONN_LOG_SCANS 1159 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1141 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1160 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1142 +#define WT_STAT_CONN_LOG_WRITE_LSN 1161 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1143 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1162 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1144 +#define WT_STAT_CONN_LOG_SYNC 1163 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1145 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1164 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1146 +#define WT_STAT_CONN_LOG_SYNC_DIR 1165 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1147 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1166 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1148 +#define WT_STAT_CONN_LOG_WRITES 1167 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1149 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1168 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1150 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1169 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1151 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1170 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1152 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1171 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1153 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1172 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1154 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1173 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1155 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1174 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1156 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1175 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1157 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1176 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1158 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1177 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1159 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1178 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1160 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1179 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1161 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1180 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1162 +#define WT_STAT_CONN_REC_PAGES 1181 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1163 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1182 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1164 +#define WT_STAT_CONN_REC_PAGE_DELETE 1183 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1165 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1184 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1166 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1185 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1167 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1186 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1168 +#define WT_STAT_CONN_SESSION_OPEN 1187 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1169 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1188 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1170 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1189 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1171 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1190 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1172 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1191 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1173 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1192 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1174 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1193 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1175 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1194 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1176 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1195 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1177 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1196 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1178 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1197 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1179 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1198 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1180 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1199 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1181 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1200 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1182 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1201 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1183 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1202 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1184 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1203 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1185 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1204 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1186 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1205 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1187 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1206 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1188 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1207 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1189 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1208 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1190 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1209 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1191 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1210 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1192 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1211 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1193 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1212 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1194 +#define WT_STAT_CONN_PAGE_SLEEP 1213 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1195 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1214 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1196 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1215 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1197 +#define WT_STAT_CONN_TXN_BEGIN 1216 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1198 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1217 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1199 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1218 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1200 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1219 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1201 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1220 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1202 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1221 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1203 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1222 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1204 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1223 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1205 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1224 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1206 +#define WT_STAT_CONN_TXN_CHECKPOINT 1225 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1207 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1226 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1208 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1227 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1209 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1228 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1210 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1229 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1211 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1230 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1212 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1231 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1213 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1232 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1214 +#define WT_STAT_CONN_TXN_SYNC 1233 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1215 +#define WT_STAT_CONN_TXN_COMMIT 1234 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1216 +#define WT_STAT_CONN_TXN_ROLLBACK 1235 /*! * @} @@ -4721,28 +4768,28 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! btree: btree checkpoint generation */ #define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2022 /*! - * btree: column-store fixed-size leaf pages, only reported if - * statistics=all is set + * btree: column-store fixed-size leaf pages, only reported if tree_walk + * or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_FIX 2023 /*! - * btree: column-store internal pages, only reported if statistics=all is - * set + * btree: column-store internal pages, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2024 /*! * btree: column-store variable-size RLE encoded values, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_RLE 2025 /*! * btree: column-store variable-size deleted values, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2026 /*! * btree: column-store variable-size leaf pages, only reported if - * statistics=all is set + * tree_walk or all statistics are enabled */ #define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2027 /*! btree: fixed-record size */ @@ -4760,20 +4807,26 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); /*! btree: maximum tree depth */ #define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2034 /*! - * btree: number of key/value pairs, only reported if statistics=all is - * set + * btree: number of key/value pairs, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_ENTRIES 2035 -/*! btree: overflow pages, only reported if statistics=all is set */ +/*! + * btree: overflow pages, only reported if tree_walk or all statistics + * are enabled + */ #define WT_STAT_DSRC_BTREE_OVERFLOW 2036 /*! btree: pages rewritten by compaction */ #define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2037 /*! - * btree: row-store internal pages, only reported if statistics=all is - * set + * btree: row-store internal pages, only reported if tree_walk or all + * statistics are enabled */ #define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2038 -/*! btree: row-store leaf pages, only reported if statistics=all is set */ +/*! + * btree: row-store leaf pages, only reported if tree_walk or all + * statistics are enabled + */ #define WT_STAT_DSRC_BTREE_ROW_LEAF 2039 /*! cache: bytes currently in the cache */ #define WT_STAT_DSRC_CACHE_BYTES_INUSE 2040 @@ -4819,87 +4872,179 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2060 /*! cache: unmodified pages evicted */ #define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2061 +/*! + * cache_walk: Average difference between current eviction generation + * when the page was last considered, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2062 +/*! + * cache_walk: Average on-disk page image size seen, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2063 +/*! + * cache_walk: Clean pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2064 +/*! + * cache_walk: Current eviction generation, only reported if cache_walk + * or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2065 +/*! + * cache_walk: Dirty pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2066 +/*! + * cache_walk: Entries in the root page, only reported if cache_walk or + * all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2067 +/*! + * cache_walk: Internal pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2068 +/*! + * cache_walk: Leaf pages currently in cache, only reported if cache_walk + * or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2069 +/*! + * cache_walk: Maximum difference between current eviction generation + * when the page was last considered, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2070 +/*! + * cache_walk: Maximum page size seen, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2071 +/*! + * cache_walk: Minimum on-disk page image size seen, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2072 +/*! + * cache_walk: On-disk page image sizes smaller than a single allocation + * unit, only reported if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2073 +/*! + * cache_walk: Pages created in memory and never written, only reported + * if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2074 +/*! + * cache_walk: Pages currently queued for eviction, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2075 +/*! + * cache_walk: Pages that could not be queued for eviction, only reported + * if cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2076 +/*! + * cache_walk: Refs skipped during cache traversal, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2077 +/*! + * cache_walk: Size of the root page, only reported if cache_walk or all + * statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2078 +/*! + * cache_walk: Total number of pages currently in cache, only reported if + * cache_walk or all statistics are enabled + */ +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2079 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2062 +#define WT_STAT_DSRC_COMPRESS_READ 2080 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2063 +#define WT_STAT_DSRC_COMPRESS_WRITE 2081 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2064 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2082 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2065 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2083 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2066 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2084 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2067 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2085 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2068 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2086 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2069 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2087 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2070 +#define WT_STAT_DSRC_CURSOR_CREATE 2088 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2071 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2089 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2072 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2090 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2073 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2091 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2074 +#define WT_STAT_DSRC_CURSOR_INSERT 2092 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2075 +#define WT_STAT_DSRC_CURSOR_NEXT 2093 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2076 +#define WT_STAT_DSRC_CURSOR_PREV 2094 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2077 +#define WT_STAT_DSRC_CURSOR_REMOVE 2095 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2078 +#define WT_STAT_DSRC_CURSOR_RESET 2096 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2079 +#define WT_STAT_DSRC_CURSOR_RESTART 2097 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2080 +#define WT_STAT_DSRC_CURSOR_SEARCH 2098 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2081 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2099 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2082 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2100 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2083 +#define WT_STAT_DSRC_CURSOR_UPDATE 2101 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2084 +#define WT_STAT_DSRC_REC_DICTIONARY 2102 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2085 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2103 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2086 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2104 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2087 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2105 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2088 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2106 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2089 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2107 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2090 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2108 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2091 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2109 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2092 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2110 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2093 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2111 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2094 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2112 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2095 +#define WT_STAT_DSRC_REC_PAGES 2113 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2096 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2114 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2097 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2115 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2098 +#define WT_STAT_DSRC_SESSION_COMPACT 2116 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2099 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2117 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2100 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2118 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 4e6699ab9d1..d354757c592 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -216,6 +216,8 @@ struct __wt_logslot; typedef struct __wt_logslot WT_LOGSLOT; struct __wt_lsm_chunk; typedef struct __wt_lsm_chunk WT_LSM_CHUNK; +struct __wt_lsm_cursor_chunk; + typedef struct __wt_lsm_cursor_chunk WT_LSM_CURSOR_CHUNK; struct __wt_lsm_data_source; typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE; struct __wt_lsm_manager; diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index b9a6dd18b7a..067c527a21a 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -10,7 +10,7 @@ #define WT_FORALL_CURSORS(clsm, c, i) \ for ((i) = (clsm)->nchunks; (i) > 0;) \ - if (((c) = (clsm)->cursors[--i]) != NULL) + if (((c) = (clsm)->chunks[--i]->cursor) != NULL) #define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp) @@ -18,6 +18,7 @@ static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *); static int __clsm_open_cursors(WT_CURSOR_LSM *, bool, u_int, uint32_t); static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); +static int __clsm_search_near(WT_CURSOR *cursor, int *exactp); /* * __wt_clsm_request_switch -- @@ -109,7 +110,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) primary = NULL; have_primary = false; } else { - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; primary_chunk = clsm->primary_chunk; WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && @@ -165,7 +166,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; - uint64_t pinned_id, *switchp; + uint64_t i, pinned_id , switch_txn; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; @@ -238,15 +239,16 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; - for (switchp = - &clsm->switch_txn[clsm->nchunks - 2]; + for (i = clsm->nchunks - 2; clsm->nupdates < clsm->nchunks; - clsm->nupdates++, switchp--) { - if (WT_TXNID_LT(*switchp, pinned_id)) + clsm->nupdates++, i--) { + switch_txn = + clsm->chunks[i]->switch_txn; + if (WT_TXNID_LT(switch_txn, pinned_id)) break; WT_ASSERT(session, !__wt_txn_visible_all( - session, *switchp)); + session, switch_txn)); } } } @@ -377,7 +379,7 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) WT_CURSOR *c; u_int i; - if (clsm->cursors == NULL || clsm->nchunks == 0) + if (clsm->chunks == NULL || clsm->nchunks == 0) return (0); /* @@ -386,12 +388,12 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) * careful with unsigned integer wrapping. */ for (i = start; i < end; i++) { - if ((c = (clsm)->cursors[i]) != NULL) { - clsm->cursors[i] = NULL; + if ((c = (clsm)->chunks[i]->cursor) != NULL) { + clsm->chunks[i]->cursor = NULL; WT_RET(c->close(c)); } - if ((bloom = clsm->blooms[i]) != NULL) { - clsm->blooms[i] = NULL; + if ((bloom = clsm->chunks[i]->bloom) != NULL) { + clsm->chunks[i]->bloom = NULL; WT_RET(__wt_bloom_close(bloom)); } } @@ -400,6 +402,45 @@ __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) } /* + * __clsm_resize_chunks -- + * Allocates an array of unit objects for each chunk. + */ +static int +__clsm_resize_chunks( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int nchunks) +{ + WT_DECL_RET; + WT_LSM_CURSOR_CHUNK *chunk; + + /* Don't allocate more iterators if we don't need them. */ + if (clsm->chunks_count >= nchunks) { + return (ret); + } + + WT_RET(__wt_realloc_def(session, &clsm->chunks_alloc, nchunks, + &clsm->chunks)); + for (; clsm->chunks_count < nchunks; clsm->chunks_count++) { + WT_RET(__wt_calloc_one(session, &chunk)); + clsm->chunks[clsm->chunks_count] = chunk; + } + return (ret); +} + +/* + * __clsm_free_chunks -- + * Allocates an array of unit objects for each chunk. + */ +static void +__clsm_free_chunks(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm) +{ + size_t i; + for (i = 0; i < clsm->chunks_count; i++) { + __wt_free(session, clsm->chunks[i]); + } + __wt_free(session, clsm->chunks); +} + +/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ @@ -408,7 +449,7 @@ __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; - WT_CURSOR *c, **cp, *primary; + WT_CURSOR *c, *cursor, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; @@ -421,6 +462,7 @@ __clsm_open_cursors( bool locked; c = &clsm->iface; + cursor = NULL; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; @@ -464,7 +506,7 @@ __clsm_open_cursors( retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; - + WT_ERR(__clsm_resize_chunks(session, clsm, nchunks)); /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. @@ -485,16 +527,13 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; + WT_ERR(__clsm_resize_chunks(session, clsm, nchunks)); /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ - if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) - WT_ERR(__wt_realloc_def(session, - &clsm->txnid_alloc, nchunks, - &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { @@ -503,11 +542,11 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ - for (ngood = nchunks - 1, nupdates = 1; - ngood > 0; + for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; - clsm->switch_txn[ngood - 1] = chunk->switch_txn; + clsm->chunks[ngood - 1]->switch_txn = + chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; @@ -518,21 +557,20 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { } /* Check how many cursors are already open. */ - for (cp = clsm->cursors + ngood; - ngood < clsm->nchunks && ngood < nchunks; - cp++, ngood++) { + for (; ngood < clsm->nchunks && ngood < nchunks; ngood++) { chunk = lsm_tree->chunk[ngood]; + cursor = clsm->chunks[ngood]->cursor; /* If the cursor isn't open yet, we're done. */ - if (*cp == NULL) + if (cursor == NULL) break; /* Easy case: the URIs don't match. */ - if (strcmp((*cp)->uri, chunk->uri) != 0) + if (strcmp(cursor->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ - checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + checkpoint = ((WT_CURSOR_BTREE *)cursor)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && @@ -540,7 +578,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { break; /* Make sure the Bloom config matches. */ - if (clsm->blooms[ngood] == NULL && + if (clsm->chunks[ngood]->bloom == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } @@ -558,7 +596,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ - if (clsm->cursors != NULL && ngood < clsm->nchunks) { + if (clsm->chunks != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { @@ -590,28 +628,23 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { clsm->current = NULL; } - WT_ERR(__wt_realloc_def(session, - &clsm->bloom_alloc, nchunks, &clsm->blooms)); - WT_ERR(__wt_realloc_def(session, - &clsm->cursor_alloc, nchunks, &clsm->cursors)); - clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ - for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { + for (i = ngood; i != nchunks; i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) - clsm->switch_txn[i] = chunk->switch_txn; + clsm->chunks[i]->switch_txn = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ - WT_ASSERT(session, *cp == NULL); + WT_ASSERT(session, clsm->chunks[i]->cursor == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? - ckpt_cfg : NULL, cp); + ckpt_cfg : NULL, &clsm->chunks[i]->cursor); /* * XXX kludge: we may have an empty chunk where no checkpoint @@ -619,8 +652,8 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { - ret = __wt_open_cursor( - session, chunk->uri, c, NULL, cp); + ret = __wt_open_cursor(session, + chunk->uri, c, NULL, &clsm->chunks[i]->cursor); if (ret == 0) chunk->empty = 1; } @@ -633,25 +666,31 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { * write conflicts with concurrent updates. */ if (i != nchunks - 1) - (*cp)->insert = __wt_curfile_update_check; + clsm->chunks[i]->cursor->insert = + __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, - c, &clsm->blooms[i])); + c, &clsm->chunks[i]->bloom)); /* Child cursors always use overwrite and raw mode. */ - F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); + F_SET(clsm->chunks[i]->cursor, + WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } + /* Setup the count values for each chunk in the chunks*/ + for (i = 0; i != clsm->nchunks; i++) + clsm->chunks[i]->count = lsm_tree->chunk[i]->count; + /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by @@ -671,17 +710,19 @@ err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { - for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { + for (i = 0; i != clsm->nchunks; i++) { + cursor = clsm->chunks[i]->cursor; chunk = lsm_tree->chunk[i + start_chunk]; - /* Make sure the cursor is open. */ - WT_ASSERT(session, *cp != NULL); + /* Make sure the first cursor is open. */ + WT_ASSERT(session, cursor != NULL); /* Easy case: the URIs should match. */ - WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); + WT_ASSERT( + session, strcmp(cursor->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ - checkpoint = ((WT_CURSOR_BTREE *)*cp)-> + checkpoint = ((WT_CURSOR_BTREE *)cursor)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && @@ -692,7 +733,8 @@ err: WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? - clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); + clsm->chunks[i]->bloom != NULL : + clsm->chunks[i]->bloom == NULL); } } #endif @@ -901,6 +943,96 @@ err: __clsm_leave(clsm); } /* + * __clsm_random_chunk -- + * Pick a chunk at random, weighted by the size of all chunks. Weighting + * proportional to documents avoids biasing towards small chunks. Then return + * the cursor on the chunk we have picked. + */ +static int +__clsm_random_chunk(WT_SESSION_IMPL *session, + WT_CURSOR_LSM *clsm, WT_CURSOR **cursor) +{ + uint64_t checked_docs, i, rand_doc, total_docs; + + /* + * If the tree is empty we cannot do a random lookup, so return a + * WT_NOTFOUND. + */ + if (clsm->nchunks == 0) + return (WT_NOTFOUND); + for (total_docs = i = 0; i < clsm->nchunks; i++) { + total_docs += clsm->chunks[i]->count; + } + if (total_docs == 0) + return (WT_NOTFOUND); + + rand_doc = __wt_random(&session->rnd) % total_docs; + + for (checked_docs = i = 0; i < clsm->nchunks; i++) { + checked_docs += clsm->chunks[i]->count; + if (rand_doc <= checked_docs) { + *cursor = clsm->chunks[i]->cursor; + break; + } + } + return (0); +} + +/* + * __clsm_next_random -- + * WT_CURSOR->next method for the LSM cursor type when configured with + * next_random. + */ +static int +__clsm_next_random(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int exact; + + c = NULL; + clsm = (WT_CURSOR_LSM *)cursor; + + CURSOR_API_CALL(cursor, session, next, NULL); + WT_CURSOR_NOVALUE(cursor); + WT_ERR(__clsm_enter(clsm, false, false)); + + for (;;) { + WT_ERR(__clsm_random_chunk(session, clsm, &c)); + /* + * This call to next_random on the chunk can potentially end in + * WT_NOTFOUND if the chunk we picked is empty. We want to retry + * in that case. + */ + ret = __wt_curfile_next_random(c); + if (ret == WT_NOTFOUND) + continue; + + WT_ERR(ret); + F_SET(cursor, WT_CURSTD_KEY_INT); + WT_ERR(c->get_key(c, &cursor->key)); + /* + * Search near the current key to resolve any tombstones + * and position to a valid document. If we see a + * WT_NOTFOUND here that is valid, as the tree has no + * documents visible to us. + */ + WT_ERR(__clsm_search_near(cursor, &exact)); + break; + } + + /* We have found a valid doc. Set that we are now positioned */ + if (0) { +err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + } + __clsm_leave(clsm); + API_END(session, ret); + return (ret); +} + +/* * __clsm_prev -- * WT_CURSOR->prev method for the LSM cursor type. */ @@ -1071,7 +1203,7 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_FORALL_CURSORS(clsm, c, i) { /* If there is a Bloom filter, see if we can skip the read. */ bloom = NULL; - if ((bloom = clsm->blooms[i]) != NULL) { + if ((bloom = clsm->chunks[i]->bloom) != NULL) { if (!have_hash) { __wt_bloom_hash(bloom, &cursor->key, &bhash); have_hash = true; @@ -1258,7 +1390,12 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) */ F_CLR(cursor, WT_CURSTD_KEY_SET); F_SET(cursor, WT_CURSTD_KEY_INT); - if ((ret = cursor->next(cursor)) == 0) { + /* + * We call __clsm_next here as we want to advance + * forward. If we are a random LSM cursor calling next + * on the cursor will not advance as we intend. + */ + if ((ret = __clsm_next(cursor)) == 0) { cmp = 1; deleted = false; } @@ -1267,7 +1404,11 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) } if (deleted) { clsm->current = NULL; - WT_ERR(cursor->prev(cursor)); + /* + * We call prev directly here as cursor->prev may be "invalid" + * if this is a random cursor. + */ + WT_ERR(__clsm_prev(cursor)); cmp = -1; } *exactp = cmp; @@ -1311,7 +1452,7 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, * Clear the existing cursor position. Don't clear the primary cursor: * we're about to use it anyway. */ - primary = clsm->cursors[clsm->nchunks - 1]; + primary = clsm->chunks[clsm->nchunks - 1]->cursor; WT_RET(__clsm_reset_cursors(clsm, primary)); /* If necessary, set the position for future scans. */ @@ -1321,12 +1462,12 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) { /* Check if we need to keep updating old chunks. */ if (i > 0 && - __wt_txn_visible(session, clsm->switch_txn[slot])) { + __wt_txn_visible(session, clsm->chunks[slot]->switch_txn)) { clsm->nupdates = i; break; } - c = clsm->cursors[slot]; + c = clsm->chunks[slot]->cursor; c->set_key(c, key); c->set_value(c, value); WT_RET((position && i == 0) ? c->update(c) : c->insert(c)); @@ -1484,9 +1625,7 @@ __wt_clsm_close(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_API_CALL(cursor, session, close, NULL); WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks)); - __wt_free(session, clsm->blooms); - __wt_free(session, clsm->cursors); - __wt_free(session, clsm->switch_txn); + __clsm_free_chunks(session, clsm); /* In case we were somehow left positioned, clear that. */ __clsm_leave(clsm); @@ -1587,6 +1726,13 @@ __wt_clsm_open(WT_SESSION_IMPL *session, */ clsm->dsk_gen = 0; + /* If the next_random option is set, configure a random cursor */ + WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); + if (cval.val != 0) { + __wt_cursor_set_notsup(cursor); + cursor->next = __clsm_next_random; + } + WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); if (bulk) diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c index 607ca0c9705..319426de3f0 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c @@ -28,9 +28,8 @@ __clsm_close_bulk(WT_CURSOR *cursor) session = (WT_SESSION_IMPL *)clsm->iface.session; /* Close the bulk cursor to ensure the chunk is written to disk. */ - bulk_cursor = clsm->cursors[0]; + bulk_cursor = clsm->chunks[0]->cursor; WT_RET(bulk_cursor->close(bulk_cursor)); - clsm->cursors[0] = NULL; clsm->nchunks = 0; /* Set ondisk, and flush the metadata */ @@ -75,7 +74,7 @@ __clsm_insert_bulk(WT_CURSOR *cursor) WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); ++chunk->count; chunk->size += cursor->key.size + cursor->value.size; - bulk_cursor = *clsm->cursors; + bulk_cursor = clsm->chunks[0]->cursor; bulk_cursor->set_key(bulk_cursor, &cursor->key); bulk_cursor->set_value(bulk_cursor, &cursor->value); WT_RET(bulk_cursor->insert(bulk_cursor)); @@ -124,11 +123,10 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ - WT_RET(__wt_calloc_one(session, &clsm->blooms)); - clsm->bloom_alloc = 1; - WT_RET(__wt_calloc_one(session, &clsm->cursors)); - clsm->cursor_alloc = 1; - clsm->nchunks = 1; + WT_RET( + __wt_realloc_def(session, &clsm->chunks_alloc, 1, &clsm->chunks)); + WT_RET(__wt_calloc_one(session, &clsm->chunks[0])); + clsm->chunks_count = clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read @@ -139,7 +137,7 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); - clsm->cursors[0] = bulk_cursor; + clsm->chunks[0]->cursor = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index f4f5a0acce8..3fe3ca1ba81 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -42,11 +42,11 @@ __curstat_lsm_init( if (cst->flags != 0) { (void)snprintf(config, sizeof(config), "statistics=(%s%s%s%s)", - F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "", - F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "", - !F_ISSET(cst, WT_CONN_STAT_ALL) && - F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "", - F_ISSET(cst, WT_CONN_STAT_SIZE) ? "size," : ""); + F_ISSET(cst, WT_STAT_TYPE_ALL) ? "all," : "", + F_ISSET(cst, WT_STAT_CLEAR) ? "clear," : "", + !F_ISSET(cst, WT_STAT_TYPE_ALL) && + F_ISSET(cst, WT_STAT_TYPE_FAST) ? "fast," : "", + F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : ""); cfg[1] = disk_cfg[1] = config; } @@ -132,26 +132,26 @@ __curstat_lsm_init( /* Include, and optionally clear, LSM-level specific information. */ WT_STAT_WRITE(session, stats, bloom_miss, lsm_tree->bloom_miss); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_miss = 0; WT_STAT_WRITE(session, stats, bloom_hit, lsm_tree->bloom_hit); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_hit = 0; WT_STAT_WRITE(session, stats, bloom_false_positive, lsm_tree->bloom_false_positive); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->bloom_false_positive = 0; WT_STAT_WRITE(session, stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_lookup_no_bloom = 0; WT_STAT_WRITE(session, stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_checkpoint_throttle = 0; WT_STAT_WRITE(session, stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle); - if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + if (F_ISSET(cst, WT_STAT_CLEAR)) lsm_tree->lsm_merge_throttle = 0; __wt_curstat_dsrc_final(cst); diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c index c7ae881af97..719e214696b 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_time.c +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -17,6 +17,15 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) { WT_DECL_RET; + /* + * This function doesn't return an error, but panics on failure (which + * should never happen, it's done this way to simplify error handling + * in the caller). However, some compilers complain about using garbage + * values. Initializing the values avoids the complaint. + */ + tsp->tv_sec = 0; + tsp->tv_nsec = 0; + #if defined(HAVE_CLOCK_GETTIME) WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); if (ret == 0) diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c index 1cd39d97364..345f9164e9b 100644 --- a/src/third_party/wiredtiger/src/schema/schema_stat.c +++ b/src/third_party/wiredtiger/src/schema/schema_stat.c @@ -137,7 +137,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, * If only gathering table size statistics, try a fast path that * avoids the schema and table list locks. */ - if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + if (F_ISSET(cst, WT_STAT_TYPE_SIZE)) { WT_RET(__curstat_size_only(session, uri, &was_fast, cst)); if (was_fast) return (0); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index d3432c19ef3..f594450db74 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1757,11 +1757,13 @@ __open_session(WT_CONNECTION_IMPL *conn, if (i >= conn->session_cnt) /* Defend against off-by-one errors. */ conn->session_cnt = i + 1; - session_ret->id = i; session_ret->iface = F_ISSET(conn, WT_CONN_READONLY) ? stds_readonly : stds; session_ret->iface.connection = &conn->iface; + session_ret->name = NULL; + session_ret->id = i; + WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); if (WT_SESSION_FIRST_USE(session_ret)) @@ -1777,10 +1779,10 @@ __open_session(WT_CONNECTION_IMPL *conn, * Allocate the table hash array as well. */ if (session_ret->dhhash == NULL) - WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE, + WT_ERR(__wt_calloc(session, WT_HASH_ARRAY_SIZE, sizeof(struct __dhandles_hash), &session_ret->dhhash)); if (session_ret->tablehash == NULL) - WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE, + WT_ERR(__wt_calloc(session, WT_HASH_ARRAY_SIZE, sizeof(struct __tables_hash), &session_ret->tablehash)); for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) { TAILQ_INIT(&session_ret->dhhash[i]); @@ -1789,7 +1791,7 @@ __open_session(WT_CONNECTION_IMPL *conn, /* Initialize transaction support: default to read-committed. */ session_ret->isolation = WT_ISO_READ_COMMITTED; - WT_ERR(__wt_txn_init(session_ret)); + WT_ERR(__wt_txn_init(session, session_ret)); /* * The session's hazard pointer memory isn't discarded during normal @@ -1808,6 +1810,9 @@ __open_session(WT_CONNECTION_IMPL *conn, */ session_ret->hazard_size = 0; + /* Cache the offset of this session's statistics bucket. */ + session_ret->stat_bucket = WT_STATS_SLOT_ID(session); + /* * Configuration: currently, the configuration for open_session is the * same as session.reconfigure, so use that function. @@ -1816,8 +1821,6 @@ __open_session(WT_CONNECTION_IMPL *conn, WT_ERR( __session_reconfigure((WT_SESSION *)session_ret, config)); - session_ret->name = NULL; - /* * Publish: make the entry visible to server threads. There must be a * barrier for two reasons, to ensure structure fields are set before diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 9d440f9ebf3..6e8e218a0db 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -65,6 +65,24 @@ static const char * const __stats_dsrc_desc[] = { "cache: pages written from cache", "cache: pages written requiring in-memory restoration", "cache: unmodified pages evicted", + "cache_walk: Average difference between current eviction generation when the page was last considered", + "cache_walk: Average on-disk page image size seen", + "cache_walk: Clean pages currently in cache", + "cache_walk: Current eviction generation", + "cache_walk: Dirty pages currently in cache", + "cache_walk: Entries in the root page", + "cache_walk: Internal pages currently in cache", + "cache_walk: Leaf pages currently in cache", + "cache_walk: Maximum difference between current eviction generation when the page was last considered", + "cache_walk: Maximum page size seen", + "cache_walk: Minimum on-disk page image size seen", + "cache_walk: On-disk page image sizes smaller than a single allocation unit", + "cache_walk: Pages created in memory and never written", + "cache_walk: Pages currently queued for eviction", + "cache_walk: Pages that could not be queued for eviction", + "cache_walk: Refs skipped during cache traversal", + "cache_walk: Size of the root page", + "cache_walk: Total number of pages currently in cache", "compression: compressed pages read", "compression: compressed pages written", "compression: page written failed to compress", @@ -196,6 +214,24 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_write = 0; stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; + /* not clearing cache_state_gen_avg_gap */ + /* not clearing cache_state_avg_written_size */ + /* not clearing cache_state_pages_clean */ + /* not clearing cache_state_gen_current */ + /* not clearing cache_state_pages_dirty */ + /* not clearing cache_state_root_entries */ + /* not clearing cache_state_pages_internal */ + /* not clearing cache_state_pages_leaf */ + /* not clearing cache_state_gen_max_gap */ + /* not clearing cache_state_max_pagesize */ + /* not clearing cache_state_min_written_size */ + /* not clearing cache_state_smaller_alloc_size */ + /* not clearing cache_state_memory */ + /* not clearing cache_state_queued */ + /* not clearing cache_state_not_queueable */ + /* not clearing cache_state_refs_skipped */ + /* not clearing cache_state_root_size */ + /* not clearing cache_state_pages */ stats->compress_read = 0; stats->compress_write = 0; stats->compress_write_fail = 0; @@ -325,6 +361,27 @@ __wt_stat_dsrc_aggregate_single( to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; + to->cache_state_gen_avg_gap += from->cache_state_gen_avg_gap; + to->cache_state_avg_written_size += + from->cache_state_avg_written_size; + to->cache_state_pages_clean += from->cache_state_pages_clean; + to->cache_state_gen_current += from->cache_state_gen_current; + to->cache_state_pages_dirty += from->cache_state_pages_dirty; + to->cache_state_root_entries += from->cache_state_root_entries; + to->cache_state_pages_internal += from->cache_state_pages_internal; + to->cache_state_pages_leaf += from->cache_state_pages_leaf; + to->cache_state_gen_max_gap += from->cache_state_gen_max_gap; + to->cache_state_max_pagesize += from->cache_state_max_pagesize; + to->cache_state_min_written_size += + from->cache_state_min_written_size; + to->cache_state_smaller_alloc_size += + from->cache_state_smaller_alloc_size; + to->cache_state_memory += from->cache_state_memory; + to->cache_state_queued += from->cache_state_queued; + to->cache_state_not_queueable += from->cache_state_not_queueable; + to->cache_state_refs_skipped += from->cache_state_refs_skipped; + to->cache_state_root_size += from->cache_state_root_size; + to->cache_state_pages += from->cache_state_pages; to->compress_read += from->compress_read; to->compress_write += from->compress_write; to->compress_write_fail += from->compress_write_fail; @@ -467,6 +524,39 @@ __wt_stat_dsrc_aggregate( to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->cache_state_gen_avg_gap += + WT_STAT_READ(from, cache_state_gen_avg_gap); + to->cache_state_avg_written_size += + WT_STAT_READ(from, cache_state_avg_written_size); + to->cache_state_pages_clean += + WT_STAT_READ(from, cache_state_pages_clean); + to->cache_state_gen_current += + WT_STAT_READ(from, cache_state_gen_current); + to->cache_state_pages_dirty += + WT_STAT_READ(from, cache_state_pages_dirty); + to->cache_state_root_entries += + WT_STAT_READ(from, cache_state_root_entries); + to->cache_state_pages_internal += + WT_STAT_READ(from, cache_state_pages_internal); + to->cache_state_pages_leaf += + WT_STAT_READ(from, cache_state_pages_leaf); + to->cache_state_gen_max_gap += + WT_STAT_READ(from, cache_state_gen_max_gap); + to->cache_state_max_pagesize += + WT_STAT_READ(from, cache_state_max_pagesize); + to->cache_state_min_written_size += + WT_STAT_READ(from, cache_state_min_written_size); + to->cache_state_smaller_alloc_size += + WT_STAT_READ(from, cache_state_smaller_alloc_size); + to->cache_state_memory += WT_STAT_READ(from, cache_state_memory); + to->cache_state_queued += WT_STAT_READ(from, cache_state_queued); + to->cache_state_not_queueable += + WT_STAT_READ(from, cache_state_not_queueable); + to->cache_state_refs_skipped += + WT_STAT_READ(from, cache_state_refs_skipped); + to->cache_state_root_size += + WT_STAT_READ(from, cache_state_root_size); + to->cache_state_pages += WT_STAT_READ(from, cache_state_pages); to->compress_read += WT_STAT_READ(from, compress_read); to->compress_write += WT_STAT_READ(from, compress_write); to->compress_write_fail += WT_STAT_READ(from, compress_write_fail); @@ -549,6 +639,10 @@ static const char * const __stats_connection_desc[] = { "block-manager: bytes written for checkpoint", "block-manager: mapped blocks read", "block-manager: mapped bytes read", + "cache: application threads page read from disk to cache count", + "cache: application threads page read from disk to cache time (usecs)", + "cache: application threads page write from cache to disk count", + "cache: application threads page write from cache to disk time (usecs)", "cache: bytes belonging to page images in the cache", "cache: bytes currently in the cache", "cache: bytes not belonging to page images in the cache", @@ -642,6 +736,21 @@ static const char * const __stats_connection_desc[] = { "data-handle: connection sweeps", "data-handle: session dhandles swept", "data-handle: session sweep attempts", + "lock: checkpoint lock acquisitions", + "lock: checkpoint lock application thread wait time (usecs)", + "lock: checkpoint lock internal thread wait time (usecs)", + "lock: handle-list lock acquisitions", + "lock: handle-list lock application thread wait time (usecs)", + "lock: handle-list lock internal thread wait time (usecs)", + "lock: metadata lock acquisitions", + "lock: metadata lock application thread wait time (usecs)", + "lock: metadata lock internal thread wait time (usecs)", + "lock: schema lock acquisitions", + "lock: schema lock application thread wait time (usecs)", + "lock: schema lock internal thread wait time (usecs)", + "lock: table lock acquisitions", + "lock: table lock application thread time waiting for the table lock (usecs)", + "lock: table lock internal thread time waiting for the table lock (usecs)", "log: busy returns attempting to switch slots", "log: consolidated slot closures", "log: consolidated slot join races", @@ -796,6 +905,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->block_byte_write_checkpoint = 0; stats->block_map_read = 0; stats->block_byte_map_read = 0; + stats->cache_read_app_count = 0; + stats->cache_read_app_time = 0; + stats->cache_write_app_count = 0; + stats->cache_write_app_time = 0; /* not clearing cache_bytes_image */ /* not clearing cache_bytes_inuse */ /* not clearing cache_bytes_other */ @@ -889,6 +1002,21 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->dh_sweeps = 0; stats->dh_session_handles = 0; stats->dh_session_sweeps = 0; + stats->lock_checkpoint_count = 0; + stats->lock_checkpoint_wait_application = 0; + stats->lock_checkpoint_wait_internal = 0; + stats->lock_handle_list_count = 0; + stats->lock_handle_list_wait_application = 0; + stats->lock_handle_list_wait_internal = 0; + stats->lock_metadata_count = 0; + stats->lock_metadata_wait_application = 0; + stats->lock_metadata_wait_internal = 0; + stats->lock_schema_count = 0; + stats->lock_schema_wait_application = 0; + stats->lock_schema_wait_internal = 0; + stats->lock_table_count = 0; + stats->lock_table_wait_application = 0; + stats->lock_table_wait_internal = 0; stats->log_slot_switch_busy = 0; stats->log_slot_closes = 0; stats->log_slot_races = 0; @@ -1036,6 +1164,11 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, block_byte_write_checkpoint); to->block_map_read += WT_STAT_READ(from, block_map_read); to->block_byte_map_read += WT_STAT_READ(from, block_byte_map_read); + to->cache_read_app_count += WT_STAT_READ(from, cache_read_app_count); + to->cache_read_app_time += WT_STAT_READ(from, cache_read_app_time); + to->cache_write_app_count += + WT_STAT_READ(from, cache_write_app_count); + to->cache_write_app_time += WT_STAT_READ(from, cache_write_app_time); to->cache_bytes_image += WT_STAT_READ(from, cache_bytes_image); to->cache_bytes_inuse += WT_STAT_READ(from, cache_bytes_inuse); to->cache_bytes_other += WT_STAT_READ(from, cache_bytes_other); @@ -1162,6 +1295,33 @@ __wt_stat_connection_aggregate( to->dh_sweeps += WT_STAT_READ(from, dh_sweeps); to->dh_session_handles += WT_STAT_READ(from, dh_session_handles); to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps); + to->lock_checkpoint_count += + WT_STAT_READ(from, lock_checkpoint_count); + to->lock_checkpoint_wait_application += + WT_STAT_READ(from, lock_checkpoint_wait_application); + to->lock_checkpoint_wait_internal += + WT_STAT_READ(from, lock_checkpoint_wait_internal); + to->lock_handle_list_count += + WT_STAT_READ(from, lock_handle_list_count); + to->lock_handle_list_wait_application += + WT_STAT_READ(from, lock_handle_list_wait_application); + to->lock_handle_list_wait_internal += + WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); + to->lock_metadata_wait_application += + WT_STAT_READ(from, lock_metadata_wait_application); + to->lock_metadata_wait_internal += + WT_STAT_READ(from, lock_metadata_wait_internal); + to->lock_schema_count += WT_STAT_READ(from, lock_schema_count); + to->lock_schema_wait_application += + WT_STAT_READ(from, lock_schema_wait_application); + to->lock_schema_wait_internal += + WT_STAT_READ(from, lock_schema_wait_internal); + to->lock_table_count += WT_STAT_READ(from, lock_table_count); + to->lock_table_wait_application += + WT_STAT_READ(from, lock_table_wait_application); + to->lock_table_wait_internal += + WT_STAT_READ(from, lock_table_wait_internal); to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); to->log_slot_races += WT_STAT_READ(from, log_slot_races); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 3b24bcd505d..d60ea73c660 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -659,20 +659,20 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) * Initialize a session's transaction data. */ int -__wt_txn_init(WT_SESSION_IMPL *session) +__wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) { WT_TXN *txn; - txn = &session->txn; + txn = &session_ret->txn; txn->id = WT_TXN_NONE; WT_RET(__wt_calloc_def(session, - S2C(session)->session_size, &txn->snapshot)); + S2C(session_ret)->session_size, &txn->snapshot)); #ifdef HAVE_DIAGNOSTIC - if (S2C(session)->txn_global.states != NULL) { + if (S2C(session_ret)->txn_global.states != NULL) { WT_TXN_STATE *txn_state; - txn_state = WT_SESSION_TXN_STATE(session); + txn_state = WT_SESSION_TXN_STATE(session_ret); WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); } #endif @@ -683,7 +683,7 @@ __wt_txn_init(WT_SESSION_IMPL *session) */ txn->mod = NULL; - txn->isolation = session->isolation; + txn->isolation = session_ret->isolation; return (0); } diff --git a/src/third_party/wiredtiger/test/bloom/test_bloom.c b/src/third_party/wiredtiger/test/bloom/test_bloom.c index 7a298f000aa..67249ff887e 100644 --- a/src/third_party/wiredtiger/test/bloom/test_bloom.c +++ b/src/third_party/wiredtiger/test/bloom/test_bloom.c @@ -56,8 +56,6 @@ void usage(void) extern char *__wt_optarg; extern int __wt_optind; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c index 6b2f0d4466c..4998019ad8e 100644 --- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c +++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c @@ -42,8 +42,6 @@ static int wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index 15db2fbcf46..8f1714237b9 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -7,6 +7,9 @@ AM_LDFLAGS = -static test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c noinst_PROGRAMS = test_wt1965_col_efficiency +test_wt2403_lsm_workload_SOURCES = wt2403_lsm_workload/main.c +noinst_PROGRAMS += test_wt2403_lsm_workload + test_wt2246_col_append_SOURCES = wt2246_col_append/main.c noinst_PROGRAMS += test_wt2246_col_append diff --git a/src/third_party/wiredtiger/test/csuite/wt1965_col_efficiency/main.c b/src/third_party/wiredtiger/test/csuite/wt1965_col_efficiency/main.c index 0dc367c0611..a7235d81b31 100644 --- a/src/third_party/wiredtiger/test/csuite/wt1965_col_efficiency/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt1965_col_efficiency/main.c @@ -35,8 +35,6 @@ * it is demonstrating an inefficiency rather than a correctness bug. */ -void (*custom_die)(void) = NULL; - /* If changing field count also need to change set_value and get_value calls */ #define NR_FIELDS 8 #define NR_OBJECTS 100 diff --git a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c index 1da8732abb4..4b352b26051 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2246_col_append/main.c @@ -42,8 +42,6 @@ #define MILLION 1000000 -void (*custom_die)(void) = NULL; - /* Needs to be global for signal handling. */ static TEST_OPTS *opts, _opts; diff --git a/src/third_party/wiredtiger/test/csuite/wt2323_join_visibility/main.c b/src/third_party/wiredtiger/test/csuite/wt2323_join_visibility/main.c index 5987b77fd7d..239a3f300d0 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2323_join_visibility/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2323_join_visibility/main.c @@ -52,8 +52,6 @@ * of inserts set low as a default. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 10000 #define N_INSERT 500000 #define N_INSERT_THREAD 2 diff --git a/src/third_party/wiredtiger/test/csuite/wt2403_lsm_workload/main.c b/src/third_party/wiredtiger/test/csuite/wt2403_lsm_workload/main.c new file mode 100644 index 00000000000..0c287484b9e --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt2403_lsm_workload/main.c @@ -0,0 +1,241 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "test_util.h" + +static const char name[] = "lsm:test"; +#define NUM_DOCS 100000 +#define NUM_QUERIES (NUM_DOCS/100) + +static void +rand_str(uint64_t i, char *str) +{ + uint64_t x, y; + + y = strlen(str); + for (x = y; x > y - 8; x--) { + str[x - 1] = (char)(i % 10) + 48; + i = i / 10; + } +} + +static void +check_str(uint64_t i, char *str, bool mod) +{ + char str2[] = "0000000000000000"; + + rand_str(i, str2); + if (mod) + str2[0] = 'A'; + testutil_checkfmt(strcmp(str, str2), + "strcmp failed, got %s, expected %s", str, str2); +} + +static void +query_docs(WT_CURSOR *cursor, bool mod) +{ + WT_ITEM key, value; + int i; + + for (i = 0; i < NUM_QUERIES; i++) { + testutil_check(cursor->next(cursor)); + testutil_check(cursor->get_key(cursor, &key)); + testutil_check(cursor->get_value(cursor, &value)); + check_str((uint64_t)key.data, (char *)value.data, mod); + } + printf("%d documents read\n", NUM_QUERIES); +} + +static void * +compact_thread(void *args) +{ + WT_SESSION *session; + + session = (WT_SESSION *)args; + testutil_check(session->compact(session, name, NULL)); + return (NULL); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *rcursor, *wcursor; + WT_ITEM key, value; + WT_SESSION *session, *session2; + pthread_t thread; + uint64_t i; + + char str[] = "0000000000000000"; + + /* + * Create a clean test directory for this run of the test program if the + * environment variable isn't already set (as is done by make check). + */ + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + testutil_check(wiredtiger_open(opts->home, + NULL, "create,cache_size=200M", &opts->conn)); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session2)); + + testutil_check(session->create(session, name, + "key_format=Q,value_format=S")); + + /* Populate the table with some data. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + wcursor->set_key(wcursor, i); + rand_str(i, str); + wcursor->set_value(wcursor, str); + testutil_check(wcursor->insert(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents inserted\n", NUM_DOCS); + + /* Perform some random reads */ + testutil_check(session->open_cursor( + session, name, NULL, "next_random=true", &rcursor)); + query_docs(rcursor, false); + testutil_check(rcursor->close(rcursor)); + + /* Setup Transaction to pin the current values */ + testutil_check( + session2->begin_transaction(session2, "isolation=snapshot")); + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + + /* Perform updates in a txn to confirm that we see only the original. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + rand_str(i, str); + str[0] = 'A'; + wcursor->set_key(wcursor, i); + wcursor->set_value(wcursor, str); + testutil_check(wcursor->update(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents set to update\n", NUM_DOCS); + + /* Random reads, which should see the original values */ + query_docs(rcursor, false); + testutil_check(rcursor->close(rcursor)); + + /* Finish the txn */ + testutil_check(session2->rollback_transaction(session2, NULL)); + + /* Random reads, which should see the updated values */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + + /* Setup a pre-delete txn */ + testutil_check( + session2->begin_transaction(session2, "isolation=snapshot")); + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + + /* Delete all but one document */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS - 1; i++) { + wcursor->set_key(wcursor, i); + testutil_check(wcursor->remove(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + printf("%d documents deleted\n", NUM_DOCS - 1); + + /* Random reads, which should not see the deletes */ + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + + /* Rollback the txn so we can see the deletes */ + testutil_check(session2->rollback_transaction(session2, NULL)); + + /* Find the one remaining document 3 times */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + for (i = 0; i < 3; i++) { + testutil_check(rcursor->next(rcursor)); + testutil_check(rcursor->get_key(rcursor, &key)); + testutil_check(rcursor->get_value(rcursor, &value)); + /* There should only be one value available to us */ + testutil_assertfmt((uint64_t)key.data == NUM_DOCS - 1, + "expected %d and got %" PRIu64, + NUM_DOCS - 1, (uint64_t)key.data); + check_str((uint64_t)key.data, (char *)value.data, true); + } + printf("Found the deleted doc 3 times\n"); + testutil_check(rcursor->close(rcursor)); + + /* Repopulate the table for compact. */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS - 1; i++) { + wcursor->set_key(wcursor, i); + rand_str(i, str); + str[0] = 'A'; + wcursor->set_value(wcursor, str); + testutil_check(wcursor->insert(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + + /* Run random cursor queries while compact is running */ + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + testutil_check(pthread_create(&thread, NULL, compact_thread, session)); + query_docs(rcursor, true); + testutil_check(rcursor->close(rcursor)); + testutil_check(pthread_join(thread, NULL)); + + /* Delete everything. Check for infinite loops */ + testutil_check(session->open_cursor( + session, name, NULL, "overwrite", &wcursor)); + for (i = 0; i < NUM_DOCS; i++) { + wcursor->set_key(wcursor, i); + testutil_check(wcursor->remove(wcursor)); + } + testutil_check(wcursor->close(wcursor)); + + testutil_check(session2->open_cursor( + session2, name, NULL, "next_random=true", &rcursor)); + for (i = 0; i < 3; i++) + testutil_assert(rcursor->next(rcursor) == WT_NOTFOUND); + printf("Successfully got WT_NOTFOUND\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/csuite/wt2447_join_main_table/main.c b/src/third_party/wiredtiger/test/csuite/wt2447_join_main_table/main.c index bbae61e7ed5..1368e7c8c09 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2447_join_main_table/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2447_join_main_table/main.c @@ -49,8 +49,6 @@ * table. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 10000 static void diff --git a/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c b/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c index ee567df8749..ae18760a829 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2535_insert_race/main.c @@ -36,8 +36,6 @@ * Failure mode: Check that the data is correct at the end of the run. */ -void (*custom_die)(void) = NULL; - void *thread_insert_race(void *); int diff --git a/src/third_party/wiredtiger/test/csuite/wt2592_join_schema/main.c b/src/third_party/wiredtiger/test/csuite/wt2592_join_schema/main.c index 4ffc9194646..0ec1c765d99 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2592_join_schema/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2592_join_schema/main.c @@ -36,12 +36,6 @@ * Failure mode: The failure seen in WT-2592 was that no items were returned * by a join. */ -#include <inttypes.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <wiredtiger.h> /* The C struct for the data we are storing in a WiredTiger table. */ typedef struct { @@ -66,8 +60,6 @@ static POP_RECORD pop_data[] = { { "", 0, 0 } }; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/csuite/wt2695_checksum/main.c b/src/third_party/wiredtiger/test/csuite/wt2695_checksum/main.c index 50f118bf828..db4fed5dc53 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2695_checksum/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2695_checksum/main.c @@ -32,8 +32,6 @@ * Test case description: Smoke-test the CRC. */ -void (*custom_die)(void) = NULL; - static inline void check(uint32_t hw, uint32_t sw, size_t len, const char *msg) { diff --git a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c index 64a47ea49a6..0942cfc73b2 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2719_reconfig/main.c @@ -34,9 +34,7 @@ * Test case description: Fuzz testing for WiredTiger reconfiguration. */ -void (*custom_die)(void) = NULL; - -static const char *list[] = { +static const char * const list[] = { ",async=(enabled=0)", ",async=(enabled=1)", ",async=(ops_max=2048)", diff --git a/src/third_party/wiredtiger/test/csuite/wt2834_join_bloom_fix/main.c b/src/third_party/wiredtiger/test/csuite/wt2834_join_bloom_fix/main.c index 1d6abcfb179..7c80496f1b6 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2834_join_bloom_fix/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2834_join_bloom_fix/main.c @@ -39,8 +39,6 @@ * * Failure mode: We get results back from our join. */ -void (*custom_die)(void) = NULL; - #define N_RECORDS 100000 #define N_INSERT 1000000 @@ -103,8 +101,8 @@ main(int argc, char *argv[]) &maincur)); maincur->set_key(maincur, N_RECORDS); maincur->set_value(maincur, 54321, 0, "", 0, N_RECORDS); - maincur->insert(maincur); - maincur->close(maincur); + testutil_check(maincur->insert(maincur)); + testutil_check(maincur->close(maincur)); testutil_check(session->close(session, NULL)); populate(opts); @@ -153,6 +151,7 @@ main(int argc, char *argv[]) key, key2, post, balance, flag); count++; } + testutil_assert(ret == WT_NOTFOUND); testutil_assert(count == 0); testutil_cleanup(opts); @@ -197,6 +196,6 @@ populate(TEST_OPTS *opts) testutil_check(maincur->insert(maincur)); testutil_check(session->commit_transaction(session, NULL)); } - maincur->close(maincur); - session->close(session, NULL); + testutil_check(maincur->close(maincur)); + testutil_check(session->close(session, NULL)); } diff --git a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c index 27591fdf214..6cec9634cd1 100644 --- a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c @@ -42,8 +42,6 @@ * continues until the test ends (~30 seconds). */ -void (*custom_die)(void) = NULL; - static void *thread_insert(void *); static void *thread_get(void *); diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c index aa351e6fea8..85b8c68e545 100644 --- a/src/third_party/wiredtiger/test/cursor_order/cursor_order.c +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c @@ -44,8 +44,6 @@ static void wt_shutdown(SHARED_CONFIG *); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/fops/t.c b/src/third_party/wiredtiger/test/fops/t.c index bf0588d5a53..7b4a7cf8fca 100644 --- a/src/third_party/wiredtiger/test/fops/t.c +++ b/src/third_party/wiredtiger/test/fops/t.c @@ -51,8 +51,6 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 542adf33da2..839ff5058de 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -187,8 +187,17 @@ config_setup(void) /* Give in-memory configuration a final review. */ config_in_memory_check(); - /* Make the default maximum-run length 20 minutes. */ - if (!config_is_perm("timer")) + /* + * Run-length configured by a number of operations and a timer. If the + * operation count and the timer are both set by a configuration, there + * isn't anything to do. If only the operation count was configured, + * set a default maximum-run of 20 minutes. If only the timer is set, + * clear the operations count (which was set randomly). + */ + if (config_is_perm("timer")) { + if (!config_is_perm("ops")) + config_single("ops=0", 0); + } else config_single("timer=20", 0); /* @@ -270,28 +279,33 @@ config_compression(const char *conf_name) */ switch (mmrand(NULL, 1, 20)) { #ifdef HAVE_BUILTIN_EXTENSION_LZ4 - case 1: case 2: case 3: case 4: /* 20% lz4 */ + case 1: case 2: /* 10% lz4 */ cstr = "lz4"; break; - case 5: /* 5% lz4-no-raw */ + case 3: /* 5% lz4-no-raw */ cstr = "lz4-noraw"; break; #endif #ifdef HAVE_BUILTIN_EXTENSION_SNAPPY - case 6: case 7: case 8: case 9: /* 30% snappy */ - case 10: case 11: + case 4: case 5: case 6: case 7: /* 30% snappy */ + case 8: case 9: cstr = "snappy"; break; #endif #ifdef HAVE_BUILTIN_EXTENSION_ZLIB - case 12: case 13: case 14: case 15: /* 20% zlib */ + case 10: case 11: case 12: case 13: /* 20% zlib */ cstr = "zlib"; break; - case 16: /* 5% zlib-no-raw */ + case 14: /* 5% zlib-no-raw */ cstr = "zlib-noraw"; break; #endif - case 17: case 18: case 19: case 20: /* 20% no compression */ +#ifdef HAVE_BUILTIN_EXTENSION_ZSTD + case 15: case 16 case 17: /* 15% zstd */ + cstr = "zstd"; + break; +#endif + case 18: case 19: case 20: /* 15% no compression */ default: break; } @@ -748,6 +762,8 @@ config_map_compression(const char *s, u_int *vp) *vp = COMPRESS_ZLIB; else if (strcmp(s, "zlib-noraw") == 0) *vp = COMPRESS_ZLIB_NO_RAW; + else if (strcmp(s, "zstd") == 0) + *vp = COMPRESS_ZSTD; else testutil_die(EINVAL, "illegal compression configuration: %s", s); diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 725bc7c5d97..9bfba3cd0df 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -58,7 +58,7 @@ typedef struct { } CONFIG; #define COMPRESSION_LIST \ - "(none | lz4 | lz4-noraw | snappy | zlib | zlib-noraw)" + "(none | lz4 | lz4-noraw | snappy | zlib | zlib-noraw | zstd)" static CONFIG c[] = { { "abort", diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 363dcf9eea8..820bc020c9b 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -48,6 +48,8 @@ EXTPATH "compressors/snappy/.libs/libwiredtiger_snappy.so" #define ZLIB_PATH \ EXTPATH "compressors/zlib/.libs/libwiredtiger_zlib.so" +#define ZSTD_PATH \ + EXTPATH "compressors/zstd/.libs/libwiredtiger_zstd.so" #define REVERSE_PATH \ EXTPATH "collators/reverse/.libs/libwiredtiger_reverse_collator.so" @@ -219,6 +221,7 @@ typedef struct { #define COMPRESS_SNAPPY 5 #define COMPRESS_ZLIB 6 #define COMPRESS_ZLIB_NO_RAW 7 +#define COMPRESS_ZSTD 8 u_int c_compression_flag; /* Compression flag value */ u_int c_logging_compression_flag; /* Log compression flag value */ diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 41bdea11e73..7701595776c 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -38,8 +38,6 @@ static void usage(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = format_die; /* Local death handler. */ - int main(int argc, char *argv[]) { @@ -47,6 +45,8 @@ main(int argc, char *argv[]) int ch, onerun, reps; const char *config, *home; + custom_die = format_die; /* Local death handler. */ + config = NULL; #ifdef _WIN32 diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 1600786855a..23fdbce156c 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -50,6 +50,8 @@ compressor(uint32_t compress_flag) return ("zlib"); case COMPRESS_ZLIB_NO_RAW: return ("zlib-noraw"); + case COMPRESS_ZSTD: + return ("zstd"); default: break; } @@ -210,13 +212,14 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) /* Extensions. */ p += snprintf(p, REMAIN(p, end), ",extensions=[" - "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],", + "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],", g.c_reverse ? REVERSE_PATH : "", access(LZ4_PATH, R_OK) == 0 ? LZ4_PATH : "", access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "", access(ROTN_PATH, R_OK) == 0 ? ROTN_PATH : "", access(SNAPPY_PATH, R_OK) == 0 ? SNAPPY_PATH : "", access(ZLIB_PATH, R_OK) == 0 ? ZLIB_PATH : "", + access(ZSTD_PATH, R_OK) == 0 ? ZSTD_PATH : "", DATASOURCE("kvsbdb") ? KVS_BDB_PATH : ""); /* diff --git a/src/third_party/wiredtiger/test/huge/huge.c b/src/third_party/wiredtiger/test/huge/huge.c index 3aa61a9048e..17e2db353d5 100644 --- a/src/third_party/wiredtiger/test/huge/huge.c +++ b/src/third_party/wiredtiger/test/huge/huge.c @@ -159,8 +159,6 @@ run(CONFIG *cp, int bigkey, size_t bytes) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/manydbs/manydbs.c b/src/third_party/wiredtiger/test/manydbs/manydbs.c index c5c9a9a7ccd..7e986d47af3 100644 --- a/src/third_party/wiredtiger/test/manydbs/manydbs.c +++ b/src/third_party/wiredtiger/test/manydbs/manydbs.c @@ -68,8 +68,6 @@ usage(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - static WT_CONNECTION **connections = NULL; static WT_CURSOR **cursors = NULL; static WT_RAND_STATE rnd; diff --git a/src/third_party/wiredtiger/test/packing/intpack-test.c b/src/third_party/wiredtiger/test/packing/intpack-test.c index 76851b38e35..c84823b741b 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - int main(void) { diff --git a/src/third_party/wiredtiger/test/packing/intpack-test2.c b/src/third_party/wiredtiger/test/packing/intpack-test2.c index a7d31329069..4e612808a35 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test2.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test2.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - int main(void) { diff --git a/src/third_party/wiredtiger/test/packing/intpack-test3.c b/src/third_party/wiredtiger/test/packing/intpack-test3.c index aac0178578f..763b0255ecf 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test3.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test3.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - void test_value(int64_t); void test_spread(int64_t, int64_t, int64_t); diff --git a/src/third_party/wiredtiger/test/packing/packing-test.c b/src/third_party/wiredtiger/test/packing/packing-test.c index f251c17eb67..919b0622806 100644 --- a/src/third_party/wiredtiger/test/packing/packing-test.c +++ b/src/third_party/wiredtiger/test/packing/packing-test.c @@ -28,8 +28,6 @@ #include "test_util.h" -void (*custom_die)(void) = NULL; - static void check(const char *fmt, ...) { diff --git a/src/third_party/wiredtiger/test/readonly/readonly.c b/src/third_party/wiredtiger/test/readonly/readonly.c index 7a131912c31..a4b79f5859f 100644 --- a/src/third_party/wiredtiger/test/readonly/readonly.c +++ b/src/third_party/wiredtiger/test/readonly/readonly.c @@ -158,8 +158,6 @@ open_dbs(int op, const char *dir, extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/recovery/random-abort.c b/src/third_party/wiredtiger/test/recovery/random-abort.c index 22a163bedff..03e67e2f723 100644 --- a/src/third_party/wiredtiger/test/recovery/random-abort.c +++ b/src/third_party/wiredtiger/test/recovery/random-abort.c @@ -179,8 +179,6 @@ fill_db(uint32_t nth) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/recovery/truncated-log.c b/src/third_party/wiredtiger/test/recovery/truncated-log.c index c0effa85e95..c265263d44c 100644 --- a/src/third_party/wiredtiger/test/recovery/truncated-log.c +++ b/src/third_party/wiredtiger/test/recovery/truncated-log.c @@ -258,8 +258,6 @@ fill_db(void) extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/salvage/salvage.c b/src/third_party/wiredtiger/test/salvage/salvage.c index bad0167ca8e..b8553bbd72d 100644 --- a/src/third_party/wiredtiger/test/salvage/salvage.c +++ b/src/third_party/wiredtiger/test/salvage/salvage.c @@ -64,8 +64,6 @@ static int verbose; /* -v flag */ extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/suite/test_bug017.py b/src/third_party/wiredtiger/test/suite/test_bug017.py new file mode 100644 index 00000000000..03e7b2ba714 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_bug017.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_bug017.py +# WT-2987: opening a cursor on an incomplete table drops core +class test_bug017(wttest.WiredTigerTestCase): + + def test_bug017_run(self): + self.session.create("table:bug17", + 'key_format=r,value_format=5sHQ,' + + 'columns=(id,country,year,population),colgroups=(main,population)') + + msg = '/column groups/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor("table:bug17(country)", None), + msg) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_cursor_random02.py b/src/third_party/wiredtiger/test/suite/test_cursor_random02.py index 93aa97f2282..195480d703b 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor_random02.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor_random02.py @@ -34,7 +34,10 @@ from wtscenario import make_scenarios # test_cursor_random02.py # Cursor next_random operations class test_cursor_random02(wttest.WiredTigerTestCase): - type = 'table:random' + types = [ + ('lsm', dict(type='lsm:random')), + ('table', dict(type='table:random')) + ] config = [ ('not-sample', dict(config='next_random=true')) ] @@ -46,26 +49,35 @@ class test_cursor_random02(wttest.WiredTigerTestCase): ('10000', dict(records=10000)), ('50000', dict(records=50000)), ] - scenarios = make_scenarios(config, records) + scenarios = make_scenarios(config, records, types) # Check that next_random works in the presence of a larger set of values, # where the values are in an insert list. def test_cursor_random_reasonable_distribution(self): uri = self.type num_entries = self.records + config = 'key_format=S' + if uri == 'table:random': + config = 'leaf_page_max=100MB,' + config # Set the leaf-page-max value, otherwise the page might split. - simple_populate(self, uri, - 'leaf_page_max=100MB,key_format=S', num_entries) + simple_populate(self, uri, config, num_entries) # Setup an array to track which keys are seen visitedKeys = [0] * (num_entries + 1) + # Setup a counter to see when we find a sequential key + sequentialKeys = 0 cursor = self.session.open_cursor(uri, None, 'next_random=true') + lastKey = None for i in range(0, num_entries): self.assertEqual(cursor.next(), 0) current = cursor.get_key() current = int(current) visitedKeys[current] = visitedKeys[current] + 1 + if lastKey != None: + if current == (lastKey + 1): + sequentialKeys += 1 + lastKey = current differentKeys = sum(x > 0 for x in visitedKeys) @@ -76,7 +88,10 @@ class test_cursor_random02(wttest.WiredTigerTestCase): str(num_entries) + ', ' + \ str((int)((differentKeys * 100) / num_entries)) + '%') ''' - + # Can't test for non-sequential data when there is 1 item in the table + if num_entries > 1: + self.assertGreater(num_entries - 1, sequentialKeys, + 'cursor is returning sequential data') self.assertGreater(differentKeys, num_entries / 4, 'next_random random distribution not adequate') diff --git a/src/third_party/wiredtiger/test/suite/test_encrypt01.py b/src/third_party/wiredtiger/test/suite/test_encrypt01.py index d314cbeadfd..746c9d13e96 100644 --- a/src/third_party/wiredtiger/test/suite/test_encrypt01.py +++ b/src/third_party/wiredtiger/test/suite/test_encrypt01.py @@ -57,6 +57,7 @@ class test_encrypt01(wttest.WiredTigerTestCase): ('lz4', dict(log_compress='lz4', block_compress='lz4')), ('snappy', dict(log_compress='snappy', block_compress='snappy')), ('zlib', dict(log_compress='zlib', block_compress='zlib')), + ('zstd', dict(log_compress='zstd', block_compress='zstd')), ('none-snappy', dict(log_compress=None, block_compress='snappy')), ('snappy-lz4', dict(log_compress='snappy', block_compress='lz4')), ] diff --git a/src/third_party/wiredtiger/test/suite/test_stat02.py b/src/third_party/wiredtiger/test/suite/test_stat02.py index 3d2a83d1c3c..047d2c74499 100644 --- a/src/third_party/wiredtiger/test/suite/test_stat02.py +++ b/src/third_party/wiredtiger/test/suite/test_stat02.py @@ -165,7 +165,7 @@ class test_stat_cursor_conn_error(wttest.WiredTigerTestCase): args = ['none', 'all', 'fast'] for i in list(itertools.permutations(args, 2)): config = 'create,statistics=(' + i[0] + ',' + i[1] + ')' - msg = '/only one statistics configuration value/' + msg = '/Only one of/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.wiredtiger_open('.', config), msg) @@ -188,10 +188,76 @@ class test_stat_cursor_dsrc_error(wttest.WiredTigerTestCase): args = ['all', 'fast'] for i in list(itertools.permutations(args, 2)): config = 'statistics=(' + i[0] + ',' + i[1] + ')' - msg = '/only one statistics configuration value/' + msg = '/Only one of/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.open_cursor( 'statistics:' + self.uri, None, config), msg) +# Test data-source cache walk statistics +class test_stat_cursor_dsrc_cache_walk(wttest.WiredTigerTestCase): + uri = 'file:test_stat_cursor_dsrc_cache_walk' + + conn_config = 'statistics=(none)' + + def test_stat_cursor_dsrc_cache_walk(self): + simple_populate(self, self.uri, 'key_format=S', 100) + # Ensure that it's an error to get cache_walk stats if none is set + msg = '/doesn\'t match the database statistics/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor( + 'statistics:' + self.uri, None, None), msg) + + # Test configurations that are valid but should not collect + # cache walk information. Do these first since the cache walk + # statistics are mostly marked as not cleared - so once they are + # populated the values will always be returned + self.conn.reconfigure('statistics=(cache_walk,fast,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(fast)') + self.assertEqual(c[stat.dsrc.cache_state_root_size][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(fast)') + self.assertEqual(c[stat.dsrc.cache_state_root_size][2], 0) + c.close() + + self.conn.reconfigure('statistics=(cache_walk,fast,clear)') + c = self.session.open_cursor('statistics:' + self.uri, None, None) + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + # Verify that cache_walk didn't imply tree_walk + self.assertEqual(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(cache_walk,tree_walk,fast,clear)') + c = self.session.open_cursor('statistics:' + self.uri, None, None) + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + # Verify that cache_walk didn't exclude tree_walk + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(all)') + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + + # Verify that cache and tree walk can operate independantly + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(cache_walk,fast)') + self.assertGreater(c[stat.dsrc.cache_state_root_size][2], 0) + self.assertEqual(c[stat.dsrc.btree_entries][2], 0) + c.close() + + self.conn.reconfigure('statistics=(all,clear)') + c = self.session.open_cursor( + 'statistics:' + self.uri, None, 'statistics=(tree_walk,fast)') + # Don't check the cache walk stats for empty - they won't be cleared + self.assertGreater(c[stat.dsrc.btree_entries][2], 0) + c.close() + if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/thread/t.c b/src/third_party/wiredtiger/test/thread/t.c index 5b53532e8a6..baadbf2adb9 100644 --- a/src/third_party/wiredtiger/test/thread/t.c +++ b/src/third_party/wiredtiger/test/thread/t.c @@ -52,8 +52,6 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; -void (*custom_die)(void) = NULL; - int main(int argc, char *argv[]) { diff --git a/src/third_party/wiredtiger/test/utility/misc.c b/src/third_party/wiredtiger/test/utility/misc.c index 096bc752726..1491c9a6938 100644 --- a/src/third_party/wiredtiger/test/utility/misc.c +++ b/src/third_party/wiredtiger/test/utility/misc.c @@ -27,6 +27,8 @@ */ #include "test_util.h" +void (*custom_die)(void) = NULL; + /* * die -- * Report an error and quit. @@ -142,8 +144,6 @@ testutil_cleanup(TEST_OPTS *opts) if (!opts->preserve) testutil_clean_work_dir(opts->home); - free(opts->conn_config); - free(opts->table_config); free(opts->uri); free(opts->home); } diff --git a/src/third_party/wiredtiger/test/utility/parse_opts.c b/src/third_party/wiredtiger/test/utility/parse_opts.c index 08aeafa9617..74a1c021d5d 100644 --- a/src/third_party/wiredtiger/test/utility/parse_opts.c +++ b/src/third_party/wiredtiger/test/utility/parse_opts.c @@ -27,10 +27,6 @@ */ #include "test_util.h" -extern int __wt_opterr; /* if error message should be printed */ -extern int __wt_optind; /* index into parent argv vector */ -extern int __wt_optopt; /* character checked for validity */ -extern int __wt_optreset; /* reset getopt */ extern char *__wt_optarg; /* argument associated with option */ /* @@ -59,7 +55,7 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) opts->n_append_threads = (uint64_t)atoll(__wt_optarg); break; case 'h': /* Home directory */ - opts->home = __wt_optarg; + opts->home = dstrdup(__wt_optarg); break; case 'n': /* Number of records */ opts->nrecords = (uint64_t)atoll(__wt_optarg); @@ -116,12 +112,14 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) } /* - * Setup the home directory. It needs to be unique for every test - * or the auto make parallel tester gets upset. + * Setup the home directory if not explicitly specified. It needs to be + * unique for every test or the auto make parallel tester gets upset. */ - len = strlen("WT_TEST.") + strlen(opts->progname) + 10; - opts->home = dmalloc(len); - snprintf(opts->home, len, "WT_TEST.%s", opts->progname); + if (opts->home == NULL) { + len = strlen("WT_TEST.") + strlen(opts->progname) + 10; + opts->home = dmalloc(len); + snprintf(opts->home, len, "WT_TEST.%s", opts->progname); + } /* Setup the default URI string */ len = strlen("table:") + strlen(opts->progname) + 10; diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h index 1047d1ca8a0..f6a9cd68e02 100644 --- a/src/third_party/wiredtiger/test/utility/test_util.h +++ b/src/third_party/wiredtiger/test/utility/test_util.h @@ -68,10 +68,8 @@ typedef struct { * resources. */ WT_CONNECTION *conn; - char *conn_config; WT_SESSION *session; bool running; - char *table_config; char *uri; volatile uint64_t next_threadid; uint64_t max_inserted_id; @@ -87,6 +85,16 @@ typedef struct { } while (0) /* + * testutil_assertfmt -- + * Complain and quit if something isn't true. + */ +#define testutil_assertfmt(a, fmt, ...) do { \ + if (!(a)) \ + testutil_die(0, "%s/%d: %s: " fmt, \ + __func__, __LINE__, #a, __VA_ARGS__); \ +} while (0) + +/* * testutil_check -- * Complain and quit if a function call fails. */ diff --git a/src/third_party/wiredtiger/tools/wtstats/stat_data.py b/src/third_party/wiredtiger/tools/wtstats/stat_data.py index 8f47b86a23e..635e710c469 100644 --- a/src/third_party/wiredtiger/tools/wtstats/stat_data.py +++ b/src/third_party/wiredtiger/tools/wtstats/stat_data.py @@ -91,6 +91,24 @@ no_scale_per_second_list = [ 'btree: row-store leaf pages', 'cache: bytes currently in the cache', 'cache: overflow values cached in memory', + 'cache_walk: Average difference between current eviction generation when the page was last considered', + 'cache_walk: Average on-disk page image size seen', + 'cache_walk: Clean pages currently in cache', + 'cache_walk: Current eviction generation', + 'cache_walk: Dirty pages currently in cache', + 'cache_walk: Entries in the root page', + 'cache_walk: Internal pages currently in cache', + 'cache_walk: Leaf pages currently in cache', + 'cache_walk: Maximum difference between current eviction generation when the page was last considered', + 'cache_walk: Maximum page size seen', + 'cache_walk: Minimum on-disk page image size seen', + 'cache_walk: On-disk page image sizes smaller than a single allocation unit', + 'cache_walk: Pages created in memory and never written', + 'cache_walk: Pages currently queued for eviction', + 'cache_walk: Pages that could not be queued for eviction', + 'cache_walk: Refs skipped during cache traversal', + 'cache_walk: Size of the root page', + 'cache_walk: Total number of pages currently in cache', 'LSM: bloom filters in the LSM tree', 'LSM: chunks in the LSM tree', 'LSM: highest merge generation in the LSM tree', @@ -162,6 +180,24 @@ no_clear_list = [ 'transaction: transaction range of IDs currently pinned by named snapshots', 'btree: btree checkpoint generation', 'cache: bytes currently in the cache', + 'cache_walk: Average difference between current eviction generation when the page was last considered', + 'cache_walk: Average on-disk page image size seen', + 'cache_walk: Clean pages currently in cache', + 'cache_walk: Current eviction generation', + 'cache_walk: Dirty pages currently in cache', + 'cache_walk: Entries in the root page', + 'cache_walk: Internal pages currently in cache', + 'cache_walk: Leaf pages currently in cache', + 'cache_walk: Maximum difference between current eviction generation when the page was last considered', + 'cache_walk: Maximum page size seen', + 'cache_walk: Minimum on-disk page image size seen', + 'cache_walk: On-disk page image sizes smaller than a single allocation unit', + 'cache_walk: Pages created in memory and never written', + 'cache_walk: Pages currently queued for eviction', + 'cache_walk: Pages that could not be queued for eviction', + 'cache_walk: Refs skipped during cache traversal', + 'cache_walk: Size of the root page', + 'cache_walk: Total number of pages currently in cache', 'session: open cursor count', ] prefix_list = [ @@ -169,6 +205,7 @@ prefix_list = [ 'reconciliation', 'LSM', 'log', + 'lock', 'cache', 'transaction', 'cursor', @@ -176,9 +213,10 @@ prefix_list = [ 'session', 'block-manager', 'thread-yield', + 'cache_walk', 'async', 'btree', 'thread-state', 'compression', ] -groups = {'cursor': ['cursor', 'session'], 'lsm': ['LSM', 'transaction'], 'system': ['connection', 'data-handle', 'session', 'thread-state'], 'evict': ['block-manager', 'cache', 'connection', 'thread-state'], 'memory': ['cache', 'connection', 'reconciliation']}
\ No newline at end of file +groups = {'cursor': ['cursor', 'session'], 'lsm': ['LSM', 'transaction'], 'system': ['connection', 'data-handle', 'session', 'thread-state'], 'evict': ['block-manager', 'cache', 'cache_walk', 'connection', 'thread-state'], 'memory': ['cache', 'cache_walk', 'connection', 'reconciliation']}
\ No newline at end of file diff --git a/src/third_party/wiredtiger/tools/wtstats/wtstats.py b/src/third_party/wiredtiger/tools/wtstats/wtstats.py index 3549031c30f..bf5557d12f4 100755 --- a/src/third_party/wiredtiger/tools/wtstats/wtstats.py +++ b/src/third_party/wiredtiger/tools/wtstats/wtstats.py @@ -115,6 +115,9 @@ def parse_wtstats_file(file, result): # Parse file for line in open(file, 'rU'): month, day, time, v, title = line.strip('\n').split(" ", 4) + # The colon in the URI confuses parsing, strip it out. + if "cache_walk" in title: + title = title.replace("file:", "", 1) result[title].append((month + " " + day + " " + time, v)) |