From 23914068c331a42d1f98de0d58caecf0e391549a Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Wed, 17 Jan 2018 14:04:41 +1100 Subject: Import wiredtiger: 357efdd4ce279efc71ff618c59fe1b903ef80bb2 from branch mongodb-3.8 ref: 9e50448231..357efdd4ce for: 3.7.2 WT-3565 Test and understand mixed timestamp/no-timestamp usage to same data WT-3597 Add a diagnostic check for updates to the same key out of timestamp order WT-3632 Increase how granularly cache usage settings can be configured WT-3695 format failed to report a stuck cache WT-3740 race in page dirty-byte decrement. WT-3767 Avoid lookaside instantiation for faster reads WT-3775 Improve commit timestamp is older than oldest timestamp error message WT-3792 LSM version 1 metadata incompatibility WT-3796 Report a better error message if transaction commit fails WT-3799 Test/format with timestamps enabled pin cache full WT-3809 Fix a bug in lookaside related to birthmarks WT-3811 Python scripts for visualizing operation tracking files WT-3818 __rec_txn_read() code order cleanup WT-3825 Fix calculation of CPU ticks per unit time WT-3826 random-abort test failure WT-3827 test_compact02 failure WT-3828 Link error on OS/X for __wt_process data reference WT-3831 uninitialized buffer value in statlog server path comparison WT-3832 Fixup shell script warning messages WT-3833 test/format cache_minimum value error WT-3841 Fix error message pattern in timestamp09 WT-3842 full-build Friday & lint WT-3844 Checkpoints can hang on limbo pages WT-3845 Compiler warning in examples using GCC 5.4.0 --- src/third_party/wiredtiger/NEWS | 29 + src/third_party/wiredtiger/README | 6 +- src/third_party/wiredtiger/RELEASE_INFO | 2 +- .../wiredtiger/build_posix/aclocal/version-set.m4 | 4 +- .../wiredtiger/build_posix/aclocal/version.m4 | 2 +- src/third_party/wiredtiger/dist/api_data.py | 41 +- src/third_party/wiredtiger/dist/s_define.list | 2 +- src/third_party/wiredtiger/dist/s_string.ok | 1 + src/third_party/wiredtiger/dist/s_void | 1 + src/third_party/wiredtiger/dist/stat_data.py | 2 + src/third_party/wiredtiger/examples/c/ex_smoke.c | 6 +- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 10 + src/third_party/wiredtiger/src/btree/bt_delete.c | 1 + src/third_party/wiredtiger/src/btree/bt_discard.c | 49 +- src/third_party/wiredtiger/src/btree/bt_handle.c | 2 +- src/third_party/wiredtiger/src/btree/bt_io.c | 6 +- src/third_party/wiredtiger/src/btree/bt_random.c | 2 + src/third_party/wiredtiger/src/btree/bt_read.c | 374 ++++--- src/third_party/wiredtiger/src/btree/bt_ret.c | 44 +- src/third_party/wiredtiger/src/btree/bt_split.c | 30 +- src/third_party/wiredtiger/src/btree/bt_sync.c | 6 +- src/third_party/wiredtiger/src/btree/bt_walk.c | 7 +- src/third_party/wiredtiger/src/btree/col_modify.c | 3 + src/third_party/wiredtiger/src/btree/row_modify.c | 24 +- src/third_party/wiredtiger/src/cache/cache_las.c | 7 +- src/third_party/wiredtiger/src/config/config_def.c | 60 +- src/third_party/wiredtiger/src/conn/conn_cache.c | 63 +- .../wiredtiger/src/conn/conn_cache_pool.c | 11 +- src/third_party/wiredtiger/src/conn/conn_log.c | 6 +- src/third_party/wiredtiger/src/conn/conn_stat.c | 20 +- src/third_party/wiredtiger/src/cursor/cur_file.c | 30 +- src/third_party/wiredtiger/src/docs/top/main.dox | 8 +- src/third_party/wiredtiger/src/docs/upgrading.dox | 10 + src/third_party/wiredtiger/src/evict/evict_lru.c | 86 +- src/third_party/wiredtiger/src/evict/evict_page.c | 20 +- src/third_party/wiredtiger/src/include/api.h | 16 +- src/third_party/wiredtiger/src/include/btmem.h | 41 +- src/third_party/wiredtiger/src/include/btree.i | 15 +- src/third_party/wiredtiger/src/include/cache.h | 14 +- src/third_party/wiredtiger/src/include/cache.i | 24 +- src/third_party/wiredtiger/src/include/extern.h | 8 +- src/third_party/wiredtiger/src/include/misc.i | 24 +- src/third_party/wiredtiger/src/include/mutex.i | 8 +- src/third_party/wiredtiger/src/include/optrack.h | 2 +- src/third_party/wiredtiger/src/include/os.h | 16 +- .../wiredtiger/src/include/os_fhandle.i | 12 +- src/third_party/wiredtiger/src/include/serial.i | 23 +- src/third_party/wiredtiger/src/include/stat.h | 2 + src/third_party/wiredtiger/src/include/txn.h | 3 + src/third_party/wiredtiger/src/include/txn.i | 3 +- .../wiredtiger/src/include/wiredtiger.in | 564 ++++++----- src/third_party/wiredtiger/src/log/log.c | 24 +- src/third_party/wiredtiger/src/log/log_slot.c | 18 +- src/third_party/wiredtiger/src/lsm/lsm_meta.c | 22 +- src/third_party/wiredtiger/src/os_posix/os_dir.c | 14 +- .../wiredtiger/src/reconcile/rec_write.c | 97 +- .../wiredtiger/src/session/session_api.c | 10 +- src/third_party/wiredtiger/src/support/global.c | 4 +- src/third_party/wiredtiger/src/support/hazard.c | 10 +- src/third_party/wiredtiger/src/support/mtx_rw.c | 16 +- src/third_party/wiredtiger/src/support/stat.c | 8 + src/third_party/wiredtiger/src/support/time.c | 26 +- src/third_party/wiredtiger/src/txn/txn.c | 110 +- src/third_party/wiredtiger/src/txn/txn_ckpt.c | 22 +- .../wiredtiger/src/txn/txn_rollback_to_stable.c | 30 +- src/third_party/wiredtiger/src/txn/txn_timestamp.c | 65 +- .../wiredtiger/test/csuite/random_abort/main.c | 10 +- .../wiredtiger/test/csuite/timestamp_abort/main.c | 3 +- src/third_party/wiredtiger/test/format/config.h | 2 +- src/third_party/wiredtiger/test/format/ops.c | 16 +- src/third_party/wiredtiger/test/format/util.c | 11 +- .../wiredtiger/test/packing/intpack-test.c | 6 + .../wiredtiger/test/packing/intpack-test2.c | 6 + .../wiredtiger/test/packing/intpack-test3.c | 6 + .../wiredtiger/test/packing/packing-test.c | 6 + .../wiredtiger/test/suite/test_compact02.py | 8 +- .../wiredtiger/test/suite/test_config04.py | 63 +- .../wiredtiger/test/suite/test_reconfig01.py | 7 + .../wiredtiger/test/suite/test_shared_cache01.py | 33 + .../wiredtiger/test/suite/test_shared_cache02.py | 30 + .../wiredtiger/test/suite/test_timestamp04.py | 56 +- .../wiredtiger/test/suite/test_timestamp09.py | 4 +- .../wiredtiger/test/suite/test_timestamp10.py | 162 +++ .../wiredtiger/test/suite/test_timestamp11.py | 150 +++ .../wiredtiger/tools/optrack/arrow-left.png | Bin 0 -> 103602 bytes .../wiredtiger/tools/optrack/arrow-right.png | Bin 0 -> 108216 bytes .../tools/optrack/find-latency-spikes.py | 1063 ++++++++++++++++++++ .../wiredtiger/tools/optrack/wt_optrack_decode.py | 319 ++++++ .../wiredtiger/tools/wt_optrack_decode.py | 319 ------ 90 files changed, 3206 insertions(+), 1302 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_timestamp10.py create mode 100644 src/third_party/wiredtiger/test/suite/test_timestamp11.py create mode 100644 src/third_party/wiredtiger/tools/optrack/arrow-left.png create mode 100644 src/third_party/wiredtiger/tools/optrack/arrow-right.png create mode 100755 src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py create mode 100755 src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py delete mode 100755 src/third_party/wiredtiger/tools/wt_optrack_decode.py (limited to 'src/third_party') diff --git a/src/third_party/wiredtiger/NEWS b/src/third_party/wiredtiger/NEWS index ffcefd5f8c1..7bf3b0e7edb 100644 --- a/src/third_party/wiredtiger/NEWS +++ b/src/third_party/wiredtiger/NEWS @@ -1,6 +1,35 @@ Ticket reference tags refer to tickets in the MongoDB JIRA tracking system: https://jira.mongodb.org +WiredTiger release 3.0.0, 2018-01-08 +------------------------------------ + +See the upgrading documentation for details of API and behavior changes. + +Significant changes: +* WT-3039 Change the log file format to record a previous LSN record +* WT-3181 Add support for application defined transaction IDs via a mechanism called timestamps. +* WT-3310 Add support to WT_SESSION::alter to change table log setting +* WT-3389 Restructure page split code to hold a split generation for the entire operation. +* WT-3406 Fix a bug in reconciliation so that it ignores concurrent updates. +* WT-3418 Fix a block manager race in tree close/open +* WT-3435 Improvements to the cache overflow mechanism aka lookaside +* WT-3437 Improvements to auto tuning of number of eviction workers +* WT-3440 Add a log record when starting a checkpoint. +* WT-3461 Avoid hangs when system clocks move backwards by using CLOCK_MONOTONIC for pthread_cond_timedwait if possible. +* WT-3490 Fix a bug in WT_CURSOR.modify unaligned size_t access. +* WT-3495 Fix a bug so we don't ftruncate if log cursors are open +* WT-3497 Improve logging message when hitting the configured session limits +* WT-3537 Split pages in memory when nothing can be written +* WT-3556 Remove wtstats support +* WT-3681 Change recovery so that it doesn't truncate the last log file +* WT-3683 Allow eviction of clean pages with history when cache is stuck +* WT-3710 Get a page-level lock to ensure page splits are single threaded +* WT-3752 Allow trimming of obsolete modify updates. + +See JIRA changelog for a full listing: +https://jira.mongodb.org/projects/WT/versions/18401 + WiredTiger release 2.9.3, 2017-06-27 ------------------------------------ diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README index 4def09abba6..234038d21d0 100644 --- a/src/third_party/wiredtiger/README +++ b/src/third_party/wiredtiger/README @@ -1,6 +1,6 @@ -WiredTiger 3.0.0: (June 27, 2017) +WiredTiger 3.0.1: (January 8, 2018) -This is version 3.0.0 of WiredTiger. +This is version 3.0.1 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at: The documentation for this specific release can be found at: - http://source.wiredtiger.com/3.0.0/index.html + http://source.wiredtiger.com/3.0.1/index.html The WiredTiger source code can be found at: diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO index ccdff34f2d5..38189c7be01 100644 --- a/src/third_party/wiredtiger/RELEASE_INFO +++ b/src/third_party/wiredtiger/RELEASE_INFO @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=3 WIREDTIGER_VERSION_MINOR=0 -WIREDTIGER_VERSION_PATCH=0 +WIREDTIGER_VERSION_PATCH=1 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 index 5e54ad1cb69..fa3fed8638b 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 @@ -2,8 +2,8 @@ dnl build by dist/s_version VERSION_MAJOR=3 VERSION_MINOR=0 -VERSION_PATCH=0 -VERSION_STRING='"WiredTiger 3.0.0: (June 27, 2017)"' +VERSION_PATCH=1 +VERSION_STRING='"WiredTiger 3.0.1: (January 8, 2018)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 index 3c5980dbaad..9a6918366e9 100644 --- a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 +++ b/src/third_party/wiredtiger/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -3.0.0 +3.0.1 diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index c0afe3ae041..2a369bdafbc 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -454,29 +454,36 @@ connection_runtime_config = [ ]), Config('eviction_checkpoint_target', '5', r''' perform eviction at the beginning of checkpoints to bring the dirty - content in cache to this level, expressed as a percentage of the total - cache size. Ignored if set to zero or \c in_memory is \c true''', - min=0, max=99), + content in cache to this level. It is a percentage of the cache size if + the value is within the range of 0 to 100 or an absolute size when + greater than 100. The value is not allowed to exceed the \c cache_size. + Ignored if set to zero or \c in_memory is \c true''', + min=0, max='10TB'), Config('eviction_dirty_target', '5', r''' perform eviction in worker threads when the cache contains at least - this much dirty content, expressed as a percentage of the total cache - size.''', - min=1, max=99), + this much dirty content. It is a percentage of the cache size if the + value is within the range of 1 to 100 or an absolute size when greater + than 100. The value is not allowed to exceed the \c cache_size.''', + min=1, max='10TB'), Config('eviction_dirty_trigger', '20', r''' trigger application threads to perform eviction when the cache contains - at least this much dirty content, expressed as a percentage of the - total cache size. This setting only alters behavior if it is lower than - eviction_trigger''', - min=1, max=99), + at least this much dirty content. It is a percentage of the cache size + if the value is within the range of 1 to 100 or an absolute size when + greater than 100. The value is not allowed to exceed the \c cache_size. + This setting only alters behavior if it is lower than eviction_trigger + ''', min=1, max='10TB'), Config('eviction_target', '80', r''' perform eviction in worker threads when the cache contains at least - this much content, expressed as a percentage of the total cache size. - Must be less than \c eviction_trigger''', - min=10, max=99), + this much content. It is a percentage of the cache size if the value is + within the range of 10 to 100 or an absolute size when greater than 100. + The value is not allowed to exceed the \c cache_size.''', + min=10, max='10TB'), Config('eviction_trigger', '95', r''' trigger application threads to perform eviction when the cache contains - at least this much content, expressed as a percentage of the - total cache size''', min=10, max=99), + at least this much content. It is a percentage of the cache size if the + value is within the range of 10 to 100 or an absolute size when greater + than 100. The value is not allowed to exceed the \c cache_size.''', + min=10, max='10TB'), Config('file_manager', '', r''' control how file handles are managed''', type='category', subconfig=[ @@ -525,7 +532,9 @@ connection_runtime_config = [ Config('shared_cache', '', r''' shared cache configuration options. A database should configure either a cache_size or a shared_cache not both. Enabling a - shared cache uses a session from the configured session_max''', + shared cache uses a session from the configured session_max. A + shared cache can not have absolute values configured for cache + eviction settings''', type='category', subconfig=[ Config('chunk', '10MB', r''' the granularity that a shared cache is redistributed''', diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 16f06da383c..cfae3106fcf 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -20,6 +20,7 @@ WT_BLOCK_HEADER_SIZE WT_CACHE_LINE_ALIGNMENT WT_CACHE_LINE_PAD_BEGIN WT_CACHE_LINE_PAD_END +WT_CLOCKDIFF_NS WT_CONN_CHECK_PANIC WT_DEADLOCK WT_DEBUG_BYTE @@ -67,7 +68,6 @@ WT_TRACK_OP WT_TRACK_OP_END WT_TRACK_OP_INIT WT_TRET_ERROR_OK -WT_TSCDIFF_NS WT_UPDATE_SIZE WT_WITH_LOCK_NOWAIT WT_WITH_LOCK_WAIT diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 1f025013fe7..5d2eb7427b6 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1206,6 +1206,7 @@ tempdir testutil th tid +timedwait timestamp timestamps tmp diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 4a0e73e1c0d..9c5f6711da0 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -70,6 +70,7 @@ func_ok() -e '/int __wt_stat_connection_desc$/d' \ -e '/int __wt_stat_dsrc_desc$/d' \ -e '/int __wt_stat_join_desc$/d' \ + -e '/int __wt_txn_rollback_required$/d' \ -e '/int __wt_win_directory_list_free$/d' \ -e '/int bdb_compare_reverse$/d' \ -e '/int copyout_val$/d' \ diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 9c828f66cef..6cd3f219b4a 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -272,6 +272,8 @@ connection_stats = [ CacheStat('cache_read_app_count', 'application threads page read from disk to cache count'), CacheStat('cache_read_app_time', 'application threads page read from disk to cache time (usecs)'), CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'), + CacheStat('cache_read_lookaside_delay', 'pages read into cache with skipped lookaside entries needed later'), + CacheStat('cache_read_lookaside_skipped', 'pages read into cache skipping older lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), CacheStat('cache_write_app_count', 'application threads page write from cache to disk count'), diff --git a/src/third_party/wiredtiger/examples/c/ex_smoke.c b/src/third_party/wiredtiger/examples/c/ex_smoke.c index 2091073c2f4..2647a706a54 100644 --- a/src/third_party/wiredtiger/examples/c/ex_smoke.c +++ b/src/third_party/wiredtiger/examples/c/ex_smoke.c @@ -45,7 +45,11 @@ main(int argc, char *argv[]) * This code deliberately doesn't use the standard test_util macros, * we don't want to link against that code to smoke-test a build. */ - (void)system("rm -rf WT_HOME && mkdir WT_HOME"); + if ((ret = system("rm -rf WT_HOME && mkdir WT_HOME")) != 0) { + fprintf(stderr, + "Failed to clean up prior to running example.\n"); + return (EXIT_FAILURE); + } /* Open a connection to the database, creating it if necessary. */ if ((ret = wiredtiger_open("WT_HOME", NULL, "create", &conn)) != 0) { diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 1800088ef2a..dc3e684a19f 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "9e5044823185feffa71e56a6593cfb92e0741a41", + "commit": "357efdd4ce279efc71ff618c59fe1b903ef80bb2", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.8" diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index a329e09a0c2..6e90447f18d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -58,8 +58,10 @@ static inline bool __cursor_page_pinned(WT_CURSOR_BTREE *cbt) { WT_CURSOR *cursor; + WT_SESSION_IMPL *session; cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; /* * Check the page active flag, asserting the page reference with any @@ -86,6 +88,14 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt) if (cbt->ref->page->read_gen == WT_READGEN_OLDEST) return (false); + /* + * If we are doing an update, we need a page with history. Release the + * page so we get it again with history if required. + */ + if (F_ISSET(&session->txn, WT_TXN_UPDATE) && + cbt->ref->state != WT_REF_MEM) + return (false); + return (true); } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index ed55491ab38..a728341e033 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -158,6 +158,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: + case WT_REF_LIMBO: case WT_REF_LOOKASIDE: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 2ba1c9734b9..66974c70e04 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -16,14 +16,13 @@ static void __free_skip_array( WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t, bool); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *, bool); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t, bool); -static void __page_out_int(WT_SESSION_IMPL *, WT_PAGE **, bool); /* - * __wt_ref_out_int -- + * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) +__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* * A version of the page-out function that allows us to make additional @@ -57,25 +56,15 @@ __wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) } #endif - __page_out_int(session, &ref->page, rewrite); + __wt_page_out(session, &ref->page); } /* - * __wt_ref_out -- + * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) -{ - __wt_ref_out_int(session, ref, false); -} - -/* - * __page_out_int -- - * Discard an in-memory page, freeing all memory associated with it. - */ -static void -__page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) +__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; @@ -113,7 +102,7 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) } /* Update the cache's information. */ - __wt_cache_page_evict(session, page, rewrite); + __wt_cache_page_evict(session, page); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) @@ -157,16 +146,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) __wt_overwrite_and_free(session, page); } -/* - * __wt_page_out -- - * Discard an in-memory page, freeing all memory associated with it. - */ -void -__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) -{ - __page_out_int(session, pagep, false); -} - /* * __free_page_modify -- * Discard the page's associated modification structures. @@ -310,20 +289,12 @@ __wt_free_ref( break; } - /* - * Free any address allocation; if there's no linked WT_REF page, it - * must be allocated. - */ + /* Free any address allocation. */ __wt_ref_addr_free(session, ref); - /* - * Free any lookaside or page-deleted information. We only expect a - * lookaside structure for lookaside references, but can see - * page-deleted information in other cases (such as WT_REF_MEM). - */ - if (ref->state == WT_REF_LOOKASIDE) - __wt_free(session, ref->page_las); - else if (ref->page_del != NULL) { + /* Free any lookaside or page-deleted information. */ + __wt_free(session, ref->page_las); + if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 396c5f66539..840e4fa5d2e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -807,7 +807,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session) btree->maxmempage = (uint64_t)cval.val; if (!F_ISSET(conn, WT_CONN_CACHE_POOL) && (cache_size = conn->cache_size) > 0) - btree->maxmempage = WT_MIN(btree->maxmempage, + btree->maxmempage = (uint64_t)WT_MIN(btree->maxmempage, (conn->cache->eviction_dirty_trigger * cache_size) / 1000); /* Enforce a lower bound of a single disk leaf page */ diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index 3e636ad922d..007513fd581 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -368,7 +368,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, } timer = !F_ISSET(session, WT_SESSION_INTERNAL); if (timer) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); /* Call the block manager to write the block. */ WT_ERR(checkpoint ? @@ -378,10 +378,10 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, /* Update some statistics now that the write is done */ if (timer) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); WT_STAT_CONN_INCR(session, cache_write_app_count); WT_STAT_CONN_INCRV(session, cache_write_app_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } WT_STAT_CONN_INCR(session, cache_write); diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index fd2a2ac7190..03b5039b00b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -232,6 +232,7 @@ restart: /* descent = pindex->index[__wt_random(&session->rnd) % entries]; if (descent->state == WT_REF_DISK || + descent->state == WT_REF_LIMBO || descent->state == WT_REF_LOOKASIDE || descent->state == WT_REF_MEM) break; @@ -240,6 +241,7 @@ restart: /* for (i = 0; i < entries; ++i) { descent = pindex->index[i]; if (descent->state == WT_REF_DISK || + descent->state == WT_REF_LIMBO || descent->state == WT_REF_LOOKASIDE || descent->state == WT_REF_MEM) break; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index a98983746e4..afaf6c82aa5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -8,9 +8,6 @@ #include "wt_internal.h" -static void __btree_verbose_lookaside_read( - WT_SESSION_IMPL *, uint32_t, uint64_t); - /* * __col_instantiate -- * Update a column-store page entry based on a lookaside table update list. @@ -69,6 +66,142 @@ __row_instantiate(WT_SESSION_IMPL *session, return (0); } +/* + * __las_page_skip_locked -- + * Check if we can skip reading a locked page with lookaside entries. + */ +static inline bool +__las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_TXN *txn; + + txn = &session->txn; + + /* + * Skip lookaside pages if reading without a timestamp and all the + * updates in lookaside are in the past. + * + * Lookaside eviction preferentially chooses the newest updates when + * creating page images with no stable timestamp. If a stable timestamp + * has been set, we have to visit the page because eviction chooses old + * version of records in that case. + * + * One case where we may need to visit the page is if lookaside eviction + * is active in tree 2 when a checkpoint has started and is working its + * way through tree 1. In that case, lookaside may have created a page + * image with updates in the future of the checkpoint. + * + * We also need to instantiate a lookaside page if this is an update + * operation in progress. + */ + if (ref->page_las->invalid) + return (false); + + if (F_ISSET(txn, WT_TXN_UPDATE)) + return (false); + + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + + if (WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) + return (false); + + if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest) + return (true); + +#ifdef HAVE_TIMESTAMPS + /* + * Skip lookaside pages if reading as of a timestamp, we evicted new + * versions of data and all the updates are in the past. + */ + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && + ref->page_las->las_skew_newest && + __wt_timestamp_cmp( + &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0) + return (true); + + /* + * Skip lookaside pages if reading as of a timestamp, we evicted old + * versions of data and all the updates are in the future. + */ + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && + !ref->page_las->las_skew_newest && + __wt_timestamp_cmp( + &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) + return (true); +#endif + + return (false); +} + +/* + * __las_page_skip -- + * Check if we can skip reading a page with lookaside entries. + */ +static inline bool +__las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) +{ + uint32_t previous_state; + bool skip; + + if ((previous_state = ref->state) != WT_REF_LIMBO && + previous_state != WT_REF_LOOKASIDE) + return (false); + + if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + return (false); + + skip = __las_page_skip_locked(session, ref); + + /* Restore the state and push the change. */ + ref->state = previous_state; + WT_FULL_BARRIER(); + + return (skip); +} + +/* + * __las_page_instantiate_verbose -- + * Create a verbose message to display at most once per checkpoint when + * performing a lookaside table read. + */ +static void +__las_page_instantiate_verbose(WT_SESSION_IMPL *session, uint64_t las_pageid) +{ + WT_CACHE *cache; + uint64_t ckpt_gen_current, ckpt_gen_last; + + if (!WT_VERBOSE_ISSET(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) + return; + + cache = S2C(session)->cache; + ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); + ckpt_gen_last = cache->las_verb_gen_read; + + /* + * This message is throttled to one per checkpoint. To do this we + * track the generation of the last checkpoint for which the message + * was printed and check against the current checkpoint generation. + */ + if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) || + ckpt_gen_current > ckpt_gen_last) { + /* + * Attempt to atomically replace the last checkpoint generation + * for which this message was printed. If the atomic swap fails + * we have raced and the winning thread will print the message. + */ + if (__wt_atomic_casv64(&cache->las_verb_gen_read, + ckpt_gen_last, ckpt_gen_current)) { + __wt_verbose(session, + WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, + "Read from lookaside file triggered for " + "file ID %" PRIu32 ", page ID %" PRIu64, + S2BT(session)->id, las_pageid); + } + } +} + /* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. @@ -97,6 +230,10 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_CLEAR(las_key); + __las_page_instantiate_verbose(session, ref->page_las->las_pageid); + WT_STAT_CONN_INCR(session, cache_read_lookaside); + WT_STAT_DATA_INCR(session, cache_read_lookaside); + __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); @@ -320,7 +457,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_PAGE *page; size_t addr_size; uint64_t time_start, time_stop; - uint32_t page_flags, new_state, previous_state; + uint32_t page_flags, final_state, new_state, previous_state; const uint8_t *addr; bool timer; @@ -349,6 +486,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) new_state = WT_REF_READING; break; case WT_REF_DELETED: + case WT_REF_LIMBO: case WT_REF_LOOKASIDE: new_state = WT_REF_LOCKED; break; @@ -358,6 +496,20 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) if (!__wt_atomic_casv32(&ref->state, previous_state, new_state)) return (0); + final_state = WT_REF_MEM; + + /* + * If we already have the page image, just instantiate the history. + * + * We need exclusive access because other threads could be reading the + * page without history and we can't change the state underneath them. + */ + if (previous_state == WT_REF_LIMBO) { + if (__wt_hazard_check(session, ref) != NULL) + goto err; + goto skip_read; + } + /* * Get the address: if there is no address, the page was deleted or had * only lookaside entries, and a subsequent search or insert is forcing @@ -380,66 +532,89 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ timer = !F_ISSET(session, WT_SESSION_INTERNAL); if (timer) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); if (timer) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); WT_STAT_CONN_INCR(session, cache_read_app_count); WT_STAT_CONN_INCRV(session, cache_read_app_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } /* * Build the in-memory version of the page. Clear our local reference to * the allocated copy of the disk image on return, the in-memory object * steals it. + * + * If a page is read with eviction disabled, we don't count evicting it + * as progress. Since disabling eviction allows pages to be read even + * when the cache is full, we want to avoid workloads repeatedly reading + * a page with eviction disabled (e.g., a metadata page), then evicting + * that page and deciding that is a sign that eviction is unstuck. */ page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) - FLD_SET(page_flags, WT_PAGE_READ_NO_EVICT); + FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS); WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page)); tmp.mem = NULL; -skip_read: + /* + * The WT_REF lookaside state should match the page-header state of + * any page we read. + */ + WT_ASSERT(session, + (previous_state != WT_REF_LIMBO && + previous_state != WT_REF_LOOKASIDE) || + ref->page->dsk == NULL || + F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE)); + /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ - if (session->dhandle->checkpoint != NULL) + if (session->dhandle->checkpoint != NULL) { + WT_ASSERT(session, previous_state == WT_REF_DISK); goto done; + } - /* If the page was deleted, instantiate that information. */ - if (previous_state == WT_REF_DELETED) +skip_read: + switch (previous_state) { + case WT_REF_DELETED: + /* If the page was deleted, instantiate that information. */ WT_ERR(__wt_delete_page_instantiate(session, ref)); + break; + case WT_REF_LOOKASIDE: + if (__las_page_skip_locked(session, ref)) { + WT_STAT_CONN_INCR( + session, cache_read_lookaside_skipped); + ref->page_las->eviction_to_lookaside = true; + final_state = WT_REF_LIMBO; + break; + } + /* FALLTHROUGH */ + case WT_REF_LIMBO: + /* Instantiate updates from the database's lookaside table. */ + if (previous_state == WT_REF_LIMBO) + WT_STAT_CONN_INCR(session, cache_read_lookaside_delay); - /* - * Instantiate updates from the database's lookaside table. The page - * flag was set when the page was written, potentially a long time ago. - * We only care if the lookaside table is currently active, check that - * before doing any work. - */ - if (previous_state == WT_REF_LOOKASIDE) { - WT_ASSERT(session, (ref->page->dsk == NULL || - F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE))); - - __btree_verbose_lookaside_read( - session, btree->id, ref->page_las->las_pageid); - WT_STAT_CONN_INCR(session, cache_read_lookaside); - WT_STAT_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate(session, ref, btree->id)); /* * The page is instantiated so we no longer need the lookaside - * entries. Note that we are discarding updates so the page - * must be marked available even if these operations fail. + * entries. Note we are discarding updates so the page must be + * marked available even if these operations fail. + * + * Don't free WT_REF.page_las, there may be concurrent readers. */ WT_TRET(__wt_las_remove_block( session, NULL, btree->id, ref->page_las->las_pageid)); - __wt_free(session, ref->page_las); + + ref->page_las->eviction_to_lookaside = false; + break; } -done: WT_PUBLISH(ref->state, WT_REF_MEM); +done: WT_PUBLISH(ref->state, final_state); return (ret); err: /* @@ -447,7 +622,7 @@ err: /* * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ - if (ref->page != NULL) + if (ref->page != NULL && previous_state != WT_REF_LIMBO) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); @@ -456,74 +631,6 @@ err: /* return (ret); } -/* - * __las_page_skip -- - * Check if we can skip reading a page with lookaside entries. - */ -static inline bool -__las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_TXN *txn; - bool skip; - - txn = &session->txn; - skip = false; - - if (!__wt_atomic_casv32(&ref->state, WT_REF_LOOKASIDE, WT_REF_LOCKED)) - return (false); - - /* - * Skip lookaside pages if reading without a timestamp and all the - * updates in lookaside are in the past. - * - * If we skip a lookaside page, the tree cannot be left clean: - * lookaside entries must be resolved before the tree can be discarded. - * - * Lookaside eviction preferentially chooses the newest updates when - * creating page image with no stable timestamp. If a stable timestamp - * has been set, we have to visit the page because eviction chooses old - * version of records in that case. - * - * One case where we may need to visit the page is if lookaside - * eviction is active in tree 2 when a checkpoint has started and is - * working its way through tree 1. In that case, lookaside may have - * created a page image with updates in the future of the checkpoint. - */ - if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) - goto done; - - if (WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) - goto done; - - if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_newest) { - skip = true; - goto done; - } - -#ifdef HAVE_TIMESTAMPS - /* - * Skip lookaside pages if reading as of a timestamp and all the - * updates are in the future. - */ - WT_ASSERT(session, - !F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) || - __wt_timestamp_cmp(&ref->page_las->onpage_timestamp, - &session->txn.read_timestamp) <= 0); - - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_newest && - __wt_timestamp_cmp( - &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) { - skip = true; - goto done; - } -#endif - -done: WT_PUBLISH(ref->state, WT_REF_LOOKASIDE); - return (skip); -} - /* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, @@ -540,6 +647,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_DECL_RET; WT_PAGE *page; uint64_t sleep_cnt, wait_cnt; + uint32_t current_state; int force_attempts; bool busy, cache_work, did_read, stalled, wont_need; @@ -559,7 +667,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags for (did_read = wont_need = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { - switch (ref->state) { + switch (current_state = ref->state) { case WT_REF_DELETED: if (LF_ISSET(WT_READ_NO_EMPTY) && __wt_delete_page_skip(session, ref, false)) @@ -569,6 +677,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags if (LF_ISSET(WT_READ_CACHE)) { if (!LF_ISSET(WT_READ_LOOKASIDE)) return (WT_NOTFOUND); + /* + * If we skip a lookaside page, the tree + * cannot be left clean: lookaside entries + * must be resolved before the tree can be + * discarded. + */ if (__las_page_skip(session, ref)) { __wt_tree_modify_set(session); return (WT_NOTFOUND); @@ -628,6 +742,7 @@ read: /* break; case WT_REF_SPLIT: return (WT_RESTART); + case WT_REF_LIMBO: case WT_REF_MEM: /* * The page is in memory. @@ -653,6 +768,22 @@ read: /* WT_STAT_CONN_INCR(session, page_busy_blocked); break; } + /* + * If we are a limbo page check whether we need to + * instantiate the history. By having a hazard pointer + * we can use the locked version. + */ + if (current_state == WT_REF_LIMBO && + ((!LF_ISSET(WT_READ_CACHE) || + LF_ISSET(WT_READ_LOOKASIDE)) && + !__las_page_skip_locked(session, ref))) { + WT_RET(__wt_hazard_clear(session, ref)); + goto read; + } + if (current_state == WT_REF_LIMBO && + LF_ISSET(WT_READ_CACHE) && + LF_ISSET(WT_READ_LOOKASIDE)) + __wt_tree_modify_set(session); /* * Check if the page requires forced eviction. @@ -767,46 +898,3 @@ skip_evict: /* WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt); } } - -/* - * __btree_verbose_lookaside_read -- - * Create a verbose message to display at most once per checkpoint when - * performing a lookaside table read. - */ -static void -__btree_verbose_lookaside_read( - WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) -{ - WT_CACHE *cache; - uint64_t ckpt_gen_current, ckpt_gen_last; - - if (!WT_VERBOSE_ISSET(session, - WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY)) - return; - - cache = S2C(session)->cache; - ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = cache->las_verb_gen_read; - - /* - * This message is throttled to one per checkpoint. To do this we - * track the generation of the last checkpoint for which the message - * was printed and check against the current checkpoint generation. - */ - if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) || - ckpt_gen_current > ckpt_gen_last) { - /* - * Attempt to atomically replace the last checkpoint generation - * for which this message was printed. If the atomic swap fails - * we have raced and the winning thread will print the message. - */ - if (__wt_atomic_casv64(&cache->las_verb_gen_read, - ckpt_gen_last, ckpt_gen_current)) { - __wt_verbose(session, - WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY, - "Read from lookaside file triggered for " - "file ID %" PRIu32 ", page ID %" PRIu64, - las_id, las_pageid); - } - } -} diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index d1fc684e208..7d0da631e2b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -213,27 +213,43 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, } /* - * If we hit the end of the chain, roll forward from the update item we - * found, otherwise, from the original page's value. + * If there's no visible update and we skipped a birthmark, the base + * item is an empty item (in other words, birthmarks we can't read act + * as tombstones). + * If there's no visible update and we didn't skip a birthmark, the base + * item is the on-page item, which must be globally visible. + * If there's a visible update and it's a tombstone, the base item is an + * empty item. + * If there's a visible update and it's not a tombstone, the base item + * is the on-page item. */ - if (upd == NULL && !skipped_birthmark) { - /* - * Callers of this function set the cursor slot to an impossible - * value to check we're not trying to return on-page values when - * the update list should have been sufficient (which happens, - * for example, if an update list was truncated, deleting some - * standard update required by a previous modify update). Assert - * the case. - */ - WT_ASSERT(session, cbt->slot != UINT32_MAX); + if (upd == NULL) { + if (skipped_birthmark) + WT_ERR(__wt_buf_set(session, &cursor->value, "", 0)); + else { + /* + * Callers of this function set the cursor slot to an + * impossible value to check we don't try and return + * on-page values when the update list should have been + * sufficient (which happens, for example, if an update + * list was truncated, deleting some standard update + * required by a previous modify update). Assert the + * case. + */ + WT_ASSERT(session, cbt->slot != UINT32_MAX); - WT_ERR(__value_return(session, cbt)); - } else if (upd->type == WT_UPDATE_TOMBSTONE || skipped_birthmark) + WT_ERR(__value_return(session, cbt)); + } + } else if (upd->type == WT_UPDATE_TOMBSTONE) WT_ERR(__wt_buf_set(session, &cursor->value, "", 0)); else WT_ERR(__wt_buf_set(session, &cursor->value, upd->data, upd->size)); + /* + * Once we have a base item, roll forward through any visible modify + * updates. + */ while (i > 0) WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 2719481aa86..36bbe48b407 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -757,16 +757,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * on the just-updated parent page's index. */ if (discard) { - /* - * Page-delete information is only read when the WT_REF state is - * WT_REF_DELETED. The page-delete memory wasn't added to the - * parent's footprint, ignore it here. - */ - if (ref->page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } - /* * Set the discarded WT_REF state to split, ensuring we don't * race with any discard of the WT_REF deleted fields. @@ -843,12 +833,18 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, } /* - * If this page was fast-truncated, any attached structure - * should have been freed before now. + * The page-delete and lookaside memory weren't added to the + * parent's footprint, ignore it here. */ - WT_ASSERT(session, next_ref->page_del == NULL); + if (next_ref->page_del != NULL) { + __wt_free(session, next_ref->page_del->update_list); + __wt_free(session, next_ref->page_del); + } + __wt_free(session, next_ref->page_las); + /* Free the backing block and address. */ WT_TRET(__wt_ref_block_free(session, next_ref)); + WT_TRET(__split_safe_free( session, split_gen, exclusive, next_ref, sizeof(WT_REF))); parent_decr += sizeof(WT_REF); @@ -1574,7 +1570,7 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref) /* * __wt_multi_to_ref -- - * Move a multi-block list into an array of WT_REF structures. + * Move a multi-block entry into a WT_REF structure. */ int __wt_multi_to_ref(WT_SESSION_IMPL *session, @@ -2261,9 +2257,13 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) * * Pages with unresolved changes are not marked clean during * reconciliation, do it now. + * + * Don't count this as eviction making progress, we did a one-for-one + * rewrite of a page in memory, typical in the case of cache pressure. */ __wt_page_modify_clear(session, page); - __wt_ref_out_int(session, ref, true); + F_SET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS); + __wt_ref_out(session, ref); /* Swap the new page into place. */ ref->page = new->page; diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 423b569f0b7..8600c7d6555 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -138,7 +138,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: @@ -330,7 +330,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } if (timer) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 @@ -338,7 +338,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, - WT_TSCDIFF_MS(time_stop, time_start)); + WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 3d5e9a3540f..eef790d7459 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -470,7 +470,8 @@ restart: /* * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && - ref->state != WT_REF_MEM) + ref->state != WT_REF_MEM && + ref->state != WT_REF_LIMBO) break; /* Skip lookaside pages if not requested. */ @@ -663,8 +664,8 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session, int __wt_tree_walk_custom_skip( WT_SESSION_IMPL *session, WT_REF **refp, - int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), - void *func_cookie, uint32_t flags) + int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), + void *func_cookie, uint32_t flags) { return (__tree_walk_internal( session, refp, NULL, skip_func, func_cookie, flags)); diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 79fc06b7312..7270c49a9f5 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -67,6 +67,9 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, } } + /* We're going to modify the page, we should have loaded history. */ + WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); + /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index e3b9e492d78..8b1e4d78f54 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -61,6 +61,9 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, upd = upd_arg; logged = false; + /* We're going to modify the page, we should have loaded history. */ + WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); + /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; @@ -357,24 +360,3 @@ __wt_update_obsolete_check( return (NULL); } - -/* - * __wt_update_obsolete_free -- - * Free an obsolete update list. - */ -void -__wt_update_obsolete_free( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) -{ - WT_UPDATE *next; - size_t size; - - /* Free a WT_UPDATE list. */ - for (size = 0; upd != NULL; upd = next) { - next = upd->next; - size += WT_UPDATE_MEMSIZE(upd); - __wt_free(session, upd); - } - if (size != 0) - __wt_cache_page_inmem_decr(session, page, size); -} diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index dc7ea1b7438..5fa46cb7fb2 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -341,7 +341,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) char hex_timestamp[9]; /* Enough for disabled string */ #endif uint64_t ckpt_gen_current, ckpt_gen_last; - uint32_t btree_id, pct_dirty, pct_full; + uint32_t btree_id; + double pct_dirty, pct_full; btree_id = S2BT(session)->id; @@ -379,8 +380,8 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) "file ID %" PRIu32 ", page ID %" PRIu64 ". " "Max txn ID %" PRIu64 ", min timestamp %s, skewed %s. " "Entries now in lookaside file: %" PRId64 ", " - "cache dirty: %" PRIu32 "%% , " - "cache use: %" PRIu32 "%%", + "cache dirty: %2.3f%% , " + "cache use: %2.3f%%", btree_id, multi->page_las.las_pageid, multi->page_las.las_max_txn, hex_timestamp, diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index da11fa8c98a..a70b8f86648 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -149,16 +149,18 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, NULL, confchk_wiredtiger_open_eviction_subconfigs, 2 }, { "eviction_checkpoint_target", "int", - NULL, "min=0,max=99", + NULL, "min=0,max=10TB", NULL, 0 }, { "eviction_dirty_target", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", NULL, 0 }, { "eviction_dirty_trigger", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", + NULL, 0 }, + { "eviction_target", "int", NULL, "min=10,max=10TB", NULL, 0 }, + { "eviction_trigger", "int", + NULL, "min=10,max=10TB", NULL, 0 }, - { "eviction_target", "int", NULL, "min=10,max=99", NULL, 0 }, - { "eviction_trigger", "int", NULL, "min=10,max=99", NULL, 0 }, { "file_manager", "category", NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, @@ -802,16 +804,18 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, NULL, confchk_wiredtiger_open_eviction_subconfigs, 2 }, { "eviction_checkpoint_target", "int", - NULL, "min=0,max=99", + NULL, "min=0,max=10TB", NULL, 0 }, { "eviction_dirty_target", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", NULL, 0 }, { "eviction_dirty_trigger", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", + NULL, 0 }, + { "eviction_target", "int", NULL, "min=10,max=10TB", NULL, 0 }, + { "eviction_trigger", "int", + NULL, "min=10,max=10TB", NULL, 0 }, - { "eviction_target", "int", NULL, "min=10,max=99", NULL, 0 }, - { "eviction_trigger", "int", NULL, "min=10,max=99", NULL, 0 }, { "exclusive", "boolean", NULL, NULL, NULL, 0 }, { "extensions", "list", NULL, NULL, NULL, 0 }, { "file_extend", "list", @@ -900,16 +904,18 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, NULL, confchk_wiredtiger_open_eviction_subconfigs, 2 }, { "eviction_checkpoint_target", "int", - NULL, "min=0,max=99", + NULL, "min=0,max=10TB", NULL, 0 }, { "eviction_dirty_target", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", NULL, 0 }, { "eviction_dirty_trigger", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", + NULL, 0 }, + { "eviction_target", "int", NULL, "min=10,max=10TB", NULL, 0 }, + { "eviction_trigger", "int", + NULL, "min=10,max=10TB", NULL, 0 }, - { "eviction_target", "int", NULL, "min=10,max=99", NULL, 0 }, - { "eviction_trigger", "int", NULL, "min=10,max=99", NULL, 0 }, { "exclusive", "boolean", NULL, NULL, NULL, 0 }, { "extensions", "list", NULL, NULL, NULL, 0 }, { "file_extend", "list", @@ -997,16 +1003,18 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_eviction_subconfigs, 2 }, { "eviction_checkpoint_target", "int", - NULL, "min=0,max=99", + NULL, "min=0,max=10TB", NULL, 0 }, { "eviction_dirty_target", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", NULL, 0 }, { "eviction_dirty_trigger", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", + NULL, 0 }, + { "eviction_target", "int", NULL, "min=10,max=10TB", NULL, 0 }, + { "eviction_trigger", "int", + NULL, "min=10,max=10TB", NULL, 0 }, - { "eviction_target", "int", NULL, "min=10,max=99", NULL, 0 }, - { "eviction_trigger", "int", NULL, "min=10,max=99", NULL, 0 }, { "extensions", "list", NULL, NULL, NULL, 0 }, { "file_extend", "list", NULL, "choices=[\"data\",\"log\"]", @@ -1090,16 +1098,18 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_eviction_subconfigs, 2 }, { "eviction_checkpoint_target", "int", - NULL, "min=0,max=99", + NULL, "min=0,max=10TB", NULL, 0 }, { "eviction_dirty_target", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", NULL, 0 }, { "eviction_dirty_trigger", "int", - NULL, "min=1,max=99", + NULL, "min=1,max=10TB", + NULL, 0 }, + { "eviction_target", "int", NULL, "min=10,max=10TB", NULL, 0 }, + { "eviction_trigger", "int", + NULL, "min=10,max=10TB", NULL, 0 }, - { "eviction_target", "int", NULL, "min=10,max=99", NULL, 0 }, - { "eviction_trigger", "int", NULL, "min=10,max=99", NULL, 0 }, { "extensions", "list", NULL, NULL, NULL, 0 }, { "file_extend", "list", NULL, "choices=[\"data\",\"log\"]", diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 4f9f160ae3f..871190380f7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -8,6 +8,47 @@ #include "wt_internal.h" +/* + * __cache_config_abs_to_pct -- + * Cache configuration values can be either a percentage or an absolute + * size, this function converts an absolute size to a percentage. + */ +static inline int +__cache_config_abs_to_pct(WT_SESSION_IMPL *session, + double *param, const char *param_name, bool shared) +{ + WT_CONNECTION_IMPL *conn; + double input; + + conn = S2C(session); + + WT_ASSERT(session, param != NULL); + input = *param; + + /* + * Anything above 100 is an absolute value; convert it to percentage. + */ + if (input > 100.0) { + /* + * In a shared cache configuration the cache size changes + * regularly. Therefore, we require a percentage setting and do + * not allow an absolute size setting. + */ + if (shared) + WT_RET_MSG(session, EINVAL, + "Shared cache configuration requires a percentage " + "value for %s", param_name); + /* An absolute value can't exceed the cache size. */ + if (input > conn->cache_size) + WT_RET_MSG(session, EINVAL, + "%s should not exceed cache size", param_name); + + *param = (input * 100.0) / (conn->cache_size); + } + + return (0); +} + /* * __cache_config_local -- * Configure the underlying cache. @@ -37,17 +78,26 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[]) cache->overhead_pct = (u_int)cval.val; WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval)); - cache->eviction_target = (u_int)cval.val; + cache->eviction_target = (double)cval.val; + WT_RET(__cache_config_abs_to_pct( + session, &(cache->eviction_target), "eviction target", shared)); WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval)); - cache->eviction_trigger = (u_int)cval.val; + cache->eviction_trigger = (double)cval.val; + WT_RET(__cache_config_abs_to_pct( + session, &(cache->eviction_trigger), "eviction trigger", shared)); WT_RET(__wt_config_gets( session, cfg, "eviction_checkpoint_target", &cval)); - cache->eviction_checkpoint_target = (u_int)cval.val; + cache->eviction_checkpoint_target = (double)cval.val; + WT_RET(__cache_config_abs_to_pct(session, + &(cache->eviction_checkpoint_target), + "eviction checkpoint target", shared)); WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval)); - cache->eviction_dirty_target = (u_int)cval.val; + cache->eviction_dirty_target = (double)cval.val; + WT_RET(__cache_config_abs_to_pct(session, + &(cache->eviction_dirty_target), "eviction dirty target", shared)); /* * Don't allow the dirty target to be larger than the overall @@ -66,7 +116,10 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[]) cache->eviction_dirty_target; WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_trigger", &cval)); - cache->eviction_dirty_trigger = (u_int)cval.val; + cache->eviction_dirty_trigger = (double)cval.val; + WT_RET(__cache_config_abs_to_pct(session, + &(cache->eviction_dirty_trigger), "eviction dirty trigger", + shared)); /* * Don't allow the dirty trigger to be larger than the overall diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index eefb50902f4..720df3c465d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -572,8 +572,8 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CACHE *cache; WT_CACHE_POOL *cp; WT_CONNECTION_IMPL *entry; + double pct_full; uint64_t adjustment, highest_percentile, pressure, reserved, smallest; - u_int pct_full; bool busy, decrease_ok, grow, pool_full; *adjustedp = false; @@ -581,7 +581,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, cp = __wt_process.cache_pool; grow = false; pool_full = cp->currently_used >= cp->size; - pct_full = 0; + pct_full = 0.0; /* Highest as a percentage, avoid 0 */ highest_percentile = (highest / 100) + 1; @@ -613,7 +613,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, entry->default_session, false, true, &pct_full); __wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u", + "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2.3f", entry->cache_size >> 20, pressure, cache->cp_skip_count, busy, pct_full); @@ -676,8 +676,9 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * potentially a negative feedback loop in the * balance algorithm. */ - smallest = (100 * __wt_cache_bytes_inuse(cache)) / - cache->eviction_trigger; + smallest = + (uint64_t)((100 * __wt_cache_bytes_inuse(cache)) / + cache->eviction_trigger); if (entry->cache_size > smallest) adjustment = WT_MIN(cp->chunk, (entry->cache_size - smallest) / 2); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 7dea0a3fe4b..9097e10ef5a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -949,11 +949,11 @@ __log_server(void *arg) } /* Wait until the next event. */ - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); __wt_cond_auto_wait_signal( session, conn->log_cond, did_work, NULL, &signalled); - time_stop = __wt_rdtsc(session); - timediff = WT_TSCDIFF_MS(time_stop, time_start); + time_stop = __wt_clock(session); + timediff = WT_CLOCKDIFF_MS(time_stop, time_start); } if (0) { diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index e27de84254e..14a1570c138 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -503,7 +503,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) struct timespec ts; struct tm *tm, _tm; WT_CONNECTION_IMPL *conn; - WT_FSTREAM *log_stream; conn = S2C(session); @@ -516,17 +515,16 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) WT_RET_MSG(session, ENOMEM, "strftime path conversion"); /* If the path has changed, cycle the log file. */ - if ((log_stream = conn->stat_fs) == NULL || + if (conn->stat_fs == NULL || path == NULL || strcmp(tmp->mem, path->mem) != 0) { WT_RET(__wt_fclose(session, &conn->stat_fs)); - if (path != NULL) - WT_RET( - __wt_buf_set(session, path, tmp->data, tmp->size)); WT_RET(__wt_fopen(session, tmp->mem, WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED, WT_STREAM_APPEND, - &log_stream)); + &conn->stat_fs)); + + if (path != NULL) + WT_RET(__wt_buf_setstr(session, path, tmp->mem)); } - conn->stat_fs = log_stream; /* Create the entry prefix for this time of day. */ if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0) @@ -583,6 +581,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) "Attempt to log statistics while a server is running"); WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp)); + WT_ERR(__wt_buf_setstr(session, tmp, "")); WT_ERR(__statlog_log_one(session, NULL, tmp)); err: __wt_scr_free(session, &tmp); @@ -614,9 +613,6 @@ __statlog_server(void *arg) session = arg; conn = S2C(session); - WT_CLEAR(path); - WT_CLEAR(tmp); - /* * We need a temporary place to build a path and an entry prefix. * The length of the path plus 128 should be more than enough. @@ -624,8 +620,12 @@ __statlog_server(void *arg) * We also need a place to store the current path, because that's * how we know when to close/re-open the file. */ + WT_CLEAR(path); WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); + WT_ERR(__wt_buf_setstr(session, &path, "")); + WT_CLEAR(tmp); WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); + WT_ERR(__wt_buf_setstr(session, &tmp, "")); for (;;) { /* Wait until the next event. */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 5f68ad3883e..9d6f031807f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -196,11 +196,11 @@ __curfile_search(WT_CURSOR *cursor) CURSOR_API_CALL(cursor, session, search, cbt->btree); WT_ERR(__cursor_checkkey(cursor)); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_btcur_search(cbt)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_usecs_hist_incr_opread(session, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); /* Search maintains a position, key and value. */ WT_ASSERT(session, @@ -227,11 +227,11 @@ __curfile_search_near(WT_CURSOR *cursor, int *exact) CURSOR_API_CALL(cursor, session, search_near, cbt->btree); WT_ERR(__cursor_checkkey(cursor)); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_btcur_search_near(cbt, exact)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_usecs_hist_incr_opread(session, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); /* Search-near maintains a position, key and value. */ WT_ASSERT(session, @@ -261,11 +261,11 @@ __curfile_insert(WT_CURSOR *cursor) WT_ERR(__cursor_checkkey(cursor)); WT_ERR(__cursor_checkvalue(cursor)); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_btcur_insert(cbt)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_usecs_hist_incr_opwrite(session, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); /* * Insert maintains no position, key or value (except for column-store @@ -362,11 +362,11 @@ __curfile_update(WT_CURSOR *cursor) WT_ERR(__cursor_checkkey(cursor)); WT_ERR(__cursor_checkvalue(cursor)); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_btcur_update(cbt)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_usecs_hist_incr_opwrite(session, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); /* Update maintains a position, key and value. */ WT_ASSERT(session, @@ -394,11 +394,11 @@ __curfile_remove(WT_CURSOR *cursor) CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree); WT_ERR(__cursor_checkkey(cursor)); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_btcur_remove(cbt)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_usecs_hist_incr_opwrite(session, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); /* * Remove with a search-key is fire-and-forget, no position and no key. diff --git a/src/third_party/wiredtiger/src/docs/top/main.dox b/src/third_party/wiredtiger/src/docs/top/main.dox index 1bfb623c0a0..e4de22ff042 100644 --- a/src/third_party/wiredtiger/src/docs/top/main.dox +++ b/src/third_party/wiredtiger/src/docs/top/main.dox @@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases -@row{WiredTiger 2.9.3 (current), +@row{WiredTiger 3.0.0 (current), + [Release package], + [Documentation]} +@row{WiredTiger 2.9.3 (previous), [Release package], [Documentation]} -@row{WiredTiger 2.9.2 (previous), - [Release package], - [Documentation]} @row{Development branch, [Source code], [Documentation]} diff --git a/src/third_party/wiredtiger/src/docs/upgrading.dox b/src/third_party/wiredtiger/src/docs/upgrading.dox index 09cafbf480d..2e4990e8a33 100644 --- a/src/third_party/wiredtiger/src/docs/upgrading.dox +++ b/src/third_party/wiredtiger/src/docs/upgrading.dox @@ -28,6 +28,16 @@ The performance visualization tool \c wtstats has been removed and is no longer supported. +
::wiredtiger_open cache configuration changes
+
+The cache configuration options \c eviction_checkpoint_target, \c +eviction_dirty_target, \c eviction_dirty_trigger, \c eviction_target and \c +eviction_trigger have changed. The options can now take absolute size. It would +be a percentage of the cache size if the value is within the range of 0 to 100 +or an absolute size when greater than 100. This API change is compatible with +existing usage. +
+
@section version_292 Upgrading to Version 2.9.2
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 9e46f24ca7f..39c84764070 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -557,11 +557,17 @@ __evict_update_work(WT_SESSION_IMPL *session) WT_BTREE *las_tree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; + double dirty_target, dirty_trigger, target, trigger; uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; + dirty_target = cache->eviction_dirty_target; + dirty_trigger = cache->eviction_dirty_trigger; + target = cache->eviction_target; + trigger = cache->eviction_trigger; + /* Clear previous state. */ cache->flags = 0; @@ -589,13 +595,13 @@ __evict_update_work(WT_SESSION_IMPL *session) bytes_inuse = __wt_cache_bytes_inuse(cache); if (__wt_eviction_clean_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); - else if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) + else if (bytes_inuse > (target * bytes_max) / 100) F_SET(cache, WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (__wt_eviction_dirty_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); - else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) + else if (dirty_inuse > (uint64_t)(dirty_target * bytes_max) / 100) F_SET(cache, WT_CACHE_EVICT_DIRTY); /* @@ -610,10 +616,9 @@ __evict_update_work(WT_SESSION_IMPL *session) * Scrub dirty pages and keep them in cache if we are less than half * way to the clean or dirty trigger. */ - if (bytes_inuse < ((cache->eviction_target + cache->eviction_trigger) * - bytes_max) / 200 && dirty_inuse < (uint64_t) - ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) * - bytes_max) / 200) + if (bytes_inuse < (uint64_t)((target + trigger) * bytes_max) / 200 && + dirty_inuse < + (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200) F_SET(cache, WT_CACHE_EVICT_SCRUB); /* @@ -626,9 +631,8 @@ __evict_update_work(WT_SESSION_IMPL *session) if (!F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && (__wt_cache_stuck(session) || (__wt_cache_lookaside_score(cache) > 80 && - dirty_inuse > (uint64_t) - ((cache->eviction_dirty_target + cache->eviction_dirty_trigger) * - bytes_max) / 200))) + dirty_inuse > + (uint64_t)((dirty_target + dirty_trigger) * bytes_max) / 200))) F_SET(cache, WT_CACHE_EVICT_LOOKASIDE); /* @@ -671,7 +675,7 @@ __evict_pass(WT_SESSION_IMPL *session) /* Evict pages from the cache. */ for (loop = 0; cache->pass_intr == 0; loop++) { - time_now = __wt_rdtsc(session); + time_now = __wt_clock(session); if (loop == 0) time_prev = time_now; @@ -741,7 +745,7 @@ __evict_pass(WT_SESSION_IMPL *session) * transactions and writing updates to the lookaside table. */ if (eviction_progress == cache->eviction_progress) { - if (WT_TSCDIFF_MS(time_now, time_prev) >= 20 && + if (WT_CLOCKDIFF_MS(time_now, time_prev) >= 20 && F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD | WT_CACHE_EVICT_DIRTY_HARD)) { if (cache->evict_aggressive_score < 100) @@ -2049,12 +2053,13 @@ fast: /* If the page can't be evicted, give up. */ * point keeping a page pinned, since it may be the only candidate in * an idle tree. * - * If we land on a page requiring forced eviction, move on to the next - * page: we want this page evicted as quickly as possible. + * If we land on a page requiring forced eviction, or that isn't an + * ordinary in-memory page (e.g., WT_REF_LIMBO), move until we find an + * ordinary page: we should not prevent exclusive access to the page + * until the next walk. */ if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || - WT_READGEN_EVICT_SOON(ref->page->read_gen) || ref->page->memory_footprint >= btree->splitmempage) { if (restarts == 0) WT_STAT_CONN_INCR( @@ -2062,9 +2067,11 @@ fast: /* If the page can't be evicted, give up. */ WT_RET(__wt_page_release( cache->walk_session, ref, walk_flags)); ref = NULL; - } else if (WT_READGEN_EVICT_SOON(ref->page->read_gen)) - WT_RET_NOTFOUND_OK(__wt_tree_walk_count( - session, &ref, &refs_walked, walk_flags)); + } else + while (ref != NULL && (ref->state != WT_REF_MEM || + WT_READGEN_EVICT_SOON(ref->page->read_gen))) + WT_RET_NOTFOUND_OK(__wt_tree_walk_count( + session, &ref, &refs_walked, walk_flags)); btree->evict_ref = ref; } @@ -2088,7 +2095,7 @@ __evict_get_ref( WT_CACHE *cache; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue; - uint32_t candidates; + uint32_t candidates, previous_state; bool is_app, server_only, urgent_ok; *btreep = NULL; @@ -2213,8 +2220,10 @@ __evict_get_ref( * multiple attempts to evict it. For pages that are already * being evicted, this operation will fail and we will move on. */ - if (!__wt_atomic_casv32( - &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { + if (((previous_state = evict->ref->state) != WT_REF_MEM && + previous_state != WT_REF_LIMBO) || + !__wt_atomic_casv32( + &evict->ref->state, previous_state, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } @@ -2289,7 +2298,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) cache->app_evicts++; if (WT_STAT_ENABLED(session)) { app_timer = true; - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); } } @@ -2309,10 +2318,10 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) (void)__wt_atomic_subv32(&btree->evict_busy, 1); if (app_timer) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); WT_STAT_CONN_INCRV(session, application_evict_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } WT_TRACK_OP_END(session); return (ret); @@ -2325,7 +2334,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) */ int __wt_cache_eviction_worker( - WT_SESSION_IMPL *session, bool busy, bool readonly, u_int pct_full) + WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -2348,7 +2357,7 @@ __wt_cache_eviction_worker( * It is not safe to proceed if the eviction server threads aren't * setup yet. */ - if (!conn->evict_server_running || (busy && pct_full < 100)) + if (!conn->evict_server_running || (busy && pct_full < 100.0)) goto done; /* Wake the eviction server if we need to do work. */ @@ -2358,7 +2367,7 @@ __wt_cache_eviction_worker( timer = WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL); if (timer) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); for (initial_progress = cache->eviction_progress;; ret = 0) { /* @@ -2370,7 +2379,8 @@ __wt_cache_eviction_worker( if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); - WT_ERR(WT_ROLLBACK); + WT_ERR(__wt_txn_rollback_required(session, + "oldest transaction rolled back for eviction")); } /* @@ -2389,7 +2399,7 @@ __wt_cache_eviction_worker( /* See if eviction is still needed. */ if (!__wt_eviction_needed(session, busy, readonly, &pct_full) || - ((pct_full < 100 || cache->eviction_scrub_limit > 0.0) && + ((pct_full < 100.0 || cache->eviction_scrub_limit > 0.0) && (cache->eviction_progress > initial_progress + max_progress))) break; @@ -2425,10 +2435,10 @@ __wt_cache_eviction_worker( } err: if (timer) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); WT_STAT_CONN_INCRV(session, application_cache_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } done: WT_TRACK_OP_END(session); @@ -2573,7 +2583,9 @@ __verbose_dump_cache_single(WT_SESSION_IMPL *session, dhandle->checkpoint != NULL ? dhandle->checkpoint : "", btree->evict_disabled != 0 ? "eviction disabled" : "", btree->evict_disabled_open ? " at open" : "")); - if (intl_pages != 0) + if (intl_pages == 0) + WT_RET(__wt_msg(session, "internal: 0 pages")); + else WT_RET(__wt_msg(session, "internal: " "%" PRIu64 " pages, " @@ -2590,7 +2602,9 @@ __verbose_dump_cache_single(WT_SESSION_IMPL *session, intl_dirty_bytes / WT_MEGABYTE, intl_bytes_max / WT_MEGABYTE, intl_dirty_bytes_max / WT_MEGABYTE)); - if (leaf_pages != 0) + if (leaf_pages == 0) + WT_RET(__wt_msg(session, "leaf: 0 pages")); + else WT_RET(__wt_msg(session, "leaf: " "%" PRIu64 " pages, " @@ -2624,13 +2638,13 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + double pct; uint64_t total_bytes, total_dirty_bytes; - u_int pct; bool needed; conn = S2C(session); total_bytes = total_dirty_bytes = 0; - pct = 0; /* [-Werror=uninitialized] */ + pct = 0.0; /* [-Werror=uninitialized] */ WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); @@ -2639,10 +2653,10 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) "cache full: %s", __wt_cache_full(session) ? "yes" : "no")); needed = __wt_eviction_clean_needed(session, &pct); WT_RET(__wt_msg(session, - "cache clean check: %s (%u%%)", needed ? "yes" : "no", pct)); + "cache clean check: %s (%2.3f%%)", needed ? "yes" : "no", pct)); needed = __wt_eviction_dirty_needed(session, &pct); WT_RET(__wt_msg(session, - "cache dirty check: %s (%u%%)", needed ? "yes" : "no", pct)); + "cache dirty check: %s (%2.3f%%)", needed ? "yes" : "no", pct)); for (dhandle = NULL;;) { WT_WITH_HANDLE_LIST_READ_LOCK(session, diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 70c5d6d02da..0ff314f3484 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -60,7 +60,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); /* * Take some care with order of operations: if we release the hazard @@ -83,12 +83,12 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) * we have one of two pairs of stats to increment. */ ret = __wt_evict(session, ref, false); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); if (ret == 0) { if (too_big) { WT_STAT_CONN_INCR(session, cache_eviction_force); WT_STAT_CONN_INCRV(session, cache_eviction_force_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } else { /* * If the page isn't too big, we are evicting it because @@ -98,12 +98,12 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_STAT_CONN_INCR(session, cache_eviction_force_delete); WT_STAT_CONN_INCRV(session, cache_eviction_force_delete_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } } else { WT_STAT_CONN_INCR(session, cache_eviction_force_fail); WT_STAT_CONN_INCRV(session, cache_eviction_force_fail_time, - WT_TSCDIFF_US(time_stop, time_start)); + WT_CLOCKDIFF_US(time_stop, time_start)); } (void)__wt_atomic_subv32(&btree->evict_busy, 1); @@ -268,9 +268,16 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Discard the page and update the reference structure; if the page has * an address, it's a disk page; if it has no address, it's a deleted * page re-instantiated (for example, by searching) and never written. + * + * If evicting a WT_REF_LIMBO reference, we get to here and transition + * back to WT_REF_LOOKASIDE. */ __wt_ref_out(session, ref); - if (ref->addr == NULL) { + if (!closing && ref->page_las != NULL && + ref->page_las->eviction_to_lookaside) { + ref->page_las->eviction_to_lookaside = false; + WT_PUBLISH(ref->state, WT_REF_LOOKASIDE); + } else if (ref->addr == NULL) { WT_WITH_PAGE_INDEX(session, ret = __evict_delete_ref(session, ref, closing)); WT_RET_BUSY_OK(ret); @@ -361,6 +368,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Eviction wants to keep this page if we have a disk image, * re-instantiate the page in memory, else discard the page. */ + __wt_free(session, ref->page_las); if (mod->mod_disk_image == NULL) { if (mod->mod_page_las.las_pageid != 0) { WT_RET( diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index a4416f139a0..847d6c5ee01 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -81,25 +81,33 @@ /* An API call wrapped in a transaction if necessary. */ #define TXN_API_CALL(s, h, n, bt, config, cfg) do { \ - bool __autotxn = false; \ + bool __autotxn = false, __update = false; \ API_CALL(s, h, n, bt, config, cfg); \ __wt_txn_timestamp_flags(s); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ - F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) + F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \ + __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \ + if (__update) \ + F_SET(&(s)->txn, WT_TXN_UPDATE); \ /* An API call wrapped in a transaction if necessary. */ #define TXN_API_CALL_NOCONF(s, h, n, dh) do { \ - bool __autotxn = false; \ + bool __autotxn = false, __update = false; \ API_CALL_NOCONF(s, h, n, dh); \ __wt_txn_timestamp_flags(s); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ - F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) + F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \ + __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \ + if (__update) \ + F_SET(&(s)->txn, WT_TXN_UPDATE); \ /* End a transactional API call, optional retry on deadlock. */ #define TXN_API_END_RETRY(s, ret, retry) \ API_END(s, ret); \ + if (__update) \ + F_CLR(&(s)->txn, WT_TXN_UPDATE); \ if (__autotxn) { \ if (F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT)) \ F_CLR(&(s)->txn, WT_TXN_AUTOCOMMIT); \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 39aac8730c4..7fbf27a1fff 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -232,12 +232,14 @@ struct __wt_ovfl_reuse { * Related information for on-disk pages with lookaside entries. */ struct __wt_page_lookaside { - uint64_t las_pageid; /* Page ID in lookaside */ - uint64_t las_max_txn; /* Maximum transaction ID in - lookaside */ - WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */ - WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */ - bool las_skew_newest; /* On-page skewed to newest */ + uint64_t las_pageid; /* Page ID in lookaside */ + uint64_t las_max_txn; /* Max transaction ID in lookaside */ + WT_DECL_TIMESTAMP(min_timestamp)/* Min timestamp in lookaside */ + /* Max timestamp on page */ + WT_DECL_TIMESTAMP(onpage_timestamp) + bool eviction_to_lookaside; /* Revert to lookaside on eviction */ + bool las_skew_newest; /* On-page skewed to newest */ + bool invalid; /* History is required correct reads */ }; /* @@ -643,8 +645,8 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02u /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04u /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08u /* Page is on the LRU queue */ -#define WT_PAGE_OVERFLOW_KEYS 0x10u /* Page has overflow keys */ -#define WT_PAGE_READ_NO_EVICT 0x20u /* Page read with eviction disabled */ +#define WT_PAGE_EVICT_NO_PROGRESS 0x10u /* Eviction doesn't count as progress */ +#define WT_PAGE_OVERFLOW_KEYS 0x20u /* Page has overflow keys */ #define WT_PAGE_SPLIT_INSERT 0x40u /* A leaf page was split for append */ #define WT_PAGE_UPDATE_IGNORE 0x80u /* Ignore updates on page discard */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ @@ -721,6 +723,10 @@ struct __wt_page { * row-store leaf pages without reading them if they don't reference * overflow items. * + * WT_REF_LIMBO: + * The page image has been loaded into memory but there is additional + * history in the lookaside table that has not been applied. + * * WT_REF_LOCKED: * Locked for exclusive access. In eviction, this page or a parent has * been selected for eviction; once hazard pointers are checked, the page @@ -794,11 +800,12 @@ struct __wt_ref { #define WT_REF_DISK 0 /* Page is on disk */ #define WT_REF_DELETED 1 /* Page is on disk, but deleted */ -#define WT_REF_LOCKED 2 /* Page locked for exclusive access */ -#define WT_REF_LOOKASIDE 3 /* Page is on disk with lookaside */ -#define WT_REF_MEM 4 /* Page is in cache and valid */ -#define WT_REF_READING 5 /* Page being read */ -#define WT_REF_SPLIT 6 /* Parent page split (WT_REF dead) */ +#define WT_REF_LIMBO 2 /* Page is in cache without history */ +#define WT_REF_LOCKED 3 /* Page locked for exclusive access */ +#define WT_REF_LOOKASIDE 4 /* Page is on disk with lookaside */ +#define WT_REF_MEM 5 /* Page is in cache and valid */ +#define WT_REF_READING 6 /* Page being read */ +#define WT_REF_SPLIT 7 /* Parent page split (WT_REF dead) */ volatile uint32_t state; /* Page state */ /* @@ -820,16 +827,14 @@ struct __wt_ref { #undef ref_ikey #define ref_ikey key.ikey - union { - WT_PAGE_DELETED *page_del; /* Deleted page information */ - WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */ - }; + WT_PAGE_DELETED *page_del; /* Deleted page information */ + WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */ }; /* * WT_REF_SIZE is the expected structure size -- we verify the build to ensure * the compiler hasn't inserted padding which would break the world. */ -#define WT_REF_SIZE 48 +#define WT_REF_SIZE 56 /* * WT_ROW -- diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index fe9c81f42b2..3a6413162f3 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -257,7 +257,7 @@ __wt_cache_page_byte_dirty_decr( * Take care to read the dirty-byte count only once in case * we're racing with updates. */ - orig = page->modify->bytes_dirty; + WT_ORDERED_READ(orig, page->modify->bytes_dirty); decr = WT_MIN(size, orig); if (__wt_atomic_cassize( &page->modify->bytes_dirty, orig, orig - decr)) @@ -400,7 +400,7 @@ __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size) * Evict pages from the cache. */ static inline void -__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) +__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CACHE *cache; @@ -448,17 +448,8 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) /* * Track if eviction makes progress. This is used in various places to * determine whether eviction is stuck. - * - * We don't count rewrites as progress. - * - * Further, if a page was read with eviction disabled, we don't count - * evicting a it as progress. Since disabling eviction allows pages to - * be read even when the cache is full, we want to avoid workloads - * repeatedly reading a page with eviction disabled (e.g., from the - * metadata), then evicting that page and deciding that is a sign that - * eviction is unstuck. */ - if (!rewrite && !F_ISSET_ATOMIC(page, WT_PAGE_READ_NO_EVICT)) + if (!F_ISSET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS)) (void)__wt_atomic_addv64(&cache->eviction_progress, 1); } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index eea0b977515..7a49f388826 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -105,12 +105,16 @@ struct __wt_cache { WT_CONDVAR *evict_cond; /* Eviction server condition */ WT_SPINLOCK evict_walk_lock; /* Eviction walk location */ - u_int eviction_dirty_target; /* Percent to allow dirty */ - u_int eviction_dirty_trigger; /* Percent to trigger dirty eviction */ - u_int eviction_trigger; /* Percent to trigger eviction */ - u_int eviction_target; /* Percent to end eviction */ + /* + * Eviction threshold percentages use double type to allow for + * specifying percentages less than one. + */ + double eviction_dirty_target; /* Percent to allow dirty */ + double eviction_dirty_trigger; /* Percent to trigger dirty eviction */ + double eviction_trigger; /* Percent to trigger eviction */ + double eviction_target; /* Percent to end eviction */ - u_int eviction_checkpoint_target;/* Percent to reduce dirty + double eviction_checkpoint_target;/* Percent to reduce dirty to during checkpoint scrubs */ double eviction_scrub_limit; /* Percent of cache to trigger dirty eviction during checkpoint diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index 57376bb2fde..fc127942d02 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -255,7 +255,7 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) * volume of data in cache. */ static inline bool -__wt_eviction_clean_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) +__wt_eviction_clean_needed(WT_SESSION_IMPL *session, double *pct_fullp) { WT_CACHE *cache; uint64_t bytes_inuse, bytes_max; @@ -270,7 +270,7 @@ __wt_eviction_clean_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) bytes_inuse = __wt_cache_bytes_inuse(cache); if (pct_fullp != NULL) - *pct_fullp = (u_int)((100 * bytes_inuse) / bytes_max); + *pct_fullp = ((100.0 * bytes_inuse) / bytes_max); return (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100); } @@ -281,7 +281,7 @@ __wt_eviction_clean_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) * volume of dirty data in cache. */ static inline bool -__wt_eviction_dirty_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) +__wt_eviction_dirty_needed(WT_SESSION_IMPL *session, double *pct_fullp) { WT_CACHE *cache; double dirty_trigger; @@ -297,10 +297,10 @@ __wt_eviction_dirty_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); if (pct_fullp != NULL) - *pct_fullp = (u_int)((100 * dirty_inuse) / bytes_max); + *pct_fullp = ((100.0 * dirty_inuse) / bytes_max); if ((dirty_trigger = cache->eviction_scrub_limit) < 1.0) - dirty_trigger = (double)cache->eviction_dirty_trigger; + dirty_trigger = cache->eviction_dirty_trigger; return (dirty_inuse > (uint64_t)(dirty_trigger * bytes_max) / 100); } @@ -312,10 +312,10 @@ __wt_eviction_dirty_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) */ static inline bool __wt_eviction_needed( - WT_SESSION_IMPL *session, bool busy, bool readonly, u_int *pct_fullp) + WT_SESSION_IMPL *session, bool busy, bool readonly, double *pct_fullp) { WT_CACHE *cache; - u_int pct_dirty, pct_full; + double pct_dirty, pct_full; bool clean_needed, dirty_needed; cache = S2C(session)->cache; @@ -330,7 +330,7 @@ __wt_eviction_needed( clean_needed = __wt_eviction_clean_needed(session, &pct_full); if (readonly) { dirty_needed = false; - pct_dirty = 0; + pct_dirty = 0.0; } else dirty_needed = __wt_eviction_dirty_needed(session, &pct_dirty); @@ -339,9 +339,9 @@ __wt_eviction_needed( * we involve the application thread. */ if (pct_fullp != NULL) - *pct_fullp = (u_int)WT_MAX(0, 100 - WT_MIN( - (int)cache->eviction_trigger - (int)pct_full, - (int)cache->eviction_dirty_trigger - (int)pct_dirty)); + *pct_fullp = WT_MAX(0.0, 100.0 - WT_MIN( + cache->eviction_trigger - pct_full, + cache->eviction_dirty_trigger - pct_dirty)); /* * Only check the dirty trigger when the session is not busy. @@ -381,7 +381,7 @@ __wt_cache_eviction_check( WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - u_int pct_full; + double pct_full; if (didworkp != NULL) *didworkp = false; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index c80e686ead5..3674d9218da 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -129,7 +129,6 @@ extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages); @@ -198,7 +197,6 @@ extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, const WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_UPDATE **updp, size_t *sizep, u_int modify_type) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); -extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_las_nonempty(WT_SESSION_IMPL *session); @@ -369,7 +367,7 @@ extern int __wt_evict_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUT extern int __wt_evict_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); -extern int __wt_cache_eviction_worker( WT_SESSION_IMPL *session, bool busy, bool readonly, u_int pct_full) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cache_eviction_worker( WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); @@ -787,8 +785,7 @@ extern void __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GRO extern void __wt_thread_group_stop_one(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group); extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep); -extern uint64_t __wt_tsc_to_nsec(uint64_t end, uint64_t begin); -extern uint64_t __wt_tsc_get_expensive_timestamp(WT_SESSION_IMPL *session); +extern uint64_t __wt_clock_to_nsec(uint64_t end, uint64_t begin); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -797,6 +794,7 @@ extern int __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config) WT extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_stats_update(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 2cca416d20f..05c0733d4ce 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -34,9 +34,7 @@ __wt_hex(int c) * Get a timestamp from CPU registers. */ static inline uint64_t -__wt_rdtsc(WT_SESSION_IMPL *session) { - if (__wt_process.use_epochtime) - return (__wt_tsc_get_expensive_timestamp(session)); +__wt_rdtsc(void) { #if defined (__i386) { uint64_t x; @@ -52,10 +50,28 @@ __wt_rdtsc(WT_SESSION_IMPL *session) { return ((d << 32) | a); } #else - return (__wt_tsc_get_expensive_timestamp(session)); + return (0); #endif } +/* + * __wt_clock -- + * Obtain a timestamp via either a CPU register or via a system call on + * platforms where obtaining it directly from the hardware register is + * not supported. + */ +static inline uint64_t +__wt_clock(WT_SESSION_IMPL *session) +{ + struct timespec tsp; + + if (__wt_process.use_epochtime) { + __wt_epoch(session, &tsp); + return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec)); + } + return (__wt_rdtsc()); +} + /* * __wt_strdup -- * ANSI strdup function. diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index fa07e6b7d4f..8a2699f031d 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -297,17 +297,17 @@ __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) int64_t **stats; if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); __wt_spin_lock(session, t); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); stats = (int64_t **)S2C(session)->stats; stats[session->stat_bucket][t->stat_count_off]++; if (F_ISSET(session, WT_SESSION_INTERNAL)) stats[session->stat_bucket][t->stat_int_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); else stats[session->stat_bucket][t->stat_app_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); } else __wt_spin_lock(session, t); } diff --git a/src/third_party/wiredtiger/src/include/optrack.h b/src/third_party/wiredtiger/src/include/optrack.h index 9c9720bb3cc..bec724042cf 100644 --- a/src/third_party/wiredtiger/src/include/optrack.h +++ b/src/third_party/wiredtiger/src/include/optrack.h @@ -51,7 +51,7 @@ struct __wt_optrack_record { WT_OPTRACK_RECORD *__tr; \ __tr = &((s)->optrack_buf[ \ (s)->optrackbuf_ptr % WT_OPTRACK_MAXRECS]); \ - __tr->op_timestamp = __wt_rdtsc(s); \ + __tr->op_timestamp = __wt_clock(s); \ __tr->op_id = __func_id; \ __tr->op_type = optype; \ \ diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index a3f1420d72b..c31619f2f96 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -65,14 +65,14 @@ #define WT_TIMEDIFF_SEC(end, begin) \ (WT_TIMEDIFF_NS((end), (begin)) / WT_BILLION) -#define WT_TSCDIFF_NS(end, begin) \ - (__wt_tsc_to_nsec(end, begin)) -#define WT_TSCDIFF_US(end, begin) \ - (WT_TSCDIFF_NS(end, begin) / WT_THOUSAND) -#define WT_TSCDIFF_MS(end, begin) \ - (WT_TSCDIFF_NS(end, begin) / WT_MILLION) -#define WT_TSCDIFF_SEC(end, begin) \ - (WT_TSCDIFF_NS(end, begin) / WT_BILLION) +#define WT_CLOCKDIFF_NS(end, begin) \ + (__wt_clock_to_nsec(end, begin)) +#define WT_CLOCKDIFF_US(end, begin) \ + (WT_CLOCKDIFF_NS(end, begin) / WT_THOUSAND) +#define WT_CLOCKDIFF_MS(end, begin) \ + (WT_CLOCKDIFF_NS(end, begin) / WT_MILLION) +#define WT_CLOCKDIFF_SEC(end, begin) \ + (WT_CLOCKDIFF_NS(end, begin) / WT_BILLION) #define WT_TIMECMP(t1, t2) \ ((t1).tv_sec < (t2).tv_sec ? -1 : \ diff --git a/src/third_party/wiredtiger/src/include/os_fhandle.i b/src/third_party/wiredtiger/src/include/os_fhandle.i index 5685b0f46dc..7c09a83132c 100644 --- a/src/third_party/wiredtiger/src/include/os_fhandle.i +++ b/src/third_party/wiredtiger/src/include/os_fhandle.i @@ -109,14 +109,14 @@ __wt_read( WT_STAT_CONN_INCR_ATOMIC(session, thread_read_active); WT_STAT_CONN_INCR(session, read_io); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); ret = fh->handle->fh_read( fh->handle, (WT_SESSION *)session, offset, len, buf); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_msecs_hist_incr_fsread(session, - WT_TSCDIFF_MS(time_stop, time_start)); + WT_CLOCKDIFF_MS(time_stop, time_start)); WT_STAT_CONN_DECR_ATOMIC(session, thread_read_active); return (ret); } @@ -188,14 +188,14 @@ __wt_write(WT_SESSION_IMPL *session, WT_STAT_CONN_INCR(session, write_io); WT_STAT_CONN_INCR_ATOMIC(session, thread_write_active); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); ret = fh->handle->fh_write( fh->handle, (WT_SESSION *)session, offset, len, buf); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); __wt_stat_msecs_hist_incr_fswrite(session, - WT_TSCDIFF_MS(time_stop, time_start)); + WT_CLOCKDIFF_MS(time_stop, time_start)); WT_STAT_CONN_DECR_ATOMIC(session, thread_write_active); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 5590c68618b..02f15cdb8af 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -157,9 +157,10 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, size_t new_ins_size, uint64_t *recnop, u_int skipdepth, bool exclusive) { WT_DECL_RET; - WT_INSERT *new_ins = *new_insp; + WT_INSERT *new_ins; /* Clear references to memory we now own and must free on error. */ + new_ins = *new_insp; *new_insp = NULL; /* Check for page write generation wrap. */ @@ -206,11 +207,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, size_t new_ins_size, u_int skipdepth, bool exclusive) { WT_DECL_RET; - WT_INSERT *new_ins = *new_insp; + WT_INSERT *new_ins; u_int i; bool simple; /* Clear references to memory we now own and must free on error. */ + new_ins = *new_insp; *new_insp = NULL; /* Check for page write generation wrap. */ @@ -262,11 +264,13 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size, bool exclusive) { WT_DECL_RET; - WT_UPDATE *obsolete, *upd = *updp; + WT_UPDATE *obsolete, *upd; wt_timestamp_t *obsolete_timestamp; + size_t size; uint64_t txn; /* Clear references to memory we now own and must free on error. */ + upd = *updp; *updp = NULL; /* Check for page write generation wrap. */ @@ -329,9 +333,20 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, return (0); obsolete = __wt_update_obsolete_check(session, page, upd->next); + + /* + * Decrement the dirty byte count while holding the page lock, else we + * can race with checkpoints cleaning a page. + */ + for (size = 0, upd = obsolete; upd != NULL; upd = upd->next) + size += WT_UPDATE_MEMSIZE(upd); + if (size != 0) + __wt_cache_page_inmem_decr(session, page, size); + WT_PAGE_UNLOCK(session, page); + if (obsolete != NULL) - __wt_update_obsolete_free(session, page, obsolete); + __wt_free_update_list(session, obsolete); return (0); } diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 0a902f36b39..8b8c3a55a6c 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -420,6 +420,8 @@ struct __wt_connection_stats { int64_t cache_eviction_pages_queued_oldest; int64_t cache_read; int64_t cache_read_lookaside; + int64_t cache_read_lookaside_skipped; + int64_t cache_read_lookaside_delay; int64_t cache_pages_requested; int64_t cache_eviction_pages_seen; int64_t cache_eviction_fail; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 327c2cd8caa..a0f51be8a28 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -259,6 +259,8 @@ struct __wt_txn { WT_ITEM *ckpt_snapshot; bool full_ckpt; + const char *rollback_reason; /* If rollback, the reason */ + /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_TXN_AUTOCOMMIT 0x0001u #define WT_TXN_ERROR 0x0002u @@ -274,6 +276,7 @@ struct __wt_txn { #define WT_TXN_SYNC_SET 0x0800u #define WT_TXN_TS_COMMIT_ALWAYS 0x1000u #define WT_TXN_TS_COMMIT_NEVER 0x2000u +#define WT_TXN_UPDATE 0x4000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 360a6cf1edb..121a18c9c3c 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -733,7 +733,8 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) session, txn_update_conflict); WT_STAT_DATA_INCR( session, txn_update_conflict); - return (WT_ROLLBACK); + return (__wt_txn_rollback_required(session, + "conflict between concurrent operations")); } upd = upd->next; } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index edee21b5ca7..53067bf44ab 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2116,27 +2116,36 @@ struct __wt_connection { * current eviction load., an integer between 1 and 20; default \c 1.} * @config{ ),,} * @config{eviction_checkpoint_target, perform eviction at the beginning - * of checkpoints to bring the dirty content in cache to this level\, - * expressed as a percentage of the total cache size. Ignored if set to - * zero or \c in_memory is \c true., an integer between 0 and 99; - * default \c 5.} + * of checkpoints to bring the dirty content in cache to this level. It + * is a percentage of the cache size if the value is within the range of + * 0 to 100 or an absolute size when greater than 100. The value is not + * allowed to exceed the \c cache_size. Ignored if set to zero or \c + * in_memory is \c true., an integer between 0 and 10TB; default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads - * when the cache contains at least this much dirty content\, expressed - * as a percentage of the total cache size., an integer between 1 and - * 99; default \c 5.} + * when the cache contains at least this much dirty content. It is a + * percentage of the cache size if the value is within the range of 1 to + * 100 or an absolute size when greater than 100. The value is not + * allowed to exceed the \c cache_size., an integer between 1 and 10TB; + * default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to * perform eviction when the cache contains at least this much dirty - * content\, expressed as a percentage of the total cache size. This - * setting only alters behavior if it is lower than eviction_trigger., - * an integer between 1 and 99; default \c 20.} + * content. It is a percentage of the cache size if the value is within + * the range of 1 to 100 or an absolute size when greater than 100. The + * value is not allowed to exceed the \c cache_size. This setting only + * alters behavior if it is lower than eviction_trigger., an integer + * between 1 and 10TB; default \c 20.} * @config{eviction_target, perform eviction in worker threads when the - * cache contains at least this much content\, expressed as a percentage - * of the total cache size. Must be less than \c eviction_trigger., an - * integer between 10 and 99; default \c 80.} + * cache contains at least this much content. It is a percentage of the + * cache size if the value is within the range of 10 to 100 or an + * absolute size when greater than 100. The value is not allowed to + * exceed the \c cache_size., an integer between 10 and 10TB; default \c + * 80.} * @config{eviction_trigger, trigger application threads to perform - * eviction when the cache contains at least this much content\, - * expressed as a percentage of the total cache size., an integer - * between 10 and 99; default \c 95.} + * eviction when the cache contains at least this much content. It is a + * percentage of the cache size if the value is within the range of 10 + * to 100 or an absolute size when greater than 100. The value is not + * allowed to exceed the \c cache_size., an integer between 10 and 10TB; + * default \c 95.} * @config{file_manager = (, control how file handles are managed., a * set of related configuration options defined below.} * @config{    close_handle_minimum, number of @@ -2186,13 +2195,15 @@ struct __wt_connection { * @config{shared_cache = (, shared cache configuration options. A * database should configure either a cache_size or a shared_cache not * both. Enabling a shared cache uses a session from the configured - * session_max., a set of related configuration options defined below.} - * @config{    chunk, the granularity that a shared - * cache is redistributed., an integer between 1MB and 10TB; default \c - * 10MB.} - * @config{    name, the name of a cache that - * is shared between databases or \c "none" when no shared cache is - * configured., a string; default \c none.} + * session_max. A shared cache can not have absolute values configured + * for cache eviction settings., a set of related configuration options + * defined below.} + * @config{    chunk, the + * granularity that a shared cache is redistributed., an integer between + * 1MB and 10TB; default \c 10MB.} + * @config{    name, + * the name of a cache that is shared between databases or \c "none" + * when no shared cache is configured., a string; default \c none.} * @config{    quota, maximum size of cache this * database can be allocated from the shared cache. Defaults to the * entire shared cache size., an integer; default \c 0.} @@ -2715,25 +2726,32 @@ struct __wt_connection { * @config{ * ),,} * @config{eviction_checkpoint_target, perform eviction at the beginning of - * checkpoints to bring the dirty content in cache to this level\, expressed as - * a percentage of the total cache size. Ignored if set to zero or \c in_memory - * is \c true., an integer between 0 and 99; default \c 5.} + * checkpoints to bring the dirty content in cache to this level. It is a + * percentage of the cache size if the value is within the range of 0 to 100 or + * an absolute size when greater than 100. The value is not allowed to exceed + * the \c cache_size. Ignored if set to zero or \c in_memory is \c true., an + * integer between 0 and 10TB; default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads when the - * cache contains at least this much dirty content\, expressed as a percentage - * of the total cache size., an integer between 1 and 99; default \c 5.} + * cache contains at least this much dirty content. It is a percentage of the + * cache size if the value is within the range of 1 to 100 or an absolute size + * when greater than 100. The value is not allowed to exceed the \c cache_size., + * an integer between 1 and 10TB; default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to perform - * eviction when the cache contains at least this much dirty content\, expressed - * as a percentage of the total cache size. This setting only alters behavior - * if it is lower than eviction_trigger., an integer between 1 and 99; default - * \c 20.} + * eviction when the cache contains at least this much dirty content. It is a + * percentage of the cache size if the value is within the range of 1 to 100 or + * an absolute size when greater than 100. The value is not allowed to exceed + * the \c cache_size. This setting only alters behavior if it is lower than + * eviction_trigger., an integer between 1 and 10TB; default \c 20.} * @config{eviction_target, perform eviction in worker threads when the cache - * contains at least this much content\, expressed as a percentage of the total - * cache size. Must be less than \c eviction_trigger., an integer between 10 - * and 99; default \c 80.} + * contains at least this much content. It is a percentage of the cache size if + * the value is within the range of 10 to 100 or an absolute size when greater + * than 100. The value is not allowed to exceed the \c cache_size., an integer + * between 10 and 10TB; default \c 80.} * @config{eviction_trigger, trigger application threads to perform eviction - * when the cache contains at least this much content\, expressed as a - * percentage of the total cache size., an integer between 10 and 99; default \c - * 95.} + * when the cache contains at least this much content. It is a percentage of + * the cache size if the value is within the range of 10 to 100 or an absolute + * size when greater than 100. The value is not allowed to exceed the \c + * cache_size., an integer between 10 and 10TB; default \c 95.} * @config{exclusive, fail if the database already exists\, generally used with * the \c create option., a boolean flag; default \c false.} * @config{extensions, list of shared library extensions to load (using dlopen). @@ -2822,8 +2840,9 @@ struct __wt_connection { * threads)., an integer greater than or equal to 1; default \c 100.} * @config{shared_cache = (, shared cache configuration options. A database * should configure either a cache_size or a shared_cache not both. Enabling a - * shared cache uses a session from the configured session_max., a set of - * related configuration options defined below.} + * shared cache uses a session from the configured session_max. A shared cache + * can not have absolute values configured for cache eviction settings., a set + * of related configuration options defined below.} * @config{    chunk, the granularity that a shared cache is * redistributed., an integer between 1MB and 10TB; default \c 10MB.} * @config{    name, the name of a cache that is shared @@ -5009,521 +5028,528 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_READ 1105 /*! cache: pages read into cache requiring lookaside entries */ #define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1106 +/*! cache: pages read into cache skipping older lookaside entries */ +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1107 +/*! + * cache: pages read into cache with skipped lookaside entries needed + * later + */ +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1108 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1107 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1109 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1108 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1110 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1109 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1111 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1110 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1112 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1111 +#define WT_STAT_CONN_CACHE_WRITE 1113 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1112 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1114 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1113 +#define WT_STAT_CONN_CACHE_OVERHEAD 1115 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1114 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1116 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1115 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1117 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1116 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1118 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1117 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1119 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1118 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1120 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1119 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1121 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1120 +#define WT_STAT_CONN_COND_AUTO_WAIT 1122 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1121 +#define WT_STAT_CONN_TIME_TRAVEL 1123 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1122 +#define WT_STAT_CONN_FILE_OPEN 1124 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1123 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1125 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1124 +#define WT_STAT_CONN_MEMORY_FREE 1126 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1125 +#define WT_STAT_CONN_MEMORY_GROW 1127 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1126 +#define WT_STAT_CONN_COND_WAIT 1128 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1127 +#define WT_STAT_CONN_RWLOCK_READ 1129 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1128 +#define WT_STAT_CONN_RWLOCK_WRITE 1130 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1129 +#define WT_STAT_CONN_FSYNC_IO 1131 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1130 +#define WT_STAT_CONN_READ_IO 1132 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1131 +#define WT_STAT_CONN_WRITE_IO 1133 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1132 +#define WT_STAT_CONN_CURSOR_CREATE 1134 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1133 +#define WT_STAT_CONN_CURSOR_INSERT 1135 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1134 +#define WT_STAT_CONN_CURSOR_MODIFY 1136 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1135 +#define WT_STAT_CONN_CURSOR_NEXT 1137 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1136 +#define WT_STAT_CONN_CURSOR_PREV 1138 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1137 +#define WT_STAT_CONN_CURSOR_REMOVE 1139 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1138 +#define WT_STAT_CONN_CURSOR_RESERVE 1140 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1139 +#define WT_STAT_CONN_CURSOR_RESET 1141 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1140 +#define WT_STAT_CONN_CURSOR_RESTART 1142 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1141 +#define WT_STAT_CONN_CURSOR_SEARCH 1143 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1142 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1144 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1143 +#define WT_STAT_CONN_CURSOR_UPDATE 1145 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1144 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1146 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1145 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1147 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1146 +#define WT_STAT_CONN_DH_SWEEP_REF 1148 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1147 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1149 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1148 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1150 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1149 +#define WT_STAT_CONN_DH_SWEEP_TOD 1151 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1150 +#define WT_STAT_CONN_DH_SWEEPS 1152 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1151 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1153 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1152 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1154 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1153 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1155 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1154 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1156 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1155 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1157 /*! * lock: commit timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1156 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1158 /*! * lock: commit timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1157 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1159 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1158 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1160 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1159 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1161 /*! * lock: dhandle lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1160 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1162 /*! * lock: dhandle lock internal thread time waiting for the dhandle lock * (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1161 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1163 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1162 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1164 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1163 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1165 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1164 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1166 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1165 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1167 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1166 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1168 /*! * lock: read timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1167 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1169 /*! * lock: read timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1168 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1170 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1169 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1171 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1170 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1172 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1171 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1173 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1172 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1174 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1173 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1175 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1174 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1176 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1175 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1177 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1176 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1178 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1177 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1179 /*! * lock: txn global lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1178 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1180 /*! * lock: txn global lock internal thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1179 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1181 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1180 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1182 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1181 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1183 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1182 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1184 /*! log: force checkpoint calls slept */ -#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1183 +#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1185 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1184 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1186 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1185 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1187 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1186 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1188 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1187 +#define WT_STAT_CONN_LOG_FLUSH 1189 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1188 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1190 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1189 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1191 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1190 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1192 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1191 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1193 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1192 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1194 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1193 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1195 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1194 +#define WT_STAT_CONN_LOG_SCANS 1196 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1195 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1197 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1196 +#define WT_STAT_CONN_LOG_WRITE_LSN 1198 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1197 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1199 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1198 +#define WT_STAT_CONN_LOG_SYNC 1200 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1199 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1201 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1200 +#define WT_STAT_CONN_LOG_SYNC_DIR 1202 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1201 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1203 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1202 +#define WT_STAT_CONN_LOG_WRITES 1204 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1203 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1205 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1204 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1206 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1205 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1207 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1206 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1208 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1207 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1209 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1208 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1210 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1209 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1211 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1210 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1212 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1211 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1213 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1212 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1214 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1213 +#define WT_STAT_CONN_LOG_SLOT_RACES 1215 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1214 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1216 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1215 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1217 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1216 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1218 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1217 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1219 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1218 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1220 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1219 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1221 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1220 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1222 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1221 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1223 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1222 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1224 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1223 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1225 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1224 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1226 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1225 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1227 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1226 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1228 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1227 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1229 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1228 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1230 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1229 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1231 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1230 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1232 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1231 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1233 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1232 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1234 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1233 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1235 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1234 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1236 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1235 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1237 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1236 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1238 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1237 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1239 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1238 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1240 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1239 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1241 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1240 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1242 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1241 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1243 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1242 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1244 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1243 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1245 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1244 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1246 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1245 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1247 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1246 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1248 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1247 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1249 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1248 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1250 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1249 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1251 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1250 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1252 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1251 +#define WT_STAT_CONN_REC_PAGES 1253 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1252 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1254 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1253 +#define WT_STAT_CONN_REC_PAGE_DELETE 1255 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1254 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1256 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1255 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1257 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1256 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1258 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1257 +#define WT_STAT_CONN_SESSION_OPEN 1259 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1258 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1260 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1259 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1261 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1260 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1262 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1261 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1263 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1262 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1264 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1263 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1265 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1264 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1266 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1265 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1267 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1266 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1268 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1267 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1269 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1268 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1270 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1269 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1271 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1270 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1272 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1271 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1273 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1272 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1274 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1273 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1275 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1274 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1276 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1275 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1277 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1276 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1278 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1277 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1279 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1278 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1280 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1279 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1281 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1280 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1282 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1281 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1283 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1282 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1284 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1283 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1285 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1284 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1286 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1285 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1287 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1286 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1288 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1287 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1289 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1288 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1290 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1289 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1291 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1290 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1292 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1291 +#define WT_STAT_CONN_PAGE_SLEEP 1293 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1292 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1294 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1293 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1295 /*! * thread-yield: tree descend one level yielded for split page index * update */ -#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1294 +#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1296 /*! transaction: commit timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1295 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1297 /*! transaction: commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1296 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1298 /*! transaction: commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1297 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1299 /*! transaction: commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1298 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1300 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1299 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1301 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1300 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1302 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1301 +#define WT_STAT_CONN_TXN_QUERY_TS 1303 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1302 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1304 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1303 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1305 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1304 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1306 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1305 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1307 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1306 +#define WT_STAT_CONN_TXN_SET_TS 1308 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1307 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1309 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1308 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1310 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1309 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1311 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1310 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1312 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1311 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1313 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1312 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1314 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1313 +#define WT_STAT_CONN_TXN_BEGIN 1315 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1314 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1316 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1315 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1317 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1316 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1318 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1317 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1319 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1318 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1320 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1319 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1321 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1320 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1322 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1321 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1323 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1322 +#define WT_STAT_CONN_TXN_CHECKPOINT 1324 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1323 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1325 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1324 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1326 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1325 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1327 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1326 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1328 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1327 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1329 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1328 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1330 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1329 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1331 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1330 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1332 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1331 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1333 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1332 +#define WT_STAT_CONN_TXN_SYNC 1334 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1333 +#define WT_STAT_CONN_TXN_COMMIT 1335 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1334 +#define WT_STAT_CONN_TXN_ROLLBACK 1336 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1335 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1337 /*! * @} diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 217a3deab60..167297c5c80 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -311,10 +311,10 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - time_stop = __wt_rdtsc(session); - fsync_duration_usecs = WT_TSCDIFF_US(time_stop, time_start); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); log->sync_dir_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync_dir); WT_STAT_CONN_INCRV(session, @@ -334,10 +334,10 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log_fh->name, min_lsn->l.file, min_lsn->l.offset); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log_fh, true)); - time_stop = __wt_rdtsc(session); - fsync_duration_usecs = WT_TSCDIFF_US(time_stop, time_start); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); log->sync_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync); WT_STAT_CONN_INCRV(session, @@ -1844,11 +1844,11 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); fsync_duration_usecs = - WT_TSCDIFF_US(time_stop, time_start); + WT_CLOCKDIFF_US(time_stop, time_start); log->sync_dir_lsn = sync_lsn; WT_STAT_CONN_INCR(session, log_sync_dir); WT_STAT_CONN_INCRV(session, @@ -1866,11 +1866,11 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset); WT_STAT_CONN_INCR(session, log_sync); - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_fh, true)); - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); fsync_duration_usecs = - WT_TSCDIFF_US(time_stop, time_start); + WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs); log->sync_lsn = sync_lsn; diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 4a556913cdc..fc8181e2460 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -161,7 +161,7 @@ retry: */ #ifdef HAVE_DIAGNOSTIC count = 0; - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); #endif if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { while (slot->slot_unbuffered == 0) { @@ -170,8 +170,8 @@ retry: #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { - time_stop = __wt_rdtsc(session); - if (WT_TSCDIFF_SEC( + time_stop = __wt_clock(session); + if (WT_CLOCKDIFF_SEC( time_stop, time_start) > 10) { __wt_errx(session, "SLOT_CLOSE: Slot %" PRIu32 " Timeout unbuffered, state 0x%" @@ -231,7 +231,7 @@ __log_slot_new(WT_SESSION_IMPL *session) #ifdef HAVE_DIAGNOSTIC count = 0; - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); #endif /* * Keep trying until we can find a free slot. @@ -271,8 +271,8 @@ __log_slot_new(WT_SESSION_IMPL *session) #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { - time_stop = __wt_rdtsc(session); - if (WT_TSCDIFF_SEC(time_stop, time_start) > 10) { + time_stop = __wt_clock(session); + if (WT_CLOCKDIFF_SEC(time_stop, time_start) > 10) { __wt_errx(session, "SLOT_NEW: Timeout free slot"); __log_slot_dump(session); @@ -577,7 +577,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, ++wait_cnt; } if (!yielded) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); yielded = true; /* * The slot is no longer open or we lost the race to @@ -598,8 +598,8 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_STAT_CONN_INCR(session, log_slot_immediate); else { WT_STAT_CONN_INCR(session, log_slot_yield); - time_stop = __wt_rdtsc(session); - usecs = WT_TSCDIFF_US(time_stop, time_start); + time_stop = __wt_clock(session); + usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs); if (closed) WT_STAT_CONN_INCR(session, log_slot_yield_close); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c index 1337335ff5b..88daca989a6 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_meta.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c @@ -282,17 +282,13 @@ __lsm_meta_read_v1( WT_ERR(__wt_config_getones(session, lsmconf, "chunks", &cv)); __wt_config_subinit(session, &lparser, &cv); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0;) { - if (WT_STRING_MATCH("generation", lk.str, lk.len)) { + if (WT_STRING_MATCH("id", lk.str, lk.len)) { WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; - chunk->generation = (uint32_t)lv.val; - } else if (WT_STRING_MATCH("id", lk.str, lk.len)) { chunk->id = (uint32_t)lv.val; - WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, - chunk->id, chunk->generation, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE); } else if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { WT_ERR(__wt_lsm_tree_bloom_name( @@ -302,6 +298,14 @@ __lsm_meta_read_v1( chunk->size = (uint64_t)lv.val; } else if (WT_STRING_MATCH("count", lk.str, lk.len)) { chunk->count = (uint64_t)lv.val; + } else if (WT_STRING_MATCH("generation", lk.str, lk.len)) { + chunk->generation = (uint32_t)lv.val; + /* + * Id appears first, but we need both id and generation + * to create the name. + */ + WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, + chunk->id, chunk->generation, &chunk->uri)); } } WT_ERR_NOTFOUND_OK(ret); @@ -481,14 +485,10 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, chunk = lsm_tree->chunk[i]; if (i > 0) WT_ERR(__wt_buf_catfmt(session, buf, ",")); - /* - * Note that we need the generation before the ID for custom - * data sources, or the wrong URI will be generated. - */ WT_ERR(__wt_buf_catfmt( - session, buf, "generation=%" PRIu32, chunk->generation)); + session, buf, "id=%" PRIu32, chunk->id)); WT_ERR(__wt_buf_catfmt( - session, buf, ",id=%" PRIu32, chunk->id)); + session, buf, ",generation=%" PRIu32, chunk->generation)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_buf_catfmt(session, buf, ",bloom")); if (chunk->size != 0) diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c index a07577e6d38..2c2cb084a91 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_dir.c +++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c @@ -71,14 +71,12 @@ __directory_list_worker(WT_FILE_SYSTEM *file_system, *dirlistp = entries; *countp = count; -err: if (dirp != NULL) { - WT_SYSCALL(closedir(dirp), tret); - if (tret != 0) { - __wt_err(session, tret, - "%s: directory-list: closedir", directory); - if (ret == 0) - ret = tret; - } +err: WT_SYSCALL(closedir(dirp), tret); + if (tret != 0) { + __wt_err(session, tret, + "%s: directory-list: closedir", directory); + if (ret == 0) + ret = tret; } if (ret == 0) diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index ae8f640f733..3ad6bdf41ea 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -799,9 +799,13 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* * Create a new root page, initialize the array of child references, * mark it dirty, then write it. + * + * Don't count the eviction of this page as progress, checkpoint can + * repeatedly create and discard these pages. */ WT_RET(__wt_page_alloc(session, page->type, mod->mod_multi_entries, false, &next)); + F_SET_ATOMIC(next, WT_PAGE_EVICT_NO_PROGRESS); WT_INTL_INDEX_GET(session, next, pindex); for (i = 0; i < mod->mod_multi_entries; ++i) { @@ -1411,17 +1415,32 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, *updp = upd; } + /* Keep track of the selected update. */ + upd = *updp; + /* Reconciliation should never see an aborted or reserved update. */ - WT_ASSERT(session, *updp == NULL || - ((*updp)->txnid != WT_TXN_ABORTED && - (*updp)->type != WT_UPDATE_RESERVE)); + WT_ASSERT(session, upd == NULL || + (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE)); /* If all of the updates were aborted, quit. */ if (first_txn_upd == NULL) { - WT_ASSERT(session, *updp == NULL); + WT_ASSERT(session, upd == NULL); return (0); } + /* If no updates were skipped, record that we're making progress. */ + if (upd == first_txn_upd) + r->update_used = true; + + /* + * The checkpoint transaction is special. Make sure we never write + * metadata updates from a checkpoint in a concurrent session. + */ + WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || + upd == NULL || upd->txnid == WT_TXN_NONE || + upd->txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_SESSION_IS_CHECKPOINT(session)); + /* * Track the most recent transaction in the page. We store this in the * tree at the end of reconciliation in the service of checkpoints, it @@ -1432,25 +1451,26 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->max_txn = max_txn; #ifdef HAVE_TIMESTAMPS + /* Update the maximum timestamp. */ if (first_ts_upd != NULL && __wt_timestamp_cmp(&r->max_timestamp, &first_ts_upd->timestamp) < 0) __wt_timestamp_set(&r->max_timestamp, &first_ts_upd->timestamp); -#endif - /* - * The checkpoint transaction is special. Make sure we never write - * metadata updates from a checkpoint in a concurrent session. - */ - WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || - *updp == NULL || (*updp)->txnid == WT_TXN_NONE || - (*updp)->txnid != S2C(session)->txn_global.checkpoint_state.id || - WT_SESSION_IS_CHECKPOINT(session)); + /* Update the maximum on-page timestamp. */ + if (upd != NULL && + __wt_timestamp_cmp(&upd->timestamp, &r->max_onpage_timestamp) > 0) + __wt_timestamp_set(&r->max_onpage_timestamp, &upd->timestamp); +#endif /* - * If there are no skipped updates, record that we're making progress. + * If the update we chose was a birthmark, or we are doing + * update-restore and we skipped a birthmark, the original on-page + * value must be retained. */ - if (*updp == first_txn_upd) - r->update_used = true; + if (upd != NULL && + (upd->type == WT_UPDATE_BIRTHMARK || + (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark))) + *updp = NULL; /* * Check if all updates on the page are visible. If not, it must stay @@ -1465,40 +1485,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #else timestampp = NULL; #endif - all_visible = *updp == first_txn_upd && !uncommitted && + all_visible = upd == first_txn_upd && !uncommitted && (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_visible_all(session, max_txn, timestampp) : __wt_txn_visible(session, max_txn, timestampp)); - /* - * If the update we chose was a birthmark, or doing update-restore and - * we skipped a birthmark, the original on-page value must be retained. - * - * Update the maximum on-page timestamp before discarding the chosen - * update. - */ - if ((upd = *updp) != NULL) { -#ifdef HAVE_TIMESTAMPS - if (__wt_timestamp_cmp( - &upd->timestamp, &r->max_onpage_timestamp) > 0) - __wt_timestamp_set( - &r->max_onpage_timestamp, &upd->timestamp); -#endif - if ((*updp)->type == WT_UPDATE_BIRTHMARK) - *updp = NULL; - if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark) - *updp = NULL; - } - if (all_visible) goto check_original_value; + r->leave_dirty = true; + if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible"); - r->leave_dirty = true; - /* * If not trying to evict the page, we know what we'll write and we're * done. @@ -1796,6 +1796,7 @@ __rec_child_modify(WT_SESSION_IMPL *session, */ break; + case WT_REF_LIMBO: case WT_REF_LOOKASIDE: /* * On disk, with lookaside updates. @@ -3429,16 +3430,18 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, r->supd_next = j; } -done: /* Track the oldest timestamp seen so far. */ - multi->page_las.las_skew_newest = r->las_skew_newest; - multi->page_las.las_max_txn = r->max_txn; - WT_ASSERT(session, r->max_txn != WT_TXN_NONE); +done: if (F_ISSET(r, WT_REC_LOOKASIDE)) { + /* Track the oldest lookaside timestamp seen so far. */ + multi->page_las.las_skew_newest = r->las_skew_newest; + multi->page_las.las_max_txn = r->max_txn; + WT_ASSERT(session, r->max_txn != WT_TXN_NONE); #ifdef HAVE_TIMESTAMPS - __wt_timestamp_set( - &multi->page_las.min_timestamp, &r->min_saved_timestamp); - __wt_timestamp_set( - &multi->page_las.onpage_timestamp, &r->max_onpage_timestamp); + __wt_timestamp_set(&multi->page_las.min_timestamp, + &r->min_saved_timestamp); + __wt_timestamp_set(&multi->page_las.onpage_timestamp, + &r->max_onpage_timestamp); #endif + } err: __wt_scr_free(session, &key); return (ret); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index bbe5d2a0218..fd091cb5b13 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1467,7 +1467,9 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config) txn = &session->txn; if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, - "failed transaction requires rollback"); + "failed transaction requires rollback%s%s", + txn->rollback_reason == NULL ? "" : ": ", + txn->rollback_reason == NULL ? "" : txn->rollback_reason); if (ret == 0) ret = __wt_txn_commit(session, cfg); @@ -1628,14 +1630,14 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) * Keep checking the LSNs until we find it is stable or we reach * our timeout, or there's some other reason to quit. */ - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { if (!__transaction_sync_run_chk(session)) WT_ERR(ETIMEDOUT); __wt_cond_signal(session, conn->log_file_cond); - time_stop = __wt_rdtsc(session); - waited_ms = WT_TSCDIFF_MS(time_stop, time_start); + time_stop = __wt_clock(session); + waited_ms = WT_CLOCKDIFF_MS(time_stop, time_start); if (waited_ms < timeout_ms) { remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND; __wt_cond_wait(session, log->log_sync_cond, diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index 19786bb0974..d1271e0d427 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -72,10 +72,10 @@ __global_calibrate_ticks(void) for (tries = 0; tries < 3; ++tries) { /* This needs to be CPU intensive and large enough. */ __wt_epoch(NULL, &start); - tsc_start = __wt_rdtsc(NULL); + tsc_start = __wt_rdtsc(); for (i = 0; i < 100 * WT_MILLION; i++) ; - tsc_stop = __wt_rdtsc(NULL); + tsc_stop = __wt_rdtsc(); __wt_epoch(NULL, &stop); diff_nsec = WT_TIMEDIFF_NS(stop, start); diff_tsc = tsc_stop - tsc_start; diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index ac0b7f7de96..42148b068fb 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -84,7 +84,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp * eviction and splits, we re-check it after a barrier to make sure * we have a valid reference. */ - if (ref->state != WT_REF_MEM) { + if (ref->state != WT_REF_LIMBO && ref->state != WT_REF_MEM) { *busyp = true; return (0); } @@ -132,8 +132,8 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state - * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the - * page eviction server. + * of WT_REF_LIMBO or WT_REF_MEM, which can be set to WT_REF_LOCKED + * at any time by the page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can @@ -152,9 +152,9 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp /* * Check if the page state is still valid, where valid means a - * state of WT_REF_MEM. + * state of WT_REF_LIMBO or WT_REF_MEM. */ - if (ref->state == WT_REF_MEM) { + if (ref->state == WT_REF_LIMBO || ref->state == WT_REF_MEM) { ++session->nhazard; /* diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c index 52c8004ecda..572592b9fbc 100644 --- a/src/third_party/wiredtiger/src/support/mtx_rw.c +++ b/src/third_party/wiredtiger/src/support/mtx_rw.c @@ -237,7 +237,7 @@ stall: __wt_cond_wait(session, } if (set_stats) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); /* Wait for our group to start. */ for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) { if (pause_cnt < 1000) @@ -252,13 +252,13 @@ stall: __wt_cond_wait(session, } } if (set_stats) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); if (F_ISSET(session, WT_SESSION_INTERNAL)) stats[session->stat_bucket][l->stat_int_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); else stats[session->stat_bucket][l->stat_app_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); } /* @@ -407,7 +407,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) * we have the lock. */ if (set_stats) - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); for (pause_cnt = 0, old.u.v = l->u.v; ticket != old.u.s.current || old.u.s.readers_active != 0; pause_cnt++, old.u.v = l->u.v) { @@ -423,13 +423,13 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) } } if (set_stats) { - time_stop = __wt_rdtsc(session); + time_stop = __wt_clock(session); if (F_ISSET(session, WT_SESSION_INTERNAL)) stats[session->stat_bucket][l->stat_int_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); else stats[session->stat_bucket][l->stat_app_usecs_off] += - (int64_t)WT_TSCDIFF_US(time_stop, time_start); + (int64_t)WT_CLOCKDIFF_US(time_stop, time_start); } /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 148e6bfd4d7..926176d6024 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -832,6 +832,8 @@ static const char * const __stats_connection_desc[] = { "cache: pages queued for urgent eviction during walk", "cache: pages read into cache", "cache: pages read into cache requiring lookaside entries", + "cache: pages read into cache skipping older lookaside entries", + "cache: pages read into cache with skipped lookaside entries needed later", "cache: pages requested from the cache", "cache: pages seen by eviction walk", "cache: pages selected for eviction unable to be evicted", @@ -1210,6 +1212,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_pages_queued_oldest = 0; stats->cache_read = 0; stats->cache_read_lookaside = 0; + stats->cache_read_lookaside_skipped = 0; + stats->cache_read_lookaside_delay = 0; stats->cache_pages_requested = 0; stats->cache_eviction_pages_seen = 0; stats->cache_eviction_fail = 0; @@ -1624,6 +1628,10 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cache_eviction_pages_queued_oldest); to->cache_read += WT_STAT_READ(from, cache_read); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); + to->cache_read_lookaside_skipped += + WT_STAT_READ(from, cache_read_lookaside_skipped); + to->cache_read_lookaside_delay += + WT_STAT_READ(from, cache_read_lookaside_delay); to->cache_pages_requested += WT_STAT_READ(from, cache_pages_requested); to->cache_eviction_pages_seen += diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c index 34198508988..842b50fad09 100644 --- a/src/third_party/wiredtiger/src/support/time.c +++ b/src/third_party/wiredtiger/src/support/time.c @@ -73,13 +73,13 @@ __wt_seconds(WT_SESSION_IMPL *session, time_t *timep) } /* - * __wt_tsc_to_nsec -- - * Convert from rdtsc ticks to nanoseconds. + * __wt_clock_to_nsec -- + * Convert from clock ticks to nanoseconds. */ uint64_t -__wt_tsc_to_nsec(uint64_t end, uint64_t begin) +__wt_clock_to_nsec(uint64_t end, uint64_t begin) { - double tsc_diff; + double clock_diff; /* * If the ticks were reset, consider it an invalid check and just @@ -88,20 +88,6 @@ __wt_tsc_to_nsec(uint64_t end, uint64_t begin) */ if (end < begin) return (0); - tsc_diff = (double)(end - begin); - return ((uint64_t)(tsc_diff / __wt_process.tsc_nsec_ratio)); -} - -/* - * __wt_tsc_get_expensive_timestamp -- - * Obtain a timestamp via a system call on platforms where obtaining it - * directly from the hardware register is not supported. - */ -uint64_t -__wt_tsc_get_expensive_timestamp(WT_SESSION_IMPL *session) -{ - struct timespec tsp; - - __wt_epoch(session, &tsp); - return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec)); + clock_diff = (double)(end - begin); + return ((uint64_t)(clock_diff / __wt_process.tsc_nsec_ratio)); } diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 140731bcb54..6d1321b1a13 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -442,7 +442,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) #ifdef HAVE_TIMESTAMPS wt_timestamp_t ts; WT_TXN_GLOBAL *txn_global; - char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; + char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1]; bool round_to_oldest; txn_global = &S2C(session)->txn_global; @@ -460,11 +460,13 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) * avoid a race between checking and setting transaction * timestamp. */ + WT_RET(__wt_timestamp_to_hex_string(session, + hex_timestamp[0], &ts)); __wt_readlock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0) { WT_RET(__wt_timestamp_to_hex_string(session, - timestamp_buf, &ts)); + hex_timestamp[1], &txn_global->oldest_timestamp)); /* * If given read timestamp is earlier than oldest * timestamp then round the read timestamp to @@ -476,8 +478,8 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) else { __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "read timestamp " - "%s older than oldest timestamp", - timestamp_buf); + "%s older than oldest timestamp %s", + hex_timestamp[0], hex_timestamp[1]); } } else { __wt_timestamp_set(&txn->read_timestamp, &ts); @@ -497,8 +499,8 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) * critical section. */ __wt_verbose(session, WT_VERB_TIMESTAMP, "Read " - "timestamp %s : Rounded to oldest timestamp", - timestamp_buf); + "timestamp %s : Rounded to oldest timestamp %s", + hex_timestamp[0], hex_timestamp[1]); } #else WT_RET_MSG(session, EINVAL, "read_timestamp requires a " @@ -592,10 +594,79 @@ __wt_txn_release(WT_SESSION_IMPL *session) __wt_txn_release_snapshot(session); txn->isolation = session->isolation; + txn->rollback_reason = NULL; + /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; } +#ifdef HAVE_TIMESTAMPS +/* + * __txn_commit_timestamp_validate -- + * Validate that timestamp provided to commit is legal. + */ +static inline int +__txn_commit_timestamp_validate(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + WT_TXN_OP *op; + WT_UPDATE *upd; + u_int i; + char timestamp_buf[2][2 * WT_TIMESTAMP_SIZE + 1]; + + txn = &session->txn; + + /* + * Debugging checks on timestamps, if user requested them. + */ + if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && + !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, "commit_timestamp required and " + "none set on this transaction"); + if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && + F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, "no commit_timestamp required and " + "timestamp set on this transaction"); + + if (WT_VERBOSE_ISSET(session, WT_VERB_TIMESTAMP)) { + /* + * Error on any valid update structures for the same key that + * are at a later timestamp. + */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + if (op->type != WT_TXN_OP_BASIC_TS) + continue; + /* + * Skip over any aborted update structures. + */ + upd = op->u.upd->next; + while (upd != NULL && upd->txnid == WT_TXN_ABORTED) + upd = upd->next; + /* + * Check the timestamp on this update with the + * first valid update in the chain. They're in + * most recent order. + */ + if (upd != NULL && + __wt_timestamp_cmp(&op->u.upd->timestamp, + &upd->timestamp) < 0) { + WT_RET(__wt_timestamp_to_hex_string(session, + timestamp_buf[0], &op->u.upd->timestamp)); + WT_RET(__wt_timestamp_to_hex_string(session, + timestamp_buf[1], &upd->timestamp)); + __wt_verbose(session, WT_VERB_TIMESTAMP, + "Timestamp %s on new update is older than " + "timestamp %s on existing update.", + timestamp_buf[0], timestamp_buf[1]); + } + } + } + return (0); +} +#endif + /* * __wt_txn_commit -- * Commit the current transaction. @@ -645,20 +716,9 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } #ifdef HAVE_TIMESTAMPS - /* - * Debugging checks on timestamps, if user requested them. - */ - if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && - !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - txn->mod_count != 0) - WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " - "none set on this transaction"); - if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && - F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - txn->mod_count != 0) - WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " - "timestamp set on this transaction"); + WT_ERR(__txn_commit_timestamp_validate(session)); #endif + /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. @@ -939,6 +999,18 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) return (ret); } +/* + * __wt_txn_rollback_required -- + * Prepare to log a reason if the user attempts to use the transaction to + * do anything other than rollback. + */ +int +__wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason) +{ + session->txn.rollback_reason = reason; + return (WT_ROLLBACK); +} + /* * __wt_txn_init -- * Initialize a session's transaction data. diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 2e902a8db94..616816f0e8d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -382,11 +382,11 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) cache = conn->cache; /* Give up if scrubbing is disabled. */ - if (cache->eviction_checkpoint_target == 0 || + if (cache->eviction_checkpoint_target < DBL_EPSILON || cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger) return; - time_last = time_start = __wt_rdtsc(session); + time_last = time_start = __wt_clock(session); bytes_written_last = 0; bytes_written_start = cache->bytes_written; cache_size = conn->cache_size; @@ -436,7 +436,7 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) for (;;) { current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; - if (current_dirty <= (double)cache->eviction_checkpoint_target) + if (current_dirty <= cache->eviction_checkpoint_target) break; /* @@ -447,8 +447,8 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) break; __wt_sleep(0, stepdown_us / 10); - time_stop = __wt_rdtsc(session); - current_us = WT_TSCDIFF_US(time_stop, time_last); + time_stop = __wt_clock(session); + current_us = WT_CLOCKDIFF_US(time_stop, time_last); bytes_written_total = cache->bytes_written - bytes_written_start; @@ -502,11 +502,11 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) WT_MAX(cache->eviction_dirty_target, current_dirty - delta); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, cache->eviction_scrub_limit); - time_last = __wt_rdtsc(session); + time_last = __wt_clock(session); } - time_stop = __wt_rdtsc(session); - total_ms = WT_TSCDIFF_MS(time_stop, time_start); + time_stop = __wt_clock(session); + total_ms = WT_CLOCKDIFF_MS(time_stop, time_start); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); } @@ -880,10 +880,10 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ - time_start = __wt_rdtsc(session); + time_start = __wt_clock(session); WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - time_stop = __wt_rdtsc(session); - fsync_duration_usecs = WT_TSCDIFF_US(time_stop, time_start); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 88b3bdb6693..0af70c4090d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -250,22 +250,6 @@ __txn_abort_newer_updates( return (0); } -/* - * __txn_rollback_to_stable_custom_skip -- - * Return if custom rollback requires we read this page. - */ -static int -__txn_rollback_to_stable_custom_skip( - WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) -{ - WT_UNUSED(context); - WT_UNUSED(session); - - /* Review all pages that are in memory. */ - *skipp = !(ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED); - return (0); -} - /* * __txn_rollback_to_stable_btree_walk -- * Called for each open handle - choose to either skip or wipe the commits @@ -275,22 +259,24 @@ __txn_rollback_to_stable_btree_walk( WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp) { WT_DECL_RET; - WT_PAGE *page; WT_REF *ref; /* Walk the tree, marking commits aborted where appropriate. */ ref = NULL; - while ((ret = __wt_tree_walk_custom_skip(session, &ref, - __txn_rollback_to_stable_custom_skip, - NULL, WT_READ_NO_EVICT)) == 0 && ref != NULL) { - page = ref->page; + while ((ret = __wt_tree_walk(session, &ref, + WT_READ_CACHE | WT_READ_LOOKASIDE | WT_READ_NO_EVICT)) == 0 && + ref != NULL) { + if (ref->page_las != NULL && + __wt_timestamp_cmp(rollback_timestamp, + &ref->page_las->onpage_timestamp) < 0) + ref->page_las->invalid = true; /* Review deleted page saved to the ref */ if (ref->page_del != NULL && __wt_timestamp_cmp( rollback_timestamp, &ref->page_del->timestamp) < 0) __wt_delete_page_rollback(session, ref); - if (!__wt_page_is_modified(page)) + if (!__wt_page_is_modified(ref->page)) continue; WT_RET(__txn_abort_newer_updates( diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index be771677a95..41ac970f14e 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -384,6 +384,7 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_GLOBAL *txn_global; wt_timestamp_t commit_ts, oldest_ts, stable_ts; wt_timestamp_t last_oldest_ts, last_stable_ts; + char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1]; bool force; txn_global = &S2C(session)->txn_global; @@ -432,17 +433,25 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (has_commit && (has_oldest || txn_global->has_oldest_timestamp) && __wt_timestamp_cmp(&oldest_ts, &commit_ts) > 0) { __wt_readunlock(session, &txn_global->rwlock); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &oldest_ts)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[1], &commit_ts)); WT_RET_MSG(session, EINVAL, - "set_timestamp: oldest timestamp must not be later than " - "commit timestamp"); + "set_timestamp: oldest timestamp %s must not be later than " + "commit timestamp %s", hex_timestamp[0], hex_timestamp[1]); } if (has_commit && (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&stable_ts, &commit_ts) > 0) { __wt_readunlock(session, &txn_global->rwlock); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &stable_ts)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[1], &commit_ts)); WT_RET_MSG(session, EINVAL, - "set_timestamp: stable timestamp must not be later than " - "commit timestamp"); + "set_timestamp: stable timestamp %s must not be later than " + "commit timestamp %s", hex_timestamp[0], hex_timestamp[1]); } /* @@ -454,9 +463,13 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) (has_stable || txn_global->has_stable_timestamp) && __wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0) { __wt_readunlock(session, &txn_global->rwlock); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &oldest_ts)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[1], &stable_ts)); WT_RET_MSG(session, EINVAL, - "set_timestamp: oldest timestamp must not be later than " - "stable timestamp"); + "set_timestamp: oldest timestamp %s must not be later than " + "stable timestamp %s", hex_timestamp[0], hex_timestamp[1]); } __wt_readunlock(session, &txn_global->rwlock); @@ -538,29 +551,41 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, { WT_TXN *txn = &session->txn; WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t oldest_ts, stable_ts; char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; - bool older_than_oldest_ts, older_than_stable_ts; + bool has_oldest_ts, has_stable_ts; + /* + * Added this redundant initialization to circumvent build failure. + */ + __wt_timestamp_set_zero(&oldest_ts); + __wt_timestamp_set_zero(&stable_ts); /* * Compare against the oldest and the stable timestamp. Return an error * if the given timestamp is older than oldest and/or stable timestamp. */ WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, - older_than_oldest_ts = - (cmp_oldest && txn_global->has_oldest_timestamp && - __wt_timestamp_cmp(ts, &txn_global->oldest_timestamp) < 0); - older_than_stable_ts = (cmp_stable && - txn_global->has_stable_timestamp && - __wt_timestamp_cmp(ts, &txn_global->stable_timestamp) < 0)); - - if (older_than_oldest_ts) + if ((has_oldest_ts = txn_global->has_oldest_timestamp)) + __wt_timestamp_set(&oldest_ts, &txn_global->oldest_timestamp); + if ((has_stable_ts = txn_global->has_stable_timestamp)) + __wt_timestamp_set(&stable_ts, &txn_global->stable_timestamp)); + + if (cmp_oldest && has_oldest_ts && + __wt_timestamp_cmp(ts, &oldest_ts) < 0) { + WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, + &oldest_ts)); WT_RET_MSG(session, EINVAL, - "%s timestamp %.*s older than oldest timestamp", - name, (int)cval->len, cval->str); - if (older_than_stable_ts) + "%s timestamp %.*s older than oldest timestamp %s", + name, (int)cval->len, cval->str, hex_timestamp); + } + if (cmp_stable && has_stable_ts && + __wt_timestamp_cmp(ts, &stable_ts) < 0) { + WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp, + &stable_ts)); WT_RET_MSG(session, EINVAL, - "%s timestamp %.*s older than stable timestamp", - name, (int)cval->len, cval->str); + "%s timestamp %.*s older than stable timestamp %s", + name, (int)cval->len, cval->str, hex_timestamp); + } /* * Compare against the commit timestamp of the current transaction. diff --git a/src/third_party/wiredtiger/test/csuite/random_abort/main.c b/src/third_party/wiredtiger/test/csuite/random_abort/main.c index e98c0474582..e99ed5ecd4d 100644 --- a/src/third_party/wiredtiger/test/csuite/random_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/random_abort/main.c @@ -391,9 +391,13 @@ main(int argc, char *argv[]) */ for (last_key = UINT64_MAX;; ++count, last_key = key) { ret = fscanf(fp, "%" SCNu64 "\n", &key); - if (ret != EOF && ret != 1) - testutil_die(errno, "fscanf"); - if (ret == EOF) + /* + * Consider anything other than clear success in + * getting the key to be EOF. We've seen file system + * issues where the file ends with zeroes on a 4K + * boundary and does not return EOF but a ret of zero. + */ + if (ret != 1) break; /* * If we're unlucky, the last line may be a partially diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index acd5743ebe4..2cf9a69110c 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -655,7 +655,8 @@ main(int argc, char *argv[]) "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && " "cp -p WiredTigerLog.* ../%s.SAVE", home, home, home)); - (void)system(buf); + if ((status = system(buf)) < 0) + testutil_die(status, "system: %s", buf); printf("Open database, run recovery and verify content\n"); /* diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index ba5774e8a6a..565df91d46b 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -103,7 +103,7 @@ static CONFIG c[] = { { "cache_minimum", "minimum size of the cache in MB", - C_IGNORE, 1, 0, 100 * 1024, &g.c_cache_minimum, NULL }, + C_IGNORE, 0, 0, 100 * 1024, &g.c_cache_minimum, NULL }, { "checkpoints", "type of checkpoints (on | off | wiredtiger)", diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index a5493321d3c..671582dcb16 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -492,6 +492,12 @@ commit_transaction(TINFO *tinfo, WT_SESSION *session) char config_buf[64]; if (g.c_txn_timestamps) { + /* + * Update the thread's active timestamp with the current value + * to prevent the oldest timestamp moving past our allocated + * timestamp before the commit completes. + */ + tinfo->timestamp = g.timestamp; ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(__wt_snprintf( config_buf, sizeof(config_buf), @@ -500,12 +506,12 @@ commit_transaction(TINFO *tinfo, WT_SESSION *session) session->commit_transaction(session, config_buf)); /* - * Update the thread's last-committed timestamp. Don't let the - * compiler re-order this statement, if we were to race with - * the timestamp thread, it might see our thread update before - * the transaction commit. + * Clear the thread's active timestamp: it no longer needs to + * be pinned. Don't let the compiler re-order this statement, + * if we were to race with the timestamp thread, it might see + * our thread update before the transaction commit. */ - WT_PUBLISH(tinfo->timestamp, ts); + WT_PUBLISH(tinfo->timestamp, 0); } else testutil_check(session->commit_transaction(session, NULL)); ++tinfo->commit; diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index c21e58f84e4..b8343fee1d6 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -611,7 +611,7 @@ timestamp(void *arg) * Find the lowest committed timestamp. The timestamp thread * starts before the operational threads, wait for them. */ - oldest_timestamp = UINT64_MAX; + oldest_timestamp = g.timestamp; for (i = 0; i < g.c_threads; ++i) { tinfo = tinfo_list[i]; this_ts = tinfo->timestamp; @@ -619,14 +619,10 @@ timestamp(void *arg) this_ts < oldest_timestamp) oldest_timestamp = this_ts; } - if (oldest_timestamp == UINT64_MAX) { - __wt_sleep(1, 0); - continue; - } /* - * Don't get more than 100 transactions or more than 15 seconds - * out of date. + * If less than 100 transactions out of date, wait up to 15 + * seconds before updating. */ WT_READ_BARRIER(); testutil_assert(oldest_timestamp <= g.timestamp); @@ -642,6 +638,7 @@ timestamp(void *arg) config_buf, sizeof(config_buf), "oldest_timestamp=%" PRIx64, oldest_timestamp)); testutil_check(conn->set_timestamp(conn, config_buf)); + __wt_seconds((WT_SESSION_IMPL *)session, &last); usecs = mmrand(NULL, 5, 40); __wt_sleep(0, usecs); diff --git a/src/third_party/wiredtiger/test/packing/intpack-test.c b/src/third_party/wiredtiger/test/packing/intpack-test.c index 4f6b7143108..7bc3f1f519b 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test.c @@ -40,6 +40,12 @@ main(void) memset(buf, 0xff, sizeof(buf)); /* -Werror=maybe-uninitialized */ + /* + * Required on some systems to pull in parts of the library + * for which we have data references. + */ + testutil_check(__wt_library_init()); + for (ncalls = 0, i = 0; i < 10000000; i++) { for (s = 0; s < 50; s += 5) { ++ncalls; diff --git a/src/third_party/wiredtiger/test/packing/intpack-test2.c b/src/third_party/wiredtiger/test/packing/intpack-test2.c index 1be6e78751c..b1f4b8756e7 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test2.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test2.c @@ -38,6 +38,12 @@ main(void) memset(buf, 0xff, sizeof(buf)); /* -Werror=maybe-uninitialized */ + /* + * Required on some systems to pull in parts of the library + * for which we have data references. + */ + testutil_check(__wt_library_init()); + for (i = 1; i < 1LL << 60; i <<= 1) { end = buf; testutil_check( diff --git a/src/third_party/wiredtiger/test/packing/intpack-test3.c b/src/third_party/wiredtiger/test/packing/intpack-test3.c index d327c21a738..8076ca5cd52 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test3.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test3.c @@ -45,6 +45,12 @@ test_value(int64_t val) sinput = val; soutput = 0; /* -Werror=maybe-uninitialized */ + /* + * Required on some systems to pull in parts of the library + * for which we have data references. + */ + testutil_check(__wt_library_init()); + p = buf; testutil_check(__wt_vpack_int(&p, sizeof(buf), sinput)); used_len = (size_t)(p - buf); diff --git a/src/third_party/wiredtiger/test/packing/packing-test.c b/src/third_party/wiredtiger/test/packing/packing-test.c index 7451aefc494..89946c4a64d 100644 --- a/src/third_party/wiredtiger/test/packing/packing-test.c +++ b/src/third_party/wiredtiger/test/packing/packing-test.c @@ -58,6 +58,12 @@ check(const char *fmt, ...) int main(void) { + /* + * Required on some systems to pull in parts of the library + * for which we have data references. + */ + testutil_check(__wt_library_init()); + check("iii", 0, 101, -99); check("3i", 0, 101, -99); check("iS", 42, "forty two"); diff --git a/src/third_party/wiredtiger/test/suite/test_compact02.py b/src/third_party/wiredtiger/test/suite/test_compact02.py index bb53ea06288..ffa05fb92db 100644 --- a/src/third_party/wiredtiger/test/suite/test_compact02.py +++ b/src/third_party/wiredtiger/test/suite/test_compact02.py @@ -146,12 +146,14 @@ class test_compact02(wttest.WiredTigerTestCase): self.session.checkpoint() # 5. Call compact. - # Compact can collide with eviction, if that happens we retry. - for i in range(1, 5): + # Compact can collide with eviction, if that happens we retry. Wait for + # up to a minute, the check for EBUSY should mean we're not retrying on + # real errors. + for i in range(1, 15): if not self.raisesBusy( lambda: self.session.compact(self.uri, None)): break - time.sleep(2) + time.sleep(4) # 6. Get stats on compacted table. sz = self.getSize() diff --git a/src/third_party/wiredtiger/test/suite/test_config04.py b/src/third_party/wiredtiger/test/suite/test_config04.py index 11a36c2a5d2..c3d7e3b8f49 100644 --- a/src/third_party/wiredtiger/test/suite/test_config04.py +++ b/src/third_party/wiredtiger/test/suite/test_config04.py @@ -140,7 +140,6 @@ class test_config04(wttest.WiredTigerTestCase): def test_eviction(self): self.common_test('eviction_target=84,eviction_trigger=94') - # Note def test_eviction_bad(self): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: @@ -154,6 +153,68 @@ class test_config04(wttest.WiredTigerTestCase): 'eviction_trigger=86'), "/eviction target must be lower than the eviction trigger/") + def test_eviction_absolute(self): + self.common_test('eviction_target=50MB,eviction_trigger=60MB,' + 'eviction_dirty_target=20MB,eviction_dirty_trigger=15MB,' + 'eviction_checkpoint_target=13MB') + + def test_eviction_abs_and_pct(self): + self.common_test('eviction_target=50,eviction_trigger=60MB,' + 'eviction_dirty_target=20,eviction_dirty_trigger=15MB') + + def test_eviction_abs_less_than_one_pct(self): + self.wiredtiger_open('.','create,cache_size=8GB,eviction_target=70MB,' + 'eviction_trigger=75MB') + + # Test that eviction_target must be lower than eviction_trigger + def test_eviction_absolute_bad(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,eviction_target=70MB,' + 'eviction_trigger=60MB'), + '/eviction target must be lower than the eviction trigger/') + + def test_eviction_abs_and_pct_bad(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,eviction_target=50,' + 'eviction_trigger=40MB'), + '/eviction target must be lower than the eviction trigger/') + + def test_eviction_abs_and_pct_bad2(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,eviction_target=50MB,' + 'eviction_trigger=40'), + '/eviction target must be lower than the eviction trigger/') + + def test_eviction_tgt_abs_too_large(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,cache_size=500MB,' + 'eviction_target=1G'), + '/eviction target should not exceed cache size/') + + def test_eviction_trigger_abs_too_large(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,cache_size=500MB,' + 'eviction_trigger=1G'), + '/eviction trigger should not exceed cache size/') + + def test_eviction_dirty_tgt_abs_too_large(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,cache_size=500MB,' + 'eviction_dirty_target=1G'), + '/eviction dirty target should not exceed cache size/') + + def test_eviction_dirty_trigggr_abs_too_large(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,cache_size=500MB,' + 'eviction_dirty_trigger=1G'), + '/eviction dirty trigger should not exceed cache size/') + + def test_eviction_checkpoint_tgt_abs_too_large(self): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: + self.wiredtiger_open('.','create,cache_size=500MB,' + 'eviction_checkpoint_target=1G'), + '/eviction checkpoint target should not exceed cache size/') + def test_invalid_config(self): msg = '/Unbalanced brackets/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, diff --git a/src/third_party/wiredtiger/test/suite/test_reconfig01.py b/src/third_party/wiredtiger/test/suite/test_reconfig01.py index 9c34f96c13e..fc78ea709a4 100644 --- a/src/third_party/wiredtiger/test/suite/test_reconfig01.py +++ b/src/third_party/wiredtiger/test/suite/test_reconfig01.py @@ -75,6 +75,13 @@ class test_reconfig01(wttest.WiredTigerTestCase): self.conn.reconfigure("eviction=(threads_min=2)") # Set min and max the same. self.conn.reconfigure("eviction=(threads_min=6,threads_max=6)") + # Set target and trigger with an absolute value. + self.conn.reconfigure("eviction_target=50M,eviction_trigger=100M") + # Set dirty target and trigger with an absolute value + self.conn.reconfigure("eviction_dirty_target=20M," + "eviction_dirty_trigger=40M") + # Set eviction checkpoint target with an absolute value + self.conn.reconfigure("eviction_checkpoint_target=50M") def test_reconfig_lsm_manager(self): # We create and populate a tiny LSM so that we can start off with diff --git a/src/third_party/wiredtiger/test/suite/test_shared_cache01.py b/src/third_party/wiredtiger/test/suite/test_shared_cache01.py index b6ed2289639..9ebdd5093fc 100644 --- a/src/third_party/wiredtiger/test/suite/test_shared_cache01.py +++ b/src/third_party/wiredtiger/test/suite/test_shared_cache01.py @@ -158,6 +158,39 @@ class test_shared_cache01(wttest.WiredTigerTestCase): self.add_records(sess, 0, nops) self.closeConnections() + # Opening a connection with absolute values for eviction config should fail + def test_shared_cache_absolute_evict_config(self): + nops = 1000 + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),' + 'eviction_target=10M,'), '/Shared cache configuration requires a ' + 'percentage value for eviction target/') + + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),' + 'eviction_trigger=10M,'), '/Shared cache configuration requires a ' + 'percentage value for eviction trigger/') + + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),' + 'eviction_dirty_target=10M,'), '/Shared cache configuration ' + 'requires a percentage value for eviction dirty target/') + + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),' + 'eviction_dirty_trigger=10M,'), '/Shared cache configuration ' + 'requires a percentage value for eviction dirty trigger/') + + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),' + 'eviction_checkpoint_target=10M,'), '/Shared cache configuration ' + 'requires a percentage value for eviction checkpoint target/') + # Test verbose output @unittest.skip("Verbose output handling") def test_shared_cache_verbose(self): diff --git a/src/third_party/wiredtiger/test/suite/test_shared_cache02.py b/src/third_party/wiredtiger/test/suite/test_shared_cache02.py index 05f080b3323..3d5b29f1969 100644 --- a/src/third_party/wiredtiger/test/suite/test_shared_cache02.py +++ b/src/third_party/wiredtiger/test/suite/test_shared_cache02.py @@ -162,5 +162,35 @@ class test_shared_cache02(wttest.WiredTigerTestCase): self.closeConnections() + # Test reconfigure with absolute value for eviction config fails + def test_shared_cache_reconfig04(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),') + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + + connection = self.conns[0] + # Reconfiguring with absolute value of eviction trigger should fail. + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: connection.reconfigure("shared_cache=(name=pool," + "size=20M,reserve=10M),eviction_trigger=10M"),'/Shared cache ' + 'configuration requires a percentage value for eviction trigger/') + + connection = self.conns[1] + # Reconfiguring with absolute value for eviction target should fail. + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: connection.reconfigure("shared_cache=(name=pool," + "size=20M,reserve=10M),eviction_target=10M"),'/Shared cache ' + 'configuration requires a percentage value for eviction target/') + + # Reconfigure with percentage value for eviction target passes + self.conns[0].reconfigure("shared_cache=(name=pool,reserve=20M)," + "eviction_target=50") + + self.closeConnections() + if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index 17aea80a1ee..48ec7fac9a6 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. # # test_timestamp04.py -# Timestamps: Test that rollback_to_stable obeys expected visibility rules +# Timestamps: Test that rollback_to_stable obeys expected visibility rules. # from suite_subprocess import suite_subprocess @@ -49,7 +49,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): ('V2', dict(conn_config=',log=(enabled)', using_log=True)), ] - # Minimum cache_size requirement of lsm is 31MB + # Minimum cache_size requirement of lsm is 31MB. types = [ ('col_fix', dict(empty=1, cacheSize='cache_size=20MB', extra_config=',key_format=r,value_format=8t')), ('col_var', dict(empty=0, cacheSize='cache_size=20MB', extra_config=',key_format=r')), @@ -68,14 +68,12 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): cur = session.open_cursor(tablename, None) if missing == False: actual = dict((k, v) for k, v in cur if v != 0) - if prn == True: - print "CHECK : Expected" - print expected - print "CHECK : Actual" - print actual + if actual != expected: + print "missing: ", sorted(set(expected) - set(actual)) + print "extras: ", sorted(set(actual) - set(expected)) self.assertTrue(actual == expected) - # Search for the expected items as well as iterating + # Search for the expected items as well as iterating. for k, v in expected.iteritems(): if missing == False: self.assertEqual(cur[k], v, "for key " + str(k)) @@ -114,7 +112,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): self.ConnectionOpen(self.cacheSize) # Configure small page sizes to ensure eviction comes through and we - # have a somewhat complex tree + # have a somewhat complex tree. config_default = 'key_format=i,value_format=i,memory_page_max=32k,leaf_page_max=8k,internal_page_max=8k' config_nolog = ',log=(enabled=false)' # @@ -133,7 +131,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): self.session.create(self.table_nots_nolog, config_default + config_nolog + self.extra_config) cur_nots_nolog = self.session.open_cursor(self.table_nots_nolog) - # Insert keys each with timestamp=key, in some order + # Insert keys each with timestamp=key, in some order. key_range = 10000 keys = range(1, key_range + 1) @@ -168,41 +166,41 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): self.conn.rollback_to_stable() # Check that we see the inserted value (i.e. 1) for all the keys in - # non-timestamp tables + # non-timestamp tables. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_nots_log, dict((k, 1) for k in keys[:])) self.check(self.session, 'read_timestamp=' + latest_ts, self.table_nots_nolog, dict((k, 1) for k in keys[:])) # For non-logged tables the behavior is consistent across connections - # with or without log enabled + # with or without log enabled. # Check that we see the inserted value (i.e. 1) for the keys in a - # timestamp table till the stable_timestamp only. + # timestamped table until the stable_timestamp only. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_nolog, dict((k, 1) for k in keys[:(key_range / 2)])) self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_nolog, dict((k, 1) for k in keys[(key_range / 2 + 1):]), missing=True) - # For logged tables behavior changes for rollback_to_stable based on + # For logged tables, the behavior of rollback_to_stable changes based on # whether connection level logging is enabled or not. if self.using_log == True: - # When log is enabled, none of the keys will be rolled back. - # Check that we see all the keys + # When the log is enabled, none of the keys will be rolled back. + # Check that we see all the keys. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_log, dict((k, 1) for k in keys[:])) else: - # When log is disabled, keys will be rolled back till stable_timestamp - # Check that we see the insertions are rolled back in timestamp tables - # till the stable_timestamp + # When the log is disabled, the keys will be rolled back until stable_timestamp. + # Check that we see the insertions are rolled back in timestamped tables + # until the stable_timestamp. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_log, dict((k, 1) for k in keys[:(key_range / 2)])) self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_log, dict((k, 1) for k in keys[(key_range / 2 + 1):]), missing=True) - # Bump the oldest timestamp, we're not going back... + # Bump the oldest timestamp, we're not going back. self.conn.set_timestamp('oldest_timestamp=' + stable_ts) - # Update the values again in preparation for rolling back more + # Update the values again in preparation for rolling back more. for k in keys: cur_nots_log[k] = 2 cur_nots_nolog[k] = 2 @@ -212,7 +210,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + key_range)) # Scenario: 3 - # Check that we see all values updated (i.e 2) in all tables + # Check that we see all values updated (i.e 2) in all tables. latest_ts = timestamp_str(2 * key_range) self.check(self.session, 'read_timestamp=' + latest_ts, self.table_nots_log, dict((k, 2) for k in keys[:])) @@ -225,20 +223,20 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Scenario: 4 # Advance the stable_timestamp by a quarter range and rollback. - # three-quarter timestamps will be rolled back. + # Three-fourths of the later timestamps will be rolled back. stable_ts = timestamp_str(key_range + key_range / 4) self.conn.set_timestamp('stable_timestamp=' + stable_ts) self.conn.rollback_to_stable() # Check that we see the updated value (i.e. 2) for all the keys in - # non-timestamp tables + # non-timestamped tables. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_nots_log, dict((k, 2) for k in keys[:])) self.check(self.session, 'read_timestamp=' + latest_ts, self.table_nots_nolog, dict((k, 2) for k in keys[:])) # For non-logged tables the behavior is consistent across connections - # with or without log enabled - # Check that we see only half key ranges in timestamp tables. we see + # with or without log enabled. + # Check that we see only half key ranges in timestamp tables. We see # the updated value (i.e. 2) for the first quarter keys and old values # (i.e. 1) for the second quarter keys. self.check(self.session, 'read_timestamp=' + latest_ts, @@ -251,12 +249,12 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # whether connection level logging is enabled or not. if self.using_log == True: # When log is enabled, none of the keys will be rolled back. - # Check that we see all the keys + # Check that we see all the keys. self.check(self.session, 'read_timestamp=' + latest_ts, self.table_ts_log, dict((k, 2) for k in keys[:])) else: - # When log is disabled, keys will be rolled back till stable_timestamp - # Check that we see only half key ranges in timestamp tables. we see + # When log is disabled, keys will be rolled back until the stable_timestamp. + # Check that we see only half the key ranges in timestamped tables. We see # the updated value (i.e. 2) for the first quarter keys and old values # (i.e. 1) for the second quarter keys. self.check(self.session, 'read_timestamp=' + latest_ts, diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp09.py b/src/third_party/wiredtiger/test/suite/test_timestamp09.py index 5000eb4e854..9b7d88bf64e 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp09.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp09.py @@ -109,7 +109,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(3) + ',stable_timestamp=' + timestamp_str(1)), - '/oldest timestamp must not be later than stable timestamp/') + '/oldest timestamp 0*3 must not be later than stable timestamp 0*1/') # Oldest timestamp is 3 at the moment, trying to set it to an earlier # timestamp is a no-op. @@ -128,7 +128,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(6)), - '/oldest timestamp must not be later than stable timestamp/') + '/oldest timestamp 0*6 must not be later than stable timestamp 0*5/') # Commit timestamp >= Stable timestamp. # Check both timestamp_transaction and commit_transaction API. diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp10.py b/src/third_party/wiredtiger/test/suite/test_timestamp10.py new file mode 100644 index 00000000000..de928b34220 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp10.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp10.py +# Timestamps: timestamp ordering +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_timestamp10(wttest.WiredTigerTestCase, suite_subprocess): + conn_config = 'verbose=[timestamp]' + def test_timestamp_range(self): + if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build(): + self.skipTest('requires a timestamp and diagnostic build') + + base = 'timestamp10' + uri = 'file:' + base + # Create a data item at a timestamp + self.session.create(uri, 'key_format=S,value_format=S') + + # Insert a data item at timestamp 2 + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(2)) + c['key'] = 'value2' + self.session.commit_transaction() + c.close() + + # Modify the data item at timestamp 1 + # + # The docs say: + # The commits to a particular data item must be performed in timestamp + # order. Again, this is only checked in diagnostic builds and if + # applications violate this rule, data consistency can be violated. + # + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(1)) + c['key'] = 'value1' + msg='on new update is older than' + with self.expectedStdoutPattern(msg): + self.session.commit_transaction() + c.close() + + # Make sure we can successfully add a different key at timestamp 1. + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(1)) + c['key1'] = 'value1' + self.session.commit_transaction() + c.close() + + # + # Insert key2 at timestamp 10 and key3 at 15. + # Then modify both keys in one transaction at timestamp 14. + # Modifying the one from 15 should report a warning message, but + # the update will be applied. + # + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(10)) + c['key2'] = 'value10' + self.session.commit_transaction() + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(15)) + c['key3'] = 'value15' + self.session.commit_transaction() + + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(14)) + c['key2'] = 'value14' + c['key3'] = 'value14' + with self.expectedStdoutPattern(msg): + self.session.commit_transaction() + c.close() + + c = self.session.open_cursor(uri) + self.assertEquals(c['key2'], 'value14') + self.assertEquals(c['key3'], 'value14') + c.close() + + # + # Separately, we should be able to update key2 at timestamp 16. + # + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(16)) + c['key2'] = 'value16' + self.session.commit_transaction() + + # Updating key3 inserted at timestamp 13 will report a warning. + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(13)) + c['key3'] = 'value13' + with self.expectedStdoutPattern(msg): + self.session.commit_transaction() + c.close() + + # Test that updating again with an invalid timestamp reports a warning. + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(12)) + c['key3'] = 'value12' + with self.expectedStdoutPattern(msg): + self.session.commit_transaction() + c.close() + + c = self.session.open_cursor(uri) + self.assertEquals(c['key3'], 'value12') + c.close() + + # Now try a later timestamp. + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(17)) + c['key3'] = 'value17' + self.session.commit_transaction() + c.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp11.py b/src/third_party/wiredtiger/test/suite/test_timestamp11.py new file mode 100644 index 00000000000..f98b7c47b2b --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp11.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp11.py +# Timestamps: mixed timestamp usage +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_timestamp11(wttest.WiredTigerTestCase, suite_subprocess): + def test_timestamp_range(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + base = 'timestamp11' + uri = 'file:' + base + self.session.create(uri, 'key_format=S,value_format=S') + + # Test that mixed timestamp usage where some transactions use timestamps + # and others don't behave in the expected way. + + # Insert two data items at timestamp 2 + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(2)) + c['key'] = 'value2' + c['key2'] = 'value2' + self.session.commit_transaction() + c.close() + + # + # Modify one key without a timestamp and modify the other with a + # later timestamp. + # + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(5)) + c['key'] = 'value5' + self.session.commit_transaction() + c.close() + + c = self.session.open_cursor(uri) + self.session.begin_transaction() + c['key2'] = 'valueNOTS' + self.session.commit_transaction() + c.close() + + # + # Set the stable timestamp and then roll back to it. The first key + # should roll back to the original value and the second key should + # remain at the non-timestamped value. Also the non-timestamped value + # stays regardless of rollbacks or reading at a timestamp. + # + stable_ts = timestamp_str(2) + self.conn.set_timestamp('stable_timestamp=' + stable_ts) + self.conn.rollback_to_stable() + + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.assertEquals(c['key'], 'value2') + self.assertEquals(c['key2'], 'valueNOTS') + self.session.commit_transaction() + c.close() + + c = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + stable_ts) + self.assertEquals(c['key'], 'value2') + self.assertEquals(c['key2'], 'valueNOTS') + self.session.commit_transaction() + c.close() + + # + # Repeat but swapping the keys using or not using timestamps. + # + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(5)) + c['key2'] = 'value5' + self.session.commit_transaction() + c.close() + + c = self.session.open_cursor(uri) + self.session.begin_transaction() + c['key'] = 'valueNOTS' + self.session.commit_transaction() + c.close() + + # Read with each timestamp and without any timestamp. + # + # Without a timestamp. We should see the latest value for each. + c = self.session.open_cursor(uri) + self.session.begin_transaction() + self.assertEquals(c['key'], 'valueNOTS') + self.assertEquals(c['key2'], 'value5') + self.session.commit_transaction() + c.close() + + # With timestamp 2. Both non-timestamped values override the original + # value at timestamp 2. + c = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + stable_ts) + self.assertEquals(c['key'], 'valueNOTS') + self.assertEquals(c['key2'], 'valueNOTS') + self.session.commit_transaction() + c.close() + + # With timestamp 5. We rolled back the first one and never re-inserted + # one at that timestamp and inserted without a timestamp. For the second + # we inserted at timestamp 5 after the non-timestamped insert. + c = self.session.open_cursor(uri) + self.session.begin_transaction('read_timestamp=' + timestamp_str(5)) + self.assertEquals(c['key'], 'valueNOTS') + self.assertEquals(c['key2'], 'value5') + self.session.commit_transaction() + c.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/tools/optrack/arrow-left.png b/src/third_party/wiredtiger/tools/optrack/arrow-left.png new file mode 100644 index 00000000000..315983e3118 Binary files /dev/null and b/src/third_party/wiredtiger/tools/optrack/arrow-left.png differ diff --git a/src/third_party/wiredtiger/tools/optrack/arrow-right.png b/src/third_party/wiredtiger/tools/optrack/arrow-right.png new file mode 100644 index 00000000000..e874d0f55fc Binary files /dev/null and b/src/third_party/wiredtiger/tools/optrack/arrow-right.png differ diff --git a/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py b/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py new file mode 100755 index 00000000000..5bb557ce21b --- /dev/null +++ b/src/third_party/wiredtiger/tools/optrack/find-latency-spikes.py @@ -0,0 +1,1063 @@ +#!/usr/bin/env python + +import argparse +from bokeh.layouts import column +from bokeh.models import ColumnDataSource, CustomJS, HoverTool, FixedTicker +from bokeh.models import Legend, LegendItem +from bokeh.models import NumeralTickFormatter, OpenURL, Range1d, TapTool +from bokeh.models.annotations import Label +from bokeh.plotting import figure, output_file, reset_output, save, show +from bokeh.resources import CDN +import matplotlib +import numpy as np +import os +import pandas as pd +import sys +import traceback + +# Names of the image files we use +arrowLeftImg = "arrow-left.png"; +arrowRightImg = "arrow-right.png"; + +# A directory where we store cross-file plots for each bucket of the outlier +# histogram. +# +bucketDir = "BUCKET-FILES"; + +# A static list of available CSS colors +colorList = []; + +# Codes for various colors for printing of informational and error messages. +# +class color: + PURPLE = '\033[95m' + CYAN = '\033[96m' + DARKCYAN = '\033[36m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + END = '\033[0m' + +# A function name mapped to its corresponding color. +# +funcToColor = {}; +lastColorUsed = 0; + +# The smallest and the largest timestamps seen across all files. +# +firstTimeStamp = sys.maxsize; +lastTimeStamp = 0; + +# A dictionary that holds function-specific threshold values telling +# us when the function is to be considered an outlier. These values +# would be read from a config file, if supplied by the user. +# +outlierThresholdDict = {}; +outlierPrettyNames = {}; + +# A dictionary that holds a reference to the raw dataframe for each file. +# +perFileDataFrame = {}; + +# A dictionary that holds the intervals data per function. +# +perFuncDF = {}; + +# Data frames and largest stack depth for each file. +perFileDataFrame = {}; +perFileLargestStackDepth = {}; + +plotWidth = 1200; +pixelsForTitle = 30; +pixelsPerHeightUnit = 30; +pixelsPerWidthUnit = 5; + +# The coefficient by which we multiply the standard deviation when +# setting the outlier threshold, in case it is not specified by the user. +# +STDEV_MULT = 2; + + +def initColorList(): + + global colorList; + + colorList = matplotlib.colors.cnames.keys(); + + for color in colorList: + # Some browsers break if you try to give them 'sage' + if (color == "sage"): + colorList.remove(color); + +# +# Each unique function name gets a unique color. +# If we run out of colors, we repeat them from the +# beginning of the list. +# +def getColorForFunction(function): + + global colorList; + global lastColorUsed; + global funcToColor; + + if not funcToColor.has_key(function): + funcToColor[function] = colorList[lastColorUsed % len(colorList)]; + lastColorUsed += 1; + + return funcToColor[function]; + + +# +# An intervalEnd is a tuple of three items. +# item #0 is the timestamp, +# item #1 is the event type, +# item #2 is the function name. +# +def getIntervalData(intervalBeginningsStack, intervalEnd, logfile): + + errorOccurred = False; + matchFound = False; + + if (intervalEnd[1] != 1): + logfile.write( + "getIntervaldata: only rows with event type 1 can be used.\n"); + logfile.write(str(intervalEnd) + "\n"); + return None; + + if (len(intervalBeginningsStack) < 1): + logfile.write("Nothing on the intervalBeginningsStack. " + + "I cannot find the beginning for this interval.\n"); + logfile.write(str(intervalEnd) + "\n"); + return None; + + while (not matchFound): + intervalBegin = intervalBeginningsStack.pop(); + if (intervalBegin is None): + logfile.write("Could not find the matching operation begin record" + + " for the following operation end record: \n"); + logfile.write(str(intervalEnd) + "\n"); + return None; + if (intervalBegin[2] != intervalEnd[2]): + logfile.write("Operation end record does not match the available " + + "operation begin record. " + + "Your log file may be incomplete.\n" + + "Skipping the begin record.\n"); + logfile.write("Begin: " + str(intervalBegin) + "\n"); + logfile.write("End: " + str(intervalEnd) + "\n"); + errorOccurred = True; + else: + matchFound = True; + + # This value determines how deep we are in the callstack + # stackDepth = len(intervalBeginningsStack); + + return intervalBegin[0], intervalEnd[0], intervalEnd[2], errorOccurred; + +def plotOutlierHistogram(dataframe, maxOutliers, func, durationThreshold, + averageDuration, maxDuration): + + global pixelsForTitle; + global pixelsPerHeightUnit; + global plotWidth; + + cds = ColumnDataSource(dataframe); + + figureTitle = "Occurrences of " + func + " that took longer than " \ + + durationThreshold + "."; + + hover = HoverTool(tooltips = [ + ("interval start", "@lowerbound{0,0}"), + ("interval end", "@upperbound{0,0}")]); + + TOOLS = [hover, "tap, reset"]; + + p = figure(title = figureTitle, plot_width = plotWidth, + plot_height = min(500, (max(5, (maxOutliers + 1)) \ + * pixelsPerHeightUnit + \ + pixelsForTitle)), + x_axis_label = "Execution timeline (CPU cycles)", + y_axis_label = "Number of outliers", tools = TOOLS); + + y_ticker_max = p.plot_height / pixelsPerHeightUnit; + y_ticker_step = max(1, (maxOutliers + 1)/y_ticker_max); + y_upper_bound = (maxOutliers / y_ticker_step + 1) * y_ticker_step; + + p.yaxis.ticker = FixedTicker(ticks = + range(0, y_upper_bound, y_ticker_step)); + p.ygrid.ticker = FixedTicker(ticks = + range(0, y_upper_bound, y_ticker_step)); + p.xaxis.formatter = NumeralTickFormatter(format="0,"); + + p.y_range = Range1d(0, y_upper_bound); + + p.quad(left = 'lowerbound', right = 'upperbound', bottom = 'bottom', + top = 'height', color = funcToColor[func], source = cds, + nonselection_fill_color=funcToColor[func], + nonselection_fill_alpha = 1.0, + line_color = "lightgrey", + selection_fill_color = funcToColor[func], + selection_line_color="grey" + ); + + # Add an annotation to the chart + # + y_max = dataframe['height'].max(); + text = "Average duration: " + '{0:,.0f}'.format(averageDuration) + \ + ". Maximum duration: " + '{0:,.0f}'.format(maxDuration) + "."; + mytext = Label(x=0, y=y_upper_bound-y_ticker_step, text=text, + text_color = "grey", text_font = "helvetica", + text_font_size = "10pt", + text_font_style = "italic"); + p.add_layout(mytext); + + url = "@bucketfiles"; + taptool = p.select(type=TapTool); + taptool.callback = OpenURL(url=url); + + return p; + +# From all timestamps subtract the smallest observed timestamp, so that +# our execution timeline begins at zero. +# Cleanup the data to remove incomplete records and fix their effects. +# +def normalizeIntervalData(): + + global firstTimeStamp; + global perFileDataFrame; + + print(color.BLUE + color.BOLD + "Normalizing data..." + color.END); + + for file, df in perFileDataFrame.iteritems(): + df['origstart'] = df['start']; + df['start'] = df['start'] - firstTimeStamp; + df['end'] = df['end'] - firstTimeStamp; + +def reportDataError(logfile, logfilename): + + if (logfile is not sys.stdout): + print(color.BOLD + color.RED + "Your data may have errors. " + + "Check the file " + logfilename + " for details." + color.END); + return True; + +# +# Go over all operation records in the dataframe and assign stack depths. +# +def assignStackDepths(dataframe): + + stack = []; + + df = dataframe.sort_values(by=['start']); + df = df.reset_index(drop = True); + + for i in range(len(df.index)): + + myStartTime = df.at[i, 'start']; + + # Pop all items off stack whose end time is earlier than my + # start time. They are not part of my stack, so I don't want to + # count them. + # + while (len(stack) > 0 and stack[-1] < myStartTime): + stack.pop(); + + df.at[i, 'stackdepth'] = len(stack); + stack.append(df.at[i, 'end']); + + return df; + +def createCallstackSeries(data, logfilename): + + global firstTimeStamp; + global lastTimeStamp; + + colors = []; + beginIntervals = []; + dataFrame = None; + endIntervals = []; + errorReported = False; + functionNames = []; + intervalBeginningsStack = []; + largestStackDepth = 0; + logfile = None; + thisIsFirstRow = True; + + # Let's open the log file. + try: + logfile = open(logfilename, "w"); + except: + logfile = sys.stdout; + + for row in data.itertuples(): + # row[0] is the timestamp, row[1] is the event type, + # row[2] is the function name. + # + if (row[1] == 0): + intervalBeginningsStack.append(row); + elif (row[1] == 1): + try: + intervalBegin, intervalEnd, function, error\ + = getIntervalData(intervalBeginningsStack, row, logfile); + if (error and (not errorReported)): + errorReported = reportDataError(logfile, logfilename); + except: + if (not errorReported): + errorReported = reportDataError(logfile, logfilename); + continue; + + if (intervalBegin < firstTimeStamp): + firstTimeStamp = intervalBegin; + if (intervalEnd > lastTimeStamp): + lastTimeStamp = intervalEnd; + + colors.append(getColorForFunction(function)); + beginIntervals.append(intervalBegin); + endIntervals.append(intervalEnd); + functionNames.append(function); + #stackDepths.append(stackDepth); + #stackDepthsNext.append(stackDepth + 1); + + #print("Begin: " + str(intervalBegin)), + #print(" Func: " + function), + #print(" Stack depth: " + str(stackDepth)); + + else: + print("Invalid event in this line:"); + print(str(row[0]) + " " + str(row[1]) + " " + str(row[2])); + continue; + + if (len(intervalBeginningsStack) > 0): + logfile.write(str(len(intervalBeginningsStack)) + " operations had a " + + "begin record, but no matching end records. " + + "Please check that your operation tracking macros " + + "are properly inserted.\n"); + if (not errorReported): + errorReported = reportDataError(logfile, logfilename); + intervalBeginningsStack = []; + + dict = {}; + dict['color'] = colors; + dict['start'] = beginIntervals; + dict['end'] = endIntervals; + dict['function'] = functionNames; + dict['stackdepth'] = [0] * len(beginIntervals); + + dataframe = pd.DataFrame(data=dict); + dataframe = assignStackDepths(dataframe); + + dataframe['durations'] = dataframe['end'] - dataframe['start']; + dataframe['stackdepthNext'] = dataframe['stackdepth'] + 1; + + return dataframe; + +def addLegend(p, legendItems, numLegends): + + legend = Legend(items=legendItems, orientation = "horizontal"); + p.add_layout(legend, place='above'); + legendItems[:] = []; # Empty the list. + + return (numLegends + 1); + +# For each function we only show the legend once. In this dictionary we +# keep track of colors already used. +# +colorAlreadyUsedInLegend = {}; + +def generateBucketChartForFile(figureName, dataframe, y_max, x_min, x_max): + + global colorAlreadyUsedInLegend; + global funcToColor; + + MAX_ITEMS_PER_LEGEND = 5; + numLegends = 0; + legendItems = []; + pixelsPerStackLevel = 30; + pixelsPerLegend = 60; + pixelsForTitle = 30; + + cds = ColumnDataSource(dataframe); + + hover = HoverTool(tooltips=[ + ("function", "@function"), + ("duration", "@durations{0,0}"), + ("log file begin timestamp", "@origstart{0,0}") + ]); + + TOOLS = [hover]; + + p = figure(title=figureName, plot_width=1200, + x_range = (x_min, x_max), + y_range = (0, y_max+1), + x_axis_label = "Time (CPU cycles)", + y_axis_label = "Stack depth", + tools = TOOLS + ); + + # No minor ticks or labels on the y-axis + p.yaxis.major_tick_line_color = None; + p.yaxis.minor_tick_line_color = None; + p.yaxis.major_label_text_font_size = '0pt'; + p.yaxis.ticker = FixedTicker(ticks = range(0, y_max+1)); + p.ygrid.ticker = FixedTicker(ticks = range(0, y_max+1)); + + p.xaxis.formatter = NumeralTickFormatter(format="0,") + + p.quad(left = 'start', right = 'end', bottom = 'stackdepth', + top = 'stackdepthNext', color = 'color', line_color = "lightgrey", + line_width = 0.5, source=cds); + + for func, fColor in funcToColor.iteritems(): + + # If this function is not present in this dataframe, + # we don't care about it. + # + boolVec = (dataframe['function'] == func); + fDF = dataframe[boolVec]; + if (fDF.size == 0): + continue; + + # If we already added a color to any legend, we don't + # add it again to avoid redundancy in the charts and + # in order not to waste space. + # + if (colorAlreadyUsedInLegend.has_key(fColor)): + continue; + else: + colorAlreadyUsedInLegend[fColor] = True; + + r = p.quad(left=0, right=1, bottom=0, top=1, color=fColor); + + lItem = LegendItem(label = func, + renderers = [r]); + legendItems.append(lItem); + + # Cap the number of items in a legend, so it can + # fit horizontally. + if (len(legendItems) == MAX_ITEMS_PER_LEGEND): + numLegends = addLegend(p, legendItems, numLegends); + + # Add whatever legend items did not get added + if (len(legendItems) > 0): + numLegends = addLegend(p, legendItems, numLegends); + + # Plot height is the function of the maximum call stack and the number of + # legends + p.plot_height = (numLegends * pixelsPerLegend) \ + + max((y_max+1) * pixelsPerStackLevel, 100) \ + + pixelsForTitle; + + return p; + +def generateEmptyDataset(): + + dict = {}; + dict['color'] = [0]; + dict['durations'] = [0]; + dict['start'] = [0]; + dict['end'] = [0]; + dict['function'] = [""]; + dict['stackdepth'] = [0]; + dict['stackdepthNext'] = [0]; + + return pd.DataFrame(data=dict); + +# When we have no data for a trace interva we generate an empty file +# for that interval. +# +def createNoDataFile(filename): + + try: + f = open(filename, "w"); + except: + print(color.RED + color.BOLD), + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exception(exc_type, exc_value, exc_traceback); + print("Could not open file " + filename + " for writing."); + print(color.END); + return; + + f.write("\n"); + f.write("

"); + f.write("No data was generated for this trace interval.

\n"); + f.write("\n"); + f.close() +# +# Here we generate plots that span all the input files. Each plot shows +# the timelines for all files, stacked vertically. The timeline shows +# the function callstacks over time from this file. +# +# Since a single timeline is too large to fit on a single screen, we generate +# a separate HTML file with plots for bucket "i". A bucket is a vertical slice +# across the timelines for all files. We call it a bucket, because it +# corresponds to a bucket in the outlier histogram. +# +def generateCrossFilePlotsForBucket(i, lowerBound, upperBound): + + global bucketDir; + global colorAlreadyUsedInLegend; + + figuresForAllFiles = []; + fileName = bucketDir + "/bucket-" + str(i) + ".html"; + + reset_output(); + + # The following dictionary keeps track of legends. We need + # a legend for each new HTML file. So we reset the dictionary + # before generating a new file. + # + colorAlreadyUsedInLegend = {}; + + intervalTitle = "Interval " + "{:,}".format(lowerBound) + \ + " to " + "{:,}".format(upperBound) + \ + " CPU cycles"; + + # Select from the dataframe for this file the records whose 'start' + # and 'end' timestamps fall within the lower and upper bound. + # + for fname in sorted(perFileDataFrame.keys()): + + fileDF = perFileDataFrame[fname]; + + # Select operations whose start timestamp falls within + # the current interval, delimited by lowerBound and upperBound. + # + startInBucket = fileDF.loc[(fileDF['start'] >= lowerBound) + & (fileDF['start'] < upperBound)]; + + # Select operations whose end timestamp falls within + # the current interval, delimited by lowerBound and upperBound. + # + endInBucket = fileDF.loc[(fileDF['end'] > lowerBound) + & (fileDF['end'] <= upperBound)]; + + # Select operations that begin before this interval and end after + # this interval, but continue throughout this interval. The interval + # is delimited by lowerBound and upperBound. + # + spanBucket = fileDF.loc[(fileDF['start'] < lowerBound) + & (fileDF['end'] > upperBound)]; + + frames = [startInBucket, endInBucket, spanBucket]; + bucketDF = pd.concat(frames).drop_duplicates().reset_index(drop=True); + + if (bucketDF.size == 0): + continue; + + # If the end of the function is outside the interval, let's pretend + # that it is within the interval, otherwise we won't see any data about + # it when we hover. This won't have the effect of showing wrong + # data to the user. + # + mask = bucketDF.end >= upperBound; + bucketDF.loc[mask, 'end'] = upperBound-1; + + # Same adjustment as above if the start of the operation falls outside + # the interval's lower bound. + # + mask = bucketDF.start < lowerBound; + bucketDF.loc[mask, 'start'] = lowerBound; + + largestStackDepth = bucketDF['stackdepthNext'].max(); + figureTitle = fname + ": " + intervalTitle; + + figure = generateBucketChartForFile(figureTitle, bucketDF, + largestStackDepth, + lowerBound, upperBound); + + figuresForAllFiles.append(figure); + + if (len(figuresForAllFiles) > 0): + savedFileName = save(column(figuresForAllFiles), + filename = fileName, title=intervalTitle, + resources=CDN); + else: + createNoDataFile(fileName); + + return fileName; + +# Generate plots of time series slices across all files for each bucket +# in the outlier histogram. Save each cross-file slice to an HTML file. +# +def generateTSSlicesForBuckets(): + + global firstTimeStamp; + global lastTimeStamp; + global plotWidth; + global pixelsPerWidthUnit; + + bucketFilenames = []; + + numBuckets = plotWidth / pixelsPerWidthUnit; + timeUnitsPerBucket = (lastTimeStamp - firstTimeStamp) / numBuckets; + + for i in range(numBuckets): + lowerBound = i * timeUnitsPerBucket; + upperBound = (i+1) * timeUnitsPerBucket; + + fileName = generateCrossFilePlotsForBucket(i, lowerBound, + upperBound); + + percentComplete = float(i) / float(numBuckets) * 100; + print(color.BLUE + color.BOLD + " Generating timeline charts... "), + sys.stdout.write("%d%% complete \r" % (percentComplete) ); + sys.stdout.flush(); + bucketFilenames.append(fileName); + + print(color.END); + + return bucketFilenames; + +# Here we are making a line that will be inserted into an HTML file for +# a given bucket (execution slice). This line will have links to the +# previous slice and to the next slice, so we can navigate between slices +# by clicking those links. +# +def makeLineWithLinks(previous, next): + + global arrowLeftImg; + global arrowRightImg; + + previousLink = ""; + nextLink = ""; + + # Strip the directory component out of the file name. + # + if previous is not None: + words = previous.split("/"); + previousStripped = words[len(words)-1]; + previousLink = "" + \ + "

 "; + + + if next is not None: + words = next.split("/"); + nextStripped = words[len(words)-1]; + nextLink = "" + \ + "

 "; + + line = previousLink + " " + nextLink + "\n"; + return line; + + +# Into the current file insert links to the previous one and to the next one. +# The rewritten file is saved under a new file name. +# +def linkFiles(current, previous, next): + + curFile = None; + newFile = None; + newFileName = current + ".new"; + + try: + curFile = open(current, "r"); + except: + print(color.RED + color.BOLD), + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exception(exc_type, exc_value, exc_traceback); + print("Could not open file " + current + " for reading."); + print(color.END); + return None; + + try: + newFile = open(newFileName, "w"); + except: + print(color.RED + color.BOLD), + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exception(exc_type, exc_value, exc_traceback); + print("Could not open file " + newFileName + " for writing."); + print(color.END); + return None; + + curFileLines = curFile.readlines(); + + for i in range(len(curFileLines)): + line = curFileLines[i]; + + insertedLine = makeLineWithLinks(previous, next); + + if "" in line: + curFileLines.insert(i+1, insertedLine); + elif "" in line: + curFileLines.insert(i, insertedLine); + + for line in curFileLines: + newFile.write(line); + + curFile.close(); + newFile.close(); + + os.rename(newFileName, current); + +# We have a list of bucket files. Each one is an HTML file showing a slice of +# the execution. To be able to easily navigate between consecutive execution +# slices we insert links into each slice-file that take us to the previous +# slice and to the next slice. +# +def interlinkFiles(fnameList): + + for i in range(len(fnameList)): + current = fnameList[i]; + + if i > 0: + previous = fnameList[i-1]; + else: + previous = None; + + if (i < len(fnameList)-1): + next = fnameList[i+1]; + else: + next = None; + + linkFiles(current, previous, next); + +def processFile(fname): + + global perFileDataFrame; + global perFuncDF; + + rawData = pd.read_csv(fname, + header=None, delimiter=" ", + index_col=2, + names=["Event", "Function", "Timestamp"], + dtype={"Event": np.int32, "Timestamp": np.int64}, + thousands=","); + + print(color.BOLD + color.BLUE + + "Processing file " + str(fname) + color.END); + iDF = createCallstackSeries(rawData, "." + fname + ".log"); + + perFileDataFrame[fname] = iDF; + + for func in funcToColor.keys(): + + funcDF = iDF.loc[lambda iDF: iDF.function == func, :]; + funcDF = funcDF.drop(columns = ['function']); + + if (not perFuncDF.has_key(func)): + perFuncDF[func] = funcDF; + else: + perFuncDF[func] = pd.concat([perFuncDF[func], funcDF]); + + +# +# For each function, split the timeline into buckets. In each bucket +# show how many times this function took an unusually long time to +# execute. +# +# The parameter durationThreshold tells us when a function should be +# considered as unusually long. If this parameter is "-1" we count +# all functions whose duration exceeded the average by more than +# two standard deviations. +# +def createOutlierHistogramForFunction(func, funcDF, bucketFilenames): + + global firstTimeStamp; + global lastTimeStamp; + global plotWidth; + global pixelsPerWidthUnit; + global STDEV_MULT; + + durationThreshold = 0; + durationThresholdDescr = ""; + + # + # funcDF is a list of functions along with their start and end + # interval and durations. We need to create a new dataframe where + # we separate the entire timeline into a fixed number of periods + # and for each period compute how many outlier durations were + # observed. Then we create a histogram from this data. + + # Subtract the smallest timestamp from all the interval data. + funcDF['start'] = funcDF['start'] - firstTimeStamp; + funcDF['end'] = funcDF['end'] - firstTimeStamp; + + funcDF = funcDF.sort_values(by=['start']); + + averageDuration = funcDF['durations'].mean(); + maxDuration = funcDF['durations'].max(); + + if (outlierThresholdDict.has_key(func)): + durationThreshold = outlierThresholdDict[func]; + durationThresholdDescr = outlierPrettyNames[func]; + elif (outlierThresholdDict.has_key("*")): + durationThreshold = outlierThresholdDict["*"]; + durationThresholdDescr = outlierPrettyNames["*"]; + else: + # Signal that we will use standard deviation + durationThreshold = -STDEV_MULT; + + if (durationThreshold < 0): # this is a stdev multiplier + mult = -durationThreshold; + stdDev = funcDF['durations'].std(); + durationThreshold = averageDuration + mult * stdDev; + durationThresholdDescr = '{0:,.0f}'.format(durationThreshold) \ + + " measurement units (" + str(mult) + \ + " standard deviations)"; + + numBuckets = plotWidth / pixelsPerWidthUnit; + timeUnitsPerBucket = (lastTimeStamp - firstTimeStamp) / numBuckets; + lowerBounds = []; + upperBounds = []; + bucketHeights = []; + maxOutliers = 0; + + for i in range(numBuckets): + lowerBound = i * timeUnitsPerBucket; + upperBound = (i+1) * timeUnitsPerBucket; + + bucketDF = funcDF.loc[(funcDF['start'] >= lowerBound) + & (funcDF['start'] < upperBound) + & (funcDF['durations'] >= durationThreshold)]; + + numOutliers = bucketDF.size; + if (numOutliers > maxOutliers): + maxOutliers = numOutliers; + + lowerBounds.append(lowerBound); + upperBounds.append(upperBound); + bucketHeights.append(numOutliers); + + if (maxOutliers == 0): + return None; + + dict = {}; + dict['lowerbound'] = lowerBounds; + dict['upperbound'] = upperBounds; + dict['height'] = bucketHeights; + dict['bottom'] = [0] * len(lowerBounds); + dict['bucketfiles'] = bucketFilenames; + + dataframe = pd.DataFrame(data=dict); + + return plotOutlierHistogram(dataframe, maxOutliers, func, + durationThresholdDescr, averageDuration, + maxDuration); +# +# The configuration file tells us which functions should be considered +# outliers. All comment lines must begin with '#'. +# +# The first non-comment line of the file must tell us how to interpret +# the measurement units in the trace file. It must have a single number +# telling us how many time units are contained in a second. This should +# be the same time units used in the trace file. For example, if the trace +# file contains timestamps measured in milliseconds, the number would be 1000. +# If timestamps were measured in clock cycles, as is typically done, the number +# must tell us how many times the CPU clock ticks per second on the processor +# where the trace was gathered. +# +# The remaining lines must have the format: +# [units] +# +# For example, if you would like to flag as outliers all instances of +# __cursor_row_search that took longer than 200ms, you would specify this as: +# +# __cursor_row_search 200 ms +# +# You can use * as the wildcard for all function. No other wildcard options are +# supported at the moment. +# +# Acceptable units are: +# +# s -- for seconds +# ms -- for milliseconds +# us -- for microseconds +# ns -- for nanoseconds +# stdev -- for standard deviations. +# +# If no units are supplied, the same unit as the one used for the timestamp +# in the trace files is assumed. +# +# If there is a valid configuration file, but the function does not appear in +# it, we will not generate an outlier histogram for this function. Use the +# wildcard symbol to include all functions. +# +def parseConfigFile(fname): + + global outlierThresholdDict; + global outlierPrettyNames; + + configFile = None; + firstNonCommentLine = True; + unitsPerSecond = -1; + unitsPerMillisecond = 0.0; + unitsPerMicrosecond = 0.0; + unitsPerNanosecond = 0.0; + + try: + configFile = open(fname, "r"); + except: + print(color.BOLD + color.RED + + "Could not open " + fname + " for reading." + color.END); + return False; + + for line in configFile: + + if (line[0] == "#"): + continue; + elif (firstNonCommentLine): + try: + unitsPerSecond = int(line); + unitsPerMillisecond = unitsPerSecond / 1000; + unitsPerMicrosecond = unitsPerSecond / 1000000; + unitsPerNanosecond = unitsPerSecond / 1000000000; + + firstNonCommentLine = False; + except ValueError: + print(color.BOLD + color.RED + + "Could not parse the number of measurement units " + + "per second. This must be the first value in the " + + "config file." + color.END); + return False; + else: + func = ""; + number = 0; + threshold = 0.0; + units = ""; + + words = line.split(); + try: + func = words[0]; + number = int(words[1]); + units = words[2]; + except ValueError: + print(color.BOLD + color.RED + + "While parsing the config file, could not understand " + + "the following line: " + color.END); + print(line); + continue; + + # Now convert the number to the baseline units and record in the + # dictionary. + # + if (units == "s"): + threshold = unitsPerSecond * number; + elif (units == "ms"): + threshold = unitsPerMillisecond * number; + elif (units == "us"): + threshold = unitsPerMicrosecond * number; + elif (units == "ns"): + threshold = unitsPerNanosecond * number; + elif (units == "stdev"): + threshold = -units; + # We record it as negative, so that we know + # this is a standard deviation. We will compute + # the actual value once we know the average. + else: + print(color.BOLD + color.RED + + "While parsing the config file, could not understand " + + "the following line: " + color.END); + print(line); + continue; + + outlierThresholdDict[func] = threshold; + outlierPrettyNames[func] = str(number) + " " + units; + + # We were given an empty config file + if (firstNonCommentLine): + return False; + + print outlierThresholdDict; + return True; + + +def main(): + + global arrowLeftImg; + global arrowRightImg; + global bucketDir; + global perFuncDF; + + configSupplied = False; + figuresForAllFunctions = []; + + # Set up the argument parser + # + parser = argparse.ArgumentParser(description= + 'Visualize operation log'); + parser.add_argument('files', type=str, nargs='*', + help='log files to process'); + parser.add_argument('-c', '--config', dest='configFile', default=''); + args = parser.parse_args(); + + if (len(args.files) == 0): + parser.print_help(); + sys.exit(1); + + # Get names of standard CSS colors that we will use for the legend + initColorList(); + + # Read the configuration file, if supplied. + if (args.configFile != ''): + configSupplied = parseConfigFile(args.configFile); + + if (not configSupplied): + pluralSuffix = ""; + if (STDEV_MULT > 1): + pluralSuffix = "s"; + print(color.BLUE + color.BOLD + + "Will deem as outliers all function instances whose runtime " + + "was " + str(STDEV_MULT) + " standard deviation" + pluralSuffix + + " greater than the average runtime for that function." + + color.END); + + + # Create a directory for the files that display the data summarized + # in each bucket of the outlier histogram. We call these "bucket files". + # + if not os.path.exists(bucketDir): + os.makedirs(bucketDir); + + # Copy the image files that we will need later into bucketDir + scriptLocation = os.path.dirname(os.path.realpath(__file__)); + os.system("cp " + scriptLocation + "/" + arrowLeftImg + " " + bucketDir + + "/" + arrowLeftImg); + os.system("cp " + scriptLocation + "/" + arrowRightImg + " " + bucketDir + + "/" + arrowRightImg); + + # Parallelize this later, so we are working on files in parallel. + for fname in args.files: + processFile(fname); + + # Normalize all intervals by subtracting the first timestamp. + normalizeIntervalData(); + + # Generate plots of time series slices across all files for each bucket + # in the outlier histogram. Save each cross-file slice to an HTML file. + # + fileNameList = generateTSSlicesForBuckets(); + + # Rewrite the files, so that they have links to one another. This way + # you can navigate from one slice to the next by clicking the link inside + # the file. + # + interlinkFiles(fileNameList); + + totalFuncs = len(perFuncDF.keys()); + i = 0; + # Generate a histogram of outlier durations + for func in sorted(perFuncDF.keys()): + funcDF = perFuncDF[func]; + figure = createOutlierHistogramForFunction(func, funcDF, fileNameList); + if (figure is not None): + figuresForAllFunctions.append(figure); + + i += 1; + percentComplete = float(i) / float(totalFuncs) * 100; + print(color.BLUE + color.BOLD + " Generating outlier histograms... "), + sys.stdout.write("%d%% complete \r" % (percentComplete) ); + sys.stdout.flush(); + + print(color.END); + reset_output(); + output_file(filename = "WT-outliers.html", title="Outlier histograms"); + show(column(figuresForAllFunctions)); + +if __name__ == '__main__': + main() + + + diff --git a/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py new file mode 100755 index 00000000000..ebfd4af05b7 --- /dev/null +++ b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import colorsys +from multiprocessing import Process +import multiprocessing +import os +import os.path +import struct +import sys +import subprocess +import time +import traceback + +# +# This log version must be the same as that defined in ../src/include/optrack.h +# +currentLogVersion = 1; + +class color: + PURPLE = '\033[95m' + CYAN = '\033[96m' + DARKCYAN = '\033[36m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + END = '\033[0m' + +functionMap = {}; + +def buildTranslationMap(mapFileName): + + mapFile = None; + + if not os.path.exists(mapFileName): + return False; + + try: + mapFile = open(mapFileName, "r"); + except: + print(color.BOLD + color.RED); + print("Could not open " + mapFileName + " for reading"); + print(color.END); + return; + + # Read lines from the map file and build an in-memory map + # of translations. Each line has a function ID followed by space and + # followed by the function name. + # + lines = mapFile.readlines(); # a map file is usually small + + for line in lines: + + words = line.split(" "); + if (len(words) < 2): + continue; + + try: + funcID = int(words[0]); + except: + continue; + + funcName = words[1].strip(); + + functionMap[funcID] = funcName; + + return True; + +def funcIDtoName(funcID): + + if (functionMap.has_key(funcID)): + return functionMap[funcID]; + else: + return "NULL"; + +# +# The format of the record is written down in src/include/optrack.h +# file in the WiredTiger source tree. The current implementation assumes +# a record of three fields. The first field is the 8-byte timestamp. +# The second field is the 2-byte function ID. The third field is the +# 2-byte operation type: '0' for function entry, '1' for function exit. +# The record size would be padded to 16 bytes in the C implementation by +# the compiler, because we keep an array of records, and each new record +# has to be 8-byte aligned, since the first field has the size 8 bytes. +# So we explicitly pad the track record structure in the implementation +# to make it clear what the record size is. +# +def parseOneRecord(file): + + bytesRead = ""; + record = (); + RECORD_SIZE = 16; + + try: + bytesRead = file.read(RECORD_SIZE); + except: + return None; + + if (len(bytesRead) < RECORD_SIZE): + return None; + + record = struct.unpack('Qhhxxxx', bytesRead); + + return record; + +# +# HEADER_SIZE must be the same as the size of WT_OPTRACK_HEADER +# structure defined in ../src/include/optrack.h +# +def validateHeader(file): + + global currentLogVersion; + + bytesRead = ""; + HEADER_SIZE = 8; + + try: + bytesRead = file.read(HEADER_SIZE); + except: + return False, -1; + + if (len(bytesRead) < HEADER_SIZE): + return False, -1; + + version, threadType = struct.unpack('II', bytesRead); + + if (version == currentLogVersion): + return True, threadType; + else: + return False, -1; + +def getStringFromThreadType(threadType): + + if (threadType == 0): + return "external"; + elif (threadType == 1): + return "internal"; + else: + return unknown; + + +def parseFile(fileName): + + done = False; + file = None; + threadType = 0; + threadTypeString = None; + outputFile = None; + outputFileName = ""; + totalRecords = 0; + validVersion = False; + + print(color.BOLD + "Processing file " + fileName + color.END); + + # Open the log file for reading + try: + file = open(fileName, "r"); + except: + print(color.BOLD + color.RED + + "Could not open " + fileName + " for reading" + color.END); + return; + + # Read and validate log header + validVersion, threadType = validateHeader(file); + if (not validVersion): + return; + + threadTypeString = getStringFromThreadType(threadType); + + # Open the text file for writing + try: + outputFileName = fileName + "-" + threadTypeString + ".txt"; + outputFile = open(outputFileName, "w"); + except: + print(color.BOLD + color.RED + + "Could not open file " + outputfileName + ".txt for writing." + + color.END); + return; + + print(color.BOLD + color.PURPLE + + "Writing to output file " + outputFileName + "." + color.END); + + while (not done): + record = parseOneRecord(file); + + if ((record is None) or len(record) < 3): + done = True; + else: + try: + time = record[0]; + funcName = funcIDtoName(record[1]); + opType = record[2]; + + outputFile.write(str(opType) + " " + funcName + " " + str(time) + + "\n"); + totalRecords += 1; + except: + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_exception(exc_type, exc_value, exc_traceback); + print(color.BOLD + color.RED); + print("Could not write record " + str(record) + + " to file " + fileName + ".txt."); + print(color.END); + done = True; + + print("Wrote " + str(totalRecords) + " records to " + outputFileName + "."); + file.close(); + outputFile.close(); + +def waitOnOneProcess(runningProcesses): + + success = False; + for fname, p in runningProcesses.items(): + if (not p.is_alive()): + del runningProcesses[fname]; + success = True; + + # If we have not found a terminated process, sleep for a while + if (not success): + time.sleep(5); + +def main(): + + runnableProcesses = {}; + returnValues = {}; + spawnedProcesses = {}; + successfullyProcessedFiles = []; + targetParallelism = multiprocessing.cpu_count(); + terminatedProcesses = {}; + + parser = argparse.ArgumentParser(description= + 'Convert WiredTiger operation \ + tracking logs from binary to \ + text format.'); + + parser.add_argument('files', type=str, nargs='*', + help='optrack log files to process'); + + parser.add_argument('-j', dest='jobParallelism', type=int, + default='0'); + + parser.add_argument('-m', '--mapfile', dest='mapFileName', type=str, + default='optrack-map'); + + args = parser.parse_args(); + + print("Running with the following parameters:"); + for key, value in vars(args).items(): + print ("\t" + key + ": " + str(value)); + + # Parse the map of function ID to name translations. + if (buildTranslationMap(args.mapFileName) is False): + print("Failed to locate or parse the map file " + + args.mapFileName); + print("Cannot proceed."); + return; + + # Determine the target job parallelism + if (args.jobParallelism > 0): + targetParallelism = args.jobParallelism; + if (targetParallelism == 0): + targetParallelism = len(args.files); + print(color.BLUE + color.BOLD + + "Will process " + str(targetParallelism) + " files in parallel." + + color.END); + + # Prepare the processes that will parse files, one per file + if (len(args.files) > 0): + for fname in args.files: + p = Process(target=parseFile, args=(fname,)); + runnableProcesses[fname] = p; + + # Spawn these processes, not exceeding the desired parallelism + while (len(runnableProcesses) > 0): + while (len(spawnedProcesses) < targetParallelism + and len(runnableProcesses) > 0): + + fname, p = runnableProcesses.popitem(); + p.start(); + spawnedProcesses[fname] = p; + + # Find at least one terminated process + waitOnOneProcess(spawnedProcesses); + + # Wait for all processes to terminate + while (len(spawnedProcesses) > 0): + waitOnOneProcess(spawnedProcesses); + +if __name__ == '__main__': + main() diff --git a/src/third_party/wiredtiger/tools/wt_optrack_decode.py b/src/third_party/wiredtiger/tools/wt_optrack_decode.py deleted file mode 100755 index ebfd4af05b7..00000000000 --- a/src/third_party/wiredtiger/tools/wt_optrack_decode.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env python -# -# Public Domain 2014-2018 MongoDB, Inc. -# Public Domain 2008-2014 WiredTiger, Inc. -# -# This is free and unencumbered software released into the public domain. -# -# Anyone is free to copy, modify, publish, use, compile, sell, or -# distribute this software, either in source code form or as a compiled -# binary, for any purpose, commercial or non-commercial, and by any -# means. -# -# In jurisdictions that recognize copyright laws, the author or authors -# of this software dedicate any and all copyright interest in the -# software to the public domain. We make this dedication for the benefit -# of the public at large and to the detriment of our heirs and -# successors. We intend this dedication to be an overt act of -# relinquishment in perpetuity of all present and future rights to this -# software under copyright law. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. - -import argparse -import colorsys -from multiprocessing import Process -import multiprocessing -import os -import os.path -import struct -import sys -import subprocess -import time -import traceback - -# -# This log version must be the same as that defined in ../src/include/optrack.h -# -currentLogVersion = 1; - -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - -functionMap = {}; - -def buildTranslationMap(mapFileName): - - mapFile = None; - - if not os.path.exists(mapFileName): - return False; - - try: - mapFile = open(mapFileName, "r"); - except: - print(color.BOLD + color.RED); - print("Could not open " + mapFileName + " for reading"); - print(color.END); - return; - - # Read lines from the map file and build an in-memory map - # of translations. Each line has a function ID followed by space and - # followed by the function name. - # - lines = mapFile.readlines(); # a map file is usually small - - for line in lines: - - words = line.split(" "); - if (len(words) < 2): - continue; - - try: - funcID = int(words[0]); - except: - continue; - - funcName = words[1].strip(); - - functionMap[funcID] = funcName; - - return True; - -def funcIDtoName(funcID): - - if (functionMap.has_key(funcID)): - return functionMap[funcID]; - else: - return "NULL"; - -# -# The format of the record is written down in src/include/optrack.h -# file in the WiredTiger source tree. The current implementation assumes -# a record of three fields. The first field is the 8-byte timestamp. -# The second field is the 2-byte function ID. The third field is the -# 2-byte operation type: '0' for function entry, '1' for function exit. -# The record size would be padded to 16 bytes in the C implementation by -# the compiler, because we keep an array of records, and each new record -# has to be 8-byte aligned, since the first field has the size 8 bytes. -# So we explicitly pad the track record structure in the implementation -# to make it clear what the record size is. -# -def parseOneRecord(file): - - bytesRead = ""; - record = (); - RECORD_SIZE = 16; - - try: - bytesRead = file.read(RECORD_SIZE); - except: - return None; - - if (len(bytesRead) < RECORD_SIZE): - return None; - - record = struct.unpack('Qhhxxxx', bytesRead); - - return record; - -# -# HEADER_SIZE must be the same as the size of WT_OPTRACK_HEADER -# structure defined in ../src/include/optrack.h -# -def validateHeader(file): - - global currentLogVersion; - - bytesRead = ""; - HEADER_SIZE = 8; - - try: - bytesRead = file.read(HEADER_SIZE); - except: - return False, -1; - - if (len(bytesRead) < HEADER_SIZE): - return False, -1; - - version, threadType = struct.unpack('II', bytesRead); - - if (version == currentLogVersion): - return True, threadType; - else: - return False, -1; - -def getStringFromThreadType(threadType): - - if (threadType == 0): - return "external"; - elif (threadType == 1): - return "internal"; - else: - return unknown; - - -def parseFile(fileName): - - done = False; - file = None; - threadType = 0; - threadTypeString = None; - outputFile = None; - outputFileName = ""; - totalRecords = 0; - validVersion = False; - - print(color.BOLD + "Processing file " + fileName + color.END); - - # Open the log file for reading - try: - file = open(fileName, "r"); - except: - print(color.BOLD + color.RED + - "Could not open " + fileName + " for reading" + color.END); - return; - - # Read and validate log header - validVersion, threadType = validateHeader(file); - if (not validVersion): - return; - - threadTypeString = getStringFromThreadType(threadType); - - # Open the text file for writing - try: - outputFileName = fileName + "-" + threadTypeString + ".txt"; - outputFile = open(outputFileName, "w"); - except: - print(color.BOLD + color.RED + - "Could not open file " + outputfileName + ".txt for writing." + - color.END); - return; - - print(color.BOLD + color.PURPLE + - "Writing to output file " + outputFileName + "." + color.END); - - while (not done): - record = parseOneRecord(file); - - if ((record is None) or len(record) < 3): - done = True; - else: - try: - time = record[0]; - funcName = funcIDtoName(record[1]); - opType = record[2]; - - outputFile.write(str(opType) + " " + funcName + " " + str(time) - + "\n"); - totalRecords += 1; - except: - exc_type, exc_value, exc_traceback = sys.exc_info() - traceback.print_exception(exc_type, exc_value, exc_traceback); - print(color.BOLD + color.RED); - print("Could not write record " + str(record) + - " to file " + fileName + ".txt."); - print(color.END); - done = True; - - print("Wrote " + str(totalRecords) + " records to " + outputFileName + "."); - file.close(); - outputFile.close(); - -def waitOnOneProcess(runningProcesses): - - success = False; - for fname, p in runningProcesses.items(): - if (not p.is_alive()): - del runningProcesses[fname]; - success = True; - - # If we have not found a terminated process, sleep for a while - if (not success): - time.sleep(5); - -def main(): - - runnableProcesses = {}; - returnValues = {}; - spawnedProcesses = {}; - successfullyProcessedFiles = []; - targetParallelism = multiprocessing.cpu_count(); - terminatedProcesses = {}; - - parser = argparse.ArgumentParser(description= - 'Convert WiredTiger operation \ - tracking logs from binary to \ - text format.'); - - parser.add_argument('files', type=str, nargs='*', - help='optrack log files to process'); - - parser.add_argument('-j', dest='jobParallelism', type=int, - default='0'); - - parser.add_argument('-m', '--mapfile', dest='mapFileName', type=str, - default='optrack-map'); - - args = parser.parse_args(); - - print("Running with the following parameters:"); - for key, value in vars(args).items(): - print ("\t" + key + ": " + str(value)); - - # Parse the map of function ID to name translations. - if (buildTranslationMap(args.mapFileName) is False): - print("Failed to locate or parse the map file " + - args.mapFileName); - print("Cannot proceed."); - return; - - # Determine the target job parallelism - if (args.jobParallelism > 0): - targetParallelism = args.jobParallelism; - if (targetParallelism == 0): - targetParallelism = len(args.files); - print(color.BLUE + color.BOLD + - "Will process " + str(targetParallelism) + " files in parallel." - + color.END); - - # Prepare the processes that will parse files, one per file - if (len(args.files) > 0): - for fname in args.files: - p = Process(target=parseFile, args=(fname,)); - runnableProcesses[fname] = p; - - # Spawn these processes, not exceeding the desired parallelism - while (len(runnableProcesses) > 0): - while (len(spawnedProcesses) < targetParallelism - and len(runnableProcesses) > 0): - - fname, p = runnableProcesses.popitem(); - p.start(); - spawnedProcesses[fname] = p; - - # Find at least one terminated process - waitOnOneProcess(spawnedProcesses); - - # Wait for all processes to terminate - while (len(spawnedProcesses) > 0): - waitOnOneProcess(spawnedProcesses); - -if __name__ == '__main__': - main() -- cgit v1.2.1