summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-08-01 16:42:49 +1000
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-08-01 16:42:49 +1000
commit835bfb21d8e67663d84a40aa4f7370a4403725a9 (patch)
tree4f5edb231524f95272f834e31461ba4e17e52903 /src/third_party/wiredtiger
parent6300b3bd4ad9cd238a02bdb8ca681a447913f1af (diff)
downloadmongo-835bfb21d8e67663d84a40aa4f7370a4403725a9.tar.gz
Import wiredtiger: 2e9744d11a65c63ba7445060dc78371250f04051 from branch mongodb-3.6
ref: 6173a98979..2e9744d11a for: 3.5.11 WT-2309 Add yields and/or sleeps in #DIAGNOSTIC mode WT-3047 Add mode aimed at uncovering race conditions in split code WT-3308 Add statistics tracking around yield loops WT-3316 Add new engineering section to reference guide documentation WT-3338 Optimize cursor modify WT-3380 Special case 8-byte timestamps WT-3387 Add support for a stable timestamp WT-3389 Restructure split code to hold a split generation for the entire operation. WT-3406 Reconciliation is choosing reserved records for writing. WT-3410 Add developer documentation for table rename WT-3412 Add backoff logic to the btree delete and walk yield loops WT-3418 block manager object race WT-3422 WiredTiger upgrading documents out of date WT-3432 workgen needs braces around an "if" body WT-3433 session->alter method should not be supported in read-only mode WT-3439 lint/cleanup WT-3440 Add a log record when starting a checkpoint WT-3442 Coverity 1378213: false positive on diagnostic assignment. WT-3446 Temporarily disable timestamp testing in test/checkpoint WT-3447 test_stat_log02 can assert before table stats are printed WT-3461 Avoid long sleeps when the system clock is adjusted WT-3463 Add recovery of backup to test_timestamp03.py WT-3466 Track the first commit timestamp for each transaction WT-3467 Minor lint/cleanup
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r--src/third_party/wiredtiger/.gitignore2
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.cxx7
-rw-r--r--src/third_party/wiredtiger/build_posix/configure.ac.in40
-rw-r--r--src/third_party/wiredtiger/build_win/wiredtiger_config.h3
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py33
-rw-r--r--src/third_party/wiredtiger/dist/filelist2
-rw-r--r--src/third_party/wiredtiger/dist/flags.py8
-rw-r--r--src/third_party/wiredtiger/dist/log.py66
-rw-r--r--src/third_party/wiredtiger/dist/log_data.py15
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_docs_plantuml58
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_longlines1
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok6
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_style5
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_whitespace1
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py8
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger.i1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c41
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c41
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c214
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c82
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c24
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c24
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c119
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c383
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c31
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c25
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c15
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c104
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c16
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c122
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c4
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c43
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c14
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c116
-rw-r--r--src/third_party/wiredtiger/src/docs/Doxyfile4
-rw-r--r--src/third_party/wiredtiger/src/docs/devdoc-index.dox12
-rw-r--r--src/third_party/wiredtiger/src/docs/devdoc-schema.dox208
-rw-r--r--src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.pngbin0 -> 124661 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.pngbin0 -> 13799 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.pngbin0 -> 113981 bytes
-rw-r--r--src/third_party/wiredtiger/src/docs/introduction.dox4
-rw-r--r--src/third_party/wiredtiger/src/docs/spell.ok7
-rw-r--r--src/third_party/wiredtiger/src/docs/transactions.dox9
-rw-r--r--src/third_party/wiredtiger/src/docs/upgrade.dox12
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c13
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/include/bitstring.i3
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h67
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i25
-rw-r--r--src/third_party/wiredtiger/src/include/buf.i24
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i9
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i7
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h6
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h31
-rw-r--r--src/third_party/wiredtiger/src/include/extern_posix.h2
-rw-r--r--src/third_party/wiredtiger/src/include/extern_win.h2
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h2
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h22
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i39
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i15
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h8
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h29
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i100
-rw-r--r--src/third_party/wiredtiger/src/include/verify_build.h10
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in123
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/log/log.c19
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c310
-rw-r--r--src/third_party/wiredtiger/src/log/log_sys.c36
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c4
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c2
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c33
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c27
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c15
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c391
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c36
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c17
-rw-r--r--src/third_party/wiredtiger/src/support/modify.c200
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c29
-rw-r--r--src/third_party/wiredtiger/src/support/time.c69
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c42
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c38
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c95
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c38
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c270
-rwxr-xr-xsrc/third_party/wiredtiger/test/checkpoint/smoke.sh13
-rw-r--r--src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c14
-rw-r--r--src/third_party/wiredtiger/test/csuite/Makefile.am3
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c317
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c16
-rw-r--r--src/third_party/wiredtiger/test/format/bdb.c4
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c8
-rw-r--r--src/third_party/wiredtiger/test/format/config.c28
-rw-r--r--src/third_party/wiredtiger/test/format/format.h12
-rw-r--r--src/third_party/wiredtiger/test/format/lrt.c8
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c374
-rw-r--r--src/third_party/wiredtiger/test/format/t.c5
-rw-r--r--src/third_party/wiredtiger/test/format/util.c101
-rw-r--r--src/third_party/wiredtiger/test/suite/test_compat01.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_cursor12.py428
-rw-r--r--src/third_party/wiredtiger/test/suite/test_readonly03.py9
-rw-r--r--src/third_party/wiredtiger/test/suite/test_stat_log02.py23
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp02.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp03.py271
109 files changed, 4276 insertions, 1529 deletions
diff --git a/src/third_party/wiredtiger/.gitignore b/src/third_party/wiredtiger/.gitignore
index 082145e66d0..204cd421fd1 100644
--- a/src/third_party/wiredtiger/.gitignore
+++ b/src/third_party/wiredtiger/.gitignore
@@ -54,6 +54,7 @@ build_posix/wt
tags
# Documentation
+/dist/plantuml.jar
/docs/
/src/docs/changelog.md
/src/docs/doxygen.log
@@ -119,6 +120,7 @@ _wiredtiger.pyd
**/test/csuite/test_wt3120_filesys
**/test/csuite/test_wt3135_search_near_collator
**/test/csuite/test_wt3184_dup_index_collator
+**/test/csuite/test_wt3338_partial_update
**/test/csuite/test_wt3363_checkpoint_op_races
**/test/cursor_order/cursor_order
**/test/fops/t
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
index 1a0735f9adc..ce9debcca2f 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
@@ -925,8 +925,11 @@ void Operation::describe(std::ostream &os) const {
}
if (!_config.empty())
os << ", '" << _config;
- if (_transaction != NULL)
- os << ", ["; _transaction->describe(os); os << "]";
+ if (_transaction != NULL) {
+ os << ", [";
+ _transaction->describe(os);
+ os << "]";
+ }
if (_group != NULL) {
os << ", group[" << _repeatgroup << "]: {";
bool first = true;
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in
index 4de12d5161e..c33d013f6df 100644
--- a/src/third_party/wiredtiger/build_posix/configure.ac.in
+++ b/src/third_party/wiredtiger/build_posix/configure.ac.in
@@ -75,7 +75,7 @@ AC_SUBST(AM_LIBTOOLFLAGS)
# WiredTiger uses anonymous unions to pad structures. It's part of C11, but
# some compilers require -std=c11 to support them. Turn on that flag for any
# compiler that supports it, except for Solaris, where gcc -std=c11 makes
-# some none-C11 prototypes unavailable.
+# some non-C11 prototypes unavailable.
if test "$wt_cv_solaris" = "no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [AM_CFLAGS="$AM_CFLAGS -std=c11"])
fi
@@ -170,6 +170,44 @@ AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])])
# the generic declaration in AC_CHECK_FUNCS is incompatible.
AX_FUNC_POSIX_MEMALIGN
+# Check for POSIX condition variables with monotonic clock support
+AC_CACHE_CHECK([for condition waits with monotonic clock support],
+ [wt_cv_pthread_cond_monotonic],
+ [AC_RUN_IFELSE([AC_LANG_SOURCE([[
+#include <errno.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <time.h>
+
+int main()
+{
+ int ret;
+ pthread_condattr_t condattr;
+ pthread_cond_t cond;
+ pthread_mutex_t mtx;
+ struct timespec ts;
+
+ if ((ret = pthread_condattr_init(&condattr)) != 0) exit(1);
+ if ((ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) != 0) exit(1);
+ if ((ret = pthread_cond_init(&cond, &condattr)) != 0) exit(1);
+ if ((ret = pthread_mutex_init(&mtx, NULL)) != 0) exit(1);
+ if ((ret = clock_gettime(CLOCK_MONOTONIC, &ts)) != 0) exit(1);
+ ts.tv_sec += 1;
+ if ((ret = pthread_mutex_lock(&mtx)) != 0) exit(1);
+ if ((ret = pthread_cond_timedwait(&cond, &mtx, &ts)) != 0 && ret != EINTR && ret != ETIMEDOUT) exit(1);
+
+ exit(0);
+}
+ ]])],
+ [wt_pthread_cond_monotonic=yes],
+ [wt_pthread_cond_monotonic=no],
+ [wt_pthread_cond_monotonic=no])])
+AC_MSG_RESULT($wt_pthread_cond_monotonic)
+if test "$wt_pthread_cond_monotonic" = "yes" ; then
+ AC_DEFINE([HAVE_PTHREAD_COND_MONOTONIC], [1],
+ [Define to 1 if pthread condition variables support monotonic clocks.])
+fi
+
AC_SYS_LARGEFILE
AC_C_BIGENDIAN
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
index b118cfa9882..fa8425c4936 100644
--- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h
+++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
@@ -79,6 +79,9 @@
/* Define to 1 if you have the <memory.h> header file. */
/* #undef HAVE_MEMORY_H */
+/* Define to 1 if pthread condition variables support monotonic clocks. */
+/* #undef HAVE_PTHREAD_COND_MONOTONIC */
+
/* Define to 1 if you have the `posix_fadvise' function. */
/* #undef HAVE_POSIX_FADVISE */
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 9e873198f33..9fd043d5b25 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -407,15 +407,6 @@ connection_runtime_config = [
Config('release', '', r'''
compatibility release version string'''),
]),
- Config('diagnostic_timing_stress', '', r'''
- enable insertion of code that interrupts the usual timing of
- operations with a goal of uncovering race conditions and unexpected
- blocking. This option is intended for use with internal stress
- testing of WiredTiger. Only available if WiredTiger is configured
- with --enable-diagnostic. Options are given as a list, such as
- <code>"diagnostic_timing_stress=[checkpoint_slow]"</code>''',
- type='list', undoc=True, choices=[
- 'checkpoint_slow']),
Config('error_prefix', '', r'''
prefix string for error messages'''),
Config('eviction', '', r'''
@@ -529,6 +520,15 @@ connection_runtime_config = [
@ref statistics for more information''',
type='list',
choices=['all', 'cache_walk', 'fast', 'none', 'clear', 'tree_walk']),
+ Config('timing_stress_for_test', '', r'''
+ enable code that interrupts the usual timing of operations with a
+ goal of uncovering race conditions and unexpected blocking.
+ This option is intended for use with internal stress
+ testing of WiredTiger. Options are given as a list, such as
+ <code>"timing_stress_for_test=[checkpoint_slow,
+ internal_page_split_race, page_split_race]"</code>''',
+ type='list', undoc=True, choices=[
+ 'checkpoint_slow', 'internal_page_split_race', 'page_split_race']),
Config('verbose', '', r'''
enable messages for various events. Only available if WiredTiger
is configured with --enable-verbose. Options are given as a
@@ -891,7 +891,7 @@ methods = {
not available immediately''',
type='boolean', undoc=True),
Config('remove_files', 'true', r'''
- should the underlying files be removed?''',
+ if the underlying files should be removed''',
type='boolean'),
]),
@@ -1151,6 +1151,12 @@ methods = {
undoc=True),
Config('target', '', r'''
if non-empty, checkpoint the list of objects''', type='list'),
+ Config('use_timestamp', 'true', r'''
+ by default, create the checkpoint as of the last stable timestamp
+ if timestamps are in use, or all current updates if there is no
+ stable timestamp set. If false, this option generates a checkpoint
+ with all updates including those later than the timestamp''',
+ type='boolean'),
]),
'WT_SESSION.snapshot' : Method([
@@ -1249,7 +1255,12 @@ methods = {
Config('oldest_timestamp', '', r'''
future commits and queries will be no earlier than the specified
timestamp. Supplied values must be monotonically increasing.
- see @ref transaction_timestamps'''),
+ See @ref transaction_timestamps'''),
+ Config('stable_timestamp', '', r'''
+ future checkpoints will be no later than the specified
+ timestamp. Supplied values must be monotonically increasing.
+ The stable timestamp data stability only applies to tables
+ that are not being logged. See @ref transaction_timestamps'''),
]),
'WT_SESSION.reconfigure' : Method(session_config),
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index 309339ac877..f96bb8b6f2f 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -187,12 +187,14 @@ src/support/hash_fnv.c
src/support/hazard.c
src/support/hex.c
src/support/huffman.c
+src/support/modify.c
src/support/mtx_rw.c
src/support/pow.c
src/support/rand.c
src/support/scratch.c
src/support/stat.c
src/support/thread_group.c
+src/support/time.c
src/txn/txn.c
src/txn/txn_ckpt.c
src/txn/txn_ext.c
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 43db7a67054..48952768c18 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -8,9 +8,6 @@ flags = {
###################################################
# Internal routine flag declarations
###################################################
- 'diagnostic_timing_stress' : [
- 'TIMING_STRESS_CHECKPOINT_SLOW',
- ],
'log_scan' : [
'LOGSCAN_FIRST',
'LOGSCAN_FROM_CKP',
@@ -48,6 +45,11 @@ flags = {
'EVICT_UPDATE_RESTORE',
'VISIBILITY_ERR',
],
+ 'timing_stress_for_test' : [
+ 'TIMING_STRESS_CHECKPOINT_SLOW',
+ 'TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE',
+ 'TIMING_STRESS_PAGE_SPLIT_RACE',
+ ],
'txn_log_checkpoint' : [
'TXN_LOG_CKPT_CLEANUP',
'TXN_LOG_CKPT_PREPARE',
diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py
index 81dc8bd35d7..2da8e5eae66 100644
--- a/src/third_party/wiredtiger/dist/log.py
+++ b/src/third_party/wiredtiger/dist/log.py
@@ -10,10 +10,10 @@ tmp_file = '__tmp'
# Map log record types to:
# (C type, pack type, printf format, printf arg(s), list of setup functions)
field_types = {
- 'WT_LSN' : ('WT_LSN *', 'II', '%" PRIu32 "%" PRIu32 "',
- 'arg->l.file, arg->l.offset', [ '' ]),
- 'string' : ('const char *', 'S', '%s', 'arg', [ '' ]),
- 'item' : ('WT_ITEM *', 'u', '%s', 'escaped',
+ 'WT_LSN' : ('WT_LSN *', 'II', '[%" PRIu32 ", %" PRIu32 "]',
+ 'arg.l.file, arg.l.offset', [ '' ]),
+ 'string' : ('const char *', 'S', '\\"%s\\"', 'arg', [ '' ]),
+ 'item' : ('WT_ITEM *', 'u', '\\"%s\\"', 'escaped',
[ 'WT_ERR(__logrec_make_json_str(session, &escaped, &arg));',
'WT_ERR(__logrec_make_hex_str(session, &escaped, &arg));']),
'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', [ '' ]),
@@ -27,7 +27,7 @@ def cintype(f):
def couttype(f):
type = cintype(f)
# We already have a pointer to a WT_ITEM
- if f[0] == 'item':
+ if f[0] == 'item' or f[0] == 'WT_LSN':
return type
if type[-1] != '*':
type += ' '
@@ -35,8 +35,8 @@ def couttype(f):
def clocaltype(f):
type = cintype(f)
- # Allocate a WT_ITEM struct on the stack
- if f[0] == 'item':
+ # Allocate WT_ITEM and WT_LSN structs on the stack
+ if f[0] in ('item', 'WT_LSN'):
return type[:-2]
return type
@@ -62,10 +62,20 @@ def rec_pack_fmt(r):
def printf_fmt(f):
return field_types[f[0]][2]
+def pack_arg(f):
+ if f[0] == 'WT_LSN':
+ return '%s->l.file, %s->l.offset' % (f[1], f[1])
+ return f[1]
+
def printf_arg(f):
arg = field_types[f[0]][3].replace('arg', f[1])
return ' ' + arg
+def unpack_arg(f):
+ if f[0] == 'WT_LSN':
+ return '&%sp->l.file, &%sp->l.offset' % (f[1], f[1])
+ return f[1] + 'p'
+
def printf_setup(f, i, nl_indent):
stmt = field_types[f[0]][4][i].replace('arg', f[1])
return '' if stmt == '' else stmt + nl_indent
@@ -94,7 +104,7 @@ def printf_line(f, optype, i, ishex):
body = '%s%s(__wt_fprintf(session, WT_STDOUT(session),' % (
printf_setup(f, ishex, nl_indent),
'WT_ERR' if has_escape(optype.fields) else 'WT_RET') + \
- '%s "%s \\"%s\\": \\"%s\\"%s",%s));' % (
+ '%s "%s \\"%s\\": %s%s",%s));' % (
nl_indent, precomma, name, printf_fmt(f), postcomma,
printf_arg(f))
return ifbegin + body + ifend
@@ -201,13 +211,10 @@ __logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
# Emit code to read, write and print log operations (within a log record)
for optype in log_data.optypes:
- if not optype.fields:
- continue
-
tfile.write('''
int
__wt_logop_%(name)s_pack(
- WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ WT_SESSION_IMPL *session, WT_ITEM *logrec%(comma)s
%(arg_decls)s)
{
\tconst char *fmt = WT_UNCHECKED_STRING(%(fmt)s);
@@ -216,14 +223,14 @@ __wt_logop_%(name)s_pack(
\toptype = %(macro)s;
\tWT_RET(__wt_struct_size(session, &size, fmt,
-\t optype, 0%(arg_names)s));
+\t optype, 0%(pack_args)s));
\t__wt_struct_size_adjust(session, &size);
\tWT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
\trecsize = (uint32_t)size;
\tWT_RET(__wt_struct_pack(session,
\t (uint8_t *)logrec->data + logrec->size, size, fmt,
-\t optype, recsize%(arg_names)s));
+\t optype, recsize%(pack_args)s));
\tlogrec->size += (uint32_t)size;
\treturn (0);
@@ -231,17 +238,18 @@ __wt_logop_%(name)s_pack(
''' % {
'name' : optype.name,
'macro' : optype.macro_name(),
+ 'comma' : ',' if optype.fields else '',
'arg_decls' : ', '.join(
'%s%s%s' % (cintype(f), '' if cintype(f)[-1] == '*' else ' ', f[1])
for f in optype.fields),
- 'arg_names' : ''.join(', %s' % f[1] for f in optype.fields),
- 'fmt' : op_pack_fmt(optype)
+ 'pack_args' : ''.join(', %s' % pack_arg(f) for f in optype.fields),
+ 'fmt' : op_pack_fmt(optype),
})
tfile.write('''
int
__wt_logop_%(name)s_unpack(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end%(comma)s
%(arg_decls)s)
{
\tWT_DECL_RET;
@@ -249,7 +257,7 @@ __wt_logop_%(name)s_unpack(
\tuint32_t optype, size;
\tif ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
-\t &optype, &size%(arg_names)s)) != 0)
+\t &optype, &size%(unpack_args)s)) != 0)
\t\tWT_RET_MSG(session, ret, "logop_%(name)s: unpack failure");
\tWT_ASSERT(session, optype == %(macro)s);
@@ -259,42 +267,43 @@ __wt_logop_%(name)s_unpack(
''' % {
'name' : optype.name,
'macro' : optype.macro_name(),
+ 'comma' : ',' if optype.fields else '',
'arg_decls' : ', '.join(
'%s%sp' % (couttype(f), f[1]) for f in optype.fields),
- 'arg_names' : ''.join(', %sp' % f[1] for f in optype.fields),
+ 'unpack_args' : ''.join(', %s' % unpack_arg(f) for f in optype.fields),
'fmt' : op_pack_fmt(optype)
})
- last_field = optype.fields[-1]
tfile.write('''
int
__wt_logop_%(name)s_print(WT_SESSION_IMPL *session,
const uint8_t **pp, const uint8_t *end, uint32_t flags)
-{
-%(arg_ret)s\t%(arg_decls)s
+{%(arg_ret)s%(arg_decls)s
\t%(arg_unused)s%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack(
\t session, pp, end%(arg_addrs)s));
\tWT_RET(__wt_fprintf(session, WT_STDOUT(session),
\t " \\"optype\\": \\"%(name)s\\",\\n"));
-\t%(print_args)s
+%(print_args)s
%(arg_fini)s
}
''' % {
'name' : optype.name,
- 'arg_ret' : ('\tWT_DECL_RET;\n' if has_escape(optype.fields) else ''),
- 'arg_decls' : ('\n\t'.join('%s%s%s;' %
+ 'arg_ret' : ('\n\tWT_DECL_RET;' if has_escape(optype.fields) else ''),
+ 'arg_decls' : (('\n\t' + '\n\t'.join('%s%s%s;' %
(clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1])
- for f in optype.fields)) + escape_decl(optype.fields),
+ for f in optype.fields)) + escape_decl(optype.fields)
+ if optype.fields else ''),
'arg_unused' : ('' if has_escape(optype.fields)
else 'WT_UNUSED(flags);\n\t'),
'arg_init' : ('escaped = NULL;\n\t' if has_escape(optype.fields) else ''),
'arg_fini' : ('\nerr:\t__wt_free(session, escaped);\n\treturn (ret);'
if has_escape(optype.fields) else '\treturn (0);'),
'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields),
- 'print_args' : '\n\t'.join(printf_line(f, optype, i, s)
+ 'print_args' : ('\t' + '\n\t'.join(printf_line(f, optype, i, s)
for i,f in enumerate(optype.fields) for s in range(0, n_setup(f)))
+ if optype.fields else ''),
})
# Emit the printlog entry point
@@ -312,9 +321,6 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
\tswitch (optype) {''')
for optype in log_data.optypes:
- if not optype.fields:
- continue
-
tfile.write('''
\tcase %(macro)s:
\t\tWT_RET(%(print_func)s(session, pp, end, flags));
diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py
index 1782d2aadf0..9e1538ccf04 100644
--- a/src/third_party/wiredtiger/dist/log_data.py
+++ b/src/third_party/wiredtiger/dist/log_data.py
@@ -41,9 +41,8 @@ rectypes = [
# Debugging message in the log
LogRecordType('message', 'message', [('string', 'message')]),
- # System record
- LogRecordType('system', 'system', [
- ('WT_LSN','prev_lsn'), ('item', 'unused')]),
+ # System (internal) log record
+ LogRecordType('system', 'system', []),
]
class LogOperationType:
@@ -61,12 +60,17 @@ class LogOperationType:
# never change after they're written in a log file.
#
optypes = [
+# commit operations
+ LogOperationType('col_modify', 'column modify',
+ [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
LogOperationType('col_put', 'column put',
[('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
LogOperationType('col_remove', 'column remove',
[('uint32', 'fileid'), ('recno', 'recno')]),
LogOperationType('col_truncate', 'column truncate',
[('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]),
+ LogOperationType('row_modify', 'row modify',
+ [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
LogOperationType('row_put', 'row put',
[('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
LogOperationType('row_remove', 'row remove',
@@ -74,4 +78,9 @@ optypes = [
LogOperationType('row_truncate', 'row truncate',
[('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'),
('uint32', 'mode')]),
+
+# system operations
+ LogOperationType('checkpoint_start', 'checkpoint start', []),
+ LogOperationType('prev_lsn', 'previous LSN',
+ [('WT_LSN', 'prev_lsn')]),
]
diff --git a/src/third_party/wiredtiger/dist/s_docs_plantuml b/src/third_party/wiredtiger/dist/s_docs_plantuml
new file mode 100755
index 00000000000..c646739d7ba
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_docs_plantuml
@@ -0,0 +1,58 @@
+#! /bin/sh
+# Usage: ./s_docs_plantuml [-d|Auto download plantuml if doesn't exist in dist/]
+# This script checks for the existence of plantuml jar file (optionally
+# downloading it from sourceforge) and then generates uml images from the
+# plantuml templates embedded in the documentation.
+# Run this script after adding new plantuml templates to generate a fresh set of
+# uml images. All new images generated should be commited with the code.
+# For easy prototyping, following link can be used to generate these images
+# online:
+# http://www.plantuml.com/plantuml
+
+PLANTUML_URL="https://downloads.sourceforge.net/project/plantuml/plantuml.jar?r=&ts=1499750156&use_mirror=nchc"
+
+# We require java which may not be installed.
+type java > /dev/null 2>&1 || {
+ echo 'skipped: java not found'
+ exit 0
+}
+
+download_plantuml=0
+while :
+ do case "$1" in
+ -d) # Download plantuml if not already there
+ download_plantuml=1
+ shift;;
+ *)
+ break;;
+ esac
+done
+
+# plantuml is needed, check if already downloaded, else download if suggested
+# by an argument
+test -f "../dist/plantuml.jar" || {
+ echo 'dist/plantuml.jar not found. '
+ if [ $download_plantuml -eq 1 ]
+ then
+ echo 'Downloading plantuml:'
+ wget $PLANTUML_URL -O ../dist/plantuml.jar
+ else
+ echo 'plantuml can be downloaded from:'
+ echo 'https://sourceforge.net/projects/plantuml/files/plantuml.jar/download'
+ echo 'To download automatically pass -d argument to the script'
+ exit 1
+ fi
+}
+
+# Check plantuml works as expected
+java -jar ../dist/plantuml.jar -testdot > /dev/null || {
+ echo 'error: plantuml installation check failed'
+ exit 1
+}
+
+# Generate PlantUML docs. This command looks for plantuml template code embedded
+# in files at /src/docs/ with doc or dox extension.
+echo 'Generating plantuml images .. '
+mkdir -p ../docs/images/plantuml_gen_img
+java -Djava.awt.headless=true -jar ../dist/plantuml.jar -o ../docs/images/plantuml_gen_img "../src/docs/**.(doc|dox)" &&
+ echo 'Done'
diff --git a/src/third_party/wiredtiger/dist/s_longlines b/src/third_party/wiredtiger/dist/s_longlines
index bdb9811b4bd..736ea36e6d4 100755
--- a/src/third_party/wiredtiger/dist/s_longlines
+++ b/src/third_party/wiredtiger/dist/s_longlines
@@ -14,6 +14,7 @@ l=`(cd .. &&
-e '/include\/extern\.h/d' \
-e '/include\/extern_posix\.h/d' \
-e '/include\/extern_win\.h/d' \
+ -e '/log\/log_auto\.c/d' \
-e '/support\/stat\.c/d'`
for f in $l ; do
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 723f6d18858..c79dc5129a5 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -350,11 +350,14 @@ Syscall
TAILQ
TCMalloc
TESTUTIL
+TIMESTAMP
+TIMESTAMPS
TODO
TORTIOUS
TSO
TXN
TXNC
+Teardown
ThreadList
ThreadListWrapper
Timespec
@@ -992,6 +995,7 @@ oo
opcode
opendir
openfile
+oplist
oplog
optimizations
optype
@@ -1085,6 +1089,7 @@ refp
regionp
reinitialization
relocked
+repl
resize
resizing
ret
@@ -1320,3 +1325,4 @@ zseries
zstd
zstd's
zu
+zyxwvutsrqponmlkjihgfedcba
diff --git a/src/third_party/wiredtiger/dist/s_style b/src/third_party/wiredtiger/dist/s_style
index 69cf1f667fa..d5dc31ba37a 100755
--- a/src/third_party/wiredtiger/dist/s_style
+++ b/src/third_party/wiredtiger/dist/s_style
@@ -56,6 +56,11 @@ else
cat $t
fi
+ if grep 'sizeof(WT_UPDATE)' $f > $t; then
+ echo "$f: Use WT_UPDATE_SIZE rather than sizeof(WT_UPDATE)"
+ cat $t
+ fi
+
if ! expr "$f" : 'src/include/queue\.h' > /dev/null &&
egrep 'STAILQ_|SLIST_|\bLIST_' $f ; then
echo "$f: use TAILQ for all lists"
diff --git a/src/third_party/wiredtiger/dist/s_whitespace b/src/third_party/wiredtiger/dist/s_whitespace
index 874074dfb50..f4c96cdc91a 100755
--- a/src/third_party/wiredtiger/dist/s_whitespace
+++ b/src/third_party/wiredtiger/dist/s_whitespace
@@ -28,6 +28,7 @@ find bench dist examples ext src test \
-e '/checksum\/power8/d' \
-e '/3rdparty/d' \
-e '/docs\/tools/d' \
+ -e '/log\/log_auto/d' \
| while read f ; do
whitespace $f
done
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 2e344e0fbef..783fb452599 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -459,11 +459,19 @@ connection_stats = [
##########################################
YieldStat('application_cache_time', 'application thread time waiting for cache (usecs)'),
YieldStat('application_evict_time', 'application thread time evicting (usecs)'),
+ YieldStat('child_modify_blocked_page', 'page reconciliation yielded due to child modification'),
+ YieldStat('conn_close_blocked_lsm', 'connection close yielded for lsm manager shutdown'),
+ YieldStat('dhandle_lock_blocked', 'data handle lock yielded'),
+ YieldStat('log_server_sync_blocked', 'log server sync yielded for log write'),
YieldStat('page_busy_blocked', 'page acquire busy blocked'),
+ YieldStat('page_del_rollback_blocked', 'page delete rollback time sleeping for state change (usecs)'),
YieldStat('page_forcible_evict_blocked', 'page acquire eviction blocked'),
+ YieldStat('page_index_slot_ref_blocked', 'get reference for page index and slot time sleeping (usecs)'),
YieldStat('page_locked_blocked', 'page acquire locked blocked'),
YieldStat('page_read_blocked', 'page acquire read blocked'),
YieldStat('page_sleep', 'page acquire time sleeping (usecs)'),
+ YieldStat('tree_descend_blocked', 'tree descend one level yielded for split page index update'),
+ YieldStat('txn_release_blocked', 'connection close blocked waiting for transaction state stabilization'),
]
connection_stats = sorted(connection_stats, key=attrgetter('desc'))
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index ee82256992f..d99032d705c 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "6173a98979715ed727c432c1a31da64ea8a37048",
+ "commit": "2e9744d11a65c63ba7445060dc78371250f04051",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.6"
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i
index 8c737341979..e976af6a284 100644
--- a/src/third_party/wiredtiger/lang/python/wiredtiger.i
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i
@@ -489,6 +489,7 @@ NOTFOUND_OK(__wt_cursor::prev)
NOTFOUND_OK(__wt_cursor::remove)
NOTFOUND_OK(__wt_cursor::search)
NOTFOUND_OK(__wt_cursor::update)
+NOTFOUND_OK(__wt_cursor::_modify)
ANY_OK(__wt_modify::__wt_modify)
ANY_OK(__wt_modify::~__wt_modify)
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index cb59bff8f75..eb8a258d475 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -15,12 +15,10 @@
static inline int
__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
@@ -59,10 +57,10 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
(upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
cbt->v = 0;
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -74,7 +72,6 @@ static inline int
__cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_BTREE *btree;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -82,7 +79,6 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = S2BT(session);
page = cbt->ref->page;
- val = &cbt->iface.value;
/* Initialize for each new page. */
if (newpage) {
@@ -108,10 +104,10 @@ new_page:
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -122,12 +118,10 @@ new_page:
static inline int
__cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
@@ -147,9 +141,7 @@ new_page: if (cbt->ins == NULL)
++cbt->page_deleted_count;
continue;
}
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* NOTREACHED */
}
@@ -164,7 +156,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
WT_CELL *cell;
WT_CELL_UNPACK unpack;
WT_COL *cip;
- WT_ITEM *val;
WT_INSERT *ins;
WT_PAGE *page;
WT_SESSION_IMPL *session;
@@ -173,7 +164,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
- val = &cbt->iface.value;
rle_start = 0; /* -Werror=maybe-uninitialized */
@@ -210,10 +200,7 @@ new_page: /* Find the matching WT_COL slot. */
++cbt->page_deleted_count;
continue;
}
-
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/*
@@ -267,8 +254,8 @@ new_page: /* Find the matching WT_COL slot. */
cbt->cip_saved = cip;
}
- val->data = cbt->tmp->data;
- val->size = cbt->tmp->size;
+ cbt->iface.value.data = cbt->tmp->data;
+ cbt->iface.value.size = cbt->tmp->size;
return (0);
}
/* NOTREACHED */
@@ -282,7 +269,7 @@ static inline int
__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_INSERT *ins;
- WT_ITEM *key, *val;
+ WT_ITEM *key;
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
@@ -291,7 +278,6 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
key = &cbt->iface.key;
- val = &cbt->iface.value;
/*
* For row-store pages, we need a single item that tells us the part
@@ -332,9 +318,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
}
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* Check for the end of the page. */
@@ -363,7 +347,6 @@ new_insert: if ((ins = cbt->ins) != NULL) {
++cbt->page_deleted_count;
continue;
}
-
return (__cursor_row_slot_return(cbt, rip, upd));
}
/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 6e49f4df68c..c1395ea9008 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -127,12 +127,10 @@ restart:
static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
@@ -205,10 +203,10 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
(upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
cbt->v = 0;
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -220,7 +218,6 @@ static inline int
__cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_BTREE *btree;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -228,7 +225,6 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
btree = S2BT(session);
- val = &cbt->iface.value;
/* Initialize for each new page. */
if (newpage) {
@@ -254,10 +250,10 @@ new_page:
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -268,12 +264,10 @@ new_page:
static inline int
__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
cbt->ins = WT_SKIP_LAST(cbt->ins_head);
@@ -293,9 +287,7 @@ new_page: if (cbt->ins == NULL)
++cbt->page_deleted_count;
continue;
}
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* NOTREACHED */
}
@@ -311,7 +303,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
WT_CELL_UNPACK unpack;
WT_COL *cip;
WT_INSERT *ins;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -319,7 +310,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
- val = &cbt->iface.value;
rle_start = 0; /* -Werror=maybe-uninitialized */
@@ -357,10 +347,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
++cbt->page_deleted_count;
continue;
}
-
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/*
@@ -413,8 +400,8 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
cbt->cip_saved = cip;
}
- val->data = cbt->tmp->data;
- val->size = cbt->tmp->size;
+ cbt->iface.value.data = cbt->tmp->data;
+ cbt->iface.value.size = cbt->tmp->size;
return (0);
}
/* NOTREACHED */
@@ -428,7 +415,7 @@ static inline int
__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_INSERT *ins;
- WT_ITEM *key, *val;
+ WT_ITEM *key;
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
@@ -437,7 +424,6 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
key = &cbt->iface.key;
- val = &cbt->iface.value;
/*
* For row-store pages, we need a single item that tells us the part
@@ -489,9 +475,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
}
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* Check for the beginning of the page. */
@@ -522,7 +506,6 @@ new_insert: if ((ins = cbt->ins) != NULL) {
++cbt->page_deleted_count;
continue;
}
-
return (__cursor_row_slot_return(cbt, rip, upd));
}
/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 52435eeefed..d58dc78fbed 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -308,8 +308,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
}
/*
+ * __cursor_kv_return --
+ * Return a page referenced key/value pair to the application.
+ */
+static inline int
+__cursor_kv_return(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_RET(__wt_key_return(session, cbt));
+ WT_RET(__wt_value_return(session, cbt, upd));
+
+ return (0);
+}
+
+/*
* __cursor_col_search --
- * Column-store search from an application cursor.
+ * Column-store search from a cursor.
*/
static inline int
__cursor_col_search(
@@ -324,7 +338,7 @@ __cursor_col_search(
/*
* __cursor_row_search --
- * Row-store search from an application cursor.
+ * Row-store search from a cursor.
*/
static inline int
__cursor_row_search(
@@ -338,8 +352,32 @@ __cursor_row_search(
}
/*
+ * __cursor_col_modify_v --
+ * Column-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_col_modify_v(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+ return (__wt_col_modify(session, cbt,
+ cbt->iface.recno, value, NULL, modify_type, false));
+}
+
+/*
+ * __cursor_row_modify_v --
+ * Row-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_row_modify_v(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+ return (__wt_row_modify(session, cbt,
+ &cbt->iface.key, value, NULL, modify_type, false));
+}
+
+/*
* __cursor_col_modify --
- * Column-store delete, insert, and update from an application cursor.
+ * Column-store modify from a cursor.
*/
static inline int
__cursor_col_modify(
@@ -351,7 +389,7 @@ __cursor_col_modify(
/*
* __cursor_row_modify --
- * Row-store insert, update and delete from an application cursor.
+ * Row-store modify from a cursor.
*/
static inline int
__cursor_row_modify(
@@ -442,7 +480,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
}
if (valid)
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
else if (__cursor_fix_implicit(btree, cbt)) {
/*
* Creating a record past the end of the tree in a fixed-length
@@ -564,7 +602,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
*/
if (valid) {
exact = cbt->compare;
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
} else if (__cursor_fix_implicit(btree, cbt)) {
cbt->recno = cursor->recno;
cbt->v = 0;
@@ -582,7 +620,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
__cursor_col_search(session, cbt, NULL));
if (__wt_cursor_valid(cbt, &upd)) {
exact = cbt->compare;
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
exact = -1;
}
@@ -987,7 +1025,7 @@ done: /*
* Update a record in the tree.
*/
static int
-__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
+__btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
{
WT_BTREE *btree;
WT_CURFILE_STATE state;
@@ -1015,6 +1053,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
*/
if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
WT_ERR(__wt_txn_autocommit_check(session));
+
/*
* The cursor position may not be exact (the cursor's comparison
* value not equal to zero). Correct to an exact match so we can
@@ -1022,8 +1061,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
*/
cbt->compare = 0;
ret = btree->type == BTREE_ROW ?
- __cursor_row_modify(session, cbt, modify_type) :
- __cursor_col_modify(session, cbt, modify_type);
+ __cursor_row_modify_v(session, cbt, value, modify_type) :
+ __cursor_col_modify_v(session, cbt, value, modify_type);
if (ret == 0)
goto done;
@@ -1052,6 +1091,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (btree->type == BTREE_ROW) {
WT_ERR(__cursor_row_search(session, cbt, NULL, true));
+
/*
* If not overwriting, check for conflicts and fail if the key
* does not exist.
@@ -1061,7 +1101,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_row_modify(session, cbt, modify_type);
+ ret = __cursor_row_modify_v(session, cbt, value, modify_type);
} else {
WT_ERR(__cursor_col_search(session, cbt, NULL));
@@ -1080,7 +1120,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_col_modify(session, cbt, modify_type);
+ ret = __cursor_col_modify_v(session, cbt, value, modify_type);
}
err: if (ret == WT_RESTART) {
@@ -1097,14 +1137,33 @@ err: if (ret == WT_RESTART) {
* To make this work, we add a field to the btree cursor to pass back a
* pointer to the modify function's allocated update structure.
*/
-done: if (ret == 0) {
- if (modify_type == WT_UPDATE_RESERVED) {
+done: if (ret == 0)
+ switch (modify_type) {
+ case WT_UPDATE_STANDARD:
+ /*
+ * WT_CURSOR.update returns a key and a value.
+ */
+ WT_TRET(__cursor_kv_return(
+ session, cbt, cbt->modify_update));
+ break;
+ case WT_UPDATE_RESERVED:
+ /*
+ * WT_CURSOR.reserve doesn't return any value.
+ */
F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ /* FALLTHROUGH */
+ case WT_UPDATE_MODIFIED:
+ /*
+ * WT_CURSOR.modify has already created the return value
+ * and our job is to leave it untouched.
+ */
WT_TRET(__wt_key_return(session, cbt));
- } else
- WT_TRET(
- __wt_kv_return(session, cbt, cbt->modify_update));
- }
+ break;
+ case WT_UPDATE_DELETED:
+ default:
+ WT_TRET(__wt_illegal_value(session, NULL));
+ break;
+ }
if (ret != 0) {
WT_TRET(__cursor_reset(cbt));
@@ -1115,6 +1174,121 @@ done: if (ret == 0) {
}
/*
+ * __cursor_chain_exceeded --
+ * Return if the update chain has exceeded the limit. Deleted or standard
+ * updates are anticipated to be sufficient to base the modify (although that's
+ * not guaranteed, they may not be visible or might abort before we read them).
+ * Also, this is not a hard limit, threads can race modifying updates.
+ */
+static bool
+__cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ int i;
+
+ page = cbt->ref->page;
+
+ upd = NULL;
+ if (cbt->ins != NULL)
+ upd = cbt->ins->upd;
+ else if (cbt->btree->type == BTREE_ROW &&
+ page->modify != NULL && page->modify->mod_row_update != NULL)
+ upd = page->modify->mod_row_update[cbt->slot];
+
+ for (i = 0; upd != NULL; ++i, upd = upd->next) {
+ if (upd->type == WT_UPDATE_DELETED ||
+ upd->type == WT_UPDATE_STANDARD)
+ return (false);
+ if (i >= WT_MAX_MODIFY_UPDATE)
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * __wt_btcur_modify --
+ * Modify a record in the tree.
+ */
+int
+__wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
+{
+ WT_CURFILE_STATE state;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(modify);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ size_t orig, new;
+ bool chain_exceeded, overwrite;
+
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_CONN_INCR(session, cursor_modify);
+ WT_STAT_DATA_INCR(session, cursor_modify);
+
+ /* Save the cursor state. */
+ __cursor_state_save(cursor, &state);
+
+ /*
+ * Get the current value and apply the modification to it, for a few
+ * reasons: first, we set the updated value so the application can
+ * retrieve the cursor's value; second, we use the updated value as
+ * the update if the update chain is too long; third, there's a check
+ * if the updated value is too large to store; fourth, to simplify the
+ * count of bytes being added/removed; fifth, we can get into serious
+ * trouble if we attempt to modify a value that doesn't exist. For the
+ * fifth reason, verify we're not in a read-uncommitted transaction,
+ * that implies a value that might disappear out from under us.
+ */
+ if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+ WT_ERR_MSG(session, ENOTSUP,
+ "not supported in read-uncommitted transactions");
+
+ WT_ERR(__wt_btcur_search(cbt));
+ orig = cursor->value.size;
+ WT_ERR(__wt_modify_apply_api(
+ session, &cursor->value, entries, nentries));
+ new = cursor->value.size;
+ WT_ERR(__cursor_size_chk(session, &cursor->value));
+ if (new > orig)
+ WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
+ else
+ WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);
+
+ /*
+ * WT_CURSOR.modify is update-without-overwrite.
+ *
+ * Use the modify buffer as the update if under the limit, else use the
+ * complete value.
+ */
+ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
+ F_CLR(cursor, WT_CURSTD_OVERWRITE);
+ chain_exceeded = __cursor_chain_exceeded(cbt);
+ if (chain_exceeded)
+ ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
+ else if ((ret =
+ __wt_modify_pack(session, &modify, entries, nentries)) == 0)
+ ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFIED);
+ if (overwrite)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+
+ /*
+ * We have our own cursor state restoration because we've modified the
+ * cursor before calling the underlying cursor update function and we
+ * need to restore it to its original state. This means multiple calls
+ * to reset the cursor, but that shouldn't be a problem.
+ */
+ if (ret != 0) {
+err: WT_TRET(__cursor_reset(cbt));
+ __cursor_state_restore(cursor, &state);
+ }
+
+ __wt_scr_free(session, &modify);
+ return (ret);
+}
+
+/*
* __wt_btcur_reserve --
* Reserve a record in the tree.
*/
@@ -1135,7 +1309,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
/* WT_CURSOR.reserve is update-without-overwrite and a special value. */
overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
F_CLR(cursor, WT_CURSTD_OVERWRITE);
- ret = __btcur_update(cbt, WT_UPDATE_RESERVED);
+ ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVED);
if (overwrite)
F_SET(cursor, WT_CURSTD_OVERWRITE);
return (ret);
@@ -1164,7 +1338,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
WT_RET(__cursor_size_chk(session, &cursor->key));
WT_RET(__cursor_size_chk(session, &cursor->value));
- return (__btcur_update(cbt, WT_UPDATE_STANDARD));
+ return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index c0aaf3f42d9..b8d11be7b3e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -986,6 +986,35 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
}
/*
+ * __debug_modified --
+ * Dump a modified update.
+ */
+static int
+__debug_modified(WT_DBG *ds, WT_UPDATE *upd)
+{
+ const size_t *p;
+ int nentries;
+ const uint8_t *data;
+ void *modify;
+
+ modify = upd->data;
+
+ p = modify;
+ nentries = (int)*p++;
+ data = (uint8_t *)modify +
+ sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
+
+ WT_RET(ds->f(ds, "%d: ", nentries));
+ for (; nentries-- > 0; data += p[0], p += 3)
+ WT_RET(ds->f(ds,
+ "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT
+ ", %.*s}%s", p[0], p[1], p[2],
+ (int)p[2], data, nentries == 0 ? "" : ", "));
+
+ return (0);
+}
+
+/*
* __debug_update --
* Dump an update list.
*/
@@ -993,37 +1022,46 @@ static int
__debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
{
for (; upd != NULL; upd = upd->next) {
- if (upd->type == WT_UPDATE_DELETED)
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
WT_RET(ds->f(ds, "\tvalue {deleted}\n"));
- else if (upd->type == WT_UPDATE_RESERVED)
- WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
- else if (hexbyte) {
- WT_RET(ds->f(ds, "\t{"));
- WT_RET(__debug_hex_byte(ds,
- *(uint8_t *)WT_UPDATE_DATA(upd)));
+ break;
+ case WT_UPDATE_MODIFIED:
+ WT_RET(ds->f(ds, "\tvalue {modified: "));
+ WT_RET(__debug_modified(ds, upd));
WT_RET(ds->f(ds, "}\n"));
- } else
- WT_RET(__debug_item(ds,
- "value", WT_UPDATE_DATA(upd), upd->size));
- WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
+ break;
+ case WT_UPDATE_RESERVED:
+ WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
+ break;
+ case WT_UPDATE_STANDARD:
+ if (hexbyte) {
+ WT_RET(ds->f(ds, "\t{"));
+ WT_RET(__debug_hex_byte(ds, *upd->data));
+ WT_RET(ds->f(ds, "}\n"));
+ } else
+ WT_RET(__debug_item(ds,
+ "value", upd->data, upd->size));
+ break;
+ }
+ if (upd->txnid == WT_TXN_ABORTED)
+ WT_RET(ds->f(ds, "\t" "txn aborted"));
+ else
+ WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
#ifdef HAVE_TIMESTAMPS
- if (!__wt_timestamp_iszero(upd->timestamp)) {
+ if (!__wt_timestamp_iszero(
+ WT_TIMESTAMP_NULL(&upd->timestamp))) {
#if WT_TIMESTAMP_SIZE == 8
- {
- uint64_t ts;
- __wt_timestamp_set(
- (uint8_t *)&ts, (uint8_t *)&upd->timestamp[0]);
- ts = __wt_bswap64(ts);
- WT_RET(ds->f(ds, ", stamp %" PRIu64, ts));
- }
+ WT_RET(ds->f(ds,
+ ", stamp %" PRIu64, upd->timestamp.val));
#else
- {
int i;
+
WT_RET(ds->f(ds, ", stamp 0x"));
for (i = 0; i < WT_TIMESTAMP_SIZE; ++i)
- WT_RET(ds->f(ds, "%" PRIx8, upd->timestamp[i]));
- }
+ WT_RET(ds->f(ds,
+ "%" PRIx8, upd->timestamp.ts[i]));
#endif
}
#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index eac8994a5a4..093192dbaa0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -153,6 +153,7 @@ void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_UPDATE **upd;
+ uint64_t sleep_count, yield_count;
/*
* If the page is still "deleted", it's as we left it, reset the state
@@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* instantiated or being instantiated. Loop because it's possible for
* the page to return to the deleted state if instantiation fails.
*/
- for (;; __wt_yield())
+ for (sleep_count = yield_count = 0;;) {
switch (ref->state) {
case WT_REF_DISK:
case WT_REF_READING:
@@ -205,6 +206,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_free(session, ref->page_del);
return;
}
+ /*
+ * We wait for the change in page state, yield before retrying,
+ * and if we've yielded enough times, start sleeping so we don't
+ * burn CPU to no purpose.
+ */
+ __wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+ WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
+ sleep_count);
+ }
}
/*
@@ -242,10 +252,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
return (false);
skip = ref->page_del == NULL || (visible_all ?
- __wt_txn_visible_all(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)):
- __wt_txn_visible(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)));
+ __wt_txn_visible_all(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)):
+ __wt_txn_visible(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)));
/*
* The page_del structure can be freed as soon as the delete is stable:
@@ -254,8 +264,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
* no longer need synchronization to check the ref.
*/
if (skip && ref->page_del != NULL && (visible_all ||
- __wt_txn_visible_all(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)))) {
+ __wt_txn_visible_all(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
__wt_free(session, ref->page_del->update_list);
__wt_free(session, ref->page_del);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index a0b1ff65006..f933245eaef 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -45,13 +45,15 @@ __ovfl_read(WT_SESSION_IMPL *session,
*/
int
__wt_ovfl_read(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded)
{
WT_DECL_RET;
WT_OVFL_TRACK *track;
WT_UPDATE *upd;
size_t i;
+ *decoded = false;
+
/*
* If no page specified, there's no need to lock and there's no cache
* to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
@@ -78,8 +80,9 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
break;
}
WT_ASSERT(session, i < track->remove_next);
- store->data = WT_UPDATE_DATA(upd);
+ store->data = upd->data;
store->size = upd->size;
+ *decoded = true;
} else
ret = __ovfl_read(session, unpack->data, unpack->size, store);
__wt_readunlock(session, &S2BT(session)->ovfl_lock);
@@ -147,7 +150,7 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
/* Read the overflow value. */
WT_RET(__wt_scr_alloc(session, 1024, &tmp));
- WT_ERR(__ovfl_read(session, unpack->data, unpack->size, tmp));
+ WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));
/*
* Create an update entry with no transaction ID to ensure global
@@ -159,10 +162,23 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
* involves atomic operations which will act as our barrier. Regardless,
* we update the page footprint as part of this operation, which acts as
* a barrier as well.
+ *
+ * The update transaction ID choice is tricky, to work around an issue
+ * in variable-length column store. Imagine an overflow value with an
+ * RLE greater than 1. We append a copy to the end of an update chain,
+ * but it's possible it's the overflow value for more than one record,
+ * and appending it to the end of one record's update chain means a
+ * subsequent enter of a globally visible value to one of the records
+ * would allow the truncation of the overflow chain that leaves other
+ * records without a value. If appending such an overflow record, set
+ * the transaction ID to the first possible transaction ID. That ID is
+ * old enough to be globally visible, but we can use it as a flag if an
+ * update record cannot be discarded when truncating an update chain.
*/
WT_ERR(__wt_update_alloc(
session, tmp, &append, &size, WT_UPDATE_STANDARD));
- append->txnid = WT_TXN_NONE;
+ append->txnid = page->type == WT_PAGE_COL_VAR &&
+ __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE;
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
WT_PUBLISH(upd->next, append);
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 1bdf0fd1c8b..f28c4e10594 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -417,9 +417,10 @@ random_page_entry:
* the next entry, if that doesn't work, move to the previous entry.
*/
WT_ERR(__wt_row_random_leaf(session, cbt));
- if (__wt_cursor_valid(cbt, &upd))
- WT_ERR(__wt_kv_return(session, cbt, upd));
- else {
+ if (__wt_cursor_valid(cbt, &upd)) {
+ WT_ERR(__wt_key_return(session, cbt));
+ WT_ERR(__wt_value_return(session, cbt, upd));
+ } else {
if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
ret = __wt_btcur_prev(cbt, false);
WT_ERR(ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 6a89f505c31..91c1499840e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -194,7 +194,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
upd->txnid = upd_txnid;
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
- __wt_timestamp_set(upd->timestamp, las_timestamp.data);
+ __wt_timestamp_set(&upd->timestamp, las_timestamp.data);
#endif
switch (page->type) {
@@ -487,7 +487,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- u_int sleep_cnt, wait_cnt;
+ uint64_t sleep_cnt, wait_cnt;
bool busy, cache_work, evict_soon, stalled;
int force_attempts;
@@ -672,9 +672,8 @@ skip_evict:
if (cache_work)
continue;
}
- sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
+ __wt_ref_state_yield_sleep(&wait_cnt, &sleep_cnt);
WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt);
- __wt_sleep(0, sleep_cnt);
}
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 7212de72d6e..4452e6eb0c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -75,10 +75,10 @@ __key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* __value_return --
- * Change the cursor to reference an internal return value.
+ * Change the cursor to reference an internal original-page return value.
*/
static inline int
-__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -93,13 +93,6 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
page = cbt->ref->page;
cursor = &cbt->iface;
- /* If the cursor references a WT_UPDATE item, return it. */
- if (upd != NULL) {
- cursor->value.data = WT_UPDATE_DATA(upd);
- cursor->value.size = upd->size;
- return (0);
- }
-
if (page->type == WT_PAGE_ROW_LEAF) {
rip = &page->pg_row[cbt->slot];
@@ -136,6 +129,99 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
}
/*
+ * __value_return_upd --
+ * Change the cursor to reference an internal update structure return
+ * value.
+ */
+static inline int
+__value_return_upd(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_UPDATE **listp, *list[WT_MAX_MODIFY_UPDATE];
+ u_int i;
+ size_t allocated_bytes;
+
+ cursor = &cbt->iface;
+ allocated_bytes = 0;
+
+ /*
+ * We're passed a "standard" or "modified" update that's visible to us.
+ * Our caller should have already checked for deleted items (we're too
+ * far down the call stack to return not-found).
+ *
+ * Fast path if it's a standard item, assert our caller's behavior.
+ */
+ if (upd->type == WT_UPDATE_STANDARD) {
+ cursor->value.data = upd->data;
+ cursor->value.size = upd->size;
+ return (0);
+ }
+ WT_ASSERT(session, upd->type == WT_UPDATE_MODIFIED);
+
+ /*
+ * Find a complete update that's visible to us, tracking modifications
+ * that are visible to us.
+ */
+ for (i = 0, listp = list; upd != NULL; upd = upd->next) {
+ if (!__wt_txn_upd_visible(session, upd))
+ continue;
+
+ if (WT_UPDATE_DATA_VALUE(upd))
+ break;
+
+ if (upd->type == WT_UPDATE_MODIFIED) {
+ /*
+ * Update lists are expected to be short, but it's not
+ * guaranteed. There's sufficient room on the stack to
+ * avoid memory allocation in normal cases, but we have
+ * to handle the edge cases too.
+ */
+ if (i >= WT_MAX_MODIFY_UPDATE) {
+ if (i == WT_MAX_MODIFY_UPDATE)
+ listp = NULL;
+ WT_ERR(__wt_realloc_def(
+ session, &allocated_bytes, i + 1, &listp));
+ if (i == WT_MAX_MODIFY_UPDATE)
+ memcpy(listp, list, sizeof(list));
+ }
+ listp[i++] = upd;
+ }
+ }
+
+ /*
+ * If we hit the end of the chain, roll forward from the update item we
+ * found, otherwise, from the original page's value.
+ */
+ if (upd == NULL) {
+ /*
+ * Callers of this function set the cursor slot to an impossible
+ * value to check we're not trying to return on-page values when
+ * the update list should have been sufficient (which happens,
+ * for example, if an update list was truncated, deleting some
+ * standard update required by a previous modify update). Assert
+ * the case.
+ */
+ WT_ASSERT(session, cbt->slot != UINT32_MAX);
+
+ WT_ERR(__value_return(session, cbt));
+ } else if (upd->type == WT_UPDATE_DELETED)
+ WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
+ else
+ WT_ERR(__wt_buf_set(session,
+ &cursor->value, upd->data, upd->size));
+
+ while (i > 0)
+ WT_ERR(__wt_modify_apply(
+ session, &cursor->value, listp[--i]->data));
+
+err: if (allocated_bytes)
+ __wt_free(session, listp);
+ return (ret);
+}
+
+/*
* __wt_key_return --
* Change the cursor to reference an internal return key.
*/
@@ -164,21 +250,22 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
}
/*
- * __wt_kv_return --
- * Return a page referenced key/value pair to the application.
+ * __wt_value_return --
+ * Change the cursor to reference an internal return value.
*/
int
-__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__wt_value_return(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
WT_CURSOR *cursor;
cursor = &cbt->iface;
- WT_RET(__wt_key_return(session, cbt));
-
F_CLR(cursor, WT_CURSTD_VALUE_EXT);
- WT_RET(__value_return(session, cbt, upd));
+ if (upd == NULL)
+ WT_RET(__value_return(session, cbt));
+ else
+ WT_RET(__value_return_upd(session, cbt, upd));
F_SET(cursor, WT_CURSTD_VALUE_INT);
-
return (0);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index c1b7b6c4001..2862c7fb6d7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -31,6 +31,24 @@ typedef enum {
} WT_SPLIT_ERROR_PHASE;
/*
+ * __page_split_timing_stress --
+ * Optionally add delay to simulate the race conditions in page split for
+ * debug purposes. The purpose is to uncover the race conditions in page split.
+ */
+static void
+__page_split_timing_stress(WT_SESSION_IMPL *session,
+ uint32_t flag, uint64_t micro_seconds)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* We only want to sleep when page split race flag is set. */
+ if (FLD_ISSET(conn->timing_stress_flags, flag))
+ __wt_sleep(0, micro_seconds);
+}
+
+/*
* __split_safe_free --
* Free a buffer if we can be sure no thread is accessing it, or schedule
* it to be freed otherwise.
@@ -308,8 +326,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_prepare(WT_SESSION_IMPL *session,
- WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
+__split_ref_prepare(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -331,40 +349,12 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
ref = pindex->index[i];
child = ref->page;
- /*
- * Block eviction in newly created pages.
- *
- * Once the split is live, newly created internal pages might be
- * evicted and their WT_REF structures freed. If that happened
- * before all threads exit the index of the page that previously
- * "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page contains
- * the current split generation and can't be evicted until
- * all readers have left the old generation.
- *
- * Historic, we also blocked splits in newly created pages
- * because we didn't update the WT_REF.home field until after
- * the split was live, so the WT_REF.home fields being updated
- * could split again before the update, there's a race between
- * splits as to which would update them first. The current code
- * updates the WT_REF.home fields before going live (in this
- * function), this isn't an issue.
- */
- child->pg_intl_split_gen = split_gen;
-
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
+ /* Switch the WT_REF's to their new page. */
j = 0;
- WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
@@ -447,6 +437,18 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_calloc_one(session, alloc_refp));
root_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
@@ -471,10 +473,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
ref->ref_recno = (*root_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -504,13 +508,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), false);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, false);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -520,12 +523,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_INTL_INDEX_SET(root, alloc_index);
alloc_index = NULL;
- WT_LEAVE_PAGE_INDEX(session);
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Get a generation for this split, mark the root page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Note: as the root page cannot currently be evicted, the root split
+ * generation isn't ever used. That said, it future proofs eviction
+ * and isn't expensive enough to special-case.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
root->pg_intl_split_gen = split_gen;
@@ -700,6 +712,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -708,10 +724,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
/*
* Get a generation for this split, mark the page. This must be after
* the new index is swapped into place in order to know that no readers
* are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
@@ -760,7 +783,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Swapping in the new page index released the page for eviction, we can
* no longer look inside the page.
*/
-
if (ref->page == NULL)
__wt_verbose(session, WT_VERB_SPLIT,
"%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -779,8 +801,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/*
* The new page index is in place, free the WT_REF we were splitting and
* any deleted WT_REFs we found, modulo the usual safe free semantics.
- *
- * Acquire a new split generation.
*/
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
@@ -976,6 +996,18 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_calloc_one(session, alloc_refp));
parent_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
@@ -1000,10 +1032,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
ref->ref_recno = (*page_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -1033,32 +1067,35 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), true);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, true);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/* Split into the parent. */
- if ((ret = __split_parent(session, page_ref, alloc_index->index,
- alloc_index->entries, parent_incr, false, false)) == 0) {
- /*
- * Confirm the page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
- WT_INTL_INDEX_SET(page, replace_index);
- }
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
- WT_LEAVE_PAGE_INDEX(session);
- WT_ERR(ret);
+ /*
+ * Confirm the page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree.
+ */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Get a generation for this split, mark the parent page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
page->pg_intl_split_gen = split_gen;
@@ -1122,18 +1159,15 @@ err: switch (complete) {
}
/*
- * __split_internal_lock_worker --
+ * __split_internal_lock --
* Lock an internal page.
*/
static int
-__split_internal_lock_worker(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp)
{
- WT_DECL_RET;
WT_PAGE *parent;
- WT_REF *parent_ref;
- *hazardp = false;
*parentp = NULL;
/*
@@ -1166,10 +1200,11 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
for (;;) {
parent = ref->home;
- /*
- * The page will be marked dirty, and we can only lock a page
- * with a modify structure.
- */
+ /* Encourage race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND);
+
+ /* Page locks live in the modify structure. */
WT_RET(__wt_page_modify_init(session, parent));
if (trylock)
@@ -1182,69 +1217,28 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
}
/*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
+ * This child has exclusive access to split its parent and the child's
+ * existence prevents the parent from being evicted. However, once we
* update the parent's index, it may no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
+ * could conceivably be evicted. If the parent page is dirty, our page
+ * lock prevents eviction because reconciliation is blocked. However,
+ * if the page were clean, it could be evicted without encountering our
+ * page lock. That isn't possible because you cannot move a child page
+ * and still leave the parent page clean.
+ */
*parentp = parent;
return (0);
-
-err: WT_PAGE_UNLOCK(session, parent);
- return (ret);
-}
-
-/*
- * __split_internal_lock --
- * Lock an internal page.
- */
-static int
-__split_internal_lock(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
-
- /*
- * There's no lock on our parent page and we're about to acquire one,
- * which implies using the WT_REF.home field to reference our parent
- * page. As a child of the parent page, we prevent its eviction, but
- * that's a weak guarantee. If the parent page splits, and our WT_REF
- * were to move with the split, the WT_REF.home field might change
- * underneath us and we could race, and end up attempting to access
- * an evicted page. Set the session page-index generation so if the
- * parent splits, it still can't be evicted.
- */
- WT_WITH_PAGE_INDEX(session,
- ret = __split_internal_lock_worker(
- session, ref, trylock, parentp, hazardp));
- return (ret);
}
/*
* __split_internal_unlock --
* Unlock the parent page.
*/
-static int
-__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+static void
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent)
{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
-
WT_PAGE_UNLOCK(session, parent);
- return (ret);
}
/*
@@ -1297,13 +1291,12 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
* Check if we should split up the tree.
*/
static int
-__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *parent;
WT_REF *ref;
- bool parent_hazard;
btree = S2BT(session);
@@ -1317,8 +1310,10 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* split chunk, but we'll write it upon finding it in a different part
* of the tree.
*/
- if (btree->checkpointing != WT_CKPT_OFF)
- return (__split_internal_unlock(session, page, page_hazard));
+ if (btree->checkpointing != WT_CKPT_OFF) {
+ __split_internal_unlock(session, page);
+ return (0);
+ }
/*
* Page splits trickle up the tree, that is, as leaf pages grow large
@@ -1340,7 +1335,6 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
*/
for (;;) {
parent = NULL;
- parent_hazard = false;
ref = page->pg_intl_parent_ref;
/* If we don't need to split the page, we're done. */
@@ -1360,22 +1354,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* Lock the parent and split into it, then swap the parent/page
* locks, lock-coupling up the tree.
*/
- WT_ERR(__split_internal_lock(
- session, ref, true, &parent, &parent_hazard));
+ WT_ERR(__split_internal_lock(session, ref, true, &parent));
ret = __split_internal(session, parent, page);
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, page);
page = parent;
- page_hazard = parent_hazard;
parent = NULL;
- parent_hazard = false;
WT_ERR(ret);
}
err: if (parent != NULL)
- WT_TRET(
- __split_internal_unlock(session, parent, parent_hazard));
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, parent);
+ __split_internal_unlock(session, page);
/* A page may have been busy, in which case return without error. */
WT_RET_BUSY_OK(ret);
@@ -1462,11 +1452,11 @@ __split_multi_inmem(
case WT_PAGE_ROW_LEAF:
/* Build a key. */
if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->rip);
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
upd = orig->modify->mod_row_update[slot];
WT_ERR(__wt_row_leaf_key(
- session, orig, supd->rip, key, false));
+ session, orig, supd->ripcip, key, false));
} else {
upd = supd->ins->upd;
@@ -1530,7 +1520,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
break;
case WT_PAGE_ROW_LEAF:
if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->rip);
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
orig->modify->mod_row_update[slot] = NULL;
} else
supd->ins->upd = NULL;
@@ -1986,21 +1976,19 @@ err: if (split_ref[0] != NULL) {
}
/*
- * __wt_split_insert --
- * Lock, then split.
+ * __split_insert_lock --
+ * Split a page's last insert list entries into a separate page.
*/
-int
-__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+static int
+__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
-
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
+ /* Lock the parent page, then proceed with the insert split. */
+ WT_RET(__split_internal_lock(session, ref, true, &parent));
if ((ret = __split_insert(session, ref)) != 0) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2009,7 +1997,27 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_insert --
+ * Split a page's last insert list entries into a separate page.
+ */
+int
+__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the insert
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref));
+ return (ret);
}
/*
@@ -2077,21 +2085,19 @@ err: for (i = 0; i < new_entries; ++i)
}
/*
- * __wt_split_multi --
- * Lock, then split.
+ * __split_multi_lock --
+ * Split a page into multiple pages.
*/
-int
-__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+static int
+__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
-
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
+ /* Lock the parent page, then proceed with the split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2100,26 +2106,63 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_multi --
+ * Split a page into multiple pages.
+ */
+int
+__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the split.
+ */
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_multi_lock(session, ref, closing));
+ return (ret);
+}
+
+/*
+ * __split_reverse --
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
+ */
+static int
+__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+
+ /* Lock the parent page, then proceed with the reverse split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ __split_internal_unlock(session, parent);
+ return (ret);
}
/*
* __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
*/
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
__wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, false, true);
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the reverse
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index e3b9bbced48..d7150859e8f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -137,7 +137,6 @@ __stat_page_col_var(
WT_CELL_UNPACK *unpack, _unpack;
WT_COL *cip;
WT_INSERT *ins;
- WT_UPDATE *upd;
uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
uint32_t i;
bool orig_deleted;
@@ -177,31 +176,39 @@ __stat_page_col_var(
* we find, correct the original count based on its state.
*/
WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
- upd = ins->upd;
- if (upd->type == WT_UPDATE_RESERVED)
- continue;
- if (upd->type == WT_UPDATE_DELETED) {
+ switch (ins->upd->type) {
+ case WT_UPDATE_DELETED:
if (!orig_deleted) {
++deleted_cnt;
--entry_cnt;
}
- } else
+ break;
+ case WT_UPDATE_MODIFIED:
+ case WT_UPDATE_STANDARD:
if (orig_deleted) {
--deleted_cnt;
++entry_cnt;
}
+ break;
+ case WT_UPDATE_RESERVED:
+ break;
+ }
}
}
/* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
- if (ins->upd->type == WT_UPDATE_RESERVED)
- continue;
- if (ins->upd->type == WT_UPDATE_DELETED)
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
+ switch (ins->upd->type) {
+ case WT_UPDATE_DELETED:
++deleted_cnt;
- else
+ break;
+ case WT_UPDATE_MODIFIED:
+ case WT_UPDATE_STANDARD:
++entry_cnt;
- }
+ break;
+ case WT_UPDATE_RESERVED:
+ break;
+ }
WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 225e6812aa1..d783f8f6e71 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session,
{
WT_PAGE_INDEX *pindex;
WT_REF **start, **stop, **p, **t;
+ uint64_t sleep_count, yield_count;
uint32_t entries, slot;
- for (;;) {
+ /*
+ * If we don't find our reference, the page split and our home
+ * pointer references the wrong page. When internal pages
+ * split, their WT_REF structure home values are updated; yield
+ * and wait for that to happen.
+ */
+ for (sleep_count = yield_count = 0;;) {
/*
* Copy the parent page's index value: the page can split at
* any time, but the index's value is always valid, even if
@@ -58,14 +65,14 @@ __ref_index_slot(WT_SESSION_IMPL *session,
goto found;
}
}
-
/*
- * If we don't find our reference, the page split and our home
- * pointer references the wrong page. When internal pages
- * split, their WT_REF structure home values are updated; yield
- * and wait for that to happen.
+ * We failed to get the page index and slot reference, yield
+ * before retrying, and if we've yielded enough times, start
+ * sleeping so we don't burn CPU to no purpose.
*/
- __wt_yield();
+ __wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+ WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked,
+ sleep_count);
}
found: WT_ASSERT(session, pindex->index[slot] == ref);
@@ -177,12 +184,13 @@ __ref_descend_prev(
WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
+ uint64_t yield_count;
/*
* We're passed a child page into which we're descending, and on which
* we have a hazard pointer.
*/
- for (;; __wt_yield()) {
+ for (yield_count = 0;; yield_count++, __wt_yield()) {
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -242,6 +250,7 @@ __ref_descend_prev(
break;
}
*pindexp = pindex;
+ WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index e2d19bf705b..a57a9c17edb 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -268,13 +268,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value,
*/
if (modify_type == WT_UPDATE_DELETED ||
modify_type == WT_UPDATE_RESERVED)
- WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd));
+ WT_RET(__wt_calloc(session, 1, WT_UPDATE_SIZE, &upd));
else {
WT_RET(__wt_calloc(
- session, 1, sizeof(WT_UPDATE) + value->size, &upd));
+ session, 1, WT_UPDATE_SIZE + value->size, &upd));
if (value->size != 0) {
upd->size = WT_STORE_SIZE(value->size);
- memcpy(WT_UPDATE_DATA(upd), value->data, value->size);
+ memcpy(upd->data, value->data, value->size);
}
}
upd->type = (uint8_t)modify_type;
@@ -302,9 +302,16 @@ __wt_update_obsolete_check(
* freeing the memory.
*
* Walk the list of updates, looking for obsolete updates at the end.
+ *
+ * Only updates with globally visible, self-contained data can terminate
+ * update chains, ignore modified and reserved updates. Special case the
+ * first transaction ID, it flags column-store overflow values which can
+ * never be discarded.
*/
for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++)
- if (__wt_txn_upd_visible_all(session, upd)) {
+ if (WT_UPDATE_DATA_VALUE(upd) &&
+ __wt_txn_upd_visible_all(session, upd) &&
+ upd->txnid != WT_TXN_FIRST) {
if (first == NULL)
first = upd;
} else if (upd->txnid != WT_TXN_ABORTED)
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index dda27fec57c..2fca9dcf69f 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -126,9 +126,6 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_wiredtiger_open_compatibility_subconfigs, 1 },
- { "diagnostic_timing_stress", "list",
- NULL, "choices=[\"checkpoint_slow\"]",
- NULL, 0 },
{ "error_prefix", "string", NULL, NULL, NULL, 0 },
{ "eviction", "category",
NULL, NULL,
@@ -164,6 +161,10 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "statistics_log", "category",
NULL, NULL,
confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 },
+ { "timing_stress_for_test", "list",
+ NULL, "choices=[\"checkpoint_slow\",\"internal_page_split_race\""
+ ",\"page_split_race\"]",
+ NULL, 0 },
{ "verbose", "list",
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\","
@@ -179,6 +180,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
static const WT_CONFIG_CHECK confchk_WT_CONNECTION_set_timestamp[] = {
{ "oldest_timestamp", "string", NULL, NULL, NULL, 0 },
+ { "stable_timestamp", "string", NULL, NULL, NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -224,6 +226,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_checkpoint[] = {
{ "name", "string", NULL, NULL, NULL, 0 },
{ "read_timestamp", "string", NULL, NULL, NULL, 0 },
{ "target", "list", NULL, NULL, NULL, 0 },
+ { "use_timestamp", "boolean", NULL, NULL, NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -729,9 +732,6 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
confchk_wiredtiger_open_compatibility_subconfigs, 1 },
{ "config_base", "boolean", NULL, NULL, NULL, 0 },
{ "create", "boolean", NULL, NULL, NULL, 0 },
- { "diagnostic_timing_stress", "list",
- NULL, "choices=[\"checkpoint_slow\"]",
- NULL, 0 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -785,6 +785,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "statistics_log", "category",
NULL, NULL,
confchk_wiredtiger_open_statistics_log_subconfigs, 6 },
+ { "timing_stress_for_test", "list",
+ NULL, "choices=[\"checkpoint_slow\",\"internal_page_split_race\""
+ ",\"page_split_race\"]",
+ NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
@@ -823,9 +827,6 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
confchk_wiredtiger_open_compatibility_subconfigs, 1 },
{ "config_base", "boolean", NULL, NULL, NULL, 0 },
{ "create", "boolean", NULL, NULL, NULL, 0 },
- { "diagnostic_timing_stress", "list",
- NULL, "choices=[\"checkpoint_slow\"]",
- NULL, 0 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -879,6 +880,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "statistics_log", "category",
NULL, NULL,
confchk_wiredtiger_open_statistics_log_subconfigs, 6 },
+ { "timing_stress_for_test", "list",
+ NULL, "choices=[\"checkpoint_slow\",\"internal_page_split_race\""
+ ",\"page_split_race\"]",
+ NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
@@ -916,9 +921,6 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_wiredtiger_open_compatibility_subconfigs, 1 },
- { "diagnostic_timing_stress", "list",
- NULL, "choices=[\"checkpoint_slow\"]",
- NULL, 0 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -970,6 +972,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "statistics_log", "category",
NULL, NULL,
confchk_wiredtiger_open_statistics_log_subconfigs, 6 },
+ { "timing_stress_for_test", "list",
+ NULL, "choices=[\"checkpoint_slow\",\"internal_page_split_race\""
+ ",\"page_split_race\"]",
+ NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
@@ -1005,9 +1011,6 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_wiredtiger_open_compatibility_subconfigs, 1 },
- { "diagnostic_timing_stress", "list",
- NULL, "choices=[\"checkpoint_slow\"]",
- NULL, 0 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -1059,6 +1062,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "statistics_log", "category",
NULL, NULL,
confchk_wiredtiger_open_statistics_log_subconfigs, 6 },
+ { "timing_stress_for_test", "list",
+ NULL, "choices=[\"checkpoint_slow\",\"internal_page_split_race\""
+ ",\"page_split_race\"]",
+ NULL, 0 },
{ "transaction_sync", "category",
NULL, NULL,
confchk_wiredtiger_open_transaction_sync_subconfigs, 2 },
@@ -1123,17 +1130,17 @@ static const WT_CONFIG_ENTRY config_entries[] = {
{ "WT_CONNECTION.reconfigure",
"async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8,"
"cache_size=100MB,checkpoint=(log_size=0,wait=0),"
- "compatibility=(release=),diagnostic_timing_stress=,error_prefix="
- ",eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
- "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
- ",file_manager=(close_handle_minimum=250,close_idle_time=30,"
+ "compatibility=(release=),error_prefix=,eviction=(threads_max=8,"
+ "threads_min=1),eviction_checkpoint_target=5,"
+ "eviction_dirty_target=5,eviction_dirty_trigger=20,"
+ "eviction_target=80,eviction_trigger=95,"
+ "file_manager=(close_handle_minimum=250,close_idle_time=30,"
"close_scan_interval=10),log=(archive=true,prealloc=true,"
"zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4),"
"lsm_merge=true,shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
"size=500MB),statistics=none,statistics_log=(json=false,"
"on_close=false,sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
- "verbose=",
+ "timing_stress_for_test=,verbose=",
confchk_WT_CONNECTION_reconfigure, 21
},
{ "WT_CONNECTION.set_file_system",
@@ -1141,8 +1148,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
NULL, 0
},
{ "WT_CONNECTION.set_timestamp",
- "oldest_timestamp=",
- confchk_WT_CONNECTION_set_timestamp, 1
+ "oldest_timestamp=,stable_timestamp=",
+ confchk_WT_CONNECTION_set_timestamp, 2
},
{ "WT_CURSOR.close",
"",
@@ -1161,8 +1168,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_SESSION_begin_transaction, 6
},
{ "WT_SESSION.checkpoint",
- "drop=,force=false,name=,read_timestamp=,target=",
- confchk_WT_SESSION_checkpoint, 5
+ "drop=,force=false,name=,read_timestamp=,target=,"
+ "use_timestamp=true",
+ confchk_WT_SESSION_checkpoint, 6
},
{ "WT_SESSION.close",
"",
@@ -1343,9 +1351,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",builtin_extension_config=,cache_overhead=8,cache_size=100MB,"
"checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
"compatibility=(release=),config_base=true,create=false,"
- "diagnostic_timing_stress=,direct_io=,encryption=(keyid=,name=,"
- "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)"
- ",eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ "direct_io=,encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=5,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",exclusive=false,extensions=,file_extend=,"
"file_manager=(close_handle_minimum=250,close_idle_time=30,"
@@ -1357,9 +1365,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
- ",wait=0),transaction_sync=(enabled=false,method=fsync),"
- "use_environment=true,use_environment_priv=false,verbose=,"
- "write_through=",
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
+ "verbose=,write_through=",
confchk_wiredtiger_open, 42
},
{ "wiredtiger_open_all",
@@ -1367,9 +1375,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
",builtin_extension_config=,cache_overhead=8,cache_size=100MB,"
"checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
"compatibility=(release=),config_base=true,create=false,"
- "diagnostic_timing_stress=,direct_io=,encryption=(keyid=,name=,"
- "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)"
- ",eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ "direct_io=,encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=5,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",exclusive=false,extensions=,file_extend=,"
"file_manager=(close_handle_minimum=250,close_idle_time=30,"
@@ -1381,19 +1389,18 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
- ",wait=0),transaction_sync=(enabled=false,method=fsync),"
- "use_environment=true,use_environment_priv=false,verbose=,"
- "version=(major=0,minor=0),write_through=",
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),use_environment=true,use_environment_priv=false,"
+ "verbose=,version=(major=0,minor=0),write_through=",
confchk_wiredtiger_open_all, 43
},
{ "wiredtiger_open_basecfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_overhead=8,cache_size=100MB,"
"checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=),diagnostic_timing_stress=,direct_io=,"
- "encryption=(keyid=,name=,secretkey=),error_prefix=,"
- "eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ "compatibility=(release=),direct_io=,encryption=(keyid=,name=,"
+ "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)"
+ ",eviction_checkpoint_target=5,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
@@ -1404,18 +1411,17 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
- ",wait=0),transaction_sync=(enabled=false,method=fsync),verbose=,"
- "version=(major=0,minor=0),write_through=",
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),verbose=,version=(major=0,minor=0),write_through=",
confchk_wiredtiger_open_basecfg, 37
},
{ "wiredtiger_open_usercfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_overhead=8,cache_size=100MB,"
"checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=),diagnostic_timing_stress=,direct_io=,"
- "encryption=(keyid=,name=,secretkey=),error_prefix=,"
- "eviction=(threads_max=8,threads_min=1),"
- "eviction_checkpoint_target=5,eviction_dirty_target=5,"
+ "compatibility=(release=),direct_io=,encryption=(keyid=,name=,"
+ "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)"
+ ",eviction_checkpoint_target=5,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
",extensions=,file_extend=,file_manager=(close_handle_minimum=250"
",close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
@@ -1426,8 +1432,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
"reserve=0,size=500MB),statistics=none,statistics_log=(json=false"
",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\""
- ",wait=0),transaction_sync=(enabled=false,method=fsync),verbose=,"
- "write_through=",
+ ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false"
+ ",method=fsync),verbose=,write_through=",
confchk_wiredtiger_open_usercfg, 36
},
{ NULL, NULL, NULL, 0 }
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 09eceac0a3b..ded0e39b218 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1936,13 +1936,16 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
/*
* __wt_timing_stress_config --
- * Set diagnostic stress timing delay configuration.
+ * Set timing stress for test delay configuration.
*/
int
__wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[])
{
static const WT_NAME_FLAG stress_types[] = {
{ "checkpoint_slow", WT_TIMING_STRESS_CHECKPOINT_SLOW },
+ { "internal_page_split_race",
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE },
+ { "page_split_race", WT_TIMING_STRESS_PAGE_SPLIT_RACE },
{ NULL, 0 }
};
WT_CONFIG_ITEM cval, sval;
@@ -1954,22 +1957,13 @@ __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
WT_RET(__wt_config_gets(
- session, cfg, "diagnostic_timing_stress", &cval));
+ session, cfg, "timing_stress_for_test", &cval));
flags = 0;
for (ft = stress_types; ft->name != NULL; ft++) {
if ((ret = __wt_config_subgets(
session, &cval, ft->name, &sval)) == 0 && sval.val != 0) {
-#ifdef HAVE_DIAGNOSTIC
LF_SET(ft->flag);
-#else
- WT_RET_MSG(session, EINVAL,
- "diagnostic_timing_stress option specified when "
- "WiredTiger built without diagnostic support. Add "
- "--enable-diagnostic to configure command and "
- "rebuild to include support for diagnostic stress "
- "timing delays");
-#endif
}
WT_RET_NOTFOUND_OK(ret);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 97fdc7557ee..45f04b66247 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -140,18 +140,20 @@ __wt_conn_dhandle_find(
* Sync and close the underlying btree handle.
*/
int
-__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
+__wt_conn_btree_sync_and_close(
+ WT_SESSION_IMPL *session, bool final, bool mark_dead)
{
WT_BM *bm;
WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- bool marked_dead, no_schema_lock;
+ bool discard, marked_dead, no_schema_lock;
+ conn = S2C(session);
btree = S2BT(session);
bm = btree->bm;
dhandle = session->dhandle;
- marked_dead = false;
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
return (0);
@@ -180,45 +182,90 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
*/
__wt_spin_lock(session, &dhandle->close_lock);
- /*
- * The close can fail if an update cannot be written, return the EBUSY
- * error to our caller for eventual retry.
- *
- * If we are forcing the close, just mark the handle dead and the tree
- * will be discarded later. Don't do this for memory-mapped trees: we
- * have to close the file handle to allow the file to be removed, but
- * memory mapped trees contain pointers into memory that will become
- * invalid if the mapping is closed.
- */
+ /* Reset the tree's eviction priority (if any). */
+ __wt_evict_priority_clear(session);
+
+ discard = marked_dead = false;
if (!F_ISSET(btree,
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
- if (force && (bm == NULL || !bm->is_mapped(bm, session))) {
- F_SET(session->dhandle, WT_DHANDLE_DEAD);
+ /*
+ * If the handle is already marked dead, we're just here to
+ * discard it.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
+ discard = true;
+
+ /*
+ * Mark the handle dead (letting the tree be discarded later) if
+ * it's not already marked dead, our caller allows it, it's not
+ * a final close, and it's not a memory-mapped tree. (We can't
+ * mark memory-mapped tree handles dead because we close the
+ * underlying file handle to allow the file to be removed and
+ * memory-mapped trees contain pointers into memory that become
+ * invalid if the mapping is closed.)
+ */
+ if (!discard && mark_dead && !final &&
+ (bm == NULL || !bm->is_mapped(bm, session)))
marked_dead = true;
- /* Reset the tree's eviction priority (if any). */
- __wt_evict_priority_clear(session);
- }
- if (!marked_dead || final) {
- if ((ret = __wt_checkpoint_close(
- session, final)) == EBUSY)
- WT_ERR(ret);
- else
- WT_TRET(ret);
+ /*
+ * Flush dirty data from any durable trees we couldn't mark
+ * dead. That involves writing a checkpoint, which can fail if
+ * an update cannot be written, causing the close to fail: if
+ * not the final close, return the EBUSY error to our caller
+ * for eventual retry.
+ *
+ * We can't discard non-durable trees yet: first we have to
+ * close the underlying btree handle, then we can mark the
+ * data handle dead.
+ */
+ if (!discard && !marked_dead) {
+ if (F_ISSET(conn, WT_CONN_IN_MEMORY) ||
+ F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ discard = true;
+ else {
+ WT_TRET(__wt_checkpoint_close(session, final));
+ if (!final && ret == EBUSY)
+ WT_ERR(ret);
+ }
}
}
+ /* Discard the underlying btree handle. */
WT_TRET(__wt_btree_close(session));
F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
/*
- * If we marked a handle dead it will be closed by sweep, via
- * another call to sync and close.
+ * If marking the handle dead, do so after closing the underlying btree.
+ * (Don't do it before that, the block manager asserts there are never
+ * two references to a block manager object, and re-opening the handle
+ * can succeed once we mark this handle dead.)
+ *
+ * Check discard too, code we call to clear the cache expects the data
+ * handle dead flag to be set when discarding modified pages.
+ */
+ if (marked_dead || discard)
+ F_SET(dhandle, WT_DHANDLE_DEAD);
+
+ /*
+ * Discard from cache any trees not marked dead in this call (that is,
+ * including trees previously marked dead). Done after marking the data
+ * handle dead for a couple reasons: first, we don't need to hold an
+ * exclusive handle to do it, second, code we call to clear the cache
+ * expects the data handle dead flag to be set when discarding modified
+ * pages.
+ */
+ if (discard)
+ WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD));
+
+ /*
+ * If we marked a handle dead it will be closed by sweep, via another
+ * call to this function. Otherwise, we're done with this handle.
*/
if (!marked_dead) {
F_CLR(dhandle, WT_DHANDLE_OPEN);
if (dhandle->checkpoint == NULL)
- --S2C(session)->open_btree_count;
+ --conn->open_btree_count;
}
WT_ASSERT(session,
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
@@ -326,7 +373,7 @@ __wt_conn_btree_open(
* If the handle is already open, it has to be closed so it can be
* reopened with a new configuration.
*
- * This call can return EBUSY if there's an update in the object that's
+ * This call can return EBUSY if there's an update in the tree that's
* not yet globally visible. That's not a problem because it can only
* happen when we're switching from a normal handle to a "special" one,
* so we're returning EBUSY to an attempt to verify or do other special
@@ -486,7 +533,7 @@ err: WT_DHANDLE_RELEASE(dhandle);
*/
static int
__conn_dhandle_close_one(WT_SESSION_IMPL *session,
- const char *uri, const char *checkpoint, bool force)
+ const char *uri, const char *checkpoint, bool mark_dead)
{
WT_DECL_RET;
@@ -506,7 +553,7 @@ __conn_dhandle_close_one(WT_SESSION_IMPL *session,
*/
if (F_ISSET(session->dhandle, WT_DHANDLE_OPEN)) {
__wt_meta_track_sub_on(session);
- ret = __wt_conn_btree_sync_and_close(session, false, force);
+ ret = __wt_conn_btree_sync_and_close(session, false, mark_dead);
/*
* If the close succeeded, drop any locks it acquired. If
@@ -530,7 +577,7 @@ __conn_dhandle_close_one(WT_SESSION_IMPL *session,
*/
int
__wt_conn_dhandle_close_all(
- WT_SESSION_IMPL *session, const char *uri, bool force)
+ WT_SESSION_IMPL *session, const char *uri, bool mark_dead)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
@@ -548,7 +595,7 @@ __wt_conn_dhandle_close_all(
* locking the live handle to fail fast if the tree is busy (e.g., with
* cursors open or in a checkpoint).
*/
- WT_ERR(__conn_dhandle_close_one(session, uri, NULL, force));
+ WT_ERR(__conn_dhandle_close_one(session, uri, NULL, mark_dead));
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
@@ -558,7 +605,7 @@ __wt_conn_dhandle_close_all(
continue;
WT_ERR(__conn_dhandle_close_one(
- session, dhandle->name, dhandle->checkpoint, force));
+ session, dhandle->name, dhandle->checkpoint, mark_dead));
}
err: session->dhandle = NULL;
@@ -600,7 +647,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final)
*/
int
__wt_conn_dhandle_discard_single(
- WT_SESSION_IMPL *session, bool final, bool force)
+ WT_SESSION_IMPL *session, bool final, bool mark_dead)
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
@@ -610,7 +657,8 @@ __wt_conn_dhandle_discard_single(
dhandle = session->dhandle;
if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
- tret = __wt_conn_btree_sync_and_close(session, final, force);
+ tret =
+ __wt_conn_btree_sync_and_close(session, final, mark_dead);
if (final && tret != 0) {
__wt_err(session, tret,
"Final close of %s failed", dhandle->name);
@@ -679,7 +727,7 @@ restart:
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
+ session, true, false)));
goto restart;
}
@@ -705,7 +753,7 @@ restart:
WT_TAILQ_SAFE_REMOVE_BEGIN(dhandle, &conn->dhqh, q, dhandle_tmp) {
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
+ session, true, false)));
} WT_TAILQ_SAFE_REMOVE_END
return (ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index a6290e4ed92..fdf9bc8627e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -484,6 +484,7 @@ __log_file_server(void *arg)
WT_LOG *log;
WT_LSN close_end_lsn, min_lsn;
WT_SESSION_IMPL *session;
+ uint64_t yield_count;
uint32_t filenum;
bool locked;
@@ -491,6 +492,7 @@ __log_file_server(void *arg)
conn = S2C(session);
log = conn->log;
locked = false;
+ yield_count = 0;
while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
/*
* If there is a log file to close, make sure any outstanding
@@ -619,6 +621,7 @@ __log_file_server(void *arg)
* thread a chance to run and try again in
* this case.
*/
+ yield_count++;
__wt_yield();
continue;
}
@@ -631,6 +634,7 @@ __log_file_server(void *arg)
if (0) {
err: WT_PANIC_MSG(session, ret, "log close server error");
}
+ WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count);
if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
return (WT_THREAD_RET_VALUE);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 592d66b5294..7236735715f 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -85,7 +85,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
/* Only sweep clean trees where all updates are visible. */
if (btree->modified || !__wt_txn_visible_all(session,
- btree->rec_max_txn, WT_TIMESTAMP(btree->rec_max_timestamp)))
+ btree->rec_max_txn, WT_TIMESTAMP_NULL(&btree->rec_max_timestamp)))
goto err;
/*
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 3b6328a2d93..b5a8e1353ca 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -275,6 +275,40 @@ err: CURSOR_UPDATE_API_END(session, ret);
}
/*
+ * __curfile_modify --
+ * WT_CURSOR->modify method for the btree cursor type.
+ */
+static int
+__curfile_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL_BTREE(cursor, session, modify, cbt->btree);
+ WT_ERR(__cursor_checkkey(cursor));
+
+ /* Check for a rational modify vector count. */
+ if (nentries <= 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Illegal modify vector with %d entries", nentries);
+
+ WT_ERR(__wt_btcur_modify(cbt, entries, nentries));
+
+ /*
+ * Modify maintains a position, key and value. Unlike update, it's not
+ * always an internal value.
+ */
+ WT_ASSERT(session,
+ F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
+ WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) != 0);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
* __curfile_update --
* WT_CURSOR->update method for the btree cursor type.
*/
@@ -513,6 +547,15 @@ __curfile_create(WT_SESSION_IMPL *session,
/* Underlying btree initialization. */
__wt_btcur_open(cbt);
+ /*
+ * WT_CURSOR.modify supported on 'u' value formats, but the fast-path
+ * through the btree code requires log file format changes, it's not
+ * available in all versions.
+ */
+ if (WT_STREQ(cursor->value_format, "u") &&
+ S2C(session)->compat_major >= WT_LOG_V2)
+ cursor->modify = __curfile_modify;
+
WT_ERR(__wt_cursor_init(
cursor, cursor->internal_uri, owner, cfg, cursorp));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
index d1166c5d402..0ba3ce83e59 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_log.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -104,6 +104,13 @@ __curlog_op_read(WT_SESSION_IMPL *session,
pp = cl->stepp;
end = pp + opsize;
switch (optype) {
+ case WT_LOGOP_COL_MODIFY:
+ WT_RET(__wt_logop_col_modify_unpack(session, &pp, end,
+ fileid, &recno, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
case WT_LOGOP_COL_PUT:
WT_RET(__wt_logop_col_put_unpack(session, &pp, end,
fileid, &recno, &value));
@@ -117,6 +124,13 @@ __curlog_op_read(WT_SESSION_IMPL *session,
WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
break;
+ case WT_LOGOP_ROW_MODIFY:
+ WT_RET(__wt_logop_row_modify_unpack(session, &pp, end,
+ fileid, &key, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
case WT_LOGOP_ROW_PUT:
WT_RET(__wt_logop_row_put_unpack(session, &pp, end,
fileid, &key, &value));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 91995ab0e0a..28762f798c8 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -141,15 +141,16 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor)
* in the future to change these configurations.
*/
cursor->compare = __wt_cursor_compare_notsup;
+ cursor->insert = __wt_cursor_notsup;
+ cursor->modify = __wt_cursor_modify_notsup;
cursor->next = __wt_cursor_notsup;
cursor->prev = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+ cursor->reserve = __wt_cursor_notsup;
cursor->reset = __wt_cursor_noop;
cursor->search = __wt_cursor_notsup;
cursor->search_near = __wt_cursor_search_near_notsup;
- cursor->insert = __wt_cursor_notsup;
cursor->update = __wt_cursor_notsup;
- cursor->remove = __wt_cursor_notsup;
- cursor->reserve = __wt_cursor_notsup;
}
/*
@@ -603,88 +604,36 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
- WT_DECL_ITEM(ta);
- WT_DECL_ITEM(tb);
- WT_DECL_ITEM(tmp);
- size_t len, size;
- int i;
CURSOR_UPDATE_API_CALL(cursor, session, modify);
- WT_ERR(__cursor_checkkey(cursor));
-
- /* Check for a rational modify vector count. */
- if (nentries <= 0)
- WT_ERR_MSG(
- session, EINVAL, "Illegal modify vector of %d", nentries);
WT_STAT_CONN_INCR(session, cursor_modify);
WT_STAT_DATA_INCR(session, cursor_modify);
- /* Acquire position and value. */
- WT_ERR(cursor->search(cursor));
+ /* Check for a rational modify vector count. */
+ if (nentries <= 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Illegal modify vector with %d entries", nentries);
/*
- * Process the entries to figure out how large a buffer we need. This is
- * a bit pessimistic because we're ignoring replacement bytes, but it's
- * a simpler calculation.
+ * The underlying btree code cannot support WT_CURSOR.modify within
+ * a read-uncommitted transaction. Disallow it here for consistency.
*/
- for (size = cursor->value.size, i = 0; i < nentries; ++i) {
- if (entries[i].offset >= size)
- size = entries[i].offset;
- size += entries[i].data.size;
- }
+ if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+ WT_ERR_MSG(session, ENOTSUP,
+ "not supported in read-uncommitted transactions");
- /* Allocate a pair of buffers. */
- WT_ERR(__wt_scr_alloc(session, size, &ta));
- WT_ERR(__wt_scr_alloc(session, size, &tb));
-
- /* Apply the change vector to the value. */
- WT_ERR(__wt_buf_set(
- session, ta, cursor->value.data, cursor->value.size));
- for (i = 0; i < nentries; ++i) {
- /* Take leading bytes from the original, plus any gap bytes. */
- if (entries[i].offset >= ta->size) {
- memcpy(tb->mem, ta->mem, ta->size);
- if (entries[i].offset > ta->size)
- memset((uint8_t *)tb->mem + ta->size,
- '\0', entries[i].offset - ta->size);
- } else
- if (entries[i].offset > 0)
- memcpy(tb->mem, ta->mem, entries[i].offset);
- tb->size = entries[i].offset;
-
- /* Take replacement bytes. */
- if (entries[i].data.size > 0) {
- memcpy((uint8_t *)tb->mem + tb->size,
- entries[i].data.data, entries[i].data.size);
- tb->size += entries[i].data.size;
- }
-
- /* Take trailing bytes from the original. */
- len = entries[i].offset + entries[i].size;
- if (ta->size > len) {
- memcpy((uint8_t *)tb->mem + tb->size,
- (uint8_t *)ta->mem + len, ta->size - len);
- tb->size += ta->size - len;
- }
- WT_ASSERT(session, tb->size <= size);
-
- tmp = ta;
- ta = tb;
- tb = tmp;
- }
+ WT_ERR(__cursor_checkkey(cursor));
- /* Set the cursor's value. */
- ta->data = ta->mem;
- cursor->set_value(cursor, ta);
+ /* Get the current value, apply the modifications. */
+ WT_ERR(cursor->search(cursor));
+ WT_ERR(__wt_modify_apply_api(
+ session, &cursor->value, entries, nentries));
/* We know both key and value are set, "overwrite" doesn't matter. */
ret = cursor->update(cursor);
-err: __wt_scr_free(session, &ta);
- __wt_scr_free(session, &tb);
-
- CURSOR_UPDATE_API_END(session, ret);
+err: CURSOR_UPDATE_API_END(session, ret);
return (ret);
}
@@ -789,6 +738,7 @@ __wt_cursor_init(WT_CURSOR *cursor,
WT_CONFIG_ITEM cval;
WT_CURSOR *cdump;
WT_SESSION_IMPL *session;
+ bool readonly;
session = (WT_SESSION_IMPL *)cursor->session;
@@ -810,21 +760,23 @@ __wt_cursor_init(WT_CURSOR *cursor,
* Checkpoint cursors are permanently read-only, avoid the extra work
* of two configuration string checks.
*/
- WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
- if (cval.len != 0) {
+ readonly = F_ISSET(S2C(session), WT_CONN_READONLY);
+ if (!readonly) {
+ WT_RET(
+ __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ readonly = cval.len != 0;
+ }
+ if (!readonly) {
+ WT_RET(
+ __wt_config_gets_def(session, cfg, "readonly", 0, &cval));
+ readonly = cval.val != 0;
+ }
+ if (readonly) {
cursor->insert = __wt_cursor_notsup;
+ cursor->modify = __wt_cursor_modify_notsup;
cursor->remove = __wt_cursor_notsup;
cursor->reserve = __wt_cursor_notsup;
cursor->update = __wt_cursor_notsup;
- } else {
- WT_RET(
- __wt_config_gets_def(session, cfg, "readonly", 0, &cval));
- if (cval.val != 0 || F_ISSET(S2C(session), WT_CONN_READONLY)) {
- cursor->insert = __wt_cursor_notsup;
- cursor->remove = __wt_cursor_notsup;
- cursor->reserve = __wt_cursor_notsup;
- cursor->update = __wt_cursor_notsup;
- }
}
/*
@@ -865,7 +817,7 @@ __wt_cursor_init(WT_CURSOR *cursor,
/*
* WT_CURSOR.modify supported on 'u' value formats, but may have been
- * already initialized.
+ * already initialized (file cursors have a faster implementation).
*/
if (WT_STREQ(cursor->value_format, "u") &&
cursor->modify == __wt_cursor_modify_notsup)
diff --git a/src/third_party/wiredtiger/src/docs/Doxyfile b/src/third_party/wiredtiger/src/docs/Doxyfile
index e7382e2bc5e..8292df18e47 100644
--- a/src/third_party/wiredtiger/src/docs/Doxyfile
+++ b/src/third_party/wiredtiger/src/docs/Doxyfile
@@ -232,7 +232,9 @@ ALIASES = "notyet{1}=Note: <b>"\1"</b> not yet supported in Wired
"configstart{2}=@param config\n Configuration string, see @ref config_strings. Permitted values:\n <table>@hrow{Name,Effect,Values}" \
"config{3}= @row{<tt>\1</tt>,\2,\3}" \
"configend= </table>" \
- "configempty{2}=@param config\n Configuration string, see @ref config_strings. No values currently permitted."
+ "configempty{2}=@param config\n Configuration string, see @ref config_strings. No values currently permitted." \
+ "plantuml_start{1}=\image html \1\n\image latex \1\n<!-- PlantUML template begins" \
+ "plantuml_end=PlantUML template end -->"
# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding
diff --git a/src/third_party/wiredtiger/src/docs/devdoc-index.dox b/src/third_party/wiredtiger/src/docs/devdoc-index.dox
new file mode 100644
index 00000000000..ba809d7af43
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/devdoc-index.dox
@@ -0,0 +1,12 @@
+/*! @page devdoc-index Developer Documentation
+
+Most applications begin to make use of WiredTiger by creating a table (or other
+data object) to store their data in. Create is one of several schema operations
+available in WiredTiger.
+
+For more information on how schema operations are implemented in WiredTiger,
+see:
+
+- @subpage devdoc-schema
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/devdoc-schema.dox b/src/third_party/wiredtiger/src/docs/devdoc-schema.dox
new file mode 100644
index 00000000000..509fbf16d67
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/devdoc-schema.dox
@@ -0,0 +1,208 @@
+/*! @page devdoc-schema Schema operations
+
+A schema defines the format of the application data in WiredTiger. WiredTiger
+supports various types of schemas (See @ref schema for more
+information), operated upon through a WT_SESSION reference. This section details
+the internals of these various schema operations.
+
+Schema operations cause an update to the metadata and are performed under a
+schema lock to avoid concurrent operations on the same object. The following
+sequence of steps define a generic schema operation:
+
+@plantuml_start{schema_generic.png}
+@startuml{schema_generic.png}
+:A schema operation;
+partition with-schema-lock {
+ :perform operation on underlying data-object;
+ :update metadata-file;
+ :checkpoint and sync metadata;
+}
+stop
+@enduml
+@plantuml_end
+
+@section schema_create Schema Create
+
+The create schema operation is responsible for creating the underlying data
+object on the filesystem with the right parameters and then creating an entry
+for this new object into the metadata. The sequence of operations involved in a
+create for various schema types are as follows:
+
+@plantuml_start{schema_create.png}
+@startuml{schema_create.png}
+:WT_SESSION->create(.,name,.)
+(__session_create());
+
+partition session-API-call {
+ :API session init with NULL dhandle;
+ :exit if PANIC flag set;
+ :exit if invalid configuration;
+}
+
+:validate "name" and if passed "type" config parameter;
+note right
+ "name" parameter is called as "uri" internally.
+ "type" is usually not passed and generally
+ implied from the uri.
+end note
+
+partition with-schema-lock {
+ partition with-table-lock {
+ :turn on meta tracking;
+ :check uri}
+
+ split
+ :uri matches "file:"
+ ("file" is the underlying
+ type for all the objects);
+ split again
+ :uri matches "colgroup:";
+ :__create_colgroup();
+ split again
+ :uri matches "table:";
+ :__create_table();
+ split again
+ :uri matches "lsm:";
+ :__wt_lsm_tree_create();
+ split again
+ :uri matches "index:";
+ :__create_index();
+ split again
+ :matches a named data-source;
+ :__create_data_source();
+ end split
+
+ partition __create_file() {
+ :exit if file exists;
+ :validate allocation size;
+ :block manager creates the file:
+ 1.create file using __wt_open()
+ 2.write an initial descriptor to file
+ 3.fsync and close the file handle;
+ if (metadata-file?) then (yes)
+ else (no)
+ :update metadata with file
+ configuration and version;
+ endif
+ :check if file setup correctly by
+ getting btree handle with
+ WT_DHANDLE_EXCLUSIVE set;
+ if (metatracking on?) then (yes)
+ :track locked handle*;
+ else (no)
+ :release btree -
+ sync and close;
+ endif
+ }
+
+ partition turn-off-meta-tracking {
+ if (errors?) then (yes)
+ :unroll operations;
+ else (no)
+ if (logging?) then (yes)
+ :sync log;
+ else (no)
+ endif
+ :checkpoint and sync;
+ endif
+ :apply post-commit ops:
+ release tracked (handle) btree* -
+ sync and close;
+ note right
+ if meta tracking is on, this btree
+ was being tracked as locked. As part
+ of tuning off meta tracking, we sync
+ and close this btree
+ end note
+ }
+ }
+}
+
+:API-end;
+
+stop
+@enduml
+@plantuml_end
+
+@section schema_rename Schema Rename
+
+The rename schema operation is responsible for renaming the underlying data
+object on the filesystem and updating the metadata accordingly. The sequence of
+operations involved in a rename for various schema types are as follows:
+
+@plantuml_start{schema_rename.png}
+@startuml{schema_rename.png}
+:WT_SESSION->rename(old-uri, new-uri, .)
+(__session_rename());
+:session-API-call;
+
+partition with-checkpoint-lock {
+ partition with-schema-lock {
+ partition with-table-write-lock {
+ :validate new uri-type to match the old type;
+ :turn on meta tracking;
+ :check uri}
+
+ split
+ :uri matches "file:"
+ ("file" is the underlying
+ type for all the objects);
+ split again
+ :uri matches "lsm:";
+ :__wt_lsm_tree_rename();
+ split again
+ :matches a named data-source;
+ :WT_DATA_SOURCE::rename();
+ split again
+ :uri matches "table:";
+ partition __rename_table() {
+ :rename colgroups and indices represented by the table:
+ 1. extract names from the uri
+ 2. create new uri with existing types and configuration
+ 3. recursive call the rename operation on individual
+ colgroup and index with the old and the new uri
+ 4. remove old entry for colgroups and indices from
+ the metadata table and add the new ones;
+ :close and remove table handle from the session;
+ :remove old table entry from the metadata table
+ and add a new one;
+ }
+ end split
+
+ partition __rename_file() {
+ :fail if backup cursor open and schema operations will conflict;
+ :close btree handles in the file;
+ :fail if file with the old name doesn't exist or with the new
+ name exists;
+ :remove old file entries and add new in the metadata;
+ :rename the underlying file;
+ if (meta-tracking?) then (yes)
+ :track filesystem op;
+ else (no)
+ endif
+ }
+
+ :bump schema generation number to ignore stale data;
+
+ partition turn-off-meta-tracking {
+ if (errors?) then (yes)
+ :unroll operations;
+ else (no)
+ if (logging?) then (yes)
+ :sync log;
+ else (no)
+ endif
+ :checkpoint and sync;
+ endif
+ }
+ }
+ }
+}
+
+:API-end;
+
+stop
+@enduml
+@plantuml_end
+
+*/
diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png
new file mode 100644
index 00000000000..d2699646e9c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png
new file mode 100644
index 00000000000..790584b3f7b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png
new file mode 100644
index 00000000000..b0aa560946e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png
Binary files differ
diff --git a/src/third_party/wiredtiger/src/docs/introduction.dox b/src/third_party/wiredtiger/src/docs/introduction.dox
index 52936a04d3c..3a6886869b0 100644
--- a/src/third_party/wiredtiger/src/docs/introduction.dox
+++ b/src/third_party/wiredtiger/src/docs/introduction.dox
@@ -59,4 +59,8 @@ To browse the WiredTiger source code repository or contact us, see:
- @subpage community
+For more information on WiredTiger internals, see:
+
+- @subpage devdoc-index
+
*/
diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok
index 5d629f4c49f..112e5e815fe 100644
--- a/src/third_party/wiredtiger/src/docs/spell.ok
+++ b/src/third_party/wiredtiger/src/docs/spell.ok
@@ -15,6 +15,7 @@ Coverity
Coverity's
DB's
DBTs
+DHANDLE
DONTNEED
Datastore
DbCursor
@@ -185,6 +186,8 @@ desc
destructor
destructors
dev
+devdoc
+dhandle
disjunction
disjunctions
distclean
@@ -208,6 +211,7 @@ endcond
endian
endif
endinternal
+enduml
english
env
eof
@@ -317,6 +321,7 @@ memalloc
memfree
memp
metadata
+metatracking
minkey
mixin
mixins
@@ -375,6 +380,7 @@ perf
petabyte
pget
php
+plantuml
png
posix
pre
@@ -440,6 +446,7 @@ sql
src
ssd
startsync
+startuml
statlog
stderr
str
diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox
index 8a05de9b5f5..d9cc72dcf24 100644
--- a/src/third_party/wiredtiger/src/docs/transactions.dox
+++ b/src/third_party/wiredtiger/src/docs/transactions.dox
@@ -164,7 +164,7 @@ timestamp size is 8 bytes (i.e., 64 bits). Setting a size of zero disables
transaction timestamp functionality.
Applications can assign explicit commit timestamps to transactions, then read
-"as of" a timestamp. Timestamps and are communicated to WiredTiger using a
+"as of" a timestamp. Timestamps are communicated to WiredTiger using a
lower case hexadecimal encoding, so the encoded value can be twice as long as
the raw timestamp value.
@@ -177,6 +177,13 @@ WiredTiger can discard history before the specified point. It is critical
that the oldest timestamp update frequently or the cache can become full of
updates, reducing performance.
+Setting a stable timestamp in WT_CONNECTION::set_timestamp indicates a
+known stable location that is sufficient for durability. During a checkpoint
+the state of a table will be saved only as of the stable timestamp. Newer
+updates after that stable timestamp will not be included in the checkpoint.
+That can be overridden in the call to WT_SESSION::checkpoint. It is expected
+that the stable timestamp is updated frequently.
+
Commit timestamps cannot be set in the past of any read timestamp that has
been used. This is enforced by assertions in diagnostic builds, if
applications violate this rule, data consistency can be violated.
diff --git a/src/third_party/wiredtiger/src/docs/upgrade.dox b/src/third_party/wiredtiger/src/docs/upgrade.dox
index 7952efa0343..9f8331c9b1f 100644
--- a/src/third_party/wiredtiger/src/docs/upgrade.dox
+++ b/src/third_party/wiredtiger/src/docs/upgrade.dox
@@ -60,11 +60,17 @@ the WT_SESSION::upgrade method.
Applications wanting the ability to downgrade to previous releases of
WiredTiger are constrained in some important ways:
-- The underlying data file formats must not have changed between the
+- The underlying table file formats must not have changed between the
release currently being run and the release to which you are
-downgrading. In other words, data file format changes are not backward
-compatible, and an upgraded data file cannot be downgraded without being
+downgrading. In other words, table file format changes are not backward
+compatible, and an upgraded table file cannot be downgraded without being
dumped and re-loaded.
+- If the underlying log file format changed and you have logging enabled
+in the database, in order to retain the ability to downgrade to earlier
+releases you must use \c compatibility option in the call to
+::wiredtiger_open or WT_CONNECTION::reconfigure. When running at an
+earlier compatibility release setting, the log files generated will be
+compatible with earlier binaries.
<br><br>
- Applications concerned with downgrading should configure the
::wiredtiger_open \c config_base value to \c false, so WiredTiger does
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 1f26949c94f..6b341a85df8 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -964,7 +964,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session)
* If we have a fixed number of eviction threads, there is no value in
* calculating if we should do any tuning.
*/
- if (conn->evict_threads_max == conn->evict_threads_min)
+ if (conn->evict_threads_max == conn->evict_threads_min)
return;
WT_ASSERT(session, conn->evict_threads.threads[0]->session == session);
@@ -1506,10 +1506,8 @@ retry: while (slot < max_entries) {
goto retry;
}
-err: if (dhandle_locked) {
+err: if (dhandle_locked)
__wt_readunlock(session, &conn->dhandle_lock);
- dhandle_locked = false;
- }
/*
* If we didn't find any entries on a walk when we weren't interrupted,
@@ -1604,7 +1602,6 @@ __evict_walk_file(WT_SESSION_IMPL *session,
start = queue->evict_queue + *slotp;
remaining_slots = max_entries - *slotp;
total_slots = max_entries - queue->evict_entries;
- btree_inuse = cache_inuse = 0;
target_pages_clean = target_pages_dirty = 0;
/*
@@ -1813,10 +1810,8 @@ __evict_walk_file(WT_SESSION_IMPL *session,
}
if (ref == NULL) {
- WT_STAT_CONN_INCR(
- session, cache_eviction_walks_ended);
- WT_STAT_DATA_INCR(
- session, cache_eviction_walks_ended);
+ WT_STAT_CONN_INCR(session, cache_eviction_walks_ended);
+ WT_STAT_DATA_INCR(session, cache_eviction_walks_ended);
if (++restarts == 2) {
WT_STAT_CONN_INCR(
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index a12590dedbc..03d6e9ab503 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -597,7 +597,7 @@ __evict_review(
LF_ISSET(WT_EVICT_LOOKASIDE) ||
F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session, page->modify->rec_max_txn,
- WT_TIMESTAMP(page->modify->rec_max_timestamp)));
+ WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp)));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_stat.c b/src/third_party/wiredtiger/src/evict/evict_stat.c
index 63d0aff60f1..b8878f1ce2f 100644
--- a/src/third_party/wiredtiger/src/evict/evict_stat.c
+++ b/src/third_party/wiredtiger/src/evict/evict_stat.c
@@ -29,7 +29,7 @@ __evict_stat_walk(WT_SESSION_IMPL *session)
btree = S2BT(session);
cache = S2C(session)->cache;
next_walk = NULL;
- dsk_size = gen_gap = gen_gap_max = gen_gap_sum = max_pagesize = 0;
+ gen_gap_max = gen_gap_sum = max_pagesize = 0;
num_memory = num_not_queueable = num_queued = 0;
num_smaller_allocsz = pages_clean = pages_dirty = pages_internal = 0;
pages_leaf = seen_count = size = visited_count = 0;
diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i
index a9ec91d49ff..bd14fa613a8 100644
--- a/src/third_party/wiredtiger/src/include/bitstring.i
+++ b/src/third_party/wiredtiger/src/include/bitstring.i
@@ -166,8 +166,6 @@ __bit_ffc(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
uint8_t lb;
uint64_t byte, stopbyte, value;
- value = 0; /* -Wuninitialized */
-
if (nbits == 0)
return (-1);
@@ -199,7 +197,6 @@ __bit_ffs(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
uint8_t lb;
uint64_t byte, stopbyte, value;
- value = 0;
if (nbits == 0)
return (-1);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index e8d3307b013..01a9179aedc 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -263,17 +263,17 @@ struct __wt_page_modify {
void *disk_image;
/*
- * List of unresolved updates. Updates are either a WT_INSERT
- * or a row-store leaf page entry; when creating lookaside
- * records, there is an additional value, the committed item's
- * transaction ID.
+ * List of unresolved updates. Updates are either a row-store
+ * insert or update list, or column-store insert list. When
+ * creating lookaside records, there is an additional value,
+ * the committed item's transaction information.
*
* If there are unresolved updates, the block wasn't written and
* there will always be a disk image.
*/
struct __wt_save_upd {
- WT_INSERT *ins;
- WT_ROW *rip;
+ WT_INSERT *ins; /* Insert list reference */
+ WT_ROW *ripcip; /* Original on-page reference */
uint64_t onpage_txn;
WT_DECL_TIMESTAMP(onpage_timestamp)
} *supd;
@@ -695,7 +695,7 @@ struct __wt_page {
* Related information for fast-delete, on-disk pages.
*/
struct __wt_page_deleted {
- uint64_t txnid; /* Transaction ID */
+ volatile uint64_t txnid; /* Transaction ID */
WT_DECL_TIMESTAMP(timestamp)
WT_UPDATE **update_list; /* List of updates for abort */
@@ -885,43 +885,60 @@ struct __wt_ikey {
* is done for an entry, WT_UPDATE structures are formed into a forward-linked
* list.
*/
-WT_PACKED_STRUCT_BEGIN(__wt_update)
- uint64_t txnid; /* transaction */
- WT_DECL_TIMESTAMP(timestamp)
+struct __wt_update {
+ volatile uint64_t txnid; /* transaction ID */
+#if WT_TIMESTAMP_SIZE == 8
+ WT_DECL_TIMESTAMP(timestamp) /* aligned uint64_t timestamp */
+#endif
WT_UPDATE *next; /* forward-linked list */
uint32_t size; /* data length */
-#define WT_UPDATE_STANDARD 0
-#define WT_UPDATE_DELETED 1
-#define WT_UPDATE_RESERVED 2
+#define WT_UPDATE_DELETED 0 /* deleted */
+#define WT_UPDATE_MODIFIED 1 /* partial-update modify value */
+#define WT_UPDATE_RESERVED 2 /* reserved */
+#define WT_UPDATE_STANDARD 3 /* complete value */
uint8_t type; /* type (one byte to conserve memory) */
- /* The update includes a complete value. */
+ /* If the update includes a complete value. */
#define WT_UPDATE_DATA_VALUE(upd) \
((upd)->type == WT_UPDATE_STANDARD || (upd)->type == WT_UPDATE_DELETED)
- /* The untyped value immediately follows the WT_UPDATE structure. */
-#define WT_UPDATE_DATA(upd) \
- ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE)))
+#if WT_TIMESTAMP_SIZE != 8
+ WT_DECL_TIMESTAMP(timestamp) /* unaligned uint8_t array timestamp */
+#endif
/*
- * The memory size of an update: include some padding because this is
- * such a common case that overhead of tiny allocations can swamp our
- * cache overhead calculation.
+ * Zero or more bytes of value (the payload) immediately follows the
+ * WT_UPDATE structure. We use a C99 flexible array member which has
+ * the semantics we want.
*/
-#define WT_UPDATE_MEMSIZE(upd) \
- WT_ALIGN(sizeof(WT_UPDATE) + (upd)->size, 32)
-WT_PACKED_STRUCT_END
+ uint8_t data[]; /* start of the data */
+};
/*
- * WT_UPDATE_SIZE is the expected structure size -- we verify the build to
- * ensure the compiler hasn't inserted padding.
+ * WT_UPDATE_SIZE is the expected structure size excluding the payload data --
+ * we verify the build to ensure the compiler hasn't inserted padding.
*/
#define WT_UPDATE_SIZE (21 + WT_TIMESTAMP_SIZE)
/*
+ * The memory size of an update: include some padding because this is such a
+ * common case that overhead of tiny allocations can swamp our cache overhead
+ * calculation.
+ */
+#define WT_UPDATE_MEMSIZE(upd) \
+ WT_ALIGN(WT_UPDATE_SIZE + (upd)->size, 32)
+
+/*
+ * WT_MAX_MODIFY_UPDATE --
+ * Limit update chains to a small value to avoid penalizing reads and
+ * permit truncation.
+ */
+#define WT_MAX_MODIFY_UPDATE 100
+
+/*
* WT_INSERT --
*
* Row-store leaf pages support inserts of new K/V pairs. When the first K/V
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 305de509424..216c99b1d9e 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1341,8 +1341,8 @@ __wt_page_can_evict(
* If the page is clean but has modifications that appear too new to
* evict, skip it.
*/
- if (!modified && !__wt_txn_visible_all(
- session, mod->rec_max_txn, WT_TIMESTAMP(mod->rec_max_timestamp)))
+ if (!modified && !__wt_txn_visible_all(session,
+ mod->rec_max_txn, WT_TIMESTAMP_NULL(&mod->rec_max_timestamp)))
return (false);
return (true);
@@ -1602,3 +1602,24 @@ __wt_split_descent_race(
WT_INTL_INDEX_GET(session, ref->home, pindex);
return (pindex != saved_pindex);
}
+
+/*
+ * __wt_ref_state_yield_sleep --
+ * sleep while waiting for the wt_ref state after THOUSAND yields.
+ */
+static inline void
+__wt_ref_state_yield_sleep(uint64_t *yield_count, uint64_t *sleep_count)
+{
+ /*
+ * We yield before retrying, and if we've yielded enough times, start
+ * sleeping so we don't burn CPU to no purpose.
+ */
+ if ((*yield_count) < WT_THOUSAND) {
+ (*yield_count)++;
+ __wt_yield();
+ return;
+ }
+
+ (*sleep_count) = WT_MIN((*sleep_count) + WT_THOUSAND, 10 * WT_THOUSAND);
+ __wt_sleep(0, (*sleep_count));
+}
diff --git a/src/third_party/wiredtiger/src/include/buf.i b/src/third_party/wiredtiger/src/include/buf.i
index 17f67afefce..8ff52f86ced 100644
--- a/src/third_party/wiredtiger/src/include/buf.i
+++ b/src/third_party/wiredtiger/src/include/buf.i
@@ -116,18 +116,18 @@ __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp)
{
WT_ITEM *buf;
- if ((buf = *bufp) != NULL) {
- *bufp = NULL;
+ if ((buf = *bufp) == NULL)
+ return;
+ *bufp = NULL;
- if (session->scratch_cached + buf->memsize >=
- S2C(session)->session_scratch_max) {
- __wt_free(session, buf->mem);
- buf->memsize = 0;
- } else
- session->scratch_cached += buf->memsize;
+ if (session->scratch_cached + buf->memsize >=
+ S2C(session)->session_scratch_max) {
+ __wt_free(session, buf->mem);
+ buf->memsize = 0;
+ } else
+ session->scratch_cached += buf->memsize;
- buf->data = NULL;
- buf->size = 0;
- F_CLR(buf, WT_ITEM_INUSE);
- }
+ buf->data = NULL;
+ buf->size = 0;
+ F_CLR(buf, WT_ITEM_INUSE);
}
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index 0dbf29d21c3..52e9f3c9637 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -730,6 +730,7 @@ __cell_data_ref(WT_SESSION_IMPL *session,
{
WT_BTREE *btree;
void *huffman;
+ bool decoded;
btree = S2BT(session);
@@ -749,14 +750,16 @@ __cell_data_ref(WT_SESSION_IMPL *session,
huffman = btree->huffman_value;
break;
case WT_CELL_KEY_OVFL:
- WT_RET(__wt_ovfl_read(session, page, unpack, store));
- if (page_type == WT_PAGE_ROW_INT)
+ WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
+ if (page_type == WT_PAGE_ROW_INT || decoded)
return (0);
huffman = btree->huffman_key;
break;
case WT_CELL_VALUE_OVFL:
- WT_RET(__wt_ovfl_read(session, page, unpack, store));
+ WT_RET(__wt_ovfl_read(session, page, unpack, store, &decoded));
+ if (decoded)
+ return (0);
huffman = btree->huffman_value;
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 75fd935fc91..e49a9258329 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -441,11 +441,8 @@ value:
* caller passes us the update: it has already resolved which one
* (if any) is visible.
*/
- if (upd != NULL) {
- vb->data = WT_UPDATE_DATA(upd);
- vb->size = upd->size;
- return (0);
- }
+ if (upd != NULL)
+ return (__wt_value_return(session, cbt, upd));
/* Else, simple values have their location encoded in the WT_ROW. */
if (__wt_row_leaf_value(page, rip, vb))
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
index 0db59d45691..32574f05fa1 100644
--- a/src/third_party/wiredtiger/src/include/dhandle.h
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -99,9 +99,9 @@ struct __wt_data_handle {
/* Flags values over 0xff are reserved for WT_BTREE_* */
#define WT_DHANDLE_DEAD 0x01 /* Dead, awaiting discard */
-#define WT_DHANDLE_DISCARD 0x02 /* Discard on release */
-#define WT_DHANDLE_DISCARD_FORCE 0x04 /* Force discard on release */
-#define WT_DHANDLE_EXCLUSIVE 0x08 /* Need exclusive access */
+#define WT_DHANDLE_DISCARD 0x02 /* Close on release */
+#define WT_DHANDLE_DISCARD_KILL 0x04 /* Mark dead on release */
+#define WT_DHANDLE_EXCLUSIVE 0x08 /* Exclusive access */
#define WT_DHANDLE_IS_METADATA 0x10 /* Metadata handle */
#define WT_DHANDLE_LOCK_ONLY 0x20 /* Handle only used as a lock */
#define WT_DHANDLE_OPEN 0x40 /* Handle is open */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 8b48fd587bd..7c9806788bb 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -105,6 +105,7 @@ extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC
extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_reserve(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -147,7 +148,7 @@ extern const char *__wt_page_type_string(u_int type) WT_GCC_FUNC_DECL_ATTRIBUTE(
extern const char *__wt_cell_type_string(uint8_t type);
extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
-extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -164,7 +165,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
);
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_value_return( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -258,11 +259,11 @@ extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_
extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_conn_btree_sync_and_close( WT_SESSION_IMPL *session, bool final, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
@@ -362,6 +363,7 @@ extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_AT
extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
+extern int __wt_log_printf(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
@@ -388,6 +390,9 @@ extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **lo
extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *rectypep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_col_modify_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_col_modify_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_col_modify_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_col_put_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -397,6 +402,9 @@ extern int __wt_logop_col_remove_print(WT_SESSION_IMPL *session, const uint8_t *
extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_col_truncate_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_row_modify_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_row_modify_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_row_modify_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_row_put_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -406,6 +414,12 @@ extern int __wt_logop_row_remove_print(WT_SESSION_IMPL *session, const uint8_t *
extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_checkpoint_start_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec ) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_checkpoint_start_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end ) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_checkpoint_start_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_prev_lsn_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *prev_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_prev_lsn_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *prev_lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_prev_lsn_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -691,6 +705,9 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply_api( WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply(WT_SESSION_IMPL *session, WT_ITEM *value, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -747,6 +764,8 @@ extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *
extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool is_locked);
extern void __wt_thread_group_stop_one(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group);
+extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -786,7 +805,7 @@ extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM
extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char *cfg[], bool *has_create, bool *has_drops) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session);
extern int __wt_txn_recover(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, uint8_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_global_query_timestamp( WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h
index b6b5ac51f73..864a40aa325 100644
--- a/src/third_party/wiredtiger/src/include/extern_posix.h
+++ b/src/third_party/wiredtiger/src/include/extern_posix.h
@@ -28,5 +28,5 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
+extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h
index d548ee0b2ec..85db8175615 100644
--- a/src/third_party/wiredtiger/src/include/extern_win.h
+++ b/src/third_party/wiredtiger/src/include/extern_win.h
@@ -26,7 +26,7 @@ extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, co
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp);
extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern DWORD __wt_getlasterror(void);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index ef66a186fa4..4f7b59c7849 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -79,6 +79,8 @@
#define WT_STAT_TYPE_SIZE 0x00000040
#define WT_STAT_TYPE_TREE_WALK 0x00000080
#define WT_TIMING_STRESS_CHECKPOINT_SLOW 0x00000001
+#define WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE 0x00000002
+#define WT_TIMING_STRESS_PAGE_SPLIT_RACE 0x00000004
#define WT_TXN_LOG_CKPT_CLEANUP 0x00000001
#define WT_TXN_LOG_CKPT_PREPARE 0x00000002
#define WT_TXN_LOG_CKPT_START 0x00000004
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 838086c2ced..bf7d36e19ca 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -251,22 +251,28 @@
/* Timestamp type and helper macros. */
#if WT_TIMESTAMP_SIZE > 0
-#define HAVE_TIMESTAMPS 1
+#define HAVE_TIMESTAMPS
#else
-#undef HAVE_TIMESTAMPS
+#undef HAVE_TIMESTAMPS
#endif
#ifdef HAVE_TIMESTAMPS
-#define WT_TIMESTAMP(x) (x)
-typedef uint8_t wt_timestamp_t[WT_TIMESTAMP_SIZE];
-#define WT_DECL_TIMESTAMP(x) wt_timestamp_t x;
+struct __wt_timestamp_t {
+#if WT_TIMESTAMP_SIZE == 8
+ uint64_t val;
#else
-#define WT_TIMESTAMP(x) (NULL)
+ uint8_t ts[WT_TIMESTAMP_SIZE];
+#endif
+};
+typedef struct __wt_timestamp_t wt_timestamp_t;
+#define WT_DECL_TIMESTAMP(x) wt_timestamp_t x;
+#define WT_TIMESTAMP_NULL(x) (x)
+#else
+typedef void wt_timestamp_t;
+#define WT_TIMESTAMP_NULL(x) (NULL)
#define WT_DECL_TIMESTAMP(x)
#endif
-#define WT_GET_TIMESTAMP(x) WT_TIMESTAMP((x)->timestamp)
-
/*
* In diagnostic mode we track the locations from which hazard pointers and
* scratch buffers were acquired.
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index bb785a63072..dbb921f0946 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -41,45 +41,6 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
}
/*
- * __wt_seconds --
- * Return the seconds since the Epoch.
- */
-static inline void
-__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
-{
- struct timespec t;
-
- __wt_epoch(session, &t);
-
- *timep = t.tv_sec;
-}
-
-/*
- * __wt_time_check_monotonic --
- * Check and prevent time running backward. If we detect that it has, we
- * set the time structure to the previous values, making time stand still
- * until we see a time in the future of the highest value seen so far.
- */
-static inline void
-__wt_time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
-{
- /*
- * Detect time going backward. If so, use the last
- * saved timestamp.
- */
- if (session == NULL)
- return;
-
- if (tsp->tv_sec < session->last_epoch.tv_sec ||
- (tsp->tv_sec == session->last_epoch.tv_sec &&
- tsp->tv_nsec < session->last_epoch.tv_nsec)) {
- WT_STAT_CONN_INCR(session, time_travel);
- *tsp = session->last_epoch;
- } else
- session->last_epoch = *tsp;
-}
-
-/*
* __wt_snprintf --
* snprintf convenience function, ignoring the returned size.
*/
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 15d159192f9..d9c72cd2bad 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -259,8 +259,7 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
*/
static inline int
__wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
- WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size,
- bool exclusive)
+ WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size, bool exclusive)
{
WT_DECL_RET;
WT_UPDATE *obsolete, *upd = *updp;
@@ -290,19 +289,17 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/*
- * Increment in-memory footprint after releasing the mutex: that's safe
- * because the structures we added cannot be discarded while visible to
- * any running transaction, and we're a running transaction, which means
- * there can be no corresponding delete until we complete.
+ * Increment in-memory footprint after swapping the update into place.
+ * Safe because the structures we added cannot be discarded while
+ * visible to any running transaction, and we're a running transaction,
+ * which means there can be no corresponding delete until we complete.
*/
__wt_cache_page_inmem_incr(session, page, upd_size);
/* Mark the page dirty after updating the footprint. */
__wt_page_modify_set(session, page);
- /*
- * If there are no subsequent WT_UPDATE structures we are done here.
- */
+ /* If there are no subsequent WT_UPDATE structures we are done here. */
if (upd->next == NULL || exclusive)
return (0);
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index b340b278684..7ffc1b69c12 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -502,11 +502,19 @@ struct __wt_connection_stats {
int64_t thread_write_active;
int64_t application_evict_time;
int64_t application_cache_time;
+ int64_t txn_release_blocked;
+ int64_t conn_close_blocked_lsm;
+ int64_t dhandle_lock_blocked;
+ int64_t page_index_slot_ref_blocked;
+ int64_t log_server_sync_blocked;
int64_t page_busy_blocked;
int64_t page_forcible_evict_blocked;
int64_t page_locked_blocked;
int64_t page_read_blocked;
int64_t page_sleep;
+ int64_t page_del_rollback_blocked;
+ int64_t child_modify_blocked_page;
+ int64_t tree_descend_blocked;
int64_t txn_snapshots_created;
int64_t txn_snapshots_dropped;
int64_t txn_begin;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index e4cc0b04046..61ab343151c 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -92,8 +92,13 @@ struct __wt_txn_global {
WT_DECL_TIMESTAMP(commit_timestamp)
WT_DECL_TIMESTAMP(oldest_timestamp)
WT_DECL_TIMESTAMP(pinned_timestamp)
- bool has_commit_timestamp, has_oldest_timestamp, has_pinned_timestamp;
+ WT_DECL_TIMESTAMP(stable_timestamp)
+ bool has_commit_timestamp;
+ bool has_oldest_timestamp;
+ bool has_pinned_timestamp;
+ bool has_stable_timestamp;
bool oldest_is_pinned;
+ bool stable_is_pinned;
WT_SPINLOCK id_lock;
@@ -200,7 +205,21 @@ struct __wt_txn {
uint32_t snapshot_count;
uint32_t txn_logsync; /* Log sync configuration */
+ /*
+ * Timestamp copied into updates created by this transaction.
+ *
+ * In some use cases, this can be updated while the transaction is
+ * running.
+ */
WT_DECL_TIMESTAMP(commit_timestamp)
+
+ /*
+ * Set to the first commit timestamp used in the transaction and fixed
+ * while the transaction is on the public list of committed timestamps.
+ */
+ WT_DECL_TIMESTAMP(first_commit_timestamp)
+
+ /* Read updates committed as of this timestamp. */
WT_DECL_TIMESTAMP(read_timestamp)
TAILQ_ENTRY(__wt_txn) commit_timestampq;
@@ -230,8 +249,10 @@ struct __wt_txn {
#define WT_TXN_HAS_TS_COMMIT 0x010
#define WT_TXN_HAS_TS_READ 0x020
#define WT_TXN_NAMED_SNAPSHOT 0x040
-#define WT_TXN_READONLY 0x080
-#define WT_TXN_RUNNING 0x100
-#define WT_TXN_SYNC_SET 0x200
+#define WT_TXN_PUBLIC_TS_COMMIT 0x080
+#define WT_TXN_PUBLIC_TS_READ 0x100
+#define WT_TXN_READONLY 0x200
+#define WT_TXN_RUNNING 0x400
+#define WT_TXN_SYNC_SET 0x800
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 6de86eb0aaf..d693633fabe 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -10,14 +10,15 @@ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
#ifdef HAVE_TIMESTAMPS
+#if WT_TIMESTAMP_SIZE == 8
/*
* __wt_timestamp_cmp --
* Compare two timestamps.
*/
static inline int
-__wt_timestamp_cmp(const uint8_t *ts1, const uint8_t *ts2)
+__wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
{
- return (memcmp(ts1, ts2, WT_TIMESTAMP_SIZE));
+ return (ts1->val == ts2->val ? 0 : (ts1->val > ts2->val ? 1 : -1));
}
/*
@@ -25,9 +26,9 @@ __wt_timestamp_cmp(const uint8_t *ts1, const uint8_t *ts2)
* Set a timestamp.
*/
static inline void
-__wt_timestamp_set(uint8_t *dest, const uint8_t *src)
+__wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
{
- (void)memcpy(dest, src, WT_TIMESTAMP_SIZE);
+ dest->val = src->val;
}
/*
@@ -35,11 +36,62 @@ __wt_timestamp_set(uint8_t *dest, const uint8_t *src)
* Check if a timestamp is equal to the special "zero" time.
*/
static inline bool
-__wt_timestamp_iszero(const uint8_t *ts)
+__wt_timestamp_iszero(wt_timestamp_t *ts)
+{
+ return (ts->val == 0);
+}
+
+/*
+ * __wt_timestamp_set_inf --
+ * Set a timestamp to the maximum value.
+ */
+static inline void
+__wt_timestamp_set_inf(wt_timestamp_t *ts)
+{
+ ts->val = UINT64_MAX;
+}
+
+/*
+ * __wt_timestamp_set_zero --
+ * Zero out a timestamp.
+ */
+static inline void
+__wt_timestamp_set_zero(wt_timestamp_t *ts)
+{
+ ts->val = 0;
+}
+#else
+/*
+ * __wt_timestamp_cmp --
+ * Compare two timestamps.
+ */
+static inline int
+__wt_timestamp_cmp(const wt_timestamp_t *ts1, const wt_timestamp_t *ts2)
+{
+ return (memcmp(ts1->ts, ts2->ts, WT_TIMESTAMP_SIZE));
+}
+
+/*
+ * __wt_timestamp_set --
+ * Set a timestamp.
+ */
+static inline void
+__wt_timestamp_set(wt_timestamp_t *dest, const wt_timestamp_t *src)
+{
+ (void)memcpy(dest->ts, src->ts, WT_TIMESTAMP_SIZE);
+}
+
+/*
+ * __wt_timestamp_iszero --
+ * Check if a timestamp is equal to the special "zero" time.
+ */
+static inline bool
+__wt_timestamp_iszero(wt_timestamp_t *ts)
{
static const wt_timestamp_t zero_timestamp;
- return (memcmp(ts, zero_timestamp, WT_TIMESTAMP_SIZE) == 0);
+ return (memcmp(ts->ts,
+ WT_TIMESTAMP_NULL(&zero_timestamp), WT_TIMESTAMP_SIZE) == 0);
}
/*
@@ -47,9 +99,9 @@ __wt_timestamp_iszero(const uint8_t *ts)
* Set a timestamp to the maximum value.
*/
static inline void
-__wt_timestamp_set_inf(uint8_t *ts)
+__wt_timestamp_set_inf(wt_timestamp_t *ts)
{
- memset(ts, 0xff, WT_TIMESTAMP_SIZE);
+ memset(ts->ts, 0xff, WT_TIMESTAMP_SIZE);
}
/*
@@ -57,11 +109,12 @@ __wt_timestamp_set_inf(uint8_t *ts)
* Zero out a timestamp.
*/
static inline void
-__wt_timestamp_set_zero(uint8_t *ts)
+__wt_timestamp_set_zero(wt_timestamp_t *ts)
{
- memset(ts, 0x00, WT_TIMESTAMP_SIZE);
+ memset(ts->ts, 0x00, WT_TIMESTAMP_SIZE);
}
-#endif
+#endif /* WT_TIMESTAMP_SIZE == 8 */
+#endif /* HAVE_TIMESTAMPS */
/*
* __txn_next_op --
@@ -130,7 +183,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_TXN_OP_INMEM : WT_TXN_OP_BASIC;
#ifdef HAVE_TIMESTAMPS
if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
- __wt_timestamp_set(upd->timestamp, txn->commit_timestamp);
+ __wt_timestamp_set(&upd->timestamp, &txn->commit_timestamp);
if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM))
op->type = WT_TXN_OP_BASIC_TS;
}
@@ -215,16 +268,6 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
}
/*
- * __wt_txn_committed --
- * Return if a transaction has been committed.
- */
-static inline bool
-__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id)
-{
- return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running));
-}
-
-/*
* __txn_visible_all_id --
* Check if a given transaction ID is "globally visible". This is, if
* all sessions in the system will see the transaction ID including the
@@ -248,7 +291,7 @@ __txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id)
*/
static inline bool
__wt_txn_visible_all(
- WT_SESSION_IMPL *session, uint64_t id, const uint8_t *timestamp)
+ WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
{
if (!__txn_visible_all_id(session, id))
return (false);
@@ -263,7 +306,7 @@ __wt_txn_visible_all(
return (true);
__wt_readlock(session, &txn_global->rwlock);
- cmp = __wt_timestamp_cmp(timestamp, txn_global->pinned_timestamp);
+ cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp);
__wt_readunlock(session, &txn_global->rwlock);
/*
@@ -289,7 +332,7 @@ static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
return (__wt_txn_visible_all(
- session, upd->txnid, WT_GET_TIMESTAMP(upd)));
+ session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)));
}
/*
@@ -351,7 +394,7 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
*/
static inline bool
__wt_txn_visible(
- WT_SESSION_IMPL *session, uint64_t id, const uint8_t *timestamp)
+ WT_SESSION_IMPL *session, uint64_t id, const wt_timestamp_t *timestamp)
{
if (!__txn_visible_id(session, id))
return (false);
@@ -364,7 +407,7 @@ __wt_txn_visible(
if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) || timestamp == NULL)
return (true);
- return (memcmp(timestamp, txn->read_timestamp, WT_TIMESTAMP_SIZE) <= 0);
+ return (__wt_timestamp_cmp(timestamp, &txn->read_timestamp) <= 0);
}
#else
WT_UNUSED(timestamp);
@@ -379,7 +422,8 @@ __wt_txn_visible(
static inline bool
__wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
- return (__wt_txn_visible(session, upd->txnid, WT_GET_TIMESTAMP(upd)));
+ return (__wt_txn_visible(session,
+ upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)));
}
/*
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
index 57189b5c2b2..3973f786a90 100644
--- a/src/third_party/wiredtiger/src/include/verify_build.h
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -52,7 +52,15 @@ __wt_verify_build(void)
/* Check specific structures weren't padded. */
WT_SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE);
WT_SIZE_CHECK(WT_REF, WT_REF_SIZE);
- WT_SIZE_CHECK(WT_UPDATE, WT_UPDATE_SIZE);
+
+ /*
+ * WT_UPDATE is special: we arrange fields to avoid padding within the
+ * structure but it could be padded at the end depending on the
+ * timestamp size. Further check that the data field in the update
+ * structure is where we expect it.
+ */
+ WT_SIZE_CHECK(WT_UPDATE, WT_ALIGN(WT_UPDATE_SIZE, 8));
+ WT_STATIC_ASSERT(offsetof(WT_UPDATE, data) == WT_UPDATE_SIZE);
/* Check specific structures were padded. */
#define WT_PADDING_CHECK(s) \
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index d8d8b864766..f9993fbcca3 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -489,6 +489,11 @@ struct __wt_cursor {
* The modify method is only supported on raw byte arrays accessed using
* a WT_ITEM structure, that is, a format type of \c u.
*
+ * Calling the WT_CURSOR::modify method outside of snapshot isolation
+ * can lead to unexpected results. While \c read-committed isolation
+ * is supported with the WT_CURSOR::modify method, \c read-uncommitted
+ * isolation is not.
+ *
* @snippet ex_all.c Modify an existing record
*
* On success, the cursor ends positioned at the modified record; to
@@ -1387,7 +1392,7 @@ struct __wt_session {
* @configstart{WT_SESSION.drop, see dist/api_data.py}
* @config{force, return success if the object does not exist., a
* boolean flag; default \c false.}
- * @config{remove_files, should the underlying files be removed?., a
+ * @config{remove_files, if the underlying files should be removed., a
* boolean flag; default \c true.}
* @configend
* @ebusy_errors
@@ -1778,6 +1783,11 @@ struct __wt_session {
* empty.}
* @config{target, if non-empty\, checkpoint the list of objects., a
* list of strings; default empty.}
+ * @config{use_timestamp, by default\, create the checkpoint as of the
+ * last stable timestamp if timestamps are in use\, or all current
+ * updates if there is no stable timestamp set. If false\, this option
+ * generates a checkpoint with all updates including those later than
+ * the timestamp., a boolean flag; default \c true.}
* @configend
* @errors
*/
@@ -2239,8 +2249,13 @@ struct __wt_connection {
* @configstart{WT_CONNECTION.set_timestamp, see dist/api_data.py}
* @config{oldest_timestamp, future commits and queries will be no
* earlier than the specified timestamp. Supplied values must be
- * monotonically increasing. see @ref transaction_timestamps., a
+ * monotonically increasing. See @ref transaction_timestamps., a
* string; default empty.}
+ * @config{stable_timestamp, future checkpoints will be no later than
+ * the specified timestamp. Supplied values must be monotonically
+ * increasing. The stable timestamp data stability only applies to
+ * tables that are not being logged. See @ref transaction_timestamps.,
+ * a string; default empty.}
* @configend
* @errors
*/
@@ -4491,18 +4506,26 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_LOGREC_SYSTEM 4
/*! invalid operation */
#define WT_LOGOP_INVALID 0
-/*! column put */
+/*! column-store put */
#define WT_LOGOP_COL_PUT 1
-/*! column remove */
+/*! column-store remove */
#define WT_LOGOP_COL_REMOVE 2
-/*! column truncate */
+/*! column-store truncate */
#define WT_LOGOP_COL_TRUNCATE 3
-/*! row put */
+/*! row-store put */
#define WT_LOGOP_ROW_PUT 4
-/*! row remove */
+/*! row-store remove */
#define WT_LOGOP_ROW_REMOVE 5
-/*! row truncate */
+/*! row-store truncate */
#define WT_LOGOP_ROW_TRUNCATE 6
+/*! checkpoint start */
+#define WT_LOGOP_CHECKPOINT_START 7
+/*! previous LSN */
+#define WT_LOGOP_PREV_LSN 8
+/*! column-store modify */
+#define WT_LOGOP_COL_MODIFY 9
+/*! row-store modify */
+#define WT_LOGOP_ROW_MODIFY 10
/*! @} */
/*******************************************
@@ -5045,74 +5068,102 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1243
/*! thread-yield: application thread time waiting for cache (usecs) */
#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1244
+/*!
+ * thread-yield: connection close blocked waiting for transaction state
+ * stabilization
+ */
+#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1245
+/*! thread-yield: connection close yielded for lsm manager shutdown */
+#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1246
+/*! thread-yield: data handle lock yielded */
+#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1247
+/*!
+ * thread-yield: get reference for page index and slot time sleeping
+ * (usecs)
+ */
+#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1248
+/*! thread-yield: log server sync yielded for log write */
+#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1249
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1245
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1250
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1246
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1251
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1247
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1252
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1248
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1253
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1249
+#define WT_STAT_CONN_PAGE_SLEEP 1254
+/*!
+ * thread-yield: page delete rollback time sleeping for state change
+ * (usecs)
+ */
+#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1255
+/*! thread-yield: page reconciliation yielded due to child modification */
+#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1256
+/*!
+ * thread-yield: tree descend one level yielded for split page index
+ * update
+ */
+#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1257
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1250
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1258
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1251
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1259
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1252
+#define WT_STAT_CONN_TXN_BEGIN 1260
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1253
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1261
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1254
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1262
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1255
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1263
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1256
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1264
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1257
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1265
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1258
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1266
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1259
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1267
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1260
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1268
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1261
+#define WT_STAT_CONN_TXN_CHECKPOINT 1269
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1262
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1270
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1263
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1271
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1264
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1272
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1265
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1273
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1266
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1274
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1267
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1275
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1268
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1276
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1269
+#define WT_STAT_CONN_TXN_SYNC 1277
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1270
+#define WT_STAT_CONN_TXN_COMMIT 1278
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1271
+#define WT_STAT_CONN_TXN_ROLLBACK 1279
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1272
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1280
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 74fdc4c3925..84617dfcab8 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -290,6 +290,8 @@ struct __wt_thread;
typedef struct __wt_thread WT_THREAD;
struct __wt_thread_group;
typedef struct __wt_thread_group WT_THREAD_GROUP;
+struct __wt_timestamp_t;
+ typedef struct __wt_timestamp_t WT_TIMESTAMP_T;
struct __wt_txn;
typedef struct __wt_txn WT_TXN;
struct __wt_txn_global;
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 2a912e6568f..3929938618e 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -20,11 +20,11 @@ static int __log_write_internal(
#define WT_LOG_OPEN_CREATE_OK 0x01
/*
- * __log_printf_internal --
- * Internal call to write a log message.
+ * __wt_log_printf --
+ * Write a text message to the log.
*/
-static int
-__log_printf_internal(WT_SESSION_IMPL *session, const char *fmt, ...)
+int
+__wt_log_printf(WT_SESSION_IMPL *session, const char *fmt, ...)
{
WT_DECL_RET;
va_list ap;
@@ -54,10 +54,7 @@ __log_checksum_match(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t reclen)
checksum_calculate = __wt_bswap32(checksum_calculate);
#endif
logrec->checksum = checksum_tmp;
- if (logrec->checksum != checksum_calculate)
- return (false);
- else
- return (true);
+ return (logrec->checksum == checksum_calculate);
}
/*
@@ -1214,8 +1211,8 @@ __log_set_version(WT_SESSION_IMPL *session, uint16_t version,
F_SET(log, WT_LOG_FORCE_NEWFILE);
if (!F_ISSET(conn, WT_CONN_READONLY))
return (__log_prealloc_remove(session));
- else
- return (0);
+
+ return (0);
}
/*
@@ -1259,7 +1256,7 @@ __wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version,
* an archive correctly removes all earlier logs.
* Write an internal printf record.
*/
- WT_ERR(__log_printf_internal(session,
+ WT_ERR(__wt_log_printf(session,
"COMPATIBILITY: Version now %" PRIu16, log->log_version));
if (lognump != NULL)
*lognump = log->alloc_lsn.l.file;
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
index c2b38184405..703a87b09d4 100644
--- a/src/third_party/wiredtiger/src/log/log_auto.c
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -91,6 +91,81 @@ __logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
}
int
+__wt_logop_col_modify_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_MODIFY;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_modify_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep)
+{
+ WT_DECL_RET;
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ uint32_t optype, size;
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop, valuep)) != 0)
+ WT_RET_MSG(session, ret, "logop_col_modify: unpack failure");
+ WT_ASSERT(session, optype == WT_LOGOP_COL_MODIFY);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_modify_print(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t flags)
+{
+ WT_DECL_RET;
+ uint32_t fileid;
+ uint64_t recno;
+ WT_ITEM value;
+ char *escaped;
+
+ escaped = NULL;
+ WT_RET(__wt_logop_col_modify_unpack(
+ session, pp, end, &fileid, &recno, &value));
+
+ WT_RET(__wt_fprintf(session, WT_STDOUT(session),
+ " \"optype\": \"col_modify\",\n"));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"fileid\": %" PRIu32 ",\n", fileid));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"recno\": %" PRIu64 ",\n", recno));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
+
+err: __wt_free(session, escaped);
+ return (ret);
+}
+
+int
__wt_logop_col_put_pack(
WT_SESSION_IMPL *session, WT_ITEM *logrec,
uint32_t fileid, uint64_t recno, WT_ITEM *value)
@@ -149,9 +224,9 @@ __wt_logop_col_put_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"col_put\",\n"));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"recno\": \"%" PRIu64 "\",\n", recno));
+ " \"recno\": %" PRIu64 ",\n", recno));
WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
" \"value\": \"%s\"", escaped));
@@ -221,9 +296,9 @@ __wt_logop_col_remove_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"col_remove\",\n"));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"recno\": \"%" PRIu64 "\"", recno));
+ " \"recno\": %" PRIu64 "", recno));
return (0);
}
@@ -284,15 +359,96 @@ __wt_logop_col_truncate_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"col_truncate\",\n"));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"start\": \"%" PRIu64 "\",\n", start));
+ " \"start\": %" PRIu64 ",\n", start));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"stop\": \"%" PRIu64 "\"", stop));
+ " \"stop\": %" PRIu64 "", stop));
+ return (0);
+}
+
+int
+__wt_logop_row_modify_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_MODIFY;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_modify_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep)
+{
+ WT_DECL_RET;
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ uint32_t optype, size;
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp, valuep)) != 0)
+ WT_RET_MSG(session, ret, "logop_row_modify: unpack failure");
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_MODIFY);
+
+ *pp += size;
return (0);
}
int
+__wt_logop_row_modify_print(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t flags)
+{
+ WT_DECL_RET;
+ uint32_t fileid;
+ WT_ITEM key;
+ WT_ITEM value;
+ char *escaped;
+
+ escaped = NULL;
+ WT_RET(__wt_logop_row_modify_unpack(
+ session, pp, end, &fileid, &key, &value));
+
+ WT_RET(__wt_fprintf(session, WT_STDOUT(session),
+ " \"optype\": \"row_modify\",\n"));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"fileid\": %" PRIu32 ",\n", fileid));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"key\": \"%s\",\n", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"key-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ " \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
+
+err: __wt_free(session, escaped);
+ return (ret);
+}
+
+int
__wt_logop_row_put_pack(
WT_SESSION_IMPL *session, WT_ITEM *logrec,
uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
@@ -351,7 +507,7 @@ __wt_logop_row_put_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"row_put\",\n"));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
" \"key\": \"%s\",\n", escaped));
@@ -431,7 +587,7 @@ __wt_logop_row_remove_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"row_remove\",\n"));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
" \"key\": \"%s\"", escaped));
@@ -505,7 +661,7 @@ __wt_logop_row_truncate_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"optype\": \"row_truncate\",\n"));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"fileid\": \"%" PRIu32 "\",\n", fileid));
+ " \"fileid\": %" PRIu32 ",\n", fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &start));
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
" \"start\": \"%s\",\n", escaped));
@@ -523,13 +679,129 @@ __wt_logop_row_truncate_print(WT_SESSION_IMPL *session,
" \"stop-hex\": \"%s\",\n", escaped));
}
WT_ERR(__wt_fprintf(session, WT_STDOUT(session),
- " \"mode\": \"%" PRIu32 "\"", mode));
+ " \"mode\": %" PRIu32 "", mode));
err: __wt_free(session, escaped);
return (ret);
}
int
+__wt_logop_checkpoint_start_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec
+ )
+{
+ const char *fmt = WT_UNCHECKED_STRING(II);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_CHECKPOINT_START;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_checkpoint_start_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end
+ )
+{
+ WT_DECL_RET;
+ const char *fmt = WT_UNCHECKED_STRING(II);
+ uint32_t optype, size;
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size)) != 0)
+ WT_RET_MSG(session, ret, "logop_checkpoint_start: unpack failure");
+ WT_ASSERT(session, optype == WT_LOGOP_CHECKPOINT_START);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_checkpoint_start_print(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t flags)
+{
+
+ WT_UNUSED(flags);
+ WT_RET(__wt_logop_checkpoint_start_unpack(
+ session, pp, end));
+
+ WT_RET(__wt_fprintf(session, WT_STDOUT(session),
+ " \"optype\": \"checkpoint_start\",\n"));
+
+ return (0);
+}
+
+int
+__wt_logop_prev_lsn_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ WT_LSN *prev_lsn)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIII);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_PREV_LSN;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, prev_lsn->l.file, prev_lsn->l.offset));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, prev_lsn->l.file, prev_lsn->l.offset));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_prev_lsn_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ WT_LSN *prev_lsnp)
+{
+ WT_DECL_RET;
+ const char *fmt = WT_UNCHECKED_STRING(IIII);
+ uint32_t optype, size;
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, &prev_lsnp->l.file, &prev_lsnp->l.offset)) != 0)
+ WT_RET_MSG(session, ret, "logop_prev_lsn: unpack failure");
+ WT_ASSERT(session, optype == WT_LOGOP_PREV_LSN);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_prev_lsn_print(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t flags)
+{
+ WT_LSN prev_lsn;
+
+ WT_UNUSED(flags);
+ WT_RET(__wt_logop_prev_lsn_unpack(
+ session, pp, end, &prev_lsn));
+
+ WT_RET(__wt_fprintf(session, WT_STDOUT(session),
+ " \"optype\": \"prev_lsn\",\n"));
+ WT_RET(__wt_fprintf(session, WT_STDOUT(session),
+ " \"prev_lsn\": [%" PRIu32 ", %" PRIu32 "]", prev_lsn.l.file, prev_lsn.l.offset));
+ return (0);
+}
+
+int
__wt_txn_op_printlog(WT_SESSION_IMPL *session,
const uint8_t **pp, const uint8_t *end, uint32_t flags)
{
@@ -540,6 +812,10 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
end = *pp + opsize;
switch (optype) {
+ case WT_LOGOP_COL_MODIFY:
+ WT_RET(__wt_logop_col_modify_print(session, pp, end, flags));
+ break;
+
case WT_LOGOP_COL_PUT:
WT_RET(__wt_logop_col_put_print(session, pp, end, flags));
break;
@@ -552,6 +828,10 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_logop_col_truncate_print(session, pp, end, flags));
break;
+ case WT_LOGOP_ROW_MODIFY:
+ WT_RET(__wt_logop_row_modify_print(session, pp, end, flags));
+ break;
+
case WT_LOGOP_ROW_PUT:
WT_RET(__wt_logop_row_put_print(session, pp, end, flags));
break;
@@ -564,6 +844,14 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_logop_row_truncate_print(session, pp, end, flags));
break;
+ case WT_LOGOP_CHECKPOINT_START:
+ WT_RET(__wt_logop_checkpoint_start_print(session, pp, end, flags));
+ break;
+
+ case WT_LOGOP_PREV_LSN:
+ WT_RET(__wt_logop_prev_lsn_print(session, pp, end, flags));
+ break;
+
WT_ILLEGAL_VALUE(session);
}
diff --git a/src/third_party/wiredtiger/src/log/log_sys.c b/src/third_party/wiredtiger/src/log/log_sys.c
index 0f3cfdbc14b..4eb2a8e23d2 100644
--- a/src/third_party/wiredtiger/src/log/log_sys.c
+++ b/src/third_party/wiredtiger/src/log/log_sys.c
@@ -18,31 +18,28 @@ __wt_log_system_record(
{
WT_DECL_ITEM(logrec_buf);
WT_DECL_RET;
- WT_ITEM *dummy, empty;
WT_LOG *log;
WT_LOG_RECORD *logrec;
WT_LOGSLOT tmp;
WT_MYSLOT myslot;
- const char *fmt = WT_UNCHECKED_STRING(IIIU);
+ const char *fmt = WT_UNCHECKED_STRING(I);
uint32_t rectype = WT_LOGREC_SYSTEM;
size_t recsize;
log = S2C(session)->log;
WT_RET(__wt_logrec_alloc(session, log->allocsize, &logrec_buf));
memset((uint8_t *)logrec_buf->mem, 0, log->allocsize);
- WT_CLEAR(empty);
- dummy = &empty;
- /*
- * There is currently an unused portion of the system record for
- * future use. Send in a NULL entry.
- */
- WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype,
- lsn->l.file, lsn->l.offset, dummy));
- WT_ASSERT(session, recsize <= log->allocsize);
+
+ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype));
WT_ERR(__wt_struct_pack(session,
(uint8_t *)logrec_buf->data + logrec_buf->size, recsize, fmt,
- rectype, lsn->l.file, lsn->l.offset, dummy));
+ rectype));
+ logrec_buf->size += recsize;
+ WT_ERR(__wt_logop_prev_lsn_pack(session, logrec_buf, lsn));
+ WT_ASSERT(session, logrec_buf->size <= log->allocsize);
+
logrec = (WT_LOG_RECORD *)logrec_buf->mem;
+
/*
* We know system records are this size. And we have to adjust
* the size now because we're not going through the normal log
@@ -50,9 +47,8 @@ __wt_log_system_record(
* earlier.
*/
logrec_buf->size = logrec->len = log->allocsize;
- /*
- * We do not compress nor encrypt this record.
- */
+
+ /* We do not compress nor encrypt this record. */
logrec->checksum = 0;
logrec->flags = 0;
__wt_log_record_byteswap(logrec);
@@ -82,16 +78,10 @@ __wt_log_recover_system(WT_SESSION_IMPL *session,
const uint8_t **pp, const uint8_t *end, WT_LSN *lsnp)
{
WT_DECL_RET;
- WT_ITEM unused;
- uint32_t prev_file, prev_offset;
- const char *fmt = WT_UNCHECKED_STRING(IIU);
- if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
- &prev_file, &prev_offset, &unused)) != 0)
+ if ((ret = __wt_logop_prev_lsn_unpack(session, pp, end, lsnp)) != 0)
WT_RET_MSG(session, ret,
"log_recover_prevlsn: unpack failure");
- if (lsnp != NULL)
- WT_SET_LSN(lsnp, prev_file, prev_offset);
- *pp = end;
+
return (0);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
index b3e13870c95..24a0429a184 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -291,8 +291,10 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
manager->lsm_workers == 0);
if (manager->lsm_workers > 0) {
/* Wait for the main LSM manager thread to finish. */
- while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN))
+ while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) {
+ WT_STAT_CONN_INCR(session, conn_close_blocked_lsm);
__wt_yield();
+ }
/* Clean up open LSM handles. */
ret = __wt_lsm_tree_close_all(session);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 9798bd0cf50..2f21e8acdc3 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -487,7 +487,7 @@ __lsm_discard_handle(
WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
- F_SET(session->dhandle, WT_DHANDLE_DISCARD_FORCE);
+ F_SET(session->dhandle, WT_DHANDLE_DISCARD_KILL);
return (__wt_session_release_btree(session));
}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index 1018bf860d6..5d0295d94ce 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -19,11 +19,19 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
WT_DECL_RET;
WT_RET(__wt_calloc_one(session, &cond));
-
WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
- /* Initialize the condition variable to permit self-blocking. */
+#ifdef HAVE_PTHREAD_COND_MONOTONIC
+ {
+ pthread_condattr_t condattr;
+
+ WT_ERR(pthread_condattr_init(&condattr));
+ WT_ERR(pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC));
+ WT_ERR(pthread_cond_init(&cond->cond, &condattr));
+ }
+#else
WT_ERR(pthread_cond_init(&cond->cond, NULL));
+#endif
cond->name = name;
cond->waiters = 0;
@@ -79,7 +87,26 @@ __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond,
goto skipping;
if (usecs > 0) {
- __wt_epoch(session, &ts);
+ /*
+ * Get the current time as the basis for calculating when the
+ * wait should end. Prefer a monotonic clock source to avoid
+ * unexpectedly long sleeps when the system clock is adjusted.
+ *
+ * Failing that, query the time directly and don't attempt to
+ * correct for the clock moving backwards, which would result
+ * in a sleep that is too long by however much the clock is
+ * updated. This isn't as good as a monotonic clock source but
+ * makes the window of vulnerability smaller (i.e., the
+ * calculated time is only incorrect if the system clock
+ * changes in between us querying it and waiting).
+ */
+#ifdef HAVE_PTHREAD_COND_MONOTONIC
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_MONOTONIC, &ts), ret);
+ if (ret != 0)
+ WT_PANIC_MSG(session, ret, "clock_gettime");
+#else
+ __wt_epoch_raw(session, &ts);
+#endif
ts.tv_sec += (time_t)
(((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION);
ts.tv_nsec = (long)
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
index cc9516468aa..1b7a9359531 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_time.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -9,14 +9,12 @@
#include "wt_internal.h"
/*
- * __wt_epoch --
- * Return the time since the Epoch.
+ * __wt_epoch_raw --
+ * Return the time since the Epoch as reported by a system call.
*/
void
-__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
{
- struct timespec tmp;
WT_DECL_RET;
/*
@@ -28,19 +26,10 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
tsp->tv_sec = 0;
tsp->tv_nsec = 0;
- /*
- * Read into a local variable so that we're comparing the correct
- * value when we check for monotonic increasing time. There are
- * many places we read into an unlocked global variable.
- */
#if defined(HAVE_CLOCK_GETTIME)
- WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, &tmp), ret);
- if (ret == 0) {
- __wt_time_check_monotonic(session, &tmp);
- tsp->tv_sec = tmp.tv_sec;
- tsp->tv_nsec = tmp.tv_nsec;
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret);
+ if (ret == 0)
return;
- }
WT_PANIC_MSG(session, ret, "clock_gettime");
#elif defined(HAVE_GETTIMEOFDAY)
{
@@ -48,10 +37,8 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
if (ret == 0) {
- tmp.tv_sec = v.tv_sec;
- tmp.tv_nsec = v.tv_usec * WT_THOUSAND;
- __wt_time_check_monotonic(session, &tmp);
- *tsp = tmp;
+ tsp->tv_sec = v.tv_sec;
+ tsp->tv_nsec = v.tv_usec * WT_THOUSAND;
return;
}
WT_PANIC_MSG(session, ret, "gettimeofday");
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
index 038c1d78d21..4e9d4595fae 100644
--- a/src/third_party/wiredtiger/src/os_win/os_time.c
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -9,24 +9,23 @@
#include "wt_internal.h"
/*
- * __wt_epoch --
- * Return the time since the Epoch.
+ * __wt_epoch_raw --
+ * Return the time since the Epoch as reported by the system.
*/
void
-__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+__wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp)
{
- struct timespec tmp;
FILETIME time;
uint64_t ns100;
+ WT_UNUSED(session);
+
GetSystemTimeAsFileTime(&time);
ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime)
- 116444736000000000LL;
- tmp.tv_sec = ns100 / 10000000;
- tmp.tv_nsec = (long)((ns100 % 10000000) * 100);
- __wt_time_check_monotonic(session, &tmp);
- *tsp = tmp;
+ tsp->tv_sec = ns100 / 10000000;
+ tsp->tv_nsec = (long)((ns100 % 10000000) * 100);
}
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 970f760f4ca..3bad922cd5f 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -58,6 +58,9 @@ typedef struct {
uint64_t orig_btree_checkpoint_gen;
uint64_t orig_txn_checkpoint_gen;
+ /* Track the oldest transaction running when reconciliation starts. */
+ uint64_t last_running;
+
/* Track the page's maximum transaction. */
uint64_t max_txn;
WT_DECL_TIMESTAMP(max_timestamp)
@@ -294,6 +297,15 @@ typedef struct {
bool cache_write_restore; /* Used update/restoration */
uint32_t tested_ref_state; /* Debugging information */
+
+ /*
+ * XXX
+ * In the case of a modified update, we may need a copy of the current
+ * value as a set of bytes. We call back into the btree code using a
+ * fake cursor to do that work. This a layering violation and fragile,
+ * we need a better solution.
+ */
+ WT_CURSOR_BTREE update_modify_cbt;
} WT_RECONCILE;
#define WT_CROSSING_MIN_BND(r, next_len) \
@@ -324,6 +336,8 @@ static int __rec_col_var(WT_SESSION_IMPL *,
static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t);
static int __rec_destroy_session(WT_SESSION_IMPL *);
+static int __rec_init(WT_SESSION_IMPL *,
+ WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t);
static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
@@ -343,8 +357,6 @@ static int __rec_update_las(
WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
static int __rec_write_check_complete(
WT_SESSION_IMPL *, WT_RECONCILE *, bool *);
-static int __rec_write_init(WT_SESSION_IMPL *,
- WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
@@ -408,7 +420,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
#endif
/* Initialize the reconciliation structure for each new run. */
- if ((ret = __rec_write_init(
+ if ((ret = __rec_init(
session, ref, flags, salvage, &session->reconcile)) != 0) {
WT_PAGE_UNLOCK(session, page);
return (ret);
@@ -684,7 +696,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
*/
mod->rec_max_txn = r->max_txn;
#ifdef HAVE_TIMESTAMPS
- __wt_timestamp_set(mod->rec_max_timestamp, r->max_timestamp);
+ __wt_timestamp_set(&mod->rec_max_timestamp, &r->max_timestamp);
#endif
/*
@@ -700,9 +712,9 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
btree->rec_max_txn = r->max_txn;
#ifdef HAVE_TIMESTAMPS
if (__wt_timestamp_cmp(
- btree->rec_max_timestamp, r->max_timestamp) < 0)
- __wt_timestamp_set(
- btree->rec_max_timestamp, r->max_timestamp);
+ &btree->rec_max_timestamp, &r->max_timestamp) < 0)
+ __wt_timestamp_set(&btree->rec_max_timestamp,
+ &r->max_timestamp);
#endif
}
@@ -859,11 +871,11 @@ __rec_raw_compression_config(
}
/*
- * __rec_write_init --
+ * __rec_init --
* Initialize the reconciliation structure.
*/
static int
-__rec_write_init(WT_SESSION_IMPL *session,
+__rec_init(WT_SESSION_IMPL *session,
WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
{
WT_BTREE *btree;
@@ -905,6 +917,16 @@ __rec_write_init(WT_SESSION_IMPL *session,
WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
/*
+ * Cache the oldest running transaction ID. This is used to check
+ * whether updates seen by reconciliation have committed. We keep a
+ * cached copy to avoid races where a concurrent transaction could
+ * abort while reconciliation is examining its updates. This way, any
+ * transaction running when reconciliation starts is considered
+ * uncommitted.
+ */
+ WT_ORDERED_READ(r->last_running, S2C(session)->txn_global.last_running);
+
+ /*
* Lookaside table eviction is configured when eviction gets aggressive,
* adjust the flags for cases we don't support.
*/
@@ -1014,6 +1036,12 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->cache_write_lookaside = r->cache_write_restore = false;
+ /*
+ * The fake cursor used to figure out modified update values points to
+ * the enclosing WT_REF as a way to access the page.
+ */
+ r->update_modify_cbt.ref = ref;
+
return (0);
}
@@ -1047,6 +1075,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__wt_buf_free(session, &r->_cur);
__wt_buf_free(session, &r->_last);
+ __wt_buf_free(session, &r->update_modify_cbt.iface.value);
+
__rec_dictionary_free(session, r);
__wt_free(session, r);
@@ -1128,18 +1158,18 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
*/
static int
__rec_update_save(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_UPDATE *upd)
+ WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *upd)
{
WT_RET(__wt_realloc_def(
session, &r->supd_allocated, r->supd_next + 1, &r->supd));
r->supd[r->supd_next].ins = ins;
- r->supd[r->supd_next].rip = rip;
+ r->supd[r->supd_next].ripcip = ripcip;
r->supd[r->supd_next].onpage_txn =
upd == NULL ? WT_TXN_NONE : upd->txnid;
#ifdef HAVE_TIMESTAMPS
if (upd != NULL)
__wt_timestamp_set(
- r->supd[r->supd_next].onpage_timestamp, upd->timestamp);
+ &r->supd[r->supd_next].onpage_timestamp, &upd->timestamp);
#endif
++r->supd_next;
return (0);
@@ -1159,7 +1189,7 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
++bnd->supd_next;
supd->ins = NULL;
- supd->rip = NULL;
+ supd->ripcip = NULL;
return (0);
}
@@ -1170,7 +1200,7 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
*/
static int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
+ WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -1195,7 +1225,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* (which may not exist). Return immediately if the item has no updates.
*/
if (ins == NULL) {
- if ((upd_list = WT_ROW_UPDATE(page, rip)) == NULL)
+ if ((upd_list = WT_ROW_UPDATE(page, ripcip)) == NULL)
return (0);
} else
upd_list = ins->upd;
@@ -1204,8 +1234,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
update_mem = 0;
max_txn = WT_TXN_NONE;
#ifdef HAVE_TIMESTAMPS
- __wt_timestamp_set_zero(max_timestamp);
- __wt_timestamp_set_inf(min_timestamp);
+ __wt_timestamp_set_zero(&max_timestamp);
+ __wt_timestamp_set_inf(&min_timestamp);
#endif
min_txn = UINT64_MAX;
@@ -1231,36 +1261,42 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (WT_TXNID_LT(txnid, min_txn))
min_txn = txnid;
-#ifdef HAVE_TIMESTAMPS
- /* Similarly for the oldest timestamp. */
- if (__wt_timestamp_cmp(
- min_timestamp, upd->timestamp) > 0)
- __wt_timestamp_set(
- min_timestamp, upd->timestamp);
-#endif
-
/*
* Find the first update we can use.
*
- * Eviction can write any committed update.
+ * Check whether the update was committed before
+ * reconciliation started. The global commit point can
+ * move forward during reconciliation so we use a
+ * cached copy to avoid races when a concurrent
+ * transaction commits or rolls back while we are
+ * examining its updates.
+ *
+ * Lookaside eviction can cope with any committed
+ * update. Other eviction modes check that the maximum
+ * transaction ID and timestamp seen are stable.
*
* When reconciling for eviction, track whether any
* uncommitted updates are found.
- *
- * When reconciling for eviction, track the memory held
- * by the update chain.
*/
- if (__wt_txn_committed(session, txnid)) {
- if (*updp == NULL)
- *updp = upd;
+ if (WT_TXNID_LE(r->last_running, txnid)) {
+ skipped = true;
+ continue;
+ }
+
+ if (*updp == NULL)
+ *updp = upd;
#ifdef HAVE_TIMESTAMPS
- if (__wt_timestamp_cmp(
- max_timestamp, upd->timestamp) < 0)
- __wt_timestamp_set(
- max_timestamp, upd->timestamp);
+ /* Track min/max timestamps. */
+ if (__wt_timestamp_cmp(
+ &max_timestamp, &upd->timestamp) < 0)
+ __wt_timestamp_set(
+ &max_timestamp, &upd->timestamp);
+
+ if (__wt_timestamp_cmp(
+ &min_timestamp, &upd->timestamp) > 0)
+ __wt_timestamp_set(
+ &min_timestamp, &upd->timestamp);
#endif
- } else
- skipped = true;
}
} else
for (upd = upd_list; upd != NULL; upd = upd->next) {
@@ -1290,8 +1326,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
}
/* Reconciliation should never see a reserved update. */
- WT_ASSERT(session,
- *updp == NULL || (*updp)->type != WT_UPDATE_RESERVED);
+ WT_ASSERT(session, *updp == NULL ||
+ ((*updp)->txnid != WT_TXN_ABORTED &&
+ (*updp)->type != WT_UPDATE_RESERVED));
r->update_mem_all += update_mem;
@@ -1314,8 +1351,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
#ifdef HAVE_TIMESTAMPS
- if (__wt_timestamp_cmp(r->max_timestamp, max_timestamp) < 0)
- __wt_timestamp_set(r->max_timestamp, max_timestamp);
+ if (__wt_timestamp_cmp(&r->max_timestamp, &max_timestamp) < 0)
+ __wt_timestamp_set(&r->max_timestamp, &max_timestamp);
#endif
/*
@@ -1332,7 +1369,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session,
- max_txn, WT_TIMESTAMP(max_timestamp)))) {
+ max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) {
#ifdef HAVE_DIAGNOSTIC
/*
* The checkpoint transaction is special. Make sure we never
@@ -1428,7 +1465,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* globally visible, readers require the page's original value.
*/
if (!__wt_txn_visible_all(
- session, min_txn, WT_TIMESTAMP(min_timestamp)))
+ session, min_txn, WT_TIMESTAMP_NULL(&min_timestamp)))
append_origv = true;
}
@@ -1484,7 +1521,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* that transaction ID is globally visible, we know we no longer need
* the lookaside table records, allowing them to be discarded.
*/
- return (__rec_update_save(session, r, ins, rip, *updp));
+ return (__rec_update_save(session, r, ins, ripcip, *updp));
}
/*
@@ -1538,7 +1575,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
*/
if (F_ISSET(r, WT_VISIBILITY_ERR) && page_del != NULL &&
!__wt_txn_visible(session,
- page_del->txnid, WT_GET_TIMESTAMP(page_del)))
+ page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)))
WT_PANIC_RET(session, EINVAL,
"reconciliation illegally skipped an update");
@@ -1568,7 +1605,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
*/
if (ref->addr != NULL &&
(page_del == NULL || __wt_txn_visible_all(
- session, page_del->txnid, WT_GET_TIMESTAMP(page_del))))
+ session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))))
WT_RET(__wt_ref_block_free(session, ref));
/*
@@ -1619,7 +1656,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* address normally. Otherwise, we have to write a proxy record.
*/
if (__wt_txn_visible(
- session, page_del->txnid, WT_GET_TIMESTAMP(page_del)))
+ session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)))
*statep = WT_CHILD_PROXY;
return (0);
@@ -1654,7 +1691,7 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* not reserved for our exclusive use, there are other page states that
* must be considered.
*/
- for (;; __wt_yield())
+ for (;; __wt_yield()) {
switch (r->tested_ref_state = ref->state) {
case WT_REF_DISK:
/* On disk, not modified by definition. */
@@ -1765,6 +1802,8 @@ __rec_child_modify(WT_SESSION_IMPL *session,
WT_ILLEGAL_VALUE(session);
}
+ WT_STAT_CONN_INCR(session, child_modify_blocked_page);
+ }
in_memory:
/*
@@ -2436,7 +2475,7 @@ __rec_split_row_promote(
supd = &r->supd[i - 1];
if (supd->ins == NULL)
WT_ERR(__wt_row_leaf_key(session,
- r->page, supd->rip, update, false));
+ r->page, supd->ripcip, update, false));
else {
update->data = WT_INSERT_KEY(supd->ins);
update->size = WT_INSERT_KEY_SIZE(supd->ins);
@@ -3484,7 +3523,7 @@ __rec_split_write(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_LEAF:
if (supd->ins == NULL)
WT_ERR(__wt_row_leaf_key(
- session, page, supd->rip, key, false));
+ session, page, supd->ripcip, key, false));
else {
key->data = WT_INSERT_KEY(supd->ins);
key->size = WT_INSERT_KEY_SIZE(supd->ins);
@@ -3651,6 +3690,7 @@ __rec_update_las(WT_SESSION_IMPL *session,
WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd)
{
WT_CURSOR *cursor;
+ WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(key);
WT_DECL_RET;
WT_ITEM las_addr, las_timestamp, las_value;
@@ -3662,6 +3702,7 @@ __rec_update_las(WT_SESSION_IMPL *session,
uint8_t *p;
cursor = NULL;
+ cbt = &r->update_modify_cbt;
WT_CLEAR(las_addr);
WT_CLEAR(las_timestamp);
WT_CLEAR(las_value);
@@ -3715,12 +3756,11 @@ __rec_update_las(WT_SESSION_IMPL *session,
WT_ERR(
__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
key->size = WT_PTRDIFF(p, key->data);
-
break;
case WT_PAGE_ROW_LEAF:
if (list->ins == NULL)
WT_ERR(__wt_row_leaf_key(
- session, page, list->rip, key, false));
+ session, page, list->ripcip, key, false));
else {
key->data = WT_INSERT_KEY(list->ins);
key->size = WT_INSERT_KEY_SIZE(list->ins);
@@ -3729,47 +3769,65 @@ __rec_update_las(WT_SESSION_IMPL *session,
WT_ILLEGAL_VALUE_ERR(session);
}
- /* Lookaside table value component: update reference. */
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_VAR:
- upd = list->ins->upd;
- break;
- case WT_PAGE_ROW_LEAF:
- if (list->ins == NULL) {
- slot = WT_ROW_SLOT(page, list->rip);
- upd = page->modify->mod_row_update[slot];
- } else
- upd = list->ins->upd;
- break;
- WT_ILLEGAL_VALUE_ERR(session);
- }
+ /*
+ * Lookaside table value component: update reference. Updates
+ * come from the row-store insert list (an inserted item), or
+ * update array (an update to an original on-page item), or from
+ * a column-store insert list (column-store format has no update
+ * array, the insert list contains both inserted items and
+ * updates to original on-page items). When rolling forward a
+ * modify update from an original on-page item, we need an
+ * on-page slot so we can find the original on-page item. When
+ * rolling forward from an inserted item, no on-page slot is
+ * possible.
+ */
+ slot = UINT32_MAX; /* Impossible slot */
+ if (list->ripcip != NULL)
+ slot = page->type == WT_PAGE_ROW_LEAF ?
+ WT_ROW_SLOT(page, list->ripcip) :
+ WT_COL_SLOT(page, list->ripcip);
+ upd = list->ins == NULL ?
+ page->modify->mod_row_update[slot] : list->ins->upd;
/*
* Walk the list of updates, storing each key/value pair into
- * the lookaside table. Skipped reserved items, they're never
- * restored, obviously.
+ * the lookaside table. Skip aborted items (there's no point
+ * to restoring them), and assert we never see a reserved item.
*/
do {
- if (upd->type == WT_UPDATE_RESERVED)
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
+ las_value.size = 0;
+ break;
+ case WT_UPDATE_MODIFIED:
+ cbt->slot = slot;
+ WT_ERR(__wt_value_return(session, cbt, upd));
+ las_value.data = cbt->iface.value.data;
+ las_value.size = cbt->iface.value.size;
+ break;
+ case WT_UPDATE_RESERVED:
+ WT_ASSERT(session,
+ upd->type != WT_UPDATE_RESERVED);
continue;
+ case WT_UPDATE_STANDARD:
+ las_value.data = upd->data;
+ las_value.size = upd->size;
+ break;
+ }
#ifdef HAVE_TIMESTAMPS
- las_timestamp.data = list->onpage_timestamp;
+ las_timestamp.data = &list->onpage_timestamp;
las_timestamp.size = WT_TIMESTAMP_SIZE;
#endif
cursor->set_key(cursor,
btree_id, &las_addr, ++las_counter,
list->onpage_txn, &las_timestamp, key);
- if (upd->type == WT_UPDATE_DELETED)
- las_value.size = 0;
- else {
- las_value.data = WT_UPDATE_DATA(upd);
- las_value.size = upd->size;
- }
#ifdef HAVE_TIMESTAMPS
- las_timestamp.data = upd->timestamp;
+ las_timestamp.data = &upd->timestamp;
las_timestamp.size = WT_TIMESTAMP_SIZE;
#endif
cursor->set_value(cursor,
@@ -3822,8 +3880,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
cbulk->ref = pindex->index[0];
cbulk->leaf = cbulk->ref->page;
- WT_RET(
- __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
+ WT_RET(__rec_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
r = cbulk->reconcile;
r->is_bulk_load = true;
@@ -4303,7 +4360,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
if (upd != NULL)
__bit_setv(r->first_free,
WT_INSERT_RECNO(ins) - pageref->ref_recno,
- btree->bitcnt, *(uint8_t *)WT_UPDATE_DATA(upd));
+ btree->bitcnt, *upd->data);
}
/* Calculate the number of entries per page remainder. */
@@ -4360,8 +4417,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
if (nrecs > 0) {
__bit_setv(r->first_free, entry, btree->bitcnt,
- upd == NULL ? 0 :
- *(uint8_t *)WT_UPDATE_DATA(upd));
+ upd == NULL ? 0 : *upd->data);
--nrecs;
++entry;
++r->recno;
@@ -4564,6 +4620,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
WT_CELL *cell;
WT_CELL_UNPACK *vpack, _vpack;
WT_COL *cip;
+ WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(orig);
WT_DECL_RET;
WT_INSERT *ins;
@@ -4579,6 +4636,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
page = pageref->page;
last = r->last;
vpack = &_vpack;
+ cbt = &r->update_modify_cbt;
WT_RET(__rec_split_init(
session, r, page, pageref->ref_recno, btree->maxleafpage));
@@ -4694,24 +4752,34 @@ record_loop: /*
upd = NULL;
if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
WT_ERR(__rec_txn_read(
- session, r, ins, NULL, vpack, &upd));
+ session, r, ins, cip, vpack, &upd));
ins = WT_SKIP_NEXT(ins);
}
- if (upd != NULL) {
- update_no_copy = true; /* No data copy */
- repeat_count = 1; /* Single record */
- deleted = upd->type == WT_UPDATE_DELETED;
- if (!deleted) {
- data = WT_UPDATE_DATA(upd);
+ update_no_copy = true; /* No data copy */
+ repeat_count = 1; /* Single record */
+ deleted = false;
+
+ if (upd != NULL) {
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
+ deleted = true;
+ break;
+ case WT_UPDATE_MODIFIED:
+ cbt->slot = WT_COL_SLOT(page, cip);
+ WT_ERR(__wt_value_return(
+ session, cbt, upd));
+ data = cbt->iface.value.data;
+ size = (uint32_t)cbt->iface.value.size;
+ update_no_copy = false;
+ break;
+ case WT_UPDATE_STANDARD:
+ data = upd->data;
size = upd->size;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
}
} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
- update_no_copy = true; /* No data copy */
- repeat_count = 1; /* Single record */
-
- deleted = false;
-
/*
* If doing an update save and restore, and the
* underlying value is a removed overflow value,
@@ -4905,6 +4973,9 @@ compare: /*
n = WT_INSERT_RECNO(ins);
}
while (src_recno <= n) {
+ deleted = false;
+ update_no_copy = true;
+
/*
* The application may have inserted records which left
* gaps in the name space, and these gaps can be huge.
@@ -4926,14 +4997,31 @@ compare: /*
rle += skip;
src_recno += skip;
}
- } else {
- deleted = upd == NULL ||
- upd->type == WT_UPDATE_DELETED;
- if (!deleted) {
- data = WT_UPDATE_DATA(upd);
+ } else if (upd == NULL)
+ deleted = true;
+ else
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
+ deleted = true;
+ break;
+ case WT_UPDATE_MODIFIED:
+ /*
+ * Impossible slot, there's no backing
+ * on-page item.
+ */
+ cbt->slot = UINT32_MAX;
+ WT_ERR(__wt_value_return(
+ session, cbt, upd));
+ data = cbt->iface.value.data;
+ size = (uint32_t)cbt->iface.value.size;
+ update_no_copy = false;
+ break;
+ case WT_UPDATE_STANDARD:
+ data = upd->data;
size = upd->size;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
}
- }
/*
* Handle RLE accounting and comparisons -- see comment
@@ -4952,16 +5040,24 @@ compare: /*
}
/*
- * Swap the current/last state. We always assign the
- * data values to the buffer because they can only be
- * the data from a WT_UPDATE structure.
- *
- * Reset RLE counter and turn on comparisons.
+ * Swap the current/last state. We can't simply assign
+ * the data values into the last buffer because they may
+ * be a temporary copy built from a chain of modified
+ * updates and creating the next record will overwrite
+ * that memory. Check, we'd like to avoid the copy. If
+ * data was taken from an update structure, we can just
+ * use the pointers, they're not moving.
*/
if (!deleted) {
- last->data = data;
- last->size = size;
+ if (update_no_copy) {
+ last->data = data;
+ last->size = size;
+ } else
+ WT_ERR(__wt_buf_set(
+ session, last, data, size));
}
+
+ /* Ready for the next loop, reset the RLE counter. */
last_deleted = deleted;
rle = 1;
@@ -5293,6 +5389,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_CELL *cell, *val_cell;
WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(tmpkey);
WT_DECL_ITEM(tmpval);
WT_DECL_RET;
@@ -5309,6 +5406,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
void *copy;
btree = S2BT(session);
+ cbt = &r->update_modify_cbt;
slvg_skip = salvage == NULL ? 0 : salvage->skip;
key = &r->k;
@@ -5466,9 +5564,12 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
WT_ERR(__wt_ovfl_remove(
session, page, upd, vpack));
- /* If this key/value pair was deleted, we're done. */
- if (upd->type == WT_UPDATE_DELETED) {
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
/*
+ * If this key/value pair was deleted, we're
+ * done.
+ *
* Overflow keys referencing discarded values
* are no longer useful, discard the backing
* blocks. Don't worry about reuse, reusing
@@ -5503,21 +5604,32 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
/* Proceed with appended key/value pairs. */
goto leaf_insert;
- }
-
- /*
- * If no value, nothing needs to be copied. Otherwise,
- * build the value's WT_CELL chunk from the most recent
- * update value.
- */
- if (upd->size == 0) {
- val->buf.data = NULL;
- val->cell_len = val->len = val->buf.size = 0;
- } else {
+ case WT_UPDATE_MODIFIED:
+ cbt->slot = WT_ROW_SLOT(page, rip);
+ WT_ERR(__wt_value_return(session, cbt, upd));
WT_ERR(__rec_cell_build_val(session, r,
- WT_UPDATE_DATA(upd), upd->size,
- (uint64_t)0));
+ cbt->iface.value.data,
+ cbt->iface.value.size, (uint64_t)0));
dictionary = true;
+ break;
+ case WT_UPDATE_STANDARD:
+ /*
+ * If no value, nothing needs to be copied.
+ * Otherwise, build the value's chunk from the
+ * update value.
+ */
+ if (upd->size == 0) {
+ val->buf.data = NULL;
+ val->cell_len =
+ val->len = val->buf.size = 0;
+ } else {
+ WT_ERR(__rec_cell_build_val(session, r,
+ upd->data, upd->size,
+ (uint64_t)0));
+ dictionary = true;
+ }
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
}
}
@@ -5665,27 +5777,44 @@ static int
__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
{
WT_BTREE *btree;
+ WT_CURSOR_BTREE *cbt;
WT_KV *key, *val;
WT_UPDATE *upd;
bool ovfl_key;
btree = S2BT(session);
+ cbt = &r->update_modify_cbt;
key = &r->k;
val = &r->v;
for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
- /* Look for an update. */
+ /* Look for an update, if nothing is visible, we're done. */
WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
- if (upd == NULL || upd->type == WT_UPDATE_DELETED)
+ if (upd == NULL)
continue;
- if (upd->size == 0) /* Build value cell. */
- val->len = 0;
- else
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
+ continue;
+ case WT_UPDATE_MODIFIED:
+ /* Impossible slot, there's no backing on-page item. */
+ cbt->slot = UINT32_MAX;
+ WT_RET(__wt_value_return(session, cbt, upd));
WT_RET(__rec_cell_build_val(session, r,
- WT_UPDATE_DATA(upd), upd->size, (uint64_t)0));
-
+ cbt->iface.value.data,
+ cbt->iface.value.size, (uint64_t)0));
+ break;
+ case WT_UPDATE_STANDARD:
+ if (upd->size == 0)
+ val->len = 0;
+ else
+ WT_RET(__rec_cell_build_val(session, r,
+ upd->data, upd->size,
+ (uint64_t)0));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
/* Build key cell. */
WT_RET(__rec_cell_build_leaf_key(session, r,
WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 2ad2380dc2c..599d2ced103 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -166,8 +166,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config)
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
ret = __wt_schema_worker(session, uri, __wt_alter, NULL, cfg,
- WT_BTREE_ALTER |
- WT_DHANDLE_DISCARD_FORCE | WT_DHANDLE_EXCLUSIVE)));
+ WT_BTREE_ALTER | WT_DHANDLE_EXCLUSIVE)));
err: if (ret != 0)
WT_STAT_CONN_INCR(session, session_table_alter_fail);
@@ -177,6 +176,28 @@ err: if (ret != 0)
}
/*
+ * __session_alter_readonly --
+ * WT_SESSION->alter method; readonly version.
+ */
+static int
+__session_alter_readonly(
+ WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(uri);
+ WT_UNUSED(config);
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NOCONF(session, alter);
+
+ WT_STAT_CONN_INCR(session, session_table_alter_fail);
+ ret = __wt_session_notsup(session);
+err: API_END_RET(session, ret);
+}
+
+/*
* __session_close --
* WT_SESSION->close method.
*/
@@ -1142,11 +1163,11 @@ __wt_session_range_truncate(WT_SESSION_IMPL *session,
*
* Rather happily, the compare routine will also confirm the cursors
* reference the same object and the keys are set.
+ *
+ * The test for a NULL start comparison function isn't necessary (we
+ * checked it above), but it quiets clang static analysis complaints.
*/
- if (start != NULL && stop != NULL) {
- /* quiet clang scan-build */
- WT_ASSERT(session, start->compare != NULL);
-
+ if (start != NULL && stop != NULL && start->compare != NULL) {
WT_ERR(start->compare(start, stop, &cmp));
if (cmp > 0)
WT_ERR_MSG(session, EINVAL,
@@ -1553,7 +1574,6 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
WT_ERR_MSG(session, EINVAL, "logging not enabled");
log = conn->log;
- timeout_ms = waited_ms = 0;
/*
* If there is no background sync LSN in this session, there
@@ -1775,7 +1795,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
}, stds_readonly = {
NULL,
NULL,
- __session_alter,
+ __session_alter_readonly,
__session_close,
__session_reconfigure,
__wt_session_strerror,
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index dd2b6ef30ff..54fbceb65d7 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -235,6 +235,7 @@ __wt_session_lock_dhandle(
lock_busy = true;
/* Give other threads a chance to make progress. */
+ WT_STAT_CONN_INCR(session, dhandle_lock_blocked);
__wt_yield();
}
}
@@ -261,17 +262,14 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
* If we had special flags set, close the handle so that future access
* can get a handle without special flags.
*/
- if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) {
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL)) {
WT_SAVE_DHANDLE(session, __session_find_dhandle(session,
dhandle->name, dhandle->checkpoint, &dhandle_cache));
if (dhandle_cache != NULL)
__session_discard_dhandle(session, dhandle_cache);
}
- if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) {
- ret = __wt_conn_btree_sync_and_close(session, false, true);
- F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE);
- } else if (F_ISSET(btree, WT_BTREE_BULK)) {
+ if (F_ISSET(btree, WT_BTREE_BULK)) {
WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
!F_ISSET(dhandle, WT_DHANDLE_DISCARD));
/*
@@ -281,11 +279,12 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
*/
WT_WITH_SCHEMA_LOCK(session, ret =
__wt_conn_btree_sync_and_close(session, false, false));
- } else if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) ||
- F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+ } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS) ||
+ F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL)) {
WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
- ret = __wt_conn_btree_sync_and_close(session, false, false);
- F_CLR(dhandle, WT_DHANDLE_DISCARD);
+ ret = __wt_conn_btree_sync_and_close(session, false,
+ F_ISSET(dhandle, WT_DHANDLE_DISCARD_KILL));
+ F_CLR(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL);
}
if (session == dhandle->excl_session) {
diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c
new file mode 100644
index 00000000000..71c06997859
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/modify.c
@@ -0,0 +1,200 @@
+/*-
+ * Copyright (c) 2014-2017 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_modify_pack --
+ * Pack a modify structure into a buffer.
+ */
+int
+__wt_modify_pack(WT_SESSION_IMPL *session,
+ WT_ITEM **modifyp, WT_MODIFY *entries, int nentries)
+{
+ WT_ITEM *modify;
+ size_t len, *p;
+ int i;
+ uint8_t *data;
+
+ /*
+ * Build the in-memory modify value. It's the entries count, followed
+ * by the modify structure offsets written in order, followed by the
+ * data (data at the end to avoid unaligned reads/writes).
+ */
+ len = sizeof(size_t); /* nentries */
+ for (i = 0; i < nentries; ++i) {
+ len += 3 * sizeof(size_t); /* WT_MODIFY fields */
+ len += entries[i].data.size; /* data */
+ }
+
+ WT_RET(__wt_scr_alloc(session, len, &modify));
+
+ data = (uint8_t *)modify->mem +
+ sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
+ p = modify->mem;
+ *p++ = (size_t)nentries;
+ for (i = 0; i < nentries; ++i) {
+ *p++ = entries[i].data.size;
+ *p++ = entries[i].offset;
+ *p++ = entries[i].size;
+
+ memcpy(data, entries[i].data.data, entries[i].data.size);
+ data += entries[i].data.size;
+ }
+ modify->size = WT_PTRDIFF(data, modify->data);
+ *modifyp = modify;
+ return (0);
+}
+
+/*
+ * __modify_apply_one --
+ * Apply a single modify structure change to the buffer.
+ */
+static int
+__modify_apply_one(WT_SESSION_IMPL *session, WT_ITEM *value,
+ size_t data_size, size_t offset, size_t size, const uint8_t *data)
+{
+ uint8_t *from, *to;
+ size_t len;
+
+ /*
+ * Grow the buffer to the maximum size we'll need. This is pessimistic
+ * because it ignores replacement bytes, but it's a simpler calculation.
+ *
+ * Grow the buffer before we fast-path the expected case. This function
+ * is often called using a cursor buffer referencing on-page memory and
+ * it's easy to overwrite a page. A side-effect of growing the buffer is
+ * to ensure the buffer's value is in buffer-local memory.
+ *
+ * Because the buffer may reference an overflow item, the data may not
+ * start at the start of the buffer's memory and we have to correct for
+ * that.
+ */
+ len = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
+ WT_RET(__wt_buf_grow(session, value,
+ len + WT_MAX(value->size, offset) + data_size));
+
+ /*
+ * Fast-path the expected case, where we're overwriting a set of bytes
+ * that already exist in the buffer.
+ */
+ if (value->size > offset + data_size && data_size == size) {
+ memmove((uint8_t *)value->data + offset, data, data_size);
+ return (0);
+ }
+
+ /*
+ * If appending bytes past the end of the value, initialize gap bytes
+ * and copy the new bytes into place.
+ */
+ if (value->size <= offset) {
+ if (value->size < offset)
+ memset((uint8_t *)value->data +
+ value->size, 0, offset - value->size);
+ memmove((uint8_t *)value->data + offset, data, data_size);
+ value->size = offset + data_size;
+ return (0);
+ }
+
+ /*
+ * Correct the replacement size if it's nonsense, we can't replace more
+ * bytes than remain in the value. (Nonsense sizes are permitted in the
+ * API because we don't want to handle the errors.)
+ */
+ if (value->size < offset + size)
+ size = value->size - offset;
+
+ if (data_size == size) { /* Overwrite */
+ /* Copy in the new data. */
+ memmove((uint8_t *)value->data + offset, data, data_size);
+
+ /*
+ * The new data must overlap the buffer's end (else, we'd use
+ * the fast-path code above). Set the buffer size to include
+ * the new data.
+ */
+ value->size = offset + data_size;
+ } else { /* Shrink or grow */
+ /* Move trailing data forward/backward to its new location. */
+ from = (uint8_t *)value->data + (offset + size);
+ WT_ASSERT(session, WT_DATA_IN_ITEM(value) &&
+ from + (value->size - (offset + size)) <=
+ (uint8_t *)value->mem + value->memsize);
+ to = (uint8_t *)value->data + (offset + data_size);
+ WT_ASSERT(session, WT_DATA_IN_ITEM(value) &&
+ to + (value->size - (offset + size)) <=
+ (uint8_t *)value->mem + value->memsize);
+ memmove(to, from, value->size - (offset + size));
+
+ /* Copy in the new data. */
+ memmove((uint8_t *)value->data + offset, data, data_size);
+
+ /*
+ * Correct the size. This works because of how the C standard
+ * defines unsigned arithmetic, and gcc7 complains about more
+ * verbose forms:
+ *
+ * if (data_size > size)
+ * value->size += (data_size - size);
+ * else
+ * value->size -= (size - data_size);
+ *
+ * because the branches are identical.
+ */
+ value->size += (data_size - size);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_modify_apply_api --
+ * Apply a single set of WT_MODIFY changes to a buffer, the cursor API
+ * interface.
+ */
+int
+__wt_modify_apply_api(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *entries, int nentries)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ int i;
+
+ for (i = 0; i < nentries; ++i)
+ WT_RET(__modify_apply_one(session, value, entries[i].data.size,
+ entries[i].offset, entries[i].size, entries[i].data.data));
+
+ return (0);
+}
+
+/*
+ * __wt_modify_apply --
+ * Apply a single set of WT_MODIFY changes to a buffer.
+ */
+int
+__wt_modify_apply(WT_SESSION_IMPL *session, WT_ITEM *value, const void *modify)
+{
+ const size_t *p;
+ int nentries;
+ const uint8_t *data;
+
+ /*
+ * Get the number of entries, and set a second pointer to reference the
+ * change data.
+ */
+ p = modify;
+ nentries = (int)*p++;
+ data = (uint8_t *)modify +
+ sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
+
+ /* Step through the list of entries, applying them in order. */
+ for (; nentries-- > 0; data += p[0], p += 3)
+ WT_RET(__modify_apply_one(
+ session, value, p[0], p[1], p[2], data));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 14a1d2a3b86..ae029ad0a98 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -970,11 +970,19 @@ static const char * const __stats_connection_desc[] = {
"thread-state: active filesystem write calls",
"thread-yield: application thread time evicting (usecs)",
"thread-yield: application thread time waiting for cache (usecs)",
+ "thread-yield: connection close blocked waiting for transaction state stabilization",
+ "thread-yield: connection close yielded for lsm manager shutdown",
+ "thread-yield: data handle lock yielded",
+ "thread-yield: get reference for page index and slot time sleeping (usecs)",
+ "thread-yield: log server sync yielded for log write",
"thread-yield: page acquire busy blocked",
"thread-yield: page acquire eviction blocked",
"thread-yield: page acquire locked blocked",
"thread-yield: page acquire read blocked",
"thread-yield: page acquire time sleeping (usecs)",
+ "thread-yield: page delete rollback time sleeping for state change (usecs)",
+ "thread-yield: page reconciliation yielded due to child modification",
+ "thread-yield: tree descend one level yielded for split page index update",
"transaction: number of named snapshots created",
"transaction: number of named snapshots dropped",
"transaction: transaction begins",
@@ -1285,11 +1293,19 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing thread_write_active */
stats->application_evict_time = 0;
stats->application_cache_time = 0;
+ stats->txn_release_blocked = 0;
+ stats->conn_close_blocked_lsm = 0;
+ stats->dhandle_lock_blocked = 0;
+ stats->page_index_slot_ref_blocked = 0;
+ stats->log_server_sync_blocked = 0;
stats->page_busy_blocked = 0;
stats->page_forcible_evict_blocked = 0;
stats->page_locked_blocked = 0;
stats->page_read_blocked = 0;
stats->page_sleep = 0;
+ stats->page_del_rollback_blocked = 0;
+ stats->child_modify_blocked_page = 0;
+ stats->tree_descend_blocked = 0;
stats->txn_snapshots_created = 0;
stats->txn_snapshots_dropped = 0;
stats->txn_begin = 0;
@@ -1680,12 +1696,25 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, application_evict_time);
to->application_cache_time +=
WT_STAT_READ(from, application_cache_time);
+ to->txn_release_blocked += WT_STAT_READ(from, txn_release_blocked);
+ to->conn_close_blocked_lsm +=
+ WT_STAT_READ(from, conn_close_blocked_lsm);
+ to->dhandle_lock_blocked += WT_STAT_READ(from, dhandle_lock_blocked);
+ to->page_index_slot_ref_blocked +=
+ WT_STAT_READ(from, page_index_slot_ref_blocked);
+ to->log_server_sync_blocked +=
+ WT_STAT_READ(from, log_server_sync_blocked);
to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked);
to->page_forcible_evict_blocked +=
WT_STAT_READ(from, page_forcible_evict_blocked);
to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked);
to->page_read_blocked += WT_STAT_READ(from, page_read_blocked);
to->page_sleep += WT_STAT_READ(from, page_sleep);
+ to->page_del_rollback_blocked +=
+ WT_STAT_READ(from, page_del_rollback_blocked);
+ to->child_modify_blocked_page +=
+ WT_STAT_READ(from, child_modify_blocked_page);
+ to->tree_descend_blocked += WT_STAT_READ(from, tree_descend_blocked);
to->txn_snapshots_created +=
WT_STAT_READ(from, txn_snapshots_created);
to->txn_snapshots_dropped +=
diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c
new file mode 100644
index 00000000000..233bc871e06
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/time.c
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2014-2017 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __time_check_monotonic --
+ * Check and prevent time running backward. If we detect that it has, we
+ * set the time structure to the previous values, making time stand still
+ * until we see a time in the future of the highest value seen so far.
+ */
+static void
+__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ /*
+ * Detect time going backward. If so, use the last
+ * saved timestamp.
+ */
+ if (session == NULL)
+ return;
+
+ if (tsp->tv_sec < session->last_epoch.tv_sec ||
+ (tsp->tv_sec == session->last_epoch.tv_sec &&
+ tsp->tv_nsec < session->last_epoch.tv_nsec)) {
+ WT_STAT_CONN_INCR(session, time_travel);
+ *tsp = session->last_epoch;
+ } else
+ session->last_epoch = *tsp;
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch, adjusted so it never appears to go
+ * backwards.
+ */
+void
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ struct timespec tmp;
+
+ /*
+ * Read into a local variable so that we're comparing the correct
+ * value when we check for monotonic increasing time. There are
+ * many places we read into an unlocked global variable.
+ */
+ __wt_epoch_raw(session, &tmp);
+ __time_check_monotonic(session, &tmp);
+ *tsp = tmp;
+}
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+void
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ __wt_epoch(session, &t);
+
+ *timep = t.tv_sec;
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 18a3c0021f0..c7e7999d887 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -441,19 +441,27 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
if (cval.len > 0) {
#ifdef HAVE_TIMESTAMPS
WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t oldest_timestamp;
+ wt_timestamp_t oldest_timestamp, stable_timestamp;
WT_RET(__wt_txn_parse_timestamp(
- session, "read", txn->read_timestamp, &cval));
+ session, "read", &txn->read_timestamp, &cval));
__wt_readlock(session, &txn_global->rwlock);
__wt_timestamp_set(
- oldest_timestamp, txn_global->oldest_timestamp);
+ &oldest_timestamp, &txn_global->oldest_timestamp);
+ __wt_timestamp_set(
+ &stable_timestamp, &txn_global->stable_timestamp);
__wt_readunlock(session, &txn_global->rwlock);
if (__wt_timestamp_cmp(
- txn->read_timestamp, oldest_timestamp) < 0)
+ &txn->read_timestamp, &oldest_timestamp) < 0)
WT_RET_MSG(session, EINVAL,
"read timestamp %.*s older than oldest timestamp",
(int)cval.len, cval.str);
+ if (!__wt_timestamp_iszero(&stable_timestamp) &&
+ __wt_timestamp_cmp(
+ &txn->read_timestamp, &stable_timestamp) > 0)
+ WT_RET_MSG(session, EINVAL,
+ "read timestamp %.*s newer than stable timestamp",
+ (int)cval.len, cval.str);
__wt_txn_set_read_timestamp(session);
txn->isolation = WT_ISO_SNAPSHOT;
@@ -590,7 +598,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
WT_ERR(__wt_txn_parse_timestamp(
- session, "commit", txn->commit_timestamp, &cval));
+ session, "commit", &txn->commit_timestamp, &cval));
__wt_txn_set_commit_timestamp(session);
#else
WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
@@ -686,8 +694,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
#ifdef HAVE_TIMESTAMPS
if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
op->type != WT_TXN_OP_BASIC_TS)
- __wt_timestamp_set(op->u.upd->timestamp,
- txn->commit_timestamp);
+ __wt_timestamp_set(&op->u.upd->timestamp,
+ &txn->commit_timestamp);
#endif
break;
@@ -695,8 +703,8 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
#ifdef HAVE_TIMESTAMPS
if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
__wt_timestamp_set(
- op->u.ref->page_del->timestamp,
- txn->commit_timestamp);
+ &op->u.ref->page_del->timestamp,
+ &txn->commit_timestamp);
#endif
break;
@@ -728,10 +736,10 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
if (update_timestamp) {
__wt_readlock(session, &txn_global->rwlock);
__wt_timestamp_set(
- prev_commit_timestamp, txn_global->commit_timestamp);
+ &prev_commit_timestamp, &txn_global->commit_timestamp);
__wt_readunlock(session, &txn_global->rwlock);
update_timestamp = __wt_timestamp_cmp(
- txn->commit_timestamp, prev_commit_timestamp) > 0;
+ &txn->commit_timestamp, &prev_commit_timestamp) > 0;
}
/*
@@ -740,10 +748,10 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (update_timestamp) {
__wt_writelock(session, &txn_global->rwlock);
- if (__wt_timestamp_cmp(txn->commit_timestamp,
- txn_global->commit_timestamp) > 0) {
- __wt_timestamp_set(txn_global->commit_timestamp,
- txn->commit_timestamp);
+ if (__wt_timestamp_cmp(&txn->commit_timestamp,
+ &txn_global->commit_timestamp) > 0) {
+ __wt_timestamp_set(&txn_global->commit_timestamp,
+ &txn->commit_timestamp);
txn_global->has_commit_timestamp = true;
}
__wt_writeunlock(session, &txn_global->rwlock);
@@ -998,6 +1006,8 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session)
if (txn_global->oldest_id == txn_global->current &&
txn_global->metadata_pinned == txn_global->current)
break;
+
+ WT_STAT_CONN_INCR(session, txn_release_blocked);
__wt_yield();
}
@@ -1006,7 +1016,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session)
* Now that all transactions have completed, no timestamps should be
* pinned.
*/
- memset(txn_global->pinned_timestamp, 0xff, WT_TIMESTAMP_SIZE);
+ __wt_timestamp_set_inf(&txn_global->pinned_timestamp);
#endif
return (ret);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 519d3469865..8ea6bf609c4 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -571,7 +571,9 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
- char timestamp_config[100];
+ char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1], timestamp_config[100];
+ const char *query_cfg[] = { WT_CONFIG_BASE(session,
+ WT_CONNECTION_query_timestamp), "get=stable", NULL };
const char *txn_cfg[] = { WT_CONFIG_BASE(session,
WT_SESSION_begin_transaction), "isolation=snapshot", NULL, NULL };
@@ -580,11 +582,31 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[])
txn_global = &conn->txn_global;
txn_state = WT_SESSION_TXN_STATE(session);
+ /*
+ * Someone giving us a specific timestamp overrides the general
+ * use_timestamp.
+ */
WT_RET(__wt_config_gets(session, cfg, "read_timestamp", &cval));
if (cval.len > 0) {
WT_RET(__wt_snprintf(timestamp_config, sizeof(timestamp_config),
"read_timestamp=%.*s", (int)cval.len, cval.str));
txn_cfg[2] = timestamp_config;
+ } else if (txn_global->has_stable_timestamp) {
+ WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
+ /*
+ * Get the stable timestamp currently set. Then set that as
+ * the read timestamp for the transaction.
+ */
+ if (cval.val != 0) {
+ if ((ret = __wt_txn_global_query_timestamp(session,
+ timestamp_buf, query_cfg)) != 0 &&
+ ret != WT_NOTFOUND)
+ return (ret);
+ WT_RET(__wt_snprintf(timestamp_config,
+ sizeof(timestamp_config),
+ "read_timestamp=%s", timestamp_buf));
+ txn_cfg[2] = timestamp_config;
+ }
}
/*
@@ -1675,18 +1697,6 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
bulk = F_ISSET(btree, WT_BTREE_BULK);
/*
- * If the handle is already dead or the file isn't durable, force the
- * discard.
- *
- * If the file isn't durable, mark the handle dead, there are asserts
- * later on that only dead handles can have modified pages.
- */
- if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
- F_SET(session->dhandle, WT_DHANDLE_DEAD);
- if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
- return (__wt_cache_op(session, WT_SYNC_DISCARD));
-
- /*
* If closing an unmodified file, check that no update is required
* for active readers.
*/
@@ -1694,7 +1704,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
WT_RET(__wt_txn_update_oldest(
session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
return (__wt_txn_visible_all(session, btree->rec_max_txn,
- WT_TIMESTAMP(btree->rec_max_timestamp)) ?
+ WT_TIMESTAMP_NULL(&btree->rec_max_timestamp)) ?
__wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index d291139284a..1fc74fb53a1 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -32,9 +32,9 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
memset(&key, 0, sizeof(key));
/*
- * We used to take the key for row-store logging from the page
- * referenced by the cursor, when we switched to taking it from the
- * cursor itself. Check that they are the same.
+ * We used to take the row-store logging key from the page referenced by
+ * the cursor, then switched to taking it from the cursor itself. Check
+ * they are the same.
*
* If the cursor references a WT_INSERT item, take the key from there,
* else take the key from the original page.
@@ -50,8 +50,7 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
key.size = WT_INSERT_KEY_SIZE(cbt->ins);
}
- WT_ASSERT(session,
- key.size == cursor->key.size &&
+ WT_ASSERT(session, key.size == cursor->key.size &&
memcmp(key.data, cursor->key.data, key.size) == 0);
__wt_buf_free(session, &key);
@@ -74,46 +73,62 @@ __txn_op_log(WT_SESSION_IMPL *session,
cursor = &cbt->iface;
upd = op->u.upd;
- value.data = WT_UPDATE_DATA(upd);
+ value.data = upd->data;
value.size = upd->size;
/*
- * Log the operation. It must be a row- or column-store insert, remove
- * or update, all of which require log records. We shouldn't ever log
- * reserve operations.
+ * Log the row- or column-store insert, modify, remove or update. Our
+ * caller doesn't log reserve operations, we shouldn't see them here.
*/
- WT_ASSERT(session, upd->type != WT_UPDATE_RESERVED);
if (cbt->btree->type == BTREE_ROW) {
#ifdef HAVE_DIAGNOSTIC
__txn_op_log_row_key_check(session, cbt);
#endif
- if (upd->type == WT_UPDATE_DELETED)
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
WT_RET(__wt_logop_row_remove_pack(
session, logrec, op->fileid, &cursor->key));
- else
+ break;
+ case WT_UPDATE_MODIFIED:
+ WT_RET(__wt_logop_row_modify_pack(
+ session, logrec, op->fileid, &cursor->key, &value));
+ break;
+ case WT_UPDATE_STANDARD:
WT_RET(__wt_logop_row_put_pack(
session, logrec, op->fileid, &cursor->key, &value));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
} else {
recno = WT_INSERT_RECNO(cbt->ins);
WT_ASSERT(session, recno != WT_RECNO_OOB);
- if (upd->type == WT_UPDATE_DELETED)
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
WT_RET(__wt_logop_col_remove_pack(
session, logrec, op->fileid, recno));
- else
+ break;
+ case WT_UPDATE_MODIFIED:
+ WT_RET(__wt_logop_col_modify_pack(
+ session, logrec, op->fileid, recno, &value));
+ break;
+ case WT_UPDATE_STANDARD:
WT_RET(__wt_logop_col_put_pack(
session, logrec, op->fileid, recno, &value));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
}
return (0);
}
/*
- * __txn_commit_printlog --
- * Print a commit log record.
+ * __txn_oplist_printlog --
+ * Print a list of operations from a log record.
*/
static int
-__txn_commit_printlog(WT_SESSION_IMPL *session,
+__txn_oplist_printlog(WT_SESSION_IMPL *session,
const uint8_t **pp, const uint8_t *end, uint32_t flags)
{
bool firstrecord;
@@ -344,8 +359,8 @@ __wt_txn_checkpoint_log(
WT_TXN *txn;
uint8_t *end, *p;
size_t recsize;
- uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
- const char *fmt = WT_UNCHECKED_STRING(IIIIu);
+ uint32_t i, rectype;
+ const char *fmt;
conn = S2C(session);
txn = &session->txn;
@@ -367,7 +382,31 @@ __wt_txn_checkpoint_log(
switch (flags) {
case WT_TXN_LOG_CKPT_PREPARE:
txn->full_ckpt = true;
- WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
+
+ if (conn->compat_major >= WT_LOG_V2) {
+ /*
+ * Write the system log record containing a checkpoint
+ * start operation.
+ */
+ rectype = WT_LOGREC_SYSTEM;
+ fmt = WT_UNCHECKED_STRING(I);
+ WT_ERR(__wt_struct_size(
+ session, &recsize, fmt, rectype));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, recsize,
+ fmt, rectype));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_logop_checkpoint_start_pack(
+ session, logrec));
+ WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0));
+ } else {
+ WT_ERR(__wt_log_printf(session,
+ "CHECKPOINT: Starting record"));
+ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
+ }
+
/*
* We need to make sure that the log records in the checkpoint
* LSN are on disk. In particular to make sure that the
@@ -401,14 +440,16 @@ __wt_txn_checkpoint_log(
ckpt_snapshot = txn->ckpt_snapshot;
/* Write the checkpoint log record. */
- WT_ERR(__wt_struct_size(session, &recsize, fmt,
- rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
+ rectype = WT_LOGREC_CHECKPOINT;
+ fmt = WT_UNCHECKED_STRING(IIIIu);
+ WT_ERR(__wt_struct_size(session, &recsize,
+ fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
txn->ckpt_nsnapshot, ckpt_snapshot));
WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, recsize, fmt,
- rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
+ (uint8_t *)logrec->data + logrec->size, recsize,
+ fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
txn->ckpt_nsnapshot, ckpt_snapshot));
logrec->size += (uint32_t)recsize;
WT_ERR(__wt_log_write(session, logrec, lsnp,
@@ -568,7 +609,7 @@ __txn_printlog(WT_SESSION_IMPL *session,
" \"type\" : \"commit\",\n"));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"txnid\" : %" PRIu64 ",\n", txnid));
- WT_RET(__txn_commit_printlog(session, &p, end, args->flags));
+ WT_RET(__txn_oplist_printlog(session, &p, end, args->flags));
break;
case WT_LOGREC_FILE_SYNC:
@@ -596,9 +637,7 @@ __txn_printlog(WT_SESSION_IMPL *session,
WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
WT_RET(__wt_fprintf(session, WT_STDOUT(session),
" \"type\" : \"system\",\n"));
- WT_RET(__wt_fprintf(session, WT_STDOUT(session),
- " \"prev_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n",
- lsnfile, lsnoffset));
+ WT_RET(__txn_oplist_printlog(session, &p, end, args->flags));
break;
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index d08c2717f7a..fd02963a769 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -121,6 +121,25 @@ __txn_op_apply(
end = *pp + opsize;
switch (optype) {
+ case WT_LOGOP_COL_MODIFY:
+ WT_ERR(__wt_logop_col_modify_unpack(session, pp, end,
+ &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ if ((ret = cursor->search(cursor)) != 0)
+ WT_ERR_NOTFOUND_OK(ret);
+ else {
+ /*
+ * Build/insert a complete value during recovery rather
+ * than using cursor modify to create a partial update
+ * (for no particular reason than simplicity).
+ */
+ WT_ERR(__wt_modify_apply(
+ session, &cursor->value, value.data));
+ WT_ERR(cursor->insert(cursor));
+ }
+ break;
+
case WT_LOGOP_COL_PUT:
WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
&fileid, &recno, &value));
@@ -170,6 +189,25 @@ __txn_op_apply(
WT_ERR(ret);
break;
+ case WT_LOGOP_ROW_MODIFY:
+ WT_ERR(__wt_logop_row_modify_unpack(session, pp, end,
+ &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ if ((ret = cursor->search(cursor)) != 0)
+ WT_ERR_NOTFOUND_OK(ret);
+ else {
+ /*
+ * Build/insert a complete value during recovery rather
+ * than using cursor modify to create a partial update
+ * (for no particular reason than simplicity).
+ */
+ WT_ERR(__wt_modify_apply(
+ session, &cursor->value, value.data));
+ WT_ERR(cursor->insert(cursor));
+ }
+ break;
+
case WT_LOGOP_ROW_PUT:
WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
&fileid, &key, &value));
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index a975341c189..4caf0102e3c 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -15,15 +15,8 @@
*/
int
__wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
- const char *name, uint8_t *timestamp, WT_CONFIG_ITEM *cval)
+ const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
{
- WT_DECL_RET;
- WT_ITEM ts;
- wt_timestamp_t tsbuf;
- size_t hexlen;
- const char *hexts;
- char padbuf[2 * WT_TIMESTAMP_SIZE + 1];
-
__wt_timestamp_set_zero(timestamp);
if (cval->len == 0)
@@ -35,6 +28,40 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
"Failed to parse %s timestamp '%.*s': too long",
name, (int)cval->len, cval->str);
+#if WT_TIMESTAMP_SIZE == 8
+ {
+ static const u_char hextable[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 0, 0, 0, 0, 0, 0,
+ 0, 10, 11, 12, 13, 14, 15, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 10, 11, 12, 13, 14, 15
+ };
+ wt_timestamp_t ts;
+ size_t len;
+ const char *hex;
+
+ for (ts.val = 0, hex = cval->str, len = cval->len; len > 0; --len)
+ ts.val = (ts.val << 4) | hextable[(int)*hex++];
+ __wt_timestamp_set(timestamp, &ts);
+ }
+#else
+ {
+ WT_DECL_RET;
+ WT_ITEM ts;
+ wt_timestamp_t tsbuf;
+ size_t hexlen;
+ const char *hexts;
+ char padbuf[2 * WT_TIMESTAMP_SIZE + 1];
+
/*
* The decoding function assumes it is decoding data produced by dump
* and so requires an even number of hex digits.
@@ -50,8 +77,8 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
}
/* Avoid memory allocation to decode timestamps. */
- ts.data = ts.mem = tsbuf;
- ts.memsize = sizeof(tsbuf);
+ ts.data = ts.mem = tsbuf.ts;
+ ts.memsize = sizeof(tsbuf.ts);
if ((ret = __wt_nhex_to_raw(session, hexts, hexlen, &ts)) != 0)
WT_RET_MSG(session, ret, "Failed to parse %s timestamp '%.*s'",
@@ -59,15 +86,16 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
WT_ASSERT(session, ts.size <= WT_TIMESTAMP_SIZE);
/* Copy the raw value to the end of the timestamp. */
- memcpy(timestamp + WT_TIMESTAMP_SIZE - ts.size,
+ memcpy(timestamp->ts + WT_TIMESTAMP_SIZE - ts.size,
ts.data, ts.size);
-
+ }
+#endif
if (__wt_timestamp_iszero(timestamp))
WT_RET_MSG(session, EINVAL,
"Failed to parse %s timestamp '%.*s': zero not permitted",
name, (int)cval->len, cval->str);
- return (ret);
+ return (0);
}
/*
@@ -76,12 +104,13 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
*/
static int
__txn_global_query_timestamp(
- WT_SESSION_IMPL *session, uint8_t *ts, const char *cfg[])
+ WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
{
WT_CONNECTION_IMPL *conn;
WT_CONFIG_ITEM cval;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t ts;
conn = S2C(session);
txn_global = &conn->txn_global;
@@ -91,40 +120,51 @@ __txn_global_query_timestamp(
if (!txn_global->has_commit_timestamp)
return (WT_NOTFOUND);
__wt_readlock(session, &txn_global->rwlock);
- __wt_timestamp_set(ts, txn_global->commit_timestamp);
+ __wt_timestamp_set(&ts, &txn_global->commit_timestamp);
+ WT_ASSERT(session, !__wt_timestamp_iszero(&ts));
__wt_readunlock(session, &txn_global->rwlock);
/* Compare with the oldest running transaction. */
__wt_readlock(session, &txn_global->commit_timestamp_rwlock);
txn = TAILQ_FIRST(&txn_global->commit_timestamph);
if (txn != NULL &&
- __wt_timestamp_cmp(txn->commit_timestamp, ts) < 0)
- __wt_timestamp_set(ts, txn->commit_timestamp);
+ __wt_timestamp_cmp(&txn->first_commit_timestamp, &ts) < 0) {
+ __wt_timestamp_set(&ts, &txn->first_commit_timestamp);
+ WT_ASSERT(session, !__wt_timestamp_iszero(&ts));
+ }
__wt_readunlock(session, &txn_global->commit_timestamp_rwlock);
} else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len)) {
if (!txn_global->has_oldest_timestamp)
return (WT_NOTFOUND);
__wt_readlock(session, &txn_global->rwlock);
- __wt_timestamp_set(ts, txn_global->oldest_timestamp);
+ __wt_timestamp_set(&ts, &txn_global->oldest_timestamp);
/* Check for a running checkpoint */
txn = txn_global->checkpoint_txn;
if (txn_global->checkpoint_state.pinned_id != WT_TXN_NONE &&
- !__wt_timestamp_iszero(txn->read_timestamp) &&
- __wt_timestamp_cmp(txn->read_timestamp, ts) < 0)
- __wt_timestamp_set(ts, txn->read_timestamp);
+ !__wt_timestamp_iszero(&txn->read_timestamp) &&
+ __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0)
+ __wt_timestamp_set(&ts, &txn->read_timestamp);
__wt_readunlock(session, &txn_global->rwlock);
/* Look for the oldest ordinary reader. */
__wt_readlock(session, &txn_global->read_timestamp_rwlock);
txn = TAILQ_FIRST(&txn_global->read_timestamph);
if (txn != NULL &&
- __wt_timestamp_cmp(txn->read_timestamp, ts) < 0)
- __wt_timestamp_set(ts, txn->read_timestamp);
+ __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0)
+ __wt_timestamp_set(&ts, &txn->read_timestamp);
__wt_readunlock(session, &txn_global->read_timestamp_rwlock);
+ } else if (WT_STRING_MATCH("stable", cval.str, cval.len)) {
+ if (!txn_global->has_stable_timestamp)
+ return (WT_NOTFOUND);
+ __wt_readlock(session, &txn_global->rwlock);
+ __wt_timestamp_set(&ts, &txn_global->stable_timestamp);
+ __wt_readunlock(session, &txn_global->rwlock);
} else
- return (__wt_illegal_value(session, NULL));
+ WT_RET_MSG(session, EINVAL,
+ "unknown timestamp query %.*s", (int)cval.len, cval.str);
+ __wt_timestamp_set(tsp, &ts);
return (0);
}
#endif
@@ -138,8 +178,28 @@ __wt_txn_global_query_timestamp(
WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[])
{
#ifdef HAVE_TIMESTAMPS
- WT_ITEM hexts;
wt_timestamp_t ts;
+
+ WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
+
+#if WT_TIMESTAMP_SIZE == 8
+ {
+ char *p, v;
+
+ for (p = hex_timestamp; ts.val != 0; ts.val >>= 4)
+ *p++ = (char)__wt_hex((u_char)(ts.val & 0x0f));
+ *p = '\0';
+
+ /* Reverse the string. */
+ for (--p; p > hex_timestamp;) {
+ v = *p;
+ *p-- = *hex_timestamp;
+ *hex_timestamp++ = v;
+ }
+ }
+#else
+ {
+ WT_ITEM hexts;
size_t len;
uint8_t *tsp;
@@ -147,25 +207,28 @@ __wt_txn_global_query_timestamp(
* Keep clang-analyzer happy: it can't tell that ts will be set
* whenever the call below succeeds.
*/
- WT_CLEAR(ts);
- WT_RET(__txn_global_query_timestamp(session, ts, cfg));
+ __wt_timestamp_set_zero(&ts);
+ WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
/* Avoid memory allocation: set up an item guaranteed large enough. */
hexts.data = hexts.mem = hex_timestamp;
hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1;
/* Trim leading zeros. */
- for (tsp = ts, len = WT_TIMESTAMP_SIZE;
+ for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE;
len > 0 && *tsp == 0;
++tsp, --len)
;
WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts));
+ }
+#endif
return (0);
#else
- WT_UNUSED(session);
WT_UNUSED(hex_timestamp);
WT_UNUSED(cfg);
- return (ENOTSUP);
+ WT_RET_MSG(session, ENOTSUP,
+ "WT_CONNECTION.query_timestamp requires a version of WiredTiger "
+ "built with timestamp support");
#endif
}
@@ -191,28 +254,28 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session)
return (0);
__wt_readlock(session, &txn_global->rwlock);
- __wt_timestamp_set(oldest_timestamp, txn_global->oldest_timestamp);
+ __wt_timestamp_set(&oldest_timestamp, &txn_global->oldest_timestamp);
__wt_readunlock(session, &txn_global->rwlock);
/* Scan to find the global pinned timestamp. */
if ((ret = __txn_global_query_timestamp(
- session, active_timestamp, query_cfg)) != 0)
+ session, &active_timestamp, query_cfg)) != 0)
return (ret == WT_NOTFOUND ? 0 : ret);
- if (__wt_timestamp_cmp(oldest_timestamp, active_timestamp) < 0) {
- __wt_timestamp_set(pinned_timestamp, oldest_timestamp);
+ if (__wt_timestamp_cmp(&oldest_timestamp, &active_timestamp) < 0) {
+ __wt_timestamp_set(&pinned_timestamp, &oldest_timestamp);
} else
- __wt_timestamp_set(pinned_timestamp, active_timestamp);
+ __wt_timestamp_set(&pinned_timestamp, &active_timestamp);
__wt_writelock(session, &txn_global->rwlock);
if (!txn_global->has_pinned_timestamp || __wt_timestamp_cmp(
- txn_global->pinned_timestamp, pinned_timestamp) < 0) {
+ &txn_global->pinned_timestamp, &pinned_timestamp) < 0) {
__wt_timestamp_set(
- txn_global->pinned_timestamp, pinned_timestamp);
+ &txn_global->pinned_timestamp, &pinned_timestamp);
txn_global->has_pinned_timestamp = true;
txn_global->oldest_is_pinned = __wt_timestamp_cmp(
- txn_global->pinned_timestamp,
- txn_global->oldest_timestamp) == 0;
+ &txn_global->pinned_timestamp,
+ &txn_global->oldest_timestamp) == 0;
}
__wt_writeunlock(session, &txn_global->rwlock);
@@ -227,43 +290,98 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session)
int
__wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
+ WT_CONFIG_ITEM oldest_cval, stable_cval;
+ bool has_oldest, has_stable;
/*
* Look for a commit timestamp.
*/
- WT_RET(
- __wt_config_gets_def(session, cfg, "oldest_timestamp", 0, &cval));
- if (cval.len != 0) {
+ WT_RET(__wt_config_gets_def(session,
+ cfg, "oldest_timestamp", 0, &oldest_cval));
+ WT_RET(__wt_config_gets_def(session,
+ cfg, "stable_timestamp", 0, &stable_cval));
+ if (oldest_cval.len != 0)
+ has_oldest = true;
+ else
+ has_oldest = false;
+ if (stable_cval.len != 0)
+ has_stable = true;
+ else
+ has_stable = false;
+ if (has_oldest || has_stable) {
#ifdef HAVE_TIMESTAMPS
WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t oldest_timestamp;
-
- WT_RET(__wt_txn_parse_timestamp(
- session, "oldest", oldest_timestamp, &cval));
+ wt_timestamp_t oldest_ts, stable_ts;
+ txn_global = &S2C(session)->txn_global;
/*
- * This method can be called from multiple threads, check that
- * we are moving the global oldest timestamp forwards.
+ * Parsing will initialize the timestamp to zero even if
+ * it is not configured.
*/
- txn_global = &S2C(session)->txn_global;
+ WT_RET(__wt_txn_parse_timestamp(
+ session, "oldest", &oldest_ts, &oldest_cval));
+ WT_RET(__wt_txn_parse_timestamp(
+ session, "stable", &stable_ts, &stable_cval));
__wt_writelock(session, &txn_global->rwlock);
- if (!txn_global->has_oldest_timestamp || __wt_timestamp_cmp(
- txn_global->oldest_timestamp, oldest_timestamp) < 0) {
- __wt_timestamp_set(
- txn_global->oldest_timestamp, oldest_timestamp);
- txn_global->has_oldest_timestamp = true;
- txn_global->oldest_is_pinned = false;
+ /*
+ * First do error checking on the timestamp values. The
+ * oldest timestamp must always be less than or equal to
+ * the stable timestamp. If we're only setting one
+ * then compare against the system timestamp. If we're
+ * setting both then compare the passed in values.
+ */
+ if ((has_oldest && !has_stable && /* only oldest given */
+ txn_global->has_stable_timestamp &&
+ __wt_timestamp_cmp(&oldest_ts,
+ &txn_global->stable_timestamp) > 0) ||
+ (has_stable && !has_oldest && /* only stable given */
+ txn_global->has_oldest_timestamp &&
+ __wt_timestamp_cmp(&stable_ts,
+ &txn_global->oldest_timestamp) < 0) ||
+ (has_oldest && has_stable && /* both given */
+ __wt_timestamp_cmp(&oldest_ts, &stable_ts) > 0)) {
+ __wt_writeunlock(session, &txn_global->rwlock);
+ WT_RET_MSG(session, EINVAL,
+ "set_timestamp: oldest timestamp must not be "
+ "later than stable timestamp");
+ }
+ if (has_oldest) {
+ /*
+ * This method can be called from multiple threads,
+ * check that we are moving the global oldest
+ * timestamp forwards.
+ */
+ if (!txn_global->has_oldest_timestamp ||
+ __wt_timestamp_cmp(&txn_global->oldest_timestamp,
+ &oldest_ts) < 0) {
+ __wt_timestamp_set(
+ &txn_global->oldest_timestamp, &oldest_ts);
+ txn_global->has_oldest_timestamp = true;
+ txn_global->oldest_is_pinned = false;
+ }
+ }
+ if (has_stable) {
+ /*
+ * This method can be called from multiple threads,
+ * check that we are moving the global stable
+ * timestamp forwards.
+ */
+ if (!txn_global->has_stable_timestamp ||
+ __wt_timestamp_cmp(&txn_global->stable_timestamp,
+ &stable_ts) < 0) {
+ __wt_timestamp_set(
+ &txn_global->stable_timestamp, &stable_ts);
+ txn_global->has_stable_timestamp = true;
+ txn_global->stable_is_pinned = false;
+ }
}
__wt_writeunlock(session, &txn_global->rwlock);
-
WT_RET(__wt_txn_update_pinned_timestamp(session));
#else
- WT_RET_MSG(session, EINVAL, "oldest_timestamp requires a "
+ WT_RET_MSG(session, EINVAL, "set_timestamp requires a "
"version of WiredTiger built with timestamp support");
#endif
}
-
return (0);
}
@@ -286,7 +404,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn = &session->txn;
WT_RET(__wt_txn_parse_timestamp(
- session, "commit", txn->commit_timestamp, &cval));
+ session, "commit", &txn->commit_timestamp, &cval));
__wt_txn_set_commit_timestamp(session);
#else
WT_RET_MSG(session, EINVAL, "commit_timestamp requires a "
@@ -298,6 +416,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
return (0);
}
+#ifdef HAVE_TIMESTAMPS
/*
* __wt_txn_set_commit_timestamp --
* Publish a transaction's commit timestamp.
@@ -305,19 +424,28 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
void
__wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
{
+ wt_timestamp_t ts;
WT_TXN *prev, *txn;
WT_TXN_GLOBAL *txn_global;
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ if (F_ISSET(txn, WT_TXN_PUBLIC_TS_COMMIT))
return;
+ /*
+ * Copy the current commit timestamp (which can change while the
+ * transaction is running) into the first_commit_timestamp, which is
+ * fixed.
+ */
+ __wt_timestamp_set(&ts, &txn->commit_timestamp);
+ __wt_timestamp_set(&txn->first_commit_timestamp, &ts);
+
__wt_writelock(session, &txn_global->commit_timestamp_rwlock);
for (prev = TAILQ_LAST(&txn_global->commit_timestamph, __wt_txn_cts_qh);
- prev != NULL && __wt_timestamp_cmp(
- prev->commit_timestamp, txn->commit_timestamp) > 0;
+ prev != NULL &&
+ __wt_timestamp_cmp(&prev->first_commit_timestamp, &ts) > 0;
prev = TAILQ_PREV(prev, __wt_txn_cts_qh, commit_timestampq))
;
if (prev == NULL)
@@ -327,7 +455,7 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session)
TAILQ_INSERT_AFTER(&txn_global->commit_timestamph,
prev, txn, commit_timestampq);
__wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
- F_SET(txn, WT_TXN_HAS_TS_COMMIT);
+ F_SET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_PUBLIC_TS_COMMIT);
}
/*
@@ -343,12 +471,13 @@ __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_COMMIT))
return;
__wt_writelock(session, &txn_global->commit_timestamp_rwlock);
TAILQ_REMOVE(&txn_global->commit_timestamph, txn, commit_timestampq);
__wt_writeunlock(session, &txn_global->commit_timestamp_rwlock);
+ F_CLR(txn, WT_TXN_PUBLIC_TS_COMMIT);
}
/*
@@ -364,13 +493,13 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
return;
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
for (prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
prev != NULL && __wt_timestamp_cmp(
- prev->read_timestamp, txn->read_timestamp) > 0;
+ &prev->read_timestamp, &txn->read_timestamp) > 0;
prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq))
;
if (prev == NULL)
@@ -380,7 +509,7 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session)
TAILQ_INSERT_AFTER(
&txn_global->read_timestamph, prev, txn, read_timestampq);
__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
- F_SET(txn, WT_TXN_HAS_TS_READ);
+ F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
}
/*
@@ -396,11 +525,12 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- if (!F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
return;
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
- F_CLR(txn, WT_TXN_HAS_TS_READ);
+ F_CLR(txn, WT_TXN_PUBLIC_TS_READ);
}
+#endif
diff --git a/src/third_party/wiredtiger/test/checkpoint/smoke.sh b/src/third_party/wiredtiger/test/checkpoint/smoke.sh
index 3cf0b62094f..18ae42af49b 100755
--- a/src/third_party/wiredtiger/test/checkpoint/smoke.sh
+++ b/src/third_party/wiredtiger/test/checkpoint/smoke.sh
@@ -7,8 +7,9 @@ echo "checkpoint: 3 mixed tables"
$TEST_WRAPPER ./t -T 3 -t m
# Smoke-test timestamps
-echo "checkpoint: 3 mixed tables with timestamps"
-$TEST_WRAPPER ./t -T 3 -t m -s
+# Timestamp testing is commented as part of WT-3446
+# echo "checkpoint: 3 mixed tables with timestamps"
+# $TEST_WRAPPER ./t -T 3 -t m -s
# We are done unless long tests are enabled.
test "$TESTUTIL_ENABLE_LONG_TESTS" = "1" || exit 0
@@ -22,8 +23,8 @@ $TEST_WRAPPER ./t -T 6 -t l
echo "checkpoint: 6 mixed tables"
$TEST_WRAPPER ./t -T 6 -t m
-echo "checkpoint: 6 mixed tables with timestamps"
-$TEST_WRAPPER ./t -T 6 -t m -s
+# echo "checkpoint: 6 mixed tables with timestamps"
+# $TEST_WRAPPER ./t -T 6 -t m -s
echo "checkpoint: 6 row-store tables"
$TEST_WRAPPER ./t -T 6 -t r
@@ -31,5 +32,5 @@ $TEST_WRAPPER ./t -T 6 -t r
echo "checkpoint: 6 row-store tables, named checkpoint"
$TEST_WRAPPER ./t -c 'TeSt' -T 6 -t r
-echo "checkpoint: 6 row-store tables, named checkpoint"
-$TEST_WRAPPER ./t -c 'TeSt' -T 6 -t r -s
+# echo "checkpoint: 6 row-store tables, named checkpoint"
+# $TEST_WRAPPER ./t -c 'TeSt' -T 6 -t r -s
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index 2ee5aa912e4..bf5e645bb51 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -93,10 +93,24 @@ main(int argc, char *argv[])
runs = atoi(__wt_optarg);
break;
case 's':
+ /*
+ * disabled below block temporarily to avoid spurious
+ * test failures as per ticket WT-3446 and
+ * to be reverted when WT-3386 is merged.
+ */
+#if 0
#ifdef HAVE_TIMESTAMPS
g.use_timestamps = true;
#endif
break;
+#endif
+ /*
+ * The below code segment to be deleted as part of
+ * reverting the above block i.e. WT-3386
+ */
+ fprintf(stderr,
+ "Checkpoint Timestamp testing is not supported\n");
+ return (EXIT_FAILURE);
case 't':
switch (__wt_optarg[0]) {
case 'c':
diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am
index 0b117a6588b..c3fadc8674b 100644
--- a/src/third_party/wiredtiger/test/csuite/Makefile.am
+++ b/src/third_party/wiredtiger/test/csuite/Makefile.am
@@ -57,6 +57,9 @@ noinst_PROGRAMS += test_wt3135_search_near_collator
test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c
noinst_PROGRAMS += test_wt3184_dup_index_collator
+test_wt3338_partial_update_SOURCES = wt3338_partial_update/main.c
+noinst_PROGRAMS += test_wt3338_partial_update
+
test_wt3363_checkpoint_op_races_SOURCES = wt3363_checkpoint_op_races/main.c
noinst_PROGRAMS += test_wt3363_checkpoint_op_races
diff --git a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
new file mode 100644
index 00000000000..9c8153a9881
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
@@ -0,0 +1,317 @@
+/*-
+ * Public Domain 2014-2017 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: WT-3338
+ * Test case description: Smoke-test the partial update construction.
+ */
+
+#define DEBUG 0
+
+#define DATASIZE 1024
+
+#define MAX_MODIFY_ENTRIES 37
+static WT_MODIFY entries[MAX_MODIFY_ENTRIES]; /* Entries vector */
+static int nentries; /* Entries count */
+
+/*
+ * The replacement bytes array is 2x the maximum replacement string so we can
+ * offset into it by the maximum replacement string and still take a maximum
+ * replacement string without going past the end of the buffer.
+ */
+#define MAX_REPL_BYTES 17
+static char modify_repl[MAX_REPL_BYTES * 2]; /* Replacement bytes */
+
+static WT_RAND_STATE rnd; /* RNG state */
+
+#if DEBUG
+/*
+ * show --
+ * Dump out a buffer.
+ */
+static void
+show(WT_ITEM *buf, const char *tag)
+{
+ size_t i;
+ const uint8_t *a;
+
+ fprintf(stderr, "%s: %" WT_SIZET_FMT " bytes\n\t", tag, buf->size);
+ for (a = buf->data, i = 0; i < buf->size; ++i, ++a) {
+ if (isprint(*a))
+ fprintf(stderr, " %c", *a);
+ else
+ fprintf(stderr, " %#x", *a);
+ }
+ fprintf(stderr, "\n");
+}
+#endif
+
+/*
+ * modify_repl_init --
+ * Initialize the replacement information.
+ */
+static void
+modify_repl_init(void)
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(modify_repl); ++i)
+ modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26];
+}
+
+/*
+ * modify_build --
+ * Generate a set of modify vectors.
+ */
+static void
+modify_build(void)
+{
+ int i;
+
+ /* Mess up the entries. */
+ memset(entries, 0xff, MAX_MODIFY_ENTRIES * sizeof(entries[0]));
+
+ /*
+ * Randomly select a number of byte changes, offsets and lengths.
+ * Allow a value of 0, the API should accept it.
+ */
+ nentries = (int)(__wt_random(&rnd) % MAX_MODIFY_ENTRIES);
+ for (i = 0; i < nentries; ++i) {
+ entries[i].data.data =
+ modify_repl + __wt_random(&rnd) % MAX_REPL_BYTES;
+ entries[i].data.size =
+ (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES);
+ entries[i].offset = (size_t)(__wt_random(&rnd) % DATASIZE);
+ entries[i].size = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES);
+ }
+#if DEBUG
+ for (i = 0; i < nentries; ++i)
+ printf(
+ "%d: {%.*s} %" WT_SIZET_FMT " bytes replacing %"
+ WT_SIZET_FMT " bytes @ %" WT_SIZET_FMT "\n",
+ i, (int)entries[i].data.size, entries[i].data.data,
+ entries[i].data.size, entries[i].size, entries[i].offset);
+#endif
+}
+
+/*
+ * slow_apply_api --
+ * Apply a set of modification changes using a different algorithm.
+ */
+static void
+slow_apply_api(WT_ITEM *orig)
+{
+ static WT_ITEM _tb;
+ WT_ITEM *ta, *tb, *tmp, _tmp;
+ size_t len, size;
+ int i;
+
+ ta = orig;
+ tb = &_tb;
+
+ /* Mess up anything not initialized in the buffers. */
+ memset((uint8_t *)ta->mem + ta->size, 0xff, ta->memsize - ta->size);
+ memset((uint8_t *)tb->mem, 0xff, tb->memsize);
+
+ /*
+ * Process the entries to figure out how large a buffer we need. This is
+ * a bit pessimistic because we're ignoring replacement bytes, but it's
+ * a simpler calculation.
+ */
+ for (size = ta->size, i = 0; i < nentries; ++i) {
+ if (entries[i].offset >= size)
+ size = entries[i].offset;
+ size += entries[i].data.size;
+ }
+
+ testutil_check(__wt_buf_grow(NULL, ta, size));
+ testutil_check(__wt_buf_grow(NULL, tb, size));
+
+#if DEBUG
+ show(ta, "slow-apply start");
+#endif
+ /*
+ * From the starting buffer, create a new buffer b based on changes
+ * in the entries array. We're doing a brute force solution here to
+ * test the faster solution implemented in the library.
+ */
+ for (i = 0; i < nentries; ++i) {
+ /* Take leading bytes from the original, plus any gap bytes. */
+ if (entries[i].offset >= ta->size) {
+ memcpy(tb->mem, ta->mem, ta->size);
+ if (entries[i].offset > ta->size)
+ memset((uint8_t *)tb->mem + ta->size,
+ '\0', entries[i].offset - ta->size);
+ } else
+ if (entries[i].offset > 0)
+ memcpy(tb->mem, ta->mem, entries[i].offset);
+ tb->size = entries[i].offset;
+
+ /* Take replacement bytes. */
+ if (entries[i].data.size > 0) {
+ memcpy((uint8_t *)tb->mem + tb->size,
+ entries[i].data.data, entries[i].data.size);
+ tb->size += entries[i].data.size;
+ }
+
+ /* Take trailing bytes from the original. */
+ len = entries[i].offset + entries[i].size;
+ if (ta->size > len) {
+ memcpy((uint8_t *)tb->mem + tb->size,
+ (uint8_t *)ta->mem + len, ta->size - len);
+ tb->size += ta->size - len;
+ }
+ testutil_assert(tb->size <= size);
+
+ /* Swap the buffers and do it again. */
+ tmp = ta;
+ ta = tb;
+ tb = tmp;
+ }
+ ta->data = ta->mem;
+ tb->data = tb->mem;
+
+ /*
+ * The final results may not be in the original buffer, in which case
+ * we swap them back around.
+ */
+ if (ta != orig) {
+ _tmp = *ta;
+ *ta = *tb;
+ *tb = _tmp;
+ }
+
+#if DEBUG
+ show(ta, "slow-apply finish");
+#endif
+}
+
+/*
+ * diff --
+ * Diff the two results.
+ */
+static void
+diff(WT_ITEM *local, WT_ITEM *library)
+{
+#if DEBUG
+ if (local->size != library->size ||
+ memcmp(local->data, library->data, local->size) != 0) {
+ fprintf(stderr, "results differ\n");
+ show(local, "local results");
+ show(library, "library results");
+ }
+#endif
+ testutil_assert(
+ local->size == library->size && memcmp(
+ local->data, library->data, local->size) == 0);
+}
+
+/*
+ * modify_init --
+ * Initialize the buffers to a known state.
+ */
+static void
+modify_init(WT_ITEM *local, WT_ITEM *library)
+{
+ size_t len;
+
+ len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES);
+ testutil_check(__wt_buf_set(NULL, local, modify_repl, len));
+ testutil_check(__wt_buf_set(NULL, library, modify_repl, len));
+}
+
+static int nruns = 1000;
+
+/*
+ * modify_run
+ * Run some tests.
+ */
+static void
+modify_run(WT_ITEM *local, WT_ITEM *library, bool verbose)
+{
+ int i, j;
+
+ for (i = 0; i < nruns; ++i) {
+ modify_init(local, library);
+
+ for (j = 0; j < 1000; ++j) {
+ modify_build();
+
+ slow_apply_api(local);
+ testutil_check(__wt_modify_apply_api(
+ NULL, library, entries, nentries));
+
+ diff(local, library);
+ }
+ if (verbose) {
+ printf("%d (%d%%)\r", i, (i * 100) / nruns);
+ fflush(stdout);
+ }
+ }
+ if (verbose)
+ printf("\n");
+}
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ WT_ITEM *local, _local, *library, _library;
+
+ if (testutil_is_flag_set("TESTUTIL_ENABLE_LONG_TESTS"))
+ nruns = 10000;
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ testutil_make_work_dir(opts->home);
+ testutil_check(
+ wiredtiger_open(opts->home, NULL, "create", &opts->conn));
+
+ /* Initialize the RNG. */
+ __wt_random_init_seed(NULL, &rnd);
+
+ /* Set up replacement information. */
+ modify_repl_init();
+
+ /* Allocate a pair of buffers. */
+ local = &_local;
+ memset(&_local, 0, sizeof(_local));
+ library = &_library;
+ memset(&_library, 0, sizeof(_library));
+
+ /* Run the test. */
+ modify_run(local, library, opts->verbose);
+
+ __wt_buf_free(NULL, local);
+ __wt_buf_free(NULL, library);
+
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
diff --git a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
index ffef7f5fa9f..d007eb65382 100644
--- a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c
@@ -78,19 +78,9 @@ main(int argc, char *argv[])
int i;
/*
- * This test should not run unless we have compiled with diagnostic
- * support and the long tests flag is set. The test will fail when
- * attempting to set the option to add the delays to checkpoints if
- * diagnostic mode is not enable and runs for 15 minutes.
+ * This test should not run unless long tests flag is set. The test
+ * runs for 15 minutes.
*/
-#if !defined(HAVE_DIAGNOSTIC)
- /*
- * Put the return in a conditional, otherwise some compilers will
- * complain that code beyond the return is unreachable.
- */
- if (true)
- return (EXIT_SUCCESS);
-#endif
if (!testutil_is_flag_set("WT3363_CHECKPOINT_OP_RACES"))
return (EXIT_SUCCESS);
@@ -102,7 +92,7 @@ main(int argc, char *argv[])
testutil_make_work_dir(opts->home);
testutil_check(wiredtiger_open(opts->home, &event_handler,
- "create,cache_size=1G,diagnostic_timing_stress=[checkpoint_slow]",
+ "create,cache_size=1G,timing_stress_for_test=[checkpoint_slow]",
&opts->conn));
testutil_check(pthread_create(
diff --git a/src/third_party/wiredtiger/test/format/bdb.c b/src/third_party/wiredtiger/test/format/bdb.c
index 6ee3e063cad..f3dd9e44f17 100644
--- a/src/third_party/wiredtiger/test/format/bdb.c
+++ b/src/third_party/wiredtiger/test/format/bdb.c
@@ -78,7 +78,7 @@ bdb_open(void)
assert(db->cursor(db, NULL, &dbc, 0) == 0);
g.dbc = dbc;
- key_gen_setup(&keyitem);
+ key_gen_init(&keyitem);
}
void
@@ -95,7 +95,7 @@ bdb_close(void)
assert(db->close(db, 0) == 0);
assert(dbenv->close(dbenv, 0) == 0);
- free(keyitem.mem);
+ key_gen_teardown(&keyitem);
}
void
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index 5d236ec5b42..398471786e6 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -68,8 +68,8 @@ wts_load(void)
testutil_check(ret);
/* Set up the key/value buffers. */
- key_gen_setup(&key);
- val_gen_setup(NULL, &value);
+ key_gen_init(&key);
+ val_gen_init(&value);
for (;;) {
if (++g.key_cnt > g.c_rows) {
@@ -158,6 +158,6 @@ wts_load(void)
testutil_check(session->close(session, NULL));
- free(key.mem);
- free(value.mem);
+ key_gen_teardown(&key);
+ val_gen_teardown(&value);
}
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index f7aad2ae0a7..8d39e99c050 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -74,8 +74,11 @@ config_setup(void)
else
switch (mmrand(NULL, 1, 10)) {
case 1: /* 10% */
- config_single("file_type=fix", 0);
- break;
+ if (!config_is_perm("modify_pct")) {
+ config_single("file_type=fix", 0);
+ break;
+ }
+ /* FALLTHROUGH */
case 2: case 3: case 4: /* 30% */
config_single("file_type=var", 0);
break; /* 60% */
@@ -545,19 +548,36 @@ config_pct(void)
list[i].order = mmrand(NULL, 1, 1000);
if (pct > 100)
testutil_die(EINVAL,
- "operation percentages total to more than 100%%");
+ "operation percentages do not total to 100%%");
/* Cursor modify isn't possible for fixed-length column store. */
if (g.type == FIX) {
if (config_is_perm("modify_pct"))
testutil_die(EINVAL,
"WT_CURSOR.modify not supported by fixed-length "
- "column store or LSM");
+ "column store");
list[CONFIG_MODIFY_ENTRY].order = 0;
*list[CONFIG_MODIFY_ENTRY].vp = 0;
}
/*
+ * Cursor modify isn't possible for read-uncommitted transactions.
+ * If both forced, it's an error, else, prefer the forced one, else,
+ * prefer modify operations.
+ */
+ if (g.c_isolation_flag == ISOLATION_READ_UNCOMMITTED) {
+ if (config_is_perm("isolation")) {
+ if (config_is_perm("modify_pct"))
+ testutil_die(EINVAL,
+ "WT_CURSOR.modify not supported with "
+ "read-uncommitted transactions");
+ list[CONFIG_MODIFY_ENTRY].order = 0;
+ *list[CONFIG_MODIFY_ENTRY].vp = 0;
+ } else
+ config_single("isolation=random", 0);
+ }
+
+ /*
* If the delete percentage isn't nailed down, periodically set it to
* 0 so salvage gets run. Don't do it on the first run, all our smoke
* tests would hit it.
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 30246ce69a1..7ba36a700c6 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -300,16 +300,22 @@ void config_setup(void);
void config_single(const char *, int);
void fclose_and_clear(FILE **);
void key_gen(WT_ITEM *, uint64_t);
+void key_gen_init(WT_ITEM *);
void key_gen_insert(WT_RAND_STATE *, WT_ITEM *, uint64_t);
-void key_gen_setup(WT_ITEM *);
-void key_len_setup(void);
+void key_gen_teardown(WT_ITEM *);
+void key_init(void);
WT_THREAD_RET lrt(void *);
void path_setup(const char *);
+void print_item(const char *, WT_ITEM *);
+void print_item_data(const char *, const uint8_t *, size_t);
int read_row(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
uint32_t rng(WT_RAND_STATE *);
void track(const char *, uint64_t, TINFO *);
void val_gen(WT_RAND_STATE *, WT_ITEM *, uint64_t);
-void val_gen_setup(WT_RAND_STATE *, WT_ITEM *);
+void val_gen_init(WT_ITEM *);
+void val_gen_teardown(WT_ITEM *);
+void val_init(void);
+void val_teardown(void);
void wts_close(void);
void wts_dump(const char *, int);
void wts_init(void);
diff --git a/src/third_party/wiredtiger/test/format/lrt.c b/src/third_party/wiredtiger/test/format/lrt.c
index 5073d5aad03..480db6e2c87 100644
--- a/src/third_party/wiredtiger/test/format/lrt.c
+++ b/src/third_party/wiredtiger/test/format/lrt.c
@@ -50,8 +50,8 @@ lrt(void *arg)
saved_keyno = 0; /* [-Werror=maybe-uninitialized] */
- key_gen_setup(&key);
- val_gen_setup(NULL, &value);
+ key_gen_init(&key);
+ val_gen_init(&value);
buf = NULL;
buf_len = buf_size = 0;
@@ -184,8 +184,8 @@ lrt(void *arg)
testutil_check(session->close(session, NULL));
- free(key.mem);
- free(value.mem);
+ key_gen_teardown(&key);
+ val_gen_teardown(&value);
free(buf);
return (WT_THREAD_RET_VALUE);
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index f8c275bb67d..8725243ba05 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -49,9 +49,23 @@ static void table_append_init(void);
#ifdef HAVE_BERKELEY_DB
static int notfound_chk(const char *, int, int, uint64_t);
-static void print_item(const char *, WT_ITEM *);
#endif
+static char modify_repl[256];
+
+/*
+ * modify_repl_init --
+ * Initialize the replacement information.
+ */
+static void
+modify_repl_init(void)
+{
+ size_t i;
+
+ for (i = 0; i < sizeof(modify_repl); ++i)
+ modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26];
+}
+
/*
* wts_ops --
* Perform a number of operations in a set of threads.
@@ -76,6 +90,8 @@ wts_ops(int lastrun)
memset(&compat_tid, 0, sizeof(compat_tid));
memset(&lrt_tid, 0, sizeof(lrt_tid));
+ modify_repl_init();
+
/*
* There are two mechanisms to specify the length of the run, a number
* of operations and a timer, when either expire the run terminates.
@@ -226,25 +242,29 @@ wts_ops(int lastrun)
* isolation_config --
* Return an isolation configuration.
*/
-static inline const char *
-isolation_config(WT_RAND_STATE *rnd, bool *iso_snapshotp)
+static inline u_int
+isolation_config(WT_RAND_STATE *rnd, WT_SESSION *session)
{
+ const char *config;
u_int v;
if ((v = g.c_isolation_flag) == ISOLATION_RANDOM)
v = mmrand(rnd, 2, 4);
switch (v) {
case ISOLATION_READ_UNCOMMITTED:
- *iso_snapshotp = false;
- return ("isolation=read-uncommitted");
+ config = "isolation=read-uncommitted";
+ break;
case ISOLATION_READ_COMMITTED:
- *iso_snapshotp = false;
- return ("isolation=read-committed");
+ config = "isolation=read-committed";
+ break;
case ISOLATION_SNAPSHOT:
default:
- *iso_snapshotp = true;
- return ("isolation=snapshot");
+ v = ISOLATION_SNAPSHOT;
+ config = "isolation=snapshot";
+ break;
}
+ testutil_check(session->reconfigure(session, config));
+ return (v);
}
typedef struct {
@@ -382,28 +402,44 @@ snap_check(WT_CURSOR *cursor,
ret == WT_NOTFOUND ? 0 : *(uint8_t *)value->data);
/* NOTREACHED */
case ROW:
+ fprintf(stderr,
+ "snapshot-isolation %.*s search mismatch\n",
+ (int)key->size, (const char *)key->data);
+
+ if (start->deleted)
+ fprintf(stderr, "expected {deleted}\n");
+ else
+ print_item_data(
+ "expected", start->vdata, start->vsize);
+ if (ret == WT_NOTFOUND)
+ fprintf(stderr, "found {deleted}\n");
+ else
+ print_item_data(
+ " found", value->data, value->size);
+
testutil_die(ret,
- "snapshot-isolation: %.*s search: "
- "expected {%.*s}, found {%.*s}",
- (int)key->size, key->data,
- start->deleted ?
- (int)strlen("deleted") : (int)start->vsize,
- start->deleted ? "deleted" : start->vdata,
- ret == WT_NOTFOUND ?
- (int)strlen("deleted") : (int)value->size,
- ret == WT_NOTFOUND ? "deleted" : value->data);
+ "snapshot-isolation: %.*s search mismatch",
+ (int)key->size, key->data);
/* NOTREACHED */
case VAR:
+ fprintf(stderr,
+ "snapshot-isolation %" PRIu64 " search mismatch\n",
+ start->keyno);
+
+ if (start->deleted)
+ fprintf(stderr, "expected {deleted}\n");
+ else
+ print_item_data(
+ "expected", start->vdata, start->vsize);
+ if (ret == WT_NOTFOUND)
+ fprintf(stderr, "found {deleted}\n");
+ else
+ print_item_data(
+ " found", value->data, value->size);
+
testutil_die(ret,
- "snapshot-isolation: %" PRIu64 " search: "
- "expected {%.*s}, found {%.*s}",
- start->keyno,
- start->deleted ?
- (int)strlen("deleted") : (int)start->vsize,
- start->deleted ? "deleted" : start->vdata,
- ret == WT_NOTFOUND ?
- (int)strlen("deleted") : (int)value->size,
- ret == WT_NOTFOUND ? "deleted" : value->data);
+ "snapshot-isolation: %" PRIu64 " search mismatch",
+ start->keyno);
/* NOTREACHED */
}
}
@@ -462,10 +498,10 @@ ops(void *arg)
WT_SESSION *session;
uint64_t keyno, ckpt_op, reset_op, session_op;
uint32_t rnd;
- u_int i;
+ u_int i, iso_config;
int dir;
char *ckpt_config, ckpt_name[64];
- bool ckpt_available, intxn, iso_snapshot, positioned, readonly;
+ bool ckpt_available, intxn, positioned, readonly;
tinfo = arg;
@@ -474,7 +510,7 @@ ops(void *arg)
/* Initialize tracking of snapshot isolation transaction returns. */
snap = NULL;
- iso_snapshot = false;
+ iso_config = 0;
memset(snap_list, 0, sizeof(snap_list));
/* Initialize the per-thread random number generator. */
@@ -482,9 +518,9 @@ ops(void *arg)
/* Set up the default key and value buffers. */
key = &_key;
- key_gen_setup(key);
+ key_gen_init(key);
value = &_value;
- val_gen_setup(&tinfo->rnd, value);
+ val_gen_init(value);
/* Set the first operation where we'll create sessions and cursors. */
cursor = NULL;
@@ -645,13 +681,12 @@ skip_checkpoint: /* Pick the next checkpoint operation. */
*/
if (!SINGLETHREADED &&
!intxn && mmrand(&tinfo->rnd, 1, 100) >= g.c_txn_freq) {
- testutil_check(
- session->reconfigure(session,
- isolation_config(&tinfo->rnd, &iso_snapshot)));
+ iso_config = isolation_config(&tinfo->rnd, session);
testutil_check(
session->begin_transaction(session, NULL));
- snap = iso_snapshot ? snap_list : NULL;
+ snap =
+ iso_config == ISOLATION_SNAPSHOT ? snap_list : NULL;
intxn = true;
}
@@ -736,7 +771,7 @@ skip_checkpoint: /* Pick the next checkpoint operation. */
* of inserting.
*/
if (g.append_cnt >= g.append_max)
- goto update_instead_of_insert;
+ goto update_instead_of_chosen_op;
ret = col_insert(
tinfo, cursor, key, value, &keyno);
@@ -753,10 +788,17 @@ skip_checkpoint: /* Pick the next checkpoint operation. */
} else {
if (ret == WT_ROLLBACK && intxn)
goto deadlock;
- testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+ testutil_assert(ret == WT_ROLLBACK);
}
break;
case MODIFY:
+ /*
+ * Change modify into update if in a read-uncommitted
+ * transaction, modify isn't supported in that case.
+ */
+ if (iso_config == ISOLATION_READ_UNCOMMITTED)
+ goto update_instead_of_chosen_op;
+
++tinfo->update;
switch (g.type) {
case ROW:
@@ -776,7 +818,7 @@ skip_checkpoint: /* Pick the next checkpoint operation. */
positioned = false;
if (ret == WT_ROLLBACK && intxn)
goto deadlock;
- testutil_assert(ret == 0 ||
+ testutil_assert(
ret == WT_NOTFOUND || ret == WT_ROLLBACK);
}
break;
@@ -822,7 +864,7 @@ skip_checkpoint: /* Pick the next checkpoint operation. */
}
break;
case UPDATE:
-update_instead_of_insert:
+update_instead_of_chosen_op:
++tinfo->update;
switch (g.type) {
case ROW:
@@ -843,7 +885,7 @@ update_instead_of_insert:
positioned = false;
if (ret == WT_ROLLBACK && intxn)
goto deadlock;
- testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+ testutil_assert(ret == WT_ROLLBACK);
}
break;
}
@@ -860,6 +902,7 @@ update_instead_of_insert:
continue;
if (ret == WT_ROLLBACK && intxn)
goto deadlock;
+ testutil_assert(ret == WT_NOTFOUND);
break;
}
}
@@ -911,8 +954,8 @@ deadlock: ++tinfo->deadlock;
free(snap_list[i].kdata);
free(snap_list[i].vdata);
}
- free(key->mem);
- free(value->mem);
+ key_gen_teardown(key);
+ val_gen_teardown(value);
tinfo->state = TINFO_COMPLETE;
return (WT_THREAD_RET_VALUE);
@@ -935,8 +978,8 @@ wts_read_scan(void)
conn = g.wts_conn;
/* Set up the default key/value buffers. */
- key_gen_setup(&key);
- val_gen_setup(NULL, &value);
+ key_gen_init(&key);
+ val_gen_init(&value);
/* Open a session and cursor pair. */
testutil_check(conn->open_session(conn, NULL, NULL, &session));
@@ -972,8 +1015,8 @@ wts_read_scan(void)
testutil_check(session->close(session, NULL));
- free(key.mem);
- free(value.mem);
+ key_gen_teardown(&key);
+ val_gen_teardown(&value);
}
/*
@@ -1130,7 +1173,7 @@ nextprev(WT_CURSOR *cursor, int next)
session = cursor->session;
- /* Retrieve the BDB value. */
+ /* Retrieve the BDB key/value. */
bdb_np(next, &bdb_key.data, &bdb_key.size,
&bdb_value.data, &bdb_value.size, &notfound);
if (notfound_chk(
@@ -1138,28 +1181,25 @@ nextprev(WT_CURSOR *cursor, int next)
return (ret);
/* Compare the two. */
- if (g.type == ROW) {
- if (key.size != bdb_key.size ||
- memcmp(key.data, bdb_key.data, key.size) != 0) {
- fprintf(stderr, "nextprev: %s key mismatch:\n", which);
+ if ((g.type == ROW &&
+ (key.size != bdb_key.size ||
+ memcmp(key.data, bdb_key.data, key.size) != 0)) ||
+ (g.type != ROW && keyno != (uint64_t)atoll(bdb_key.data))) {
+ fprintf(stderr, "nextprev: %s KEY mismatch:\n", which);
+ goto mismatch;
+ }
+ if (value.size != bdb_value.size ||
+ memcmp(value.data, bdb_value.data, value.size) != 0) {
+ fprintf(stderr, "nextprev: %s VALUE mismatch:\n", which);
+mismatch: if (g.type == ROW) {
print_item("bdb-key", &bdb_key);
print_item(" wt-key", &key);
- testutil_die(0, NULL);
- }
- } else {
- if (keyno != (uint64_t)atoll(bdb_key.data)) {
- if ((p = strchr((char *)bdb_key.data, '.')) != NULL)
+ } else {
+ if ((p = (char *)strchr(bdb_key.data, '.')) != NULL)
*p = '\0';
- fprintf(stderr,
- "nextprev: %s key mismatch: %.*s != %" PRIu64 "\n",
- which,
+ fprintf(stderr, "\t%.*s != %" PRIu64 "\n",
(int)bdb_key.size, (char *)bdb_key.data, keyno);
- testutil_die(0, NULL);
}
- }
- if (value.size != bdb_value.size ||
- memcmp(value.data, bdb_value.data, value.size) != 0) {
- fprintf(stderr, "nextprev: %s value mismatch:\n", which);
print_item("bdb-value", &bdb_value);
print_item(" wt-value", &value);
testutil_die(0, NULL);
@@ -1174,7 +1214,7 @@ nextprev(WT_CURSOR *cursor, int next)
break;
case ROW:
(void)g.wt_api->msg_printf(
- g.wt_api, session, "%-10s{%.*s/%.*s}", which,
+ g.wt_api, session, "%-10s{%.*s}, {%.*s}", which,
(int)key.size, (char *)key.data,
(int)value.size, (char *)value.data);
break;
@@ -1254,110 +1294,28 @@ col_reserve(WT_CURSOR *cursor, uint64_t keyno, bool positioned)
/*
* modify_build --
- * Generate a set of modify vectors, and copy what the final result
- * should be into the value buffer.
+ * Generate a set of modify vectors.
*/
-static bool
-modify_build(TINFO *tinfo,
- WT_CURSOR *cursor, WT_MODIFY *entries, int *nentriesp, WT_ITEM *value)
+static void
+modify_build(TINFO *tinfo, WT_MODIFY *entries, int *nentriesp)
{
- static char repl[64];
- size_t len, size;
- u_int i, nentries;
- WT_ITEM *ta, _ta, *tb, _tb, *tmp;
-
- if (repl[0] == '\0')
- memset(repl, '+', sizeof(repl));
+ int i, nentries;
- ta = &_ta;
- memset(ta, 0, sizeof(*ta));
- tb = &_tb;
- memset(tb, 0, sizeof(*tb));
-
- testutil_check(cursor->get_value(cursor, value));
-
- /*
- * Randomly select a number of byte changes, offsets and lengths. Start
- * at least 11 bytes in so we skip the leading key information.
- */
- nentries = mmrand(&tinfo->rnd, 1, MAX_MODIFY_ENTRIES);
+ /* Randomly select a number of byte changes, offsets and lengths. */
+ nentries = (int)mmrand(&tinfo->rnd, 1, MAX_MODIFY_ENTRIES);
for (i = 0; i < nentries; ++i) {
- entries[i].data.data = repl;
+ entries[i].data.data = modify_repl +
+ mmrand(&tinfo->rnd, 1, sizeof(modify_repl) - 10);
entries[i].data.size = (size_t)mmrand(&tinfo->rnd, 0, 10);
+ /*
+ * Start at least 11 bytes into the buffer so we skip leading
+ * key information.
+ */
entries[i].offset = (size_t)mmrand(&tinfo->rnd, 20, 40);
entries[i].size = (size_t)mmrand(&tinfo->rnd, 0, 10);
}
- /*
- * Process the entries to figure out how large a buffer we need. This is
- * a bit pessimistic because we're ignoring replacement bytes, but it's
- * a simpler calculation.
- */
- for (size = cursor->value.size, i = 0; i < nentries; ++i) {
- if (entries[i].offset >= size)
- size = entries[i].offset;
- size += entries[i].data.size;
- }
-
- /* If size is larger than the available buffer size, skip this one. */
- if (size >= value->memsize)
- return (false);
-
- /* Allocate a pair of buffers. */
- ta->mem = dcalloc(size, sizeof(uint8_t));
- tb->mem = dcalloc(size, sizeof(uint8_t));
-
- /*
- * Use a brute-force process to create the value WiredTiger will create
- * from this change vector. Don't do anything tricky to speed it up, we
- * want to use a different algorithm from WiredTiger's, the idea is to
- * bug-check the library.
- */
- memcpy(ta->mem, value->data, value->size);
- ta->size = value->size;
- for (i = 0; i < nentries; ++i) {
- /* Take leading bytes from the original, plus any gap bytes. */
- if (entries[i].offset >= ta->size) {
- memcpy(tb->mem, ta->mem, ta->size);
- if (entries[i].offset > ta->size)
- memset((uint8_t *)tb->mem + ta->size,
- '\0', entries[i].offset - ta->size);
- } else
- if (entries[i].offset > 0)
- memcpy(tb->mem, ta->mem, entries[i].offset);
- tb->size = entries[i].offset;
-
- /* Take replacement bytes. */
- if (entries[i].data.size > 0) {
- memcpy((uint8_t *)tb->mem + tb->size,
- entries[i].data.data, entries[i].data.size);
- tb->size += entries[i].data.size;
- }
-
- /* Take trailing bytes from the original. */
- len = entries[i].offset + entries[i].size;
- if (ta->size > len) {
- memcpy((uint8_t *)tb->mem + tb->size,
- (uint8_t *)ta->mem + len, ta->size - len);
- tb->size += ta->size - len;
- }
- testutil_assert(tb->size <= size);
-
- tmp = ta;
- ta = tb;
- tb = tmp;
- }
-
- /* Copy the expected result into the value structure. */
- memcpy(value->mem, ta->mem, ta->size);
- value->data = value->mem;
- value->size = ta->size;
-
- free(ta->mem);
- free(tb->mem);
-
*nentriesp = (int)nentries;
- return (true);
}
/*
@@ -1375,31 +1333,12 @@ row_modify(TINFO *tinfo, WT_CURSOR *cursor,
if (!positioned) {
key_gen(key, keyno);
cursor->set_key(cursor, key);
- switch (ret = cursor->search(cursor)) {
- case 0:
- break;
- case WT_CACHE_FULL:
- case WT_ROLLBACK:
- return (WT_ROLLBACK);
- case WT_NOTFOUND:
- return (WT_NOTFOUND);
- default:
- testutil_die(ret,
- "row_modify: read row %" PRIu64 " by key", keyno);
- }
}
- /*
- * Generate a set of change vectors and copy the expected result into
- * the value buffer. If the return value is non-zero, there wasn't a
- * big enough value to work with, or for some reason we couldn't build
- * a reasonable change vector.
- */
- ret = WT_NOTFOUND;
- if (modify_build(tinfo, cursor, entries, &nentries, value))
- ret = cursor->modify(cursor, entries, nentries);
- switch (ret) {
+ modify_build(tinfo, entries, &nentries);
+ switch (ret = cursor->modify(cursor, entries, nentries)) {
case 0:
+ testutil_check(cursor->get_value(cursor, value));
break;
case WT_CACHE_FULL:
case WT_ROLLBACK:
@@ -1411,6 +1350,12 @@ row_modify(TINFO *tinfo, WT_CURSOR *cursor,
"row_modify: modify row %" PRIu64 " by key", keyno);
}
+ if (g.logging == LOG_OPS)
+ (void)g.wt_api->msg_printf(g.wt_api, cursor->session,
+ "%-10s{%.*s}, {%.*s}",
+ "modify",
+ (int)key->size, key->data, (int)value->size, value->data);
+
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
return (0);
@@ -1432,33 +1377,13 @@ col_modify(TINFO *tinfo, WT_CURSOR *cursor,
WT_MODIFY entries[MAX_MODIFY_ENTRIES];
int nentries;
- if (!positioned) {
+ if (!positioned)
cursor->set_key(cursor, keyno);
- switch (ret = cursor->search(cursor)) {
- case 0:
- break;
- case WT_CACHE_FULL:
- case WT_ROLLBACK:
- return (WT_ROLLBACK);
- case WT_NOTFOUND:
- return (WT_NOTFOUND);
- default:
- testutil_die(ret,
- "col_modify: read row %" PRIu64, keyno);
- }
- }
- /*
- * Generate a set of change vectors and copy the expected result into
- * the value buffer. If the return value is non-zero, there wasn't a
- * big enough value to work with, or for some reason we couldn't build
- * a reasonable change vector.
- */
- ret = WT_NOTFOUND;
- if (modify_build(tinfo, cursor, entries, &nentries, value))
- ret = cursor->modify(cursor, entries, nentries);
- switch (ret) {
+ modify_build(tinfo, entries, &nentries);
+ switch (ret = cursor->modify(cursor, entries, nentries)) {
case 0:
+ testutil_check(cursor->get_value(cursor, value));
break;
case WT_CACHE_FULL:
case WT_ROLLBACK:
@@ -1469,6 +1394,12 @@ col_modify(TINFO *tinfo, WT_CURSOR *cursor,
testutil_die(ret, "col_modify: modify row %" PRIu64, keyno);
}
+ if (g.logging == LOG_OPS)
+ (void)g.wt_api->msg_printf(g.wt_api, cursor->session,
+ "%-10s{%.*s}, {%.*s}",
+ "modify",
+ (int)key->size, key->data, (int)value->size, value->data);
+
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
return (0);
@@ -1910,35 +1841,4 @@ notfound_chk(const char *f, int wt_ret, int bdb_notfound, uint64_t keyno)
}
return (0);
}
-
-/*
- * print_item --
- * Display a single data/size pair, with a tag.
- */
-static void
-print_item(const char *tag, WT_ITEM *item)
-{
- static const char hex[] = "0123456789abcdef";
- const uint8_t *data;
- size_t size;
- u_char ch;
-
- data = item->data;
- size = item->size;
-
- fprintf(stderr, "\t%s {", tag);
- if (g.type == FIX)
- fprintf(stderr, "0x%02x", data[0]);
- else
- for (; size > 0; --size, ++data) {
- ch = data[0];
- if (__wt_isprint(ch))
- fprintf(stderr, "%c", (int)ch);
- else
- fprintf(stderr, "%x%x",
- hex[(data[0] & 0xf0) >> 4],
- hex[data[0] & 0x0f]);
- }
- fprintf(stderr, "}\n");
-}
#endif
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index c70f6facfc5..baff2b7304c 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -178,7 +178,8 @@ main(int argc, char *argv[])
config_setup(); /* Run configuration */
config_print(0); /* Dump run configuration */
- key_len_setup(); /* Setup keys */
+ key_init(); /* Setup keys/values */
+ val_init();
start = time(NULL);
track("starting up", 0ULL, NULL);
@@ -254,6 +255,8 @@ main(int argc, char *argv[])
g.run_cnt, g.c_data_source,
g.c_file_type, difftime(time(NULL), start));
fflush(stdout);
+
+ val_teardown(); /* Teardown keys/values */
}
/* Flush/close any logging information. */
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index 6d24073da1a..b350daf4dd7 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -33,7 +33,7 @@
#endif
void
-key_len_setup(void)
+key_init(void)
{
size_t i;
uint32_t max;
@@ -61,7 +61,7 @@ key_len_setup(void)
}
void
-key_gen_setup(WT_ITEM *key)
+key_gen_init(WT_ITEM *key)
{
size_t i, len;
char *p;
@@ -77,6 +77,13 @@ key_gen_setup(WT_ITEM *key)
key->size = 0;
}
+void
+key_gen_teardown(WT_ITEM *key)
+{
+ free(key->mem);
+ memset(key, 0, sizeof(*key));
+}
+
static void
key_gen_common(WT_ITEM *key, uint64_t keyno, const char * const suffix)
{
@@ -137,7 +144,9 @@ key_gen_insert(WT_RAND_STATE *rnd, WT_ITEM *key, uint64_t keyno)
key_gen_common(key, keyno, suffix[mmrand(rnd, 0, 14)]);
}
-static uint32_t val_dup_data_len; /* Length of duplicate data items */
+static char *val_base; /* Base/original value */
+static uint32_t val_dup_data_len; /* Length of duplicate data items */
+static uint32_t val_len; /* Length of data items */
static inline uint32_t
value_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
@@ -157,12 +166,9 @@ value_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
}
void
-val_gen_setup(WT_RAND_STATE *rnd, WT_ITEM *value)
+val_init(void)
{
- size_t i, len;
- char *p;
-
- memset(value, 0, sizeof(WT_ITEM));
+ size_t i;
/*
* Set initial buffer contents to recognizable text.
@@ -171,18 +177,37 @@ val_gen_setup(WT_RAND_STATE *rnd, WT_ITEM *value)
* into the buffer by a few extra bytes, used to generate different
* data for column-store run-length encoded files.
*/
- len = MAX(KILOBYTE(100), g.c_value_max) + 20;
- p = dmalloc(len);
- for (i = 0; i < len; ++i)
- p[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26];
+ val_len = MAX(KILOBYTE(100), g.c_value_max) + 20;
+ val_base = dmalloc(val_len);
+ for (i = 0; i < val_len; ++i)
+ val_base[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26];
- value->mem = p;
- value->memsize = len;
+ val_dup_data_len = value_len(NULL,
+ (uint64_t)mmrand(NULL, 1, 20), g.c_value_min, g.c_value_max);
+}
+
+void
+val_teardown(void)
+{
+ free(val_base);
+ val_base = NULL;
+ val_dup_data_len = val_len = 0;
+}
+
+void
+val_gen_init(WT_ITEM *value)
+{
+ value->mem = dmalloc(val_len);
+ value->memsize = val_len;
value->data = value->mem;
value->size = 0;
+}
- val_dup_data_len = value_len(rnd,
- (uint64_t)mmrand(rnd, 1, 20), g.c_value_min, g.c_value_max);
+void
+val_gen_teardown(WT_ITEM *value)
+{
+ free(value->mem);
+ memset(value, 0, sizeof(*value));
}
void
@@ -227,14 +252,16 @@ val_gen(WT_RAND_STATE *rnd, WT_ITEM *value, uint64_t keyno)
* variable-length column-stores use a duplicate data value to test RLE.
*/
if (g.type == VAR && mmrand(rnd, 1, 100) < g.c_repeat_data_pct) {
+ value->size = val_dup_data_len;
+ memcpy(p, val_base, value->size);
(void)strcpy(p, "DUPLICATEV");
p[10] = '/';
- value->size = val_dup_data_len;
} else {
- u64_to_string_zf(keyno, p, 11);
- p[10] = '/';
value->size =
value_len(rnd, keyno, g.c_value_min, g.c_value_max);
+ memcpy(p, val_base, value->size);
+ u64_to_string_zf(keyno, p, 11);
+ p[10] = '/';
}
}
@@ -563,3 +590,39 @@ compat(void *arg)
}
return (WT_THREAD_RET_VALUE);
}
+
+/*
+ * print_item_data --
+ * Display a single data/size pair, with a tag.
+ */
+void
+print_item_data(const char *tag, const uint8_t *data, size_t size)
+{
+ static const char hex[] = "0123456789abcdef";
+ u_char ch;
+
+ fprintf(stderr, "\t%s {", tag);
+ if (g.type == FIX)
+ fprintf(stderr, "0x%02x", data[0]);
+ else
+ for (; size > 0; --size, ++data) {
+ ch = data[0];
+ if (__wt_isprint(ch))
+ fprintf(stderr, "%c", (int)ch);
+ else
+ fprintf(stderr, "%x%x",
+ (u_int)hex[(data[0] & 0xf0) >> 4],
+ (u_int)hex[data[0] & 0x0f]);
+ }
+ fprintf(stderr, "}\n");
+}
+
+/*
+ * print_item --
+ * Display a single data/size pair, with a tag.
+ */
+void
+print_item(const char *tag, WT_ITEM *item)
+{
+ print_item_data(tag, item->data, item->size);
+}
diff --git a/src/third_party/wiredtiger/test/suite/test_compat01.py b/src/third_party/wiredtiger/test/suite/test_compat01.py
index 460d38d5e08..1c9f07c02f2 100644
--- a/src/third_party/wiredtiger/test/suite/test_compat01.py
+++ b/src/third_party/wiredtiger/test/suite/test_compat01.py
@@ -102,7 +102,7 @@ class test_compat01(wttest.WiredTigerTestCase, suite_subprocess):
contains = 0
with open('printlog.out') as logfile:
for line in logfile:
- if 'prev_lsn' in line:
+ if 'optype' in line and 'prev_lsn' in line:
contains += 1
self.assertEqual(prev_lsn_count, contains)
diff --git a/src/third_party/wiredtiger/test/suite/test_cursor12.py b/src/third_party/wiredtiger/test/suite/test_cursor12.py
index 827f37cfcef..cd1c3f29b87 100644
--- a/src/third_party/wiredtiger/test/suite/test_cursor12.py
+++ b/src/third_party/wiredtiger/test/suite/test_cursor12.py
@@ -26,140 +26,352 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
+import random, string
import wiredtiger, wttest
+from helper import copy_wiredtiger_home
+from wtdataset import SimpleDataSet
from wtscenario import make_scenarios
# test_cursor12.py
# Test cursor modify call
class test_cursor12(wttest.WiredTigerTestCase):
+ keyfmt = [
+ ('recno', dict(keyfmt='r')),
+ ('string', dict(keyfmt='S')),
+ ]
types = [
('file', dict(uri='file:modify')),
('lsm', dict(uri='lsm:modify')),
('table', dict(uri='table:modify')),
]
- scenarios = make_scenarios(types)
+ scenarios = make_scenarios(types, keyfmt)
- # Smoke-test the modify API.
- def test_modify_smoke(self):
- # List with original value, final value, and modifications to get
- # there.
- list = [
- {
- 'o' : 'ABCDEFGH', # no operation
- 'f' : 'ABCDEFGH',
- 'mods' : [['', 0, 0]]
- },{
- 'o' : 'ABCDEFGH', # no operation with offset
- 'f' : 'ABCDEFGH',
- 'mods' : [['', 4, 0]]
- },{
- 'o' : 'ABCDEFGH', # rewrite beginning
- 'f' : '--CDEFGH',
- 'mods' : [['--', 0, 2]]
- },{
- 'o' : 'ABCDEFGH', # rewrite end
- 'f' : 'ABCDEF--',
- 'mods' : [['--', 6, 2]]
- },{
- 'o' : 'ABCDEFGH', # append
- 'f' : 'ABCDEFGH--',
- 'mods' : [['--', 8, 2]]
- },{
- 'o' : 'ABCDEFGH', # append with gap
- 'f' : 'ABCDEFGH\00\00--',
- 'mods' : [['--', 10, 2]]
- },{
- 'o' : 'ABCDEFGH', # multiple replacements
- 'f' : 'A-C-E-G-',
- 'mods' : [['-', 1, 1], ['-', 3, 1], ['-', 5, 1], ['-', 7, 1]]
- },{
- 'o' : 'ABCDEFGH', # multiple overlapping replacements
- 'f' : 'A-CDEFGH',
- 'mods' : [['+', 1, 1], ['+', 1, 1], ['+', 1, 1], ['-', 1, 1]]
- },{
- 'o' : 'ABCDEFGH', # multiple overlapping gap replacements
- 'f' : 'ABCDEFGH\00\00--',
- 'mods' : [['+', 10, 1], ['+', 10, 1], ['+', 10, 1], ['--', 10, 2]]
- },{
- 'o' : 'ABCDEFGH', # shrink beginning
- 'f' : '--EFGH',
- 'mods' : [['--', 0, 4]]
- },{
- 'o' : 'ABCDEFGH', # shrink middle
- 'f' : 'AB--GH',
- 'mods' : [['--', 2, 4]]
- },{
- 'o' : 'ABCDEFGH', # shrink end
- 'f' : 'ABCD--',
- 'mods' : [['--', 4, 4]]
- },{
- 'o' : 'ABCDEFGH', # grow beginning
- 'f' : '--ABCDEFGH',
- 'mods' : [['--', 0, 0]]
- },{
- 'o' : 'ABCDEFGH', # grow middle
- 'f' : 'ABCD--EFGH',
- 'mods' : [['--', 4, 0]]
- },{
- 'o' : 'ABCDEFGH', # grow end
- 'f' : 'ABCDEFGH--',
- 'mods' : [['--', 8, 0]]
- },{
- 'o' : 'ABCDEFGH', # discard beginning
- 'f' : 'EFGH',
- 'mods' : [['', 0, 4]]
- },{
- 'o' : 'ABCDEFGH', # discard middle
- 'f' : 'ABGH',
- 'mods' : [['', 2, 4]]
- },{
- 'o' : 'ABCDEFGH', # discard end
- 'f' : 'ABCD',
- 'mods' : [['', 4, 4]]
- },{
- 'o' : 'ABCDEFGH', # overlap the end and append
- 'f' : 'ABCDEF--XX',
- 'mods' : [['--XX', 6, 2]]
- },{
- 'o' : 'ABCDEFGH', # overlap the end with incorrect size
- 'f' : 'ABCDEFG01234567',
- 'mods' : [['01234567', 7, 2000]]
- }
- ]
-
- self.session.create(self.uri, 'key_format=S,value_format=u')
- cursor = self.session.open_cursor(self.uri, None, None)
-
- # For each test in the list, set the original value, apply modifications
- # in order, then confirm the final state.
- for i in list:
- cursor['ABC'] = i['o']
+ # List with original value, final value, and modifications to get
+ # there.
+ list = [
+ {
+ 'o' : 'ABCDEFGH', # no operation
+ 'f' : 'ABCDEFGH',
+ 'mods' : [['', 0, 0]]
+ },{
+ 'o' : 'ABCDEFGH', # no operation with offset
+ 'f' : 'ABCDEFGH',
+ 'mods' : [['', 4, 0]]
+ },{
+ 'o' : 'ABCDEFGH', # rewrite beginning
+ 'f' : '--CDEFGH',
+ 'mods' : [['--', 0, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # rewrite end
+ 'f' : 'ABCDEF--',
+ 'mods' : [['--', 6, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # append
+ 'f' : 'ABCDEFGH--',
+ 'mods' : [['--', 8, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # append with gap
+ 'f' : 'ABCDEFGH\00\00--',
+ 'mods' : [['--', 10, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # multiple replacements
+ 'f' : 'A-C-E-G-',
+ 'mods' : [['-', 1, 1], ['-', 3, 1], ['-', 5, 1], ['-', 7, 1]]
+ },{
+ 'o' : 'ABCDEFGH', # multiple overlapping replacements
+ 'f' : 'A-CDEFGH',
+ 'mods' : [['+', 1, 1], ['+', 1, 1], ['+', 1, 1], ['-', 1, 1]]
+ },{
+ 'o' : 'ABCDEFGH', # multiple overlapping gap replacements
+ 'f' : 'ABCDEFGH\00\00--',
+ 'mods' : [['+', 10, 1], ['+', 10, 1], ['+', 10, 1], ['--', 10, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # shrink beginning
+ 'f' : '--EFGH',
+ 'mods' : [['--', 0, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # shrink middle
+ 'f' : 'AB--GH',
+ 'mods' : [['--', 2, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # shrink end
+ 'f' : 'ABCD--',
+ 'mods' : [['--', 4, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # grow beginning
+ 'f' : '--ABCDEFGH',
+ 'mods' : [['--', 0, 0]]
+ },{
+ 'o' : 'ABCDEFGH', # grow middle
+ 'f' : 'ABCD--EFGH',
+ 'mods' : [['--', 4, 0]]
+ },{
+ 'o' : 'ABCDEFGH', # grow end
+ 'f' : 'ABCDEFGH--',
+ 'mods' : [['--', 8, 0]]
+ },{
+ 'o' : 'ABCDEFGH', # discard beginning
+ 'f' : 'EFGH',
+ 'mods' : [['', 0, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # discard middle
+ 'f' : 'ABGH',
+ 'mods' : [['', 2, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # discard end
+ 'f' : 'ABCD',
+ 'mods' : [['', 4, 4]]
+ },{
+ 'o' : 'ABCDEFGH', # discard everything
+ 'f' : '',
+ 'mods' : [['', 0, 8]]
+ },{
+ 'o' : 'ABCDEFGH', # overlap the end and append
+ 'f' : 'ABCDEF--XX',
+ 'mods' : [['--XX', 6, 2]]
+ },{
+ 'o' : 'ABCDEFGH', # overlap the end with incorrect size
+ 'f' : 'ABCDEFG01234567',
+ 'mods' : [['01234567', 7, 2000]]
+ },{ # many updates
+ 'o' : '-ABCDEFGHIJKLMNOPQRSTUVWXYZ-',
+ 'f' : '-eeeeeeeeeeeeeeeeeeeeeeeeee-',
+ 'mods' : [['a', 1, 1], ['a', 2, 1], ['a', 3, 1], ['a', 4, 1],
+ ['a', 5, 1], ['a', 6, 1], ['a', 7, 1], ['a', 8, 1],
+ ['a', 9, 1], ['a', 10, 1], ['a', 11, 1], ['a', 12, 1],
+ ['a', 13, 1], ['a', 14, 1], ['a', 15, 1], ['a', 16, 1],
+ ['a', 17, 1], ['a', 18, 1], ['a', 19, 1], ['a', 20, 1],
+ ['a', 21, 1], ['a', 22, 1], ['a', 23, 1], ['a', 24, 1],
+ ['a', 25, 1], ['a', 26, 1],
+ ['b', 1, 1], ['b', 2, 1], ['b', 3, 1], ['b', 4, 1],
+ ['b', 5, 1], ['b', 6, 1], ['b', 7, 1], ['b', 8, 1],
+ ['b', 9, 1], ['b', 10, 1], ['b', 11, 1], ['b', 12, 1],
+ ['b', 13, 1], ['b', 14, 1], ['b', 15, 1], ['b', 16, 1],
+ ['b', 17, 1], ['b', 18, 1], ['b', 19, 1], ['b', 20, 1],
+ ['b', 21, 1], ['b', 22, 1], ['b', 23, 1], ['b', 24, 1],
+ ['b', 25, 1], ['b', 26, 1],
+ ['c', 1, 1], ['c', 2, 1], ['c', 3, 1], ['c', 4, 1],
+ ['c', 5, 1], ['c', 6, 1], ['c', 7, 1], ['c', 8, 1],
+ ['c', 9, 1], ['c', 10, 1], ['c', 11, 1], ['c', 12, 1],
+ ['c', 13, 1], ['c', 14, 1], ['c', 15, 1], ['c', 16, 1],
+ ['c', 17, 1], ['c', 18, 1], ['c', 19, 1], ['c', 20, 1],
+ ['c', 21, 1], ['c', 22, 1], ['c', 23, 1], ['c', 24, 1],
+ ['c', 25, 1], ['c', 26, 1],
+ ['d', 1, 1], ['d', 2, 1], ['d', 3, 1], ['d', 4, 1],
+ ['d', 5, 1], ['d', 6, 1], ['d', 7, 1], ['d', 8, 1],
+ ['d', 9, 1], ['d', 10, 1], ['d', 11, 1], ['d', 12, 1],
+ ['d', 13, 1], ['d', 14, 1], ['d', 15, 1], ['d', 16, 1],
+ ['d', 17, 1], ['d', 18, 1], ['d', 19, 1], ['d', 20, 1],
+ ['d', 21, 1], ['d', 22, 1], ['d', 23, 1], ['d', 24, 1],
+ ['d', 25, 1], ['d', 26, 1],
+ ['e', 1, 1], ['e', 2, 1], ['e', 3, 1], ['e', 4, 1],
+ ['e', 5, 1], ['e', 6, 1], ['e', 7, 1], ['e', 8, 1],
+ ['e', 9, 1], ['e', 10, 1], ['e', 11, 1], ['e', 12, 1],
+ ['e', 13, 1], ['e', 14, 1], ['e', 15, 1], ['e', 16, 1],
+ ['e', 17, 1], ['e', 18, 1], ['e', 19, 1], ['e', 20, 1],
+ ['e', 21, 1], ['e', 22, 1], ['e', 23, 1], ['e', 24, 1],
+ ['e', 25, 1], ['e', 26, 1]]
+ }
+ ]
+ # Skip record number keys with LSM.
+ def skip(self):
+ return self.keyfmt == 'r' and 'lsm' in self.uri
+
+ # Create a set of modified records and verify in-memory reads.
+ def modify_load(self, ds, single):
+ # For each test in the list:
+ # set the original value,
+ # apply modifications in order,
+ # confirm the final state
+ row = 10
+ c = self.session.open_cursor(self.uri, None)
+ for i in self.list:
+ c.set_key(ds.key(row))
+ c.set_value(i['o'])
+ self.assertEquals(c.update(), 0)
+ c.reset()
+
+ c.set_key(ds.key(row))
mods = []
for j in i['mods']:
mod = wiredtiger.Modify(j[0], j[1], j[2])
mods.append(mod)
+ self.assertEquals(c.modify(mods), 0)
+ c.reset()
+
+ c.set_key(ds.key(row))
+ self.assertEquals(c.search(), 0)
+ self.assertEquals(c.get_value(), i['f'])
+
+ if not single:
+ row = row + 1
+ c.close()
+
+ # Confirm the modified records are correct.
+ def modify_confirm(self, ds, single):
+ # For each test in the list:
+ # confirm the final state is there.
+ row = 10
+ c = self.session.open_cursor(self.uri, None)
+ for i in self.list:
+ c.set_key(ds.key(row))
+ self.assertEquals(c.search(), 0)
+ self.assertEquals(c.get_value(), i['f'])
+
+ if not single:
+ row = row + 1
+ c.close()
+
+ # Smoke-test the modify API, operating on a group of records.
+ def test_modify_smoke(self):
+ if self.skip():
+ return
+
+ ds = SimpleDataSet(self,
+ self.uri, 100, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+ self.modify_load(ds, False)
+
+ # Smoke-test the modify API, operating on a single record
+ def test_modify_smoke_single(self):
+ if self.skip():
+ return
+
+ ds = SimpleDataSet(self,
+ self.uri, 100, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+ self.modify_load(ds, True)
+
+ # Smoke-test the modify API, closing and re-opening the database.
+ def test_modify_smoke_reopen(self):
+ if self.skip():
+ return
+
+ ds = SimpleDataSet(self,
+ self.uri, 100, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+ self.modify_load(ds, False)
+
+ # Flush to disk, forcing reconciliation.
+ self.reopen_conn()
+
+ self.modify_confirm(ds, False)
+
+ # Smoke-test the modify API, recovering the database.
+ def test_modify_smoke_recover(self):
+ if self.skip():
+ return
+
+ # Close the original database.
+ self.conn.close()
+
+ # Open a new database with logging configured.
+ self.conn_config = \
+ 'log=(enabled=true),transaction_sync=(method=dsync,enabled)'
+ self.conn = self.setUpConnectionOpen(".")
+ self.session = self.setUpSessionOpen(self.conn)
+
+ # Populate a database, and checkpoint it so it exists after recovery.
+ ds = SimpleDataSet(self,
+ self.uri, 100, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+ self.session.checkpoint()
+ self.modify_load(ds, False)
- cursor.set_key('ABC')
- cursor.modify(mods)
- self.assertEquals(str(cursor['ABC']), i['f'])
+ # Crash and recover in a new directory.
+ newdir = 'RESTART'
+ copy_wiredtiger_home('.', newdir)
+ self.conn.close()
+ self.conn = self.setUpConnectionOpen(newdir)
+ self.session = self.setUpSessionOpen(self.conn)
+ self.session.verify(self.uri)
+
+ self.modify_confirm(ds, False)
+
+ # Check that we can perform a large number of modifications to a record.
+ def test_modify_many(self):
+ ds = SimpleDataSet(self,
+ self.uri, 20, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+
+ c = self.session.open_cursor(self.uri, None)
+ c.set_key(ds.key(10))
+ orig = 'abcdefghijklmnopqrstuvwxyz'
+ c.set_value(orig)
+ self.assertEquals(c.update(), 0)
+ for i in range(0, 50000):
+ new = "".join([random.choice(string.digits) for i in xrange(5)])
+ orig = orig[:10] + new + orig[15:]
+ mods = []
+ mod = wiredtiger.Modify(new, 10, 5)
+ mods.append(mod)
+ self.assertEquals(c.modify(mods), 0)
+
+ c.set_key(ds.key(10))
+ self.assertEquals(c.search(), 0)
+ self.assertEquals(c.get_value(), orig)
# Check that modify returns not-found after a delete.
def test_modify_delete(self):
- self.session.create(self.uri, 'key_format=S,value_format=u')
- cursor = self.session.open_cursor(self.uri, None, None)
- cursor['ABC'] = 'ABCDEFGH'
- cursor.set_key('ABC')
- cursor.remove()
+ ds = SimpleDataSet(self,
+ self.uri, 20, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+
+ c = self.session.open_cursor(self.uri, None)
+ c.set_key(ds.key(10))
+ self.assertEquals(c.remove(), 0)
+
+ mods = []
+ mod = wiredtiger.Modify('ABCD', 3, 3)
+ mods.append(mod)
+
+ c.set_key(ds.key(10))
+ self.assertEqual(c.modify(mods), wiredtiger.WT_NOTFOUND)
+ # Check that modify returns not-found when an insert is not yet committed
+ # and after it's aborted.
+ def test_modify_abort(self):
+ ds = SimpleDataSet(self,
+ self.uri, 20, key_format=self.keyfmt, value_format='u')
+ ds.populate()
+
+ # Start a transaction.
+ self.session.begin_transaction()
+
+ # Insert a new record.
+ c = self.session.open_cursor(self.uri, None)
+ c.set_key(ds.key(30))
+ c.set_value(ds.value(30))
+ self.assertEquals(c.insert(), 0)
+
+ # Test that we can successfully modify our own record.
mods = []
mod = wiredtiger.Modify('ABCD', 3, 3)
mods.append(mod)
+ c.set_key(ds.key(30))
+ self.assertEqual(c.modify(mods), 0)
- cursor.set_key('ABC')
- #self.assertEqual(cursor.modify(mods), wiredtiger.WT_NOTFOUND)
- self.assertRaises(
- wiredtiger.WiredTigerError, lambda:cursor.modify(mods))
+ # Test that another transaction cannot modify our uncommitted record.
+ xs = self.conn.open_session()
+ xc = xs.open_cursor(self.uri, None)
+ xc.set_key(ds.key(30))
+ xc.set_value(ds.value(30))
+ mods = []
+ mod = wiredtiger.Modify('ABCD', 3, 3)
+ mods.append(mod)
+ xc.set_key(ds.key(30))
+ self.assertEqual(xc.modify(mods), wiredtiger.WT_NOTFOUND)
+
+ # Rollback our transaction.
+ self.session.rollback_transaction()
+
+ # Test that we can't modify our aborted insert.
+ mods = []
+ mod = wiredtiger.Modify('ABCD', 3, 3)
+ mods.append(mod)
+ c.set_key(ds.key(30))
+ self.assertEqual(c.modify(mods), wiredtiger.WT_NOTFOUND)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_readonly03.py b/src/third_party/wiredtiger/test/suite/test_readonly03.py
index 6fe2942ca18..474e23981a2 100644
--- a/src/third_party/wiredtiger/test/suite/test_readonly03.py
+++ b/src/third_party/wiredtiger/test/suite/test_readonly03.py
@@ -43,8 +43,8 @@ class test_readonly03(wttest.WiredTigerTestCase, suite_subprocess):
conn_params = 'create,log=(enabled),'
conn_params_rd = 'readonly=true'
- session_ops = [ 'create', 'compact', 'drop', 'log_flush', 'log_printf',
- 'rebalance', 'rename', 'salvage', 'truncate', 'upgrade', ]
+ session_ops = [ 'alter', 'create', 'compact', 'drop', 'log_flush',
+ 'log_printf', 'rebalance', 'rename', 'salvage', 'truncate', 'upgrade', ]
cursor_ops = [ 'insert', 'remove', 'update', ]
def setUpConnectionOpen(self, dir):
@@ -86,7 +86,10 @@ class test_readonly03(wttest.WiredTigerTestCase, suite_subprocess):
self.fail('Unknown cursor operation: ' + op)
c.close()
for op in self.session_ops:
- if op == 'create':
+ if op == 'alter':
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.alter(self.uri, None), msg)
+ elif op == 'create':
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
lambda: self.session.create(self.uri2, create_params),
msg)
diff --git a/src/third_party/wiredtiger/test/suite/test_stat_log02.py b/src/third_party/wiredtiger/test/suite/test_stat_log02.py
index 322092c8190..e85b64721df 100644
--- a/src/third_party/wiredtiger/test/suite/test_stat_log02.py
+++ b/src/third_party/wiredtiger/test/suite/test_stat_log02.py
@@ -96,16 +96,21 @@ class test_stat_log02(wttest.WiredTigerTestCase):
json.loads(line)
def check_file_contains_tables(self, dir):
- files = glob.glob(dir + '/' + 'WiredTigerStat.[0-9]*')
- f = open(files[0], 'r')
- has_tables = False
- for line in f:
- data = json.loads(line)
- if "wiredTigerTables" in data:
- if "file:foo.wt" in data["wiredTigerTables"]:
- has_tables = True
+ # We wait for another 30 sleeps here to avoid erroring in the case where
+ # the stat log has only made the first pass and not yet printed the
+ # individual table stats.
+ number_sleeps = 0
+ while True:
+ files = glob.glob(dir + '/' + 'WiredTigerStat.[0-9]*')
+ f = open(files[0], 'r')
+ for line in f:
+ data = json.loads(line)
+ if "wiredTigerTables" in data:
+ if "file:foo.wt" in data["wiredTigerTables"]:
+ return
- self.assertTrue(has_tables)
+ number_sleeps += 1
+ self.assertLess(number_sleeps, 30)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp02.py b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
index 0ad007ec8e2..735e954fc7f 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp02.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
@@ -89,6 +89,8 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
c[k] = 1
self.session.commit_transaction('commit_timestamp=' + timestamp_str(k))
+ # Don't set a stable timestamp yet. Make sure we can read with
+ # a timestamp before the stable timestamp has been set.
# Now check that we see the expected state when reading at each
# timestamp
for i, t in enumerate(orig_keys):
@@ -106,6 +108,9 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
c[k] = 2
self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + 100))
+ # Now the stable timestamp before we read.
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(200))
+
for i, t in enumerate(orig_keys):
self.check(self.session, 'read_timestamp=' + timestamp_str(t + 100),
dict((k, (2 if j <= i else 1)) for j, k in enumerate(orig_keys)))
@@ -121,6 +126,8 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
del c[k]
self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + 200))
+ # We have to continue to advance the stable timestamp before reading.
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(300))
for i, t in enumerate(orig_keys):
self.check(self.session, 'read_timestamp=' + timestamp_str(t + 200),
dict((k, 2) for k in orig_keys[i+1:]))
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp03.py b/src/third_party/wiredtiger/test/suite/test_timestamp03.py
new file mode 100644
index 00000000000..9eb2359fedb
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp03.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_timestamp03.py
+# Timestamps: checkpoints
+#
+
+from helper import copy_wiredtiger_home
+import random
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+def timestamp_ret_str(t):
+ s = timestamp_str(t)
+ if len(s) % 2 == 1:
+ s = '0' + s
+ return s
+
+class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess):
+ tablename = 'ts03_ts_nologged'
+ tablename2 = 'ts03_nots_logged'
+ tablename3 = 'ts03_ts_logged'
+
+ types = [
+ ('file', dict(uri='file:', use_cg=False, use_index=False)),
+ ('lsm', dict(uri='lsm:', use_cg=False, use_index=False)),
+ ('table-cg', dict(uri='table:', use_cg=True, use_index=False)),
+ ('table-index', dict(uri='table:', use_cg=False, use_index=True)),
+ ('table-simple', dict(uri='table:', use_cg=False, use_index=False)),
+ ]
+
+ ckpt = [
+ ('use_ts_def', dict(ckptcfg='', val='none')),
+ ('use_ts_false', dict(ckptcfg='use_timestamp=false', val='all')),
+ ('use_ts_true', dict(ckptcfg='use_timestamp=true', val='none')),
+ ('read_ts', dict(ckptcfg='read_timestamp', val='none')),
+ ]
+
+ conncfg = [
+ ('nolog', dict(conn_config='create', using_log=False)),
+ ('V1', dict(conn_config='create,log=(enabled),compatibility=(release="2.9")', using_log=True)),
+ ('V2', dict(conn_config='create,log=(enabled)', using_log=True)),
+ ]
+
+ scenarios = make_scenarios(types, ckpt, conncfg)
+
+ # Binary values.
+ value = u'\u0001\u0002abcd\u0003\u0004'
+ value2 = u'\u0001\u0002dcba\u0003\u0004'
+ value3 = u'\u0001\u0002cdef\u0003\u0004'
+
+ # Check that a cursor (optionally started in a new transaction), sees the
+ # expected values.
+ def check(self, session, txn_config, expected):
+ if txn_config:
+ session.begin_transaction(txn_config)
+ c = session.open_cursor(self.uri + self.tablename, None)
+ actual = dict((k, v) for k, v in c if v != 0)
+ self.assertEqual(actual, expected)
+ # Search for the expected items as well as iterating
+ for k, v in expected.iteritems():
+ self.assertEqual(c[k], v, "for key " + str(k))
+ c.close()
+ if txn_config:
+ session.commit_transaction()
+ #
+ # Take a backup of the database and verify that the value we want to
+ # check exists in the tables the expected number of times.
+ #
+ def backup_check(self, check_value, valcnt, valcnt2, valcnt3):
+ newdir = "BACKUP"
+ copy_wiredtiger_home('.', newdir, True)
+
+ conn = self.setUpConnectionOpen(newdir)
+ session = self.setUpSessionOpen(conn)
+ c = session.open_cursor(self.uri + self.tablename, None)
+ c2 = session.open_cursor(self.uri + self.tablename2, None)
+ c3 = session.open_cursor(self.uri + self.tablename3, None)
+ # Count how many times the second value is present
+ count = 0
+ for k, v in c:
+ if check_value in str(v):
+ # print "check_value found in key " + str(k)
+ count += 1
+ c.close()
+ # Count how many times the second value is present in the
+ # non-timestamp table.
+ count2 = 0
+ for k, v in c2:
+ if check_value in str(v):
+ # print "check_value found in key " + str(k)
+ count2 += 1
+ c2.close()
+ # Count how many times the second value is present in the
+ # logged timestamp table.
+ count3 = 0
+ for k, v in c3:
+ if check_value in str(v):
+ count3 += 1
+ c3.close()
+ conn.close()
+ # print "CHECK BACKUP: Count " + str(count) + " Count2 " + str(count2) + " Count3 " + str(count3)
+ # print "CHECK BACKUP: Expect value2 count " + str(valcnt)
+ # print "CHECK BACKUP: 2nd table Expect value2 count " + str(valcnt2)
+ # print "CHECK BACKUP: 3rd table Expect value2 count " + str(valcnt3)
+ # print "CHECK BACKUP: config " + str(self.ckptcfg)
+ self.assertEqual(count, valcnt)
+ self.assertEqual(count2, valcnt2)
+ self.assertEqual(count3, valcnt3)
+
+ # Check that a cursor sees the expected values after a checkpoint.
+ def ckpt_backup(self, check_value, valcnt, valcnt2, valcnt3):
+
+ # Take a checkpoint. Make a copy of the database. Open the
+ # copy and verify whether or not the expected data is in there.
+ self.pr("CKPT: " + self.ckptcfg)
+ ckptcfg = self.ckptcfg
+ if ckptcfg == 'read_timestamp':
+ ckptcfg = self.ckptcfg + '=' + self.oldts
+ # print "CKPT: " + ckptcfg
+ self.session.checkpoint(ckptcfg)
+ self.backup_check(check_value, valcnt, valcnt2, valcnt3)
+
+ def test_timestamp03(self):
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ uri = self.uri + self.tablename
+ uri2 = self.uri + self.tablename2
+ uri3 = self.uri + self.tablename3
+ #
+ # Open three tables:
+ # 1. Table is not logged and uses timestamps.
+ # 2. Table is logged and does not use timestamps.
+ # 3. Table is logged and uses timestamps.
+ #
+ self.session.create(uri, 'key_format=i,value_format=S,log=(enabled=false)')
+ c = self.session.open_cursor(uri)
+ self.session.create(uri2, 'key_format=i,value_format=S')
+ c2 = self.session.open_cursor(uri2)
+ self.session.create(uri3, 'key_format=i,value_format=S')
+ c3 = self.session.open_cursor(uri3)
+
+ # Insert keys 1..100 each with timestamp=key, in some order
+ nkeys = 100
+ orig_keys = range(1, nkeys+1)
+ keys = orig_keys[:]
+ random.shuffle(keys)
+
+ for k in keys:
+ c2[k] = self.value
+ self.session.begin_transaction()
+ c[k] = self.value
+ c3[k] = self.value
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(k))
+
+ # Now check that we see the expected state when reading at each
+ # timestamp
+ for i, t in enumerate(orig_keys):
+ self.check(self.session, 'read_timestamp=' + timestamp_str(t),
+ dict((k, self.value) for k in orig_keys[:i+1]))
+
+ # Bump the oldest timestamp, we're not going back...
+ self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(100))
+ self.oldts = timestamp_str(100)
+ self.conn.set_timestamp('oldest_timestamp=' + self.oldts)
+ self.conn.set_timestamp('stable_timestamp=' + self.oldts)
+ # print "Oldest " + self.oldts
+
+ # Update them and retry.
+ random.shuffle(keys)
+ count = 0
+ for k in keys:
+ # Make sure a timestamp cursor is the last one to update. This
+ # tests the scenario for a bug we found where recovery replayed
+ # the last record written into the log.
+ #
+ # print "Key " + str(k) + " to value2"
+ c2[k] = self.value2
+ self.session.begin_transaction()
+ c[k] = self.value2
+ c3[k] = self.value2
+ ts = timestamp_str(k + 100)
+ self.session.commit_transaction('commit_timestamp=' + ts)
+ # print "Commit key " + str(k) + " ts " + ts
+ count += 1
+ # print "Updated " + str(count) + " keys to value2"
+
+ # Take a checkpoint using the given configuration. Then verify
+ # whether value2 appears in a copy of that data or not.
+ valcnt2 = nkeys
+ if self.val == 'all':
+ valcnt = nkeys
+ else:
+ valcnt = 0
+ # XXX adjust when logged + timestamps is fixed and defined.
+ valcnt3 = valcnt
+ self.ckpt_backup(self.value2, valcnt, valcnt2, valcnt3)
+ if self.ckptcfg != 'read_timestamp':
+ # Update the stable timestamp to the latest, but not the oldest
+ # timestamp and make sure we can see the data. Once the stable
+ # timestamp is moved we should see all keys with value2.
+ self.conn.set_timestamp('stable_timestamp=' + \
+ timestamp_str(100+nkeys))
+ self.ckpt_backup(self.value2, nkeys, nkeys, nkeys)
+
+ # If we're not using the log we're done.
+ if not self.using_log:
+ return
+
+ # Update them and retry. This time take a backup and recover.
+ random.shuffle(keys)
+ count = 0
+ for k in keys:
+ # Make sure a timestamp cursor is the last one to update. This
+ # tests the scenario for a bug we found where recovery replayed
+ # the last record written into the log.
+ #
+ # print "Key " + str(k) + " to value3"
+ c2[k] = self.value3
+ self.session.begin_transaction()
+ c[k] = self.value3
+ c3[k] = self.value3
+ ts = timestamp_str(k + 200)
+ self.session.commit_transaction('commit_timestamp=' + ts)
+ # print "Commit key " + str(k) + " ts " + ts
+ count += 1
+ # print "Updated " + str(count) + " keys to value3"
+
+ # Flush the log but don't checkpoint
+ self.session.log_flush('sync=on')
+
+ # Take a backup and then verify whether value3 appears in a copy
+ # of that data or not. Both tables that are logged should see
+ # all the data regardless of timestamps. The table that is not
+ # logged should not see any of it.
+ valcnt = 0
+ valcnt2 = valcnt3 = nkeys
+ self.backup_check(self.value3, valcnt, valcnt2, valcnt3)
+
+if __name__ == '__main__':
+ wttest.run()