diff options
Diffstat (limited to 'src/third_party')
65 files changed, 1856 insertions, 519 deletions
diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README index 5f7849fdf38..7463219341f 100644 --- a/src/third_party/wiredtiger/README +++ b/src/third_party/wiredtiger/README @@ -1,4 +1,4 @@ -WiredTiger 2.6.1: (May 15, 2015) +WiredTiger 2.6.1: (May 28, 2015) This is version 2.6.1 of WiredTiger. @@ -6,22 +6,24 @@ WiredTiger release packages and documentation can be found at: http://source.wiredtiger.com/ +The documentation for this specific release can be found at: + + http://source.wiredtiger.com/2.6.1/index.html + +The WiredTiger source code can be found at: + + http://github.com/wiredtiger/wiredtiger + WiredTiger uses JIRA for issue management: http://jira.mongodb.org/browse/WT Please do not report issues through GitHub. -Information on configuring, building and installing WiredTiger can be -found at: - - http://source.wiredtiger.com/2.6.1/install.html - WiredTiger licensing information can be found at: http://source.wiredtiger.com/license.html -For general questions and discussion, please use the WiredTiger mailing -list: +For general questions and discussion, there's a WiredTiger group: http://groups.google.com/group/wiredtiger-users diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs index 33d3cbb8bc1..0fa9e030659 100644 --- a/src/third_party/wiredtiger/build_posix/Make.subdirs +++ b/src/third_party/wiredtiger/build_posix/Make.subdirs @@ -14,6 +14,7 @@ ext/compressors/zlib ZLIB ext/datasources/helium HAVE_HELIUM ext/encryptors/nop ext/encryptors/rotn +ext/extractors/csv ext/test/kvs_bdb HAVE_BERKELEY_DB . api/leveldb LEVELDB diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_func_posix_memalign.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_func_posix_memalign.m4 new file mode 100644 index 00000000000..bd60adcbc81 --- /dev/null +++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_func_posix_memalign.m4 @@ -0,0 +1,50 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_func_posix_memalign.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_FUNC_POSIX_MEMALIGN +# +# DESCRIPTION +# +# Some versions of posix_memalign (notably glibc 2.2.5) incorrectly apply +# their power-of-two check to the size argument, not the alignment +# argument. AX_FUNC_POSIX_MEMALIGN defines HAVE_POSIX_MEMALIGN if the +# power-of-two check is correctly applied to the alignment argument. +# +# LICENSE +# +# Copyright (c) 2008 Scott Pakin <pakin@uiuc.edu> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AC_DEFUN([AX_FUNC_POSIX_MEMALIGN], +[AC_CACHE_CHECK([for working posix_memalign], + [ax_cv_func_posix_memalign_works], + [AC_TRY_RUN([ +#include <stdlib.h> + +int +main () +{ + void *buffer; + + /* Some versions of glibc incorrectly perform the alignment check on + * the size word. */ + exit (posix_memalign (&buffer, sizeof(void *), 123) != 0); +} + ], + [ax_cv_func_posix_memalign_works=yes], + [ax_cv_func_posix_memalign_works=no], + [ax_cv_func_posix_memalign_works=no])]) +if test "$ax_cv_func_posix_memalign_works" = "yes" ; then + AC_DEFINE([HAVE_POSIX_MEMALIGN], [1], + [Define to 1 if `posix_memalign' works.]) +fi +]) diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index d93793a997b..4bfb4df7fa2 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -80,12 +80,16 @@ AC_CHECK_LIB(rt, sched_yield) AC_CHECK_FUNCS([\ clock_gettime fallocate fcntl fread_unlocked ftruncate gettimeofday\ - posix_fadvise posix_fallocate posix_madvise posix_memalign\ + posix_fadvise posix_fallocate posix_madvise\ strtouq sync_file_range]) # OS X wrongly reports that it has fdatasync AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])]) +# Check for posix_memalign explicitly: it is a builtin in some compilers and +# the generic declaration in AC_CHECK_FUNCS is incompatible. +AX_FUNC_POSIX_MEMALIGN + AC_SYS_LARGEFILE AC_C_BIGENDIAN diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win index 4ec5d51ce51..099451e418d 100644 --- a/src/third_party/wiredtiger/build_win/filelist.win +++ b/src/third_party/wiredtiger/build_win/filelist.win @@ -168,4 +168,5 @@ src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_ext.c src/txn/txn_log.c +src/txn/txn_nsnap.c src/txn/txn_recover.c diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index fbf71581fe9..db1bc85add4 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -790,6 +790,14 @@ methods = { type='boolean'), ]), 'WT_SESSION.strerror' : Method([]), +'WT_SESSION.transaction_sync' : Method([ + Config('timeout_ms', '', r''' + maximum amount of time to wait for background sync to complete in + milliseconds. A value of zero disables the timeout and returns + immediately. The default waits forever.''', + type='int'), +]), + 'WT_SESSION.truncate' : Method([]), 'WT_SESSION.upgrade' : Method([]), 'WT_SESSION.verify' : Method([ @@ -826,13 +834,21 @@ methods = { priority of the transaction for resolving conflicts. Transactions with higher values are less likely to abort''', min='-100', max='100'), + Config('snapshot', '', r''' + use a named, in-memory snapshot, see + @ref transaction_named_snapshots'''), Config('sync', '', r''' whether to sync log records when the transaction commits, inherited from ::wiredtiger_open \c transaction_sync''', type='boolean'), ]), -'WT_SESSION.commit_transaction' : Method([]), +'WT_SESSION.commit_transaction' : Method([ + Config('sync', '', r''' + override whether to sync log records when the transaction commits, + inherited from ::wiredtiger_open \c transaction_sync''', + choices=['background', 'off', 'on']), +]), 'WT_SESSION.rollback_transaction' : Method([]), 'WT_SESSION.checkpoint' : Method([ @@ -857,6 +873,24 @@ methods = { if non-empty, checkpoint the list of objects''', type='list'), ]), +'WT_SESSION.snapshot' : Method([ + Config('drop', '', r''' + if non-empty, specifies which snapshots to drop. Where a group + of snapshots are being dropped, the order is based on snapshot + creation order not alphanumeric name order''', + type='category', subconfig=[ + Config('all', 'false', r''' + drop all named snapshots''', type='boolean'), + Config('before', '', r''' + drop all snapshots up to but not including the specified name'''), + Config('names', '', r''' + drop specific named snapshots''', type='list'), + Config('to', '', r''' + drop all snapshots up to and including the specified name.'''), + ]), + Config('name', '', r'''specify a name for the snapshot'''), +]), + 'WT_CONNECTION.add_collator' : Method([]), 'WT_CONNECTION.add_compressor' : Method([]), 'WT_CONNECTION.add_data_source' : Method([]), diff --git a/src/third_party/wiredtiger/dist/extlist b/src/third_party/wiredtiger/dist/extlist index fb4540653b2..874d21289d2 100644 --- a/src/third_party/wiredtiger/dist/extlist +++ b/src/third_party/wiredtiger/dist/extlist @@ -9,3 +9,4 @@ ext/compressors/zlib/zlib_compress.c ext/datasources/helium/helium.c ext/encryptors/nop/nop_encrypt.c ext/encryptors/rotn/rotn_encrypt.c +ext/extractors/csv/csv_extractor.c diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 8d797c36e47..c3321cf845d 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -166,4 +166,5 @@ src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_ext.c src/txn/txn_log.c +src/txn/txn_nsnap.c src/txn/txn_recover.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 544e3b5d549..ae97740073d 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -29,6 +29,7 @@ flags = { 'LOGSCAN_RECOVER', ], 'log_write' : [ + 'LOG_BACKGROUND', 'LOG_DSYNC', 'LOG_FLUSH', 'LOG_FSYNC', @@ -55,7 +56,7 @@ flags = { 'SKIP_UPDATE_RESTORE', ], 'txn_log_checkpoint' : [ - 'TXN_LOG_CKPT_FAIL', + 'TXN_LOG_CKPT_CLEANUP', 'TXN_LOG_CKPT_PREPARE', 'TXN_LOG_CKPT_START', 'TXN_LOG_CKPT_STOP', diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py index abe72cea5c4..da4a9c10727 100644 --- a/src/third_party/wiredtiger/dist/log.py +++ b/src/third_party/wiredtiger/dist/log.py @@ -12,7 +12,7 @@ tmp_file = '__tmp' field_types = { 'string' : ('const char *', 'S', '%s', 'arg', ''), 'item' : ('WT_ITEM *', 'u', '%s', 'escaped', - 'WT_RET(__logrec_jsonify_str(session, &escaped, &arg));'), + 'WT_ERR(__logrec_jsonify_str(session, &escaped, &arg));'), 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', ''), 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', ''), 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', ''), @@ -256,31 +256,35 @@ int __wt_logop_%(name)s_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { -\t%(arg_decls)s +%(arg_ret)s\t%(arg_decls)s \t%(arg_init)sWT_RET(__wt_logop_%(name)s_unpack( \t session, pp, end%(arg_addrs)s)); -\tfprintf(out, " \\"optype\\": \\"%(name)s\\",\\n"); +\tWT_RET(__wt_fprintf(out, " \\"optype\\": \\"%(name)s\\",\\n")); \t%(print_args)s -\t%(arg_fini)sreturn (0); +%(arg_fini)s } ''' % { 'name' : optype.name, + 'arg_ret' : ('\tWT_DECL_RET;\n' if has_escape(optype.fields) else ''), 'arg_decls' : ('\n\t'.join('%s%s%s;' % (clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1]) for f in optype.fields)) + escape_decl(optype.fields), 'arg_init' : ('escaped = NULL;\n\t' if has_escape(optype.fields) else ''), - 'arg_fini' : ('__wt_free(session, escaped);\n\t' - if has_escape(optype.fields) else ''), + 'arg_fini' : ('\nerr:\t__wt_free(session, escaped);\n\treturn (ret);' + if has_escape(optype.fields) else '\treturn (0);'), 'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields), 'print_args' : '\n\t'.join( - '%sfprintf(out, " \\"%s\\": \\"%s\\",\\n",%s);' % - (printf_setup(f), f[1], printf_fmt(f), printf_arg(f)) + '%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\",\\n",%s));' % + (printf_setup(f), + 'WT_ERR' if has_escape(optype.fields) else 'WT_RET', + f[1], printf_fmt(f), printf_arg(f)) for f in optype.fields[:-1]) + str( - '\n\t%sfprintf(out, " \\"%s\\": \\"%s\\"",%s);' % - (printf_setup(last_field), last_field[1], - printf_fmt(last_field), printf_arg(last_field))), + '\n\t%s%s(__wt_fprintf(out,\n\t " \\"%s\\": \\"%s\\"",%s));' % + (printf_setup(last_field), + 'WT_ERR' if has_escape(optype.fields) else 'WT_RET', + last_field[1], printf_fmt(last_field), printf_arg(last_field))), }) # Emit the printlog entry point diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index c9f2db406bf..623a34447a8 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -20,9 +20,6 @@ WT_ATOMIC_ADD1 WT_ATOMIC_ADD2 WT_ATOMIC_CAS1 WT_ATOMIC_CAS2 -WT_ATOMIC_CAS_VAL1 -WT_ATOMIC_CAS_VAL2 -WT_ATOMIC_CAS_VAL4 WT_ATOMIC_FETCH_ADD1 WT_ATOMIC_FETCH_ADD2 WT_ATOMIC_FETCH_ADD4 @@ -68,7 +65,6 @@ __WIREDTIGER_EXT_H_ __WIREDTIGER_H_ __WT_ATOMIC_ADD __WT_ATOMIC_CAS -__WT_ATOMIC_CAS_VAL __WT_ATOMIC_FETCH_ADD __WT_ATOMIC_STORE __WT_ATOMIC_SUB diff --git a/src/third_party/wiredtiger/dist/s_readme b/src/third_party/wiredtiger/dist/s_readme index f497a04fa1d..be809a6455c 100644 --- a/src/third_party/wiredtiger/dist/s_readme +++ b/src/third_party/wiredtiger/dist/s_readme @@ -35,23 +35,25 @@ WiredTiger release packages and documentation can be found at: http://source.wiredtiger.com/ +The documentation for this specific release can be found at: + + http://source.wiredtiger.com/$WIREDTIGER_VERSION/index.html + +The WiredTiger source code can be found at: + + http://github.com/wiredtiger/wiredtiger + WiredTiger uses JIRA for issue management: http://jira.mongodb.org/browse/WT Please do not report issues through GitHub. -Information on configuring, building and installing WiredTiger can be -found at: - - http://source.wiredtiger.com/$WIREDTIGER_VERSION/install.html - WiredTiger licensing information can be found at: http://source.wiredtiger.com/license.html -For general questions and discussion, please use the WiredTiger mailing -list: +For general questions and discussion, there's a WiredTiger group: http://groups.google.com/group/wiredtiger-users END_TEXT diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index d57b19aa029..fa87c0086b6 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -44,6 +44,7 @@ CAS CCCCCCCCCCCCCCCCCC CELL's CELLs +CET CFLAGS CHECKKEY CKPT @@ -53,6 +54,7 @@ CONFIG COVERITY CPUs CRC +CSV CURSORs CURSTD CallsCustDate @@ -440,6 +442,7 @@ crc create's crypto cryptobad +csv ctime ctype curbackup @@ -736,6 +739,7 @@ noraw notfound notsup nset +nsnap nul nuls numSymbols diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index ae43ce91fb4..791cb30e99c 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -285,6 +285,7 @@ connection_stats = [ 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), + TxnStat('txn_sync', 'transaction sync calls'), TxnStat('txn_commit', 'transactions committed'), TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), TxnStat('txn_rollback', 'transactions rolled back'), diff --git a/src/third_party/wiredtiger/ext/extractors/csv/Makefile.am b/src/third_party/wiredtiger/ext/extractors/csv/Makefile.am new file mode 100644 index 00000000000..bb2a35bbf8e --- /dev/null +++ b/src/third_party/wiredtiger/ext/extractors/csv/Makefile.am @@ -0,0 +1,10 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_LTLIBRARIES = libwiredtiger_csv_extractor.la +libwiredtiger_csv_extractor_la_SOURCES = csv_extractor.c + +# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well +# as installation, it will only build static libraries. As far as I can tell, +# the "approved" libtool way to turn them back on is by adding -rpath. +libwiredtiger_csv_extractor_la_LDFLAGS = \ + -avoid-version -module -rpath /nowhere diff --git a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c new file mode 100644 index 00000000000..0dd110955ad --- /dev/null +++ b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c @@ -0,0 +1,171 @@ +/*- + * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <string.h> +#include <limits.h> +#include <errno.h> +#include <stdlib.h> + +#include <wiredtiger_ext.h> + +/* + * A simple WiredTiger extractor that separates a single string field, + * interpreted as column separated values (CSV), into component pieces. + * When an index is configured with this extractor and app_metadata + * set to a number N, the Nth field is returned as a string. + * + * For example, if a value in the primary table is + * "Paris,France,CET,2273305" + * and this extractor is configured with app_metadata=2, then + * the extractor for this value would return "CET". + */ + +/* Local extractor structure. */ +typedef struct { + WT_EXTRACTOR extractor; /* Must come first */ + WT_EXTENSION_API *wt_api; /* Extension API */ + int field_num; /* Field to extract */ +} CSV_EXTRACTOR; + +/* + * csv_extract -- + * WiredTiger CSV extraction. + */ +static int +csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, + const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor) +{ + char *copy, *p, *pend, *valstr; + const CSV_EXTRACTOR *cvs_extractor; + int i, ret; + size_t len; + WT_EXTENSION_API *wtapi; + + (void)key; /* Unused parameters */ + + cvs_extractor = (const CSV_EXTRACTOR *)extractor; + wtapi = cvs_extractor->wt_api; + + /* Unpack the value. */ + if ((ret = wtapi->struct_unpack(wtapi, + session, value->data, value->size, "S", &valstr)) != 0) + return (ret); + + p = valstr; + pend = strchr(p, ','); + for (i = 0; i < cvs_extractor->field_num && pend != NULL; i++) { + p = pend + 1; + pend = strchr(p, ','); + } + if (i == cvs_extractor->field_num) { + if (pend == NULL) + pend = p + strlen(p); + /* + * The key we must return is a null terminated string, but p + * is not necessarily NULL-terminated. So make a copy, just + * for the duration of the insert. + */ + len = (size_t)(pend - p); + if ((copy = malloc(len + 1)) == NULL) + return (errno); + strncpy(copy, p, len); + copy[len] = '\0'; + result_cursor->set_key(result_cursor, copy); + ret = result_cursor->insert(result_cursor); + free(copy); + if (ret != 0) + return (ret); + } + return (0); +} + +/* + * csv_customize -- + * The customize function creates a customized extractor, + * needed to save the field number. + */ +static int +csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session, + const char *uri, WT_CONFIG_ITEM *appcfg, WT_EXTRACTOR **customp) +{ + const CSV_EXTRACTOR *orig; + CSV_EXTRACTOR *csv_extractor; + long field_num; + + (void)session; /* Unused parameters */ + (void)uri; /* Unused parameters */ + + orig = (const CSV_EXTRACTOR *)extractor; + field_num = strtol(appcfg->str, NULL, 10); + if (field_num < 0 || field_num > INT_MAX) + return (EINVAL); + if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL) + return (errno); + + *csv_extractor = *orig; + csv_extractor->field_num = field_num; + *customp = (WT_EXTRACTOR *)csv_extractor; + return (0); +} + +/* + * csv_terminate -- + * Terminate is called to free the CSV and any associated memory. + */ +static int +csv_terminate(WT_EXTRACTOR *extractor, WT_SESSION *session) +{ + (void)session; /* Unused parameters */ + + /* Free the allocated memory. */ + free(extractor); + return (0); +} + +/* + * wiredtiger_extension_init -- + * WiredTiger CSV extraction extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + CSV_EXTRACTOR *csv_extractor; + + (void)config; /* Unused parameters */ + + if ((csv_extractor = calloc(1, sizeof(CSV_EXTRACTOR))) == NULL) + return (errno); + + csv_extractor->extractor.extract = csv_extract; + csv_extractor->extractor.customize = csv_customize; + csv_extractor->extractor.terminate = csv_terminate; + csv_extractor->wt_api = connection->get_extension_api(connection); + + return (connection->add_extractor( + connection, "csv", (WT_EXTRACTOR *)csv_extractor, NULL)); +} diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i index 2129c89a409..ea80b80df2d 100644 --- a/src/third_party/wiredtiger/lang/java/java_doc.i +++ b/src/third_party/wiredtiger/lang/java/java_doc.i @@ -43,7 +43,9 @@ COPYDOC(__wt_session, WT_SESSION, begin_transaction) COPYDOC(__wt_session, WT_SESSION, commit_transaction) COPYDOC(__wt_session, WT_SESSION, rollback_transaction) COPYDOC(__wt_session, WT_SESSION, checkpoint) +COPYDOC(__wt_session, WT_SESSION, snapshot) COPYDOC(__wt_session, WT_SESSION, transaction_pinned_range) +COPYDOC(__wt_session, WT_SESSION, transaction_sync) COPYDOC(__wt_connection, WT_CONNECTION, async_flush) COPYDOC(__wt_connection, WT_CONNECTION, async_new_op) COPYDOC(__wt_connection, WT_CONNECTION, close) diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 64bd86294f2..5a882f0fb7c 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -51,11 +51,41 @@ __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) { WT_DECL_RET; + WT_DECL_ITEM(tmp); WT_FH *fh; + int exists, suffix; char *path; - /* Create the underlying file and open a handle. */ - WT_RET(__wt_open(session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh)); + /* + * Create the underlying file and open a handle. + * + * Since WiredTiger schema operations are (currently) non-transactional, + * it's possible to see a partially-created file left from a previous + * create. Further, there's nothing to prevent users from creating files + * in our space. Move any existing files out of the way and complain. + */ + for (;;) { + if ((ret = __wt_open( + session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh)) == 0) + break; + WT_ERR_TEST(ret != EEXIST, ret); + + if (tmp == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + for (suffix = 1;; ++suffix) { + WT_ERR(__wt_buf_fmt( + session, tmp, "%s.%d", filename, suffix)); + WT_ERR(__wt_exist(session, tmp->data, &exists)); + if (!exists) { + WT_ERR( + __wt_rename(session, filename, tmp->data)); + WT_ERR(__wt_msg(session, + "unexpected file %s found, renamed to %s", + filename, (char *)tmp->data)); + break; + } + } + } /* Write out the file's meta-data. */ ret = __wt_desc_init(session, fh, allocsize); @@ -83,6 +113,8 @@ __wt_block_manager_create( if (ret != 0) WT_TRET(__wt_remove(session, filename)); +err: __wt_scr_free(session, &tmp); + return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index b1fa5ce6178..751ec8581c2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -188,7 +188,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...) } } else { va_start(ap, fmt); - (void)vfprintf(ds->fp, fmt, ap); + (void)__wt_vfprintf(ds->fp, fmt, ap); va_end(ap); } } @@ -202,13 +202,14 @@ __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_DECL_ITEM(buf); + WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 128, &buf)); - fprintf(stderr, "%s\n", - __wt_addr_string(session, addr, addr_size, buf)); + ret = __wt_fprintf(stderr, + "%s\n", __wt_addr_string(session, addr, addr_size, buf)); __wt_scr_free(session, &buf); - return (0); + return (ret); } /* @@ -437,7 +438,7 @@ __debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level) session = ds->session; - if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { + if (WT_PAGE_IS_INTERNAL(page)) { __dmsg(ds, "%*s" "I" "%d %s\n", level * 3, " ", level, __debug_tree_shape_info(page)); WT_INTL_FOREACH_BEGIN(session, page, ref) { diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index d4c8cf1b92d..c316be6f908 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -170,15 +170,12 @@ __split_safe_free(WT_SESSION_IMPL *session, * Return if we should deepen the tree. */ static int -__split_should_deepen( - WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp) +__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_PAGE *page; WT_PAGE_INDEX *pindex; - *childrenp = 0; - btree = S2BT(session); page = ref->page; @@ -202,10 +199,8 @@ __split_should_deepen( * we get a significant payback (in the case of a set of large keys, * splitting won't help). */ - if (pindex->entries > btree->split_deepen_min_child) { - *childrenp = pindex->entries / btree->split_deepen_per_child; + if (pindex->entries > btree->split_deepen_min_child) return (1); - } /* * Don't allow a single page to put pressure on cache usage. The root @@ -216,10 +211,8 @@ __split_should_deepen( */ if (pindex->entries >= 100 && (__wt_ref_is_root(ref) || - page->memory_footprint >= S2C(session)->cache_size / 4)) { - *childrenp = pindex->entries / 10; + page->memory_footprint >= S2C(session)->cache_size / 4)) return (1); - } return (0); } @@ -385,8 +378,9 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) * Split an internal page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) +__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *child; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; @@ -394,10 +388,15 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; size_t child_incr, parent_decr, parent_incr, size; uint64_t split_gen; - uint32_t chunk, i, j, remain, slots; + uint32_t children, chunk, i, j, moved_entries, new_entries, remain; + uint32_t skip_leading, slots; int panic; void *p; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); + + btree = S2BT(session); alloc_index = NULL; parent_incr = parent_decr = 0; panic = 0; @@ -409,53 +408,76 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) */ pindex = WT_INTL_INDEX_GET_SAFE(parent); - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); - WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); - WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", - parent, pindex->entries, children)); - /* - * If the workload is prepending/appending to the tree, we could deepen - * without bound. Don't let that happen, keep the first/last pages of - * the tree at their current level. + * A prepending/appending workload will repeatedly deepen parts of the + * tree that aren't changing, and appending workloads are not uncommon. + * First, keep the first/last pages of the tree at their current level, + * to catch simple workloads. Second, track the number of entries which + * resulted from the last time we deepened this page, and if we refilled + * this page without splitting into those slots, ignore them for this + * split. It's not exact because an eviction might split into any part + * of the page: if 80% of the splits are at the end of the page, assume + * an append-style workload. Of course, the plan eventually fails: when + * repeatedly deepening this page for an append-only workload, we will + * progressively ignore more and more of the slots. When ignoring 90% of + * the slots, deepen the entire page again. * - * XXX - * To improve this, we could track which pages were last merged into - * this page by eviction, and leave those pages alone, to prevent any - * sustained insert into the tree from deepening a single location. + * Figure out how many slots we're leaving at this level and how many + * child pages we're creating. + */ +#undef skip_trailing +#define skip_trailing 1 + skip_leading = 1; + new_entries = pindex->entries - parent->pg_intl_deepen_split_last; + if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10) + skip_leading = parent->pg_intl_deepen_split_last; + if (skip_leading > (pindex->entries * 9) * 10) + skip_leading = 1; + + /* + * In a few (rare) cases we split pages with only a few entries, and in + * those cases we keep it simple, 10 children, skip only first and last + * entries. Otherwise, split into a lot of child pages. */ -#undef SPLIT_CORRECT_1 -#define SPLIT_CORRECT_1 1 /* First page correction */ -#undef SPLIT_CORRECT_2 -#define SPLIT_CORRECT_2 2 /* First/last page correction */ + moved_entries = pindex->entries - (skip_leading + skip_trailing); + children = moved_entries / btree->split_deepen_per_child; + if (children < 10) { + children = 10; + skip_leading = 1; + moved_entries = + pindex->entries - (skip_leading + skip_trailing); + } + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", + parent, pindex->entries, children)); /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize - * the first/last slots of the allocated WT_PAGE_INDEX to point to the - * first/last pages we're keeping at the current level, and the rest of - * the slots to point to new WT_REF objects. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize + * the slots of the allocated WT_PAGE_INDEX to point to the pages we're + * keeping at the current level, and the rest of the slots to point to + * new WT_REF objects. */ size = sizeof(WT_PAGE_INDEX) + - (children + SPLIT_CORRECT_2) * sizeof(WT_REF *); + (children + skip_leading + skip_trailing) * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = children + SPLIT_CORRECT_2; - alloc_index->index[0] = pindex->index[0]; + alloc_index->entries = children + skip_leading + skip_trailing; + for (alloc_refp = alloc_index->index, + i = 0; i < skip_leading; ++alloc_refp, ++i) + alloc_index->index[i] = pindex->index[i]; + for (i = 0; i < children; ++alloc_refp, ++i) + WT_ERR(__wt_calloc_one(session, alloc_refp)); + parent_incr += children * sizeof(WT_REF); alloc_index->index[alloc_index->entries - 1] = pindex->index[pindex->entries - 1]; - for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1, - i = 0; i < children; ++alloc_refp, ++i) { - WT_ERR(__wt_calloc_one(session, alloc_refp)); - parent_incr += sizeof(WT_REF); - } /* Allocate child pages, and connect them into the new page index. */ - chunk = (pindex->entries - SPLIT_CORRECT_2) / children; - remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1); - for (parent_refp = pindex->index + SPLIT_CORRECT_1, - alloc_refp = alloc_index->index + SPLIT_CORRECT_1, + chunk = moved_entries / children; + remain = moved_entries - chunk * (children - 1); + for (parent_refp = pindex->index + skip_leading, + alloc_refp = alloc_index->index + skip_leading, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; WT_ERR(__wt_page_alloc( @@ -513,10 +535,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) } __wt_cache_page_inmem_incr(session, child, child_incr); } - WT_ASSERT(session, alloc_refp - - alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1); WT_ASSERT(session, - parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1); + alloc_refp - alloc_index->index == + alloc_index->entries - skip_trailing); + WT_ASSERT(session, + parent_refp - pindex->index == pindex->entries - skip_trailing); /* * Confirm the parent page's index hasn't moved, then update it, which @@ -541,6 +564,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + /* + * Save the number of entries created by deepening the tree and reset + * the count of splits into this page after that point. + */ + parent->pg_intl_deepen_split_append = 0; + parent->pg_intl_deepen_split_last = alloc_index->entries; /* * The moved reference structures now reference the wrong parent page, @@ -823,7 +852,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **alloc_refp, *next_ref, *parent_ref; size_t parent_decr, size; uint64_t split_gen; - uint32_t children, i, j; + uint32_t i, j; uint32_t deleted_entries, parent_entries, result_entries; int complete, hazard; @@ -907,8 +936,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, } /* - * The final entry count consists of: The original count, plus any - * new pages, less any refs we are removing. + * The final entry count consists of the original count, plus any new + * pages, less any WT_REFs we're removing. */ result_entries = (parent_entries + new_entries) - deleted_entries; @@ -924,7 +953,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; - if (next_ref == ref) + if (next_ref == ref) { for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; *alloc_refp++ = ref_new[j]; @@ -936,7 +965,22 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ ref_new[j] = NULL; } - else if (next_ref->state != WT_REF_SPLIT) + + /* + * We detect append-style workloads to avoid repeatedly + * deepening parts of the tree where no work is being + * done by tracking if we're splitting after the slots + * created by the last split to deepen this parent. + * + * Note the calculation: i is a 0-based array offset and + * split-last is a count of entries, also either or both + * i and split-last might be unsigned 0, don't decrement + * either one. + */ + if (i > parent->pg_intl_deepen_split_last) + parent-> + pg_intl_deepen_split_append += new_entries; + } else if (next_ref->state != WT_REF_SPLIT) /* Skip refs we have marked for deletion. */ *alloc_refp++ = next_ref; } @@ -1070,7 +1114,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ if (ret == 0 && !LF_ISSET(WT_EVICT_EXCLUSIVE) && !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) && - __split_should_deepen(session, parent_ref, &children)) { + __split_should_deepen(session, parent_ref)) { /* * XXX * Temporary hack to avoid a bug where the root page is split @@ -1078,7 +1122,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ uint64_t __a, __b; __a = parent->memory_footprint; - ret = __split_deepen(session, parent, children); + ret = __split_deepen(session, parent); __b = parent->memory_footprint; if (__b * 2 >= __a) F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 45c2029f6ed..0ded95c0d8c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -118,15 +118,22 @@ __verify_config_offsets(WT_SESSION_IMPL *session, const char *cfg[], int *quitp) static int __verify_tree_shape(WT_SESSION_IMPL *session, WT_VSTUFF *vs) { + uint32_t total; size_t i; - WT_RET(__wt_msg(session, "Internal page tree-depth:")); + for (i = 0, total = 0; i < WT_ELEMENTS(vs->depth_internal); ++i) + total += vs->depth_internal[i]; + WT_RET(__wt_msg( + session, "Internal page tree-depth (total %" PRIu32 "):", total)); for (i = 0; i < WT_ELEMENTS(vs->depth_internal); ++i) if (vs->depth_internal[i] != 0) WT_RET(__wt_msg(session, "\t%03zu: %u", i, vs->depth_internal[i])); - WT_RET(__wt_msg(session, "Leaf page tree-depth:")); + for (i = 0, total = 0; i < WT_ELEMENTS(vs->depth_leaf); ++i) + total += vs->depth_leaf[i]; + WT_RET(__wt_msg( + session, "Leaf page tree-depth (total %" PRIu32 "):", total)); for (i = 0; i < WT_ELEMENTS(vs->depth_leaf); ++i) if (vs->depth_leaf[i] != 0) WT_RET(__wt_msg(session, diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 845b4e65825..64fc802160c 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -148,6 +148,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_begin_transaction[] = { NULL, 0 }, { "name", "string", NULL, NULL, NULL, 0 }, { "priority", "int", NULL, "min=-100,max=100", NULL, 0 }, + { "snapshot", "string", NULL, NULL, NULL, 0 }, { "sync", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -160,6 +161,13 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_checkpoint[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_WT_SESSION_commit_transaction[] = { + { "sync", "string", + NULL, "choices=[\"background\",\"off\",\"on\"]", + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_WT_SESSION_compact[] = { { "timeout", "int", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -287,6 +295,28 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_salvage[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK + confchk_WT_SESSION_snapshot_drop_subconfigs[] = { + { "all", "boolean", NULL, NULL, NULL, 0 }, + { "before", "string", NULL, NULL, NULL, 0 }, + { "names", "list", NULL, NULL, NULL, 0 }, + { "to", "string", NULL, NULL, NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + +static const WT_CONFIG_CHECK confchk_WT_SESSION_snapshot[] = { + { "drop", "category", + NULL, NULL, + confchk_WT_SESSION_snapshot_drop_subconfigs, 4 }, + { "name", "string", NULL, NULL, NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + +static const WT_CONFIG_CHECK confchk_WT_SESSION_transaction_sync[] = { + { "timeout_ms", "int", NULL, NULL, NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_WT_SESSION_verify[] = { { "dump_address", "boolean", NULL, NULL, NULL, 0 }, { "dump_blocks", "boolean", NULL, NULL, NULL, 0 }, @@ -759,8 +789,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_CURSOR_reconfigure, 2 }, { "WT_SESSION.begin_transaction", - "isolation=,name=,priority=0,sync=", - confchk_WT_SESSION_begin_transaction, 4 + "isolation=,name=,priority=0,snapshot=,sync=", + confchk_WT_SESSION_begin_transaction, 5 }, { "WT_SESSION.checkpoint", "drop=,force=0,name=,target=", @@ -771,8 +801,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { NULL, 0 }, { "WT_SESSION.commit_transaction", - "", - NULL, 0 + "sync=", + confchk_WT_SESSION_commit_transaction, 1 }, { "WT_SESSION.compact", "timeout=1200", @@ -824,10 +854,18 @@ static const WT_CONFIG_ENTRY config_entries[] = { "force=0", confchk_WT_SESSION_salvage, 1 }, + { "WT_SESSION.snapshot", + "drop=(all=0,before=,names=,to=),name=", + confchk_WT_SESSION_snapshot, 2 + }, { "WT_SESSION.strerror", "", NULL, 0 }, + { "WT_SESSION.transaction_sync", + "timeout_ms=", + confchk_WT_SESSION_transaction_sync, 1 + }, { "WT_SESSION.truncate", "", NULL, 0 diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index c57be1a9423..d42287497a5 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -406,15 +406,14 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, WT_KEYED_ENCRYPTOR *kenc; WT_NAMED_ENCRYPTOR *nenc; uint64_t bucket, hash; - int locked; *kencryptorp = NULL; + kenc = NULL; conn = S2C(session); - locked = 0; __wt_spin_lock(session, &conn->encryptor_lock); - locked = 1; + WT_ERR(__encryptor_confchk(session, cval, &nenc)); if (nenc == NULL) { if (keyid->len != 0) @@ -462,8 +461,7 @@ err: if (kenc != NULL) { __wt_free(session, kenc->keyid); __wt_free(session, kenc); } - if (locked) - __wt_spin_unlock(session, &conn->encryptor_lock); + __wt_spin_unlock(session, &conn->encryptor_lock); return (ret); } @@ -637,7 +635,7 @@ __extractor_confchk( */ int __wt_extractor_config(WT_SESSION_IMPL *session, - const char *config, WT_EXTRACTOR **extractorp, int *ownp) + const char *uri, const char *config, WT_EXTRACTOR **extractorp, int *ownp) { WT_CONFIG_ITEM cname; WT_EXTRACTOR *extractor; @@ -658,7 +656,7 @@ __wt_extractor_config(WT_SESSION_IMPL *session, WT_RET(__wt_config_getones(session, config, "app_metadata", &cname)); WT_RET(extractor->customize(extractor, &session->iface, - session->dhandle->name, &cname, extractorp)); + uri, &cname, extractorp)); } if (*extractorp == NULL) @@ -987,6 +985,9 @@ err: /* wt_session, NULL)); } + /* Release all named snapshots. */ + WT_TRET(__wt_txn_named_snapshot_destroy(session)); + /* Close open, external sessions. */ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) { @@ -1372,11 +1373,15 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_FH *fh; size_t len; wt_off_t size; + int exist, is_create; char buf[256]; conn = S2C(session); fh = NULL; + WT_RET(__wt_config_gets(session, cfg, "create", &cval)); + is_create = cval.val == 0 ? 0 : 1; + __wt_spin_lock(session, &__wt_process.spinlock); /* @@ -1410,10 +1415,6 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) * WiredTiger file and system utilities on Windows can't copy locked * files. * - * For this reason, we don't use the lock file's existence to decide if - * we're creating the database or not, use the WiredTiger file instead, - * it has existed in every version of WiredTiger. - * * Additionally, avoid an upgrade race: a 2.3.1 release process might * have the WiredTiger file locked, and we're going to create the lock * file and lock it instead. For this reason, first acquire a lock on @@ -1424,12 +1425,22 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) * then successfully lock the WiredTiger file, but I can't think of any * way to fix that.) * - * Open the WiredTiger lock file, creating it if it doesn't exist. (I'm - * not removing the lock file if we create it and subsequently fail, it - * isn't simple to detect that case, and there's no risk other than a - * useless file being left in the directory.) + * Open the WiredTiger lock file, optionally creating it if it doesn't + * exist. The "optional" part of that statement is tricky: we don't want + * to create the lock file in random directories when users mistype the + * database home directory path, so we only create the lock file in two + * cases: First, applications creating databases will configure create, + * create the lock file. Second, after a hot backup, all of the standard + * files will have been copied into place except for the lock file (see + * above, locked files cannot be copied on Windows). If the WiredTiger + * file exists in the directory, create the lock file, covering the case + * of a hot backup. */ - WT_ERR(__wt_open(session, WT_SINGLETHREAD, 1, 0, 0, &conn->lock_fh)); + exist = 0; + if (!is_create) + WT_ERR(__wt_exist(session, WT_WIREDTIGER, &exist)); + WT_ERR(__wt_open(session, + WT_SINGLETHREAD, is_create || exist, 0, 0, &conn->lock_fh)); /* * Lock a byte of the file: if we don't get the lock, some other process @@ -1442,22 +1453,22 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) "process"); /* - * If the size of the lock file is 0, we created it (or we won a locking - * race with the thread that created it, it doesn't matter). + * If the size of the lock file is non-zero, we created it (or won a + * locking race with the thread that created it, it doesn't matter). * * Write something into the file, zero-length files make me nervous. + * + * The test against the expected length is sheer paranoia (the length + * should be 0 or correct), but it shouldn't hurt. */ - WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); - if (size == 0) { #define WT_SINGLETHREAD_STRING "WiredTiger lock file\n" + WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); + if (size != strlen(WT_SINGLETHREAD_STRING)) WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING)); - } /* We own the lock file, optionally create the WiredTiger file. */ - WT_ERR(__wt_config_gets(session, cfg, "create", &cval)); - WT_ERR(__wt_open(session, - WT_WIREDTIGER, cval.val == 0 ? 0 : 1, 0, 0, &fh)); + WT_ERR(__wt_open(session, WT_WIREDTIGER, is_create, 0, 0, &fh)); /* * Lock the WiredTiger file (for backward compatibility reasons as @@ -1471,25 +1482,27 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_bytelock(fh, (wt_off_t)0, 0)); /* - * If the size of the file is zero, we created it, fill it in. If the - * size of the file is non-zero, fail if configured for exclusivity. + * We own the database home, figure out if we're creating it. There are + * a few files created when initializing the database home and we could + * crash in-between any of them, so there's no simple test. The last + * thing we do during initialization is rename a turtle file into place, + * and there's never a database home after that point without a turtle + * file. If the turtle file doesn't exist, it's a create. */ - WT_ERR(__wt_filesize(session, fh, &size)); - if (size == 0) { + WT_ERR(__wt_exist(session, WT_METADATA_TURTLE, &exist)); + conn->is_new = exist ? 0 : 1; + + if (conn->is_new) { len = (size_t)snprintf(buf, sizeof(buf), "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); WT_ERR(__wt_fsync(session, fh)); - - conn->is_new = 1; } else { WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval)); if (cval.val != 0) WT_ERR_MSG(session, EEXIST, "WiredTiger database already exists and exclusive " "option configured"); - - conn->is_new = 0; } err: /* @@ -1632,16 +1645,17 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) * Save the base configuration used to create a database. */ static int -__conn_write_base_config( - WT_SESSION_IMPL *session, const char *cfg[], const char *base_config) +__conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) { FILE *fp; WT_CONFIG parser; WT_CONFIG_ITEM cval, k, v; WT_DECL_RET; int exist; + const char *base_config; fp = NULL; + base_config = NULL; /* * Discard any base configuration setup file left-over from previous @@ -1650,7 +1664,12 @@ __conn_write_base_config( */ WT_RET(__wt_remove_if_exists(session, WT_BASECONFIG_SET)); - /* The base configuration file is optional, check the configuration. */ + /* + * The base configuration file is only written if creating the database, + * and even then, a base configuration file is optional. + */ + if (!S2C(session)->is_new) + return (0); WT_RET(__wt_config_gets(session, cfg, "config_base", &cval)); if (!cval.val) return (0); @@ -1670,13 +1689,13 @@ __conn_write_base_config( WT_RET(__wt_fopen(session, WT_BASECONFIG_SET, WT_FHANDLE_WRITE, 0, &fp)); - fprintf(fp, "%s\n\n", + WT_ERR(__wt_fprintf(fp, "%s\n\n", "# Do not modify this file.\n" "#\n" "# WiredTiger created this file when the database was created,\n" "# to store persistent database settings. Instead of changing\n" "# these settings, set a WIREDTIGER_CONFIG environment variable\n" - "# or create a WiredTiger.config file to override them."); + "# or create a WiredTiger.config file to override them.")); /* * The base configuration file contains all changes to default settings @@ -1690,10 +1709,18 @@ __conn_write_base_config( * file in that protection. * * We were passed the configuration items specified by the application. - * That list includes configuring the default setting, presumably if + * That list includes configuring the default settings, presumably if * the application configured it explicitly, that setting should survive * even if the default changes. + * + * When writing the base configuration file, we write the version and + * any configuration information set by the application (in other words, + * the stack except for cfg[0]). However, some configuration values need + * to be stripped out from the base configuration file; do that now, and + * merge the rest to be written. */ + WT_ERR(__wt_config_merge(session, cfg + 1, + "create=,encryption=(secretkey=),log=(recover=)", &base_config)); WT_ERR(__wt_config_init(session, &parser, base_config)); while ((ret = __wt_config_next(&parser, &k, &v)) == 0) { /* Fix quoting for non-trivial settings. */ @@ -1701,18 +1728,23 @@ __conn_write_base_config( --v.str; v.len += 2; } - fprintf(fp, - "%.*s=%.*s\n", (int)k.len, k.str, (int)v.len, v.str); + WT_ERR(__wt_fprintf(fp, + "%.*s=%.*s\n", (int)k.len, k.str, (int)v.len, v.str)); } WT_ERR_NOTFOUND_OK(ret); /* Flush the handle and rename the file into place. */ - return (__wt_sync_and_rename_fp( - session, &fp, WT_BASECONFIG_SET, WT_BASECONFIG)); + ret = __wt_sync_and_rename_fp( + session, &fp, WT_BASECONFIG_SET, WT_BASECONFIG); + + if (0) { + /* Close open file handle, remove any temporary file. */ +err: if (fp != NULL) + WT_TRET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); + WT_TRET(__wt_remove_if_exists(session, WT_BASECONFIG_SET)); + } - /* Close any file handle left open, remove any temporary file. */ -err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); - WT_TRET(__wt_remove_if_exists(session, WT_BASECONFIG_SET)); + __wt_free(session, base_config); return (ret); } @@ -1759,7 +1791,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_DECL_RET; const WT_NAME_FLAG *ft; WT_SESSION_IMPL *session; - const char *base_merge, *enc_cfg[] = { NULL, NULL }; + const char *enc_cfg[] = { NULL, NULL }; char version[64]; /* Leave lots of space for optional additional configuration. */ @@ -1770,7 +1802,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, conn = NULL; session = NULL; - base_merge = NULL; WT_RET(__wt_library_init()); @@ -1972,29 +2003,23 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, (WT_CONFIG_ARG *)enc_cfg, &conn->kencryptor)); /* + * Configuration completed; optionally write a base configuration file. + */ + WT_ERR(__conn_write_base_config(session, cfg)); + + /* * Check on the turtle and metadata files, creating them if necessary * (which avoids application threads racing to create the metadata file * later). Once the metadata file exists, get a reference to it in * the connection's session. + * + * THE TURTLE FILE MUST BE THE LAST FILE CREATED WHEN INITIALIZING THE + * DATABASE HOME, IT'S WHAT WE USE TO DECIDE IF WE'RE CREATING OR NOT. */ WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); /* - * Configuration completed; optionally write the base configuration file - * if it doesn't already exist. - * - * When writing the base configuration file, we write the version and - * any configuration information set by the application (in other words, - * the stack except for cfg[0]). However, some configuration values need - * to be stripped out from the base configuration file; do that now, and - * merge the rest to be written. - */ - WT_ERR(__wt_config_merge(session, - cfg + 1, "create=,encryption=(secretkey=)", &base_merge)); - WT_ERR(__conn_write_base_config(session, cfg, base_merge)); - - /* * Start the worker threads last. */ WT_ERR(__wt_connection_workers(session, cfg)); @@ -2008,8 +2033,6 @@ err: /* Discard the scratch buffers. */ __wt_scr_free(session, &i2); __wt_scr_free(session, &i3); - __wt_free(session, base_merge); - /* * We may have allocated scratch memory when using the dummy session or * the subsequently created real session, and we don't want to tie down diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 29ab25e0226..bd57cd4b438 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -270,17 +270,18 @@ err: } /* - * __log_close_server -- - * The log close server thread. + * __log_file_server -- + * The log file server thread. This worker thread manages + * log file operations such as closing and syncing. */ static WT_THREAD_RET -__log_close_server(void *arg) +__log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; - WT_LSN close_end_lsn, close_lsn; + WT_LSN close_end_lsn, close_lsn, min_lsn; WT_SESSION_IMPL *session; int locked; @@ -321,10 +322,40 @@ __log_close_server(void *arg) WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); - } else - /* Wait until the next event. */ - WT_ERR(__wt_cond_wait(session, - conn->log_close_cond, WT_MILLION)); + } + /* + * If a later thread asked for a background sync, do it now. + */ + if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) { + /* + * Save the latest write LSN which is the minimum + * we will have written to disk. + */ + min_lsn = log->write_lsn; + /* + * The sync LSN we asked for better be smaller than + * the current written LSN. + */ + WT_ASSERT(session, + WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0); + WT_ERR(__wt_fsync(session, log->log_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + /* + * The sync LSN could have advanced while we were + * writing to disk. + */ + if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) { + log->sync_lsn = min_lsn; + WT_ERR(__wt_cond_signal( + session, log->log_sync_cond)); + } + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } + /* Wait until the next event. */ + WT_ERR(__wt_cond_wait( + session, conn->log_file_cond, WT_MILLION)); } if (0) { @@ -362,7 +393,7 @@ __log_wrlsn_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; - WT_LOG_WRLSN_ENTRY written[SLOT_POOL]; + WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *slot; WT_SESSION_IMPL *session; size_t written_i; @@ -385,7 +416,7 @@ __log_wrlsn_server(void *arg) * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ - while (i < SLOT_POOL) { + while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) @@ -427,9 +458,9 @@ __log_wrlsn_server(void *arg) /* * Signal the close thread if needed. */ - if (F_ISSET(slot, SLOT_CLOSEFH)) + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, - conn->log_close_cond)); + conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); } } @@ -579,16 +610,16 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( - conn, "log-close-server", 0, 0, &conn->log_close_session)); - WT_RET(__wt_cond_alloc(conn->log_close_session, - "log close server", 0, &conn->log_close_cond)); + conn, "log-close-server", 0, 0, &conn->log_file_session)); + WT_RET(__wt_cond_alloc(conn->log_file_session, + "log close server", 0, &conn->log_file_cond)); /* * Start the log file close thread. */ - WT_RET(__wt_thread_create(conn->log_close_session, - &conn->log_close_tid, __log_close_server, conn->log_close_session)); - conn->log_close_tid_set = 1; + WT_RET(__wt_thread_create(conn->log_file_session, + &conn->log_file_tid, __log_file_server, conn->log_file_session)); + conn->log_file_tid_set = 1; /* * Start the log write LSN thread. It is not configurable. @@ -663,16 +694,16 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_tid_set = 0; } WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); - if (conn->log_close_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_close_cond)); - WT_TRET(__wt_thread_join(session, conn->log_close_tid)); - conn->log_close_tid_set = 0; + if (conn->log_file_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); + WT_TRET(__wt_thread_join(session, conn->log_file_tid)); + conn->log_file_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond)); - if (conn->log_close_session != NULL) { - wt_session = &conn->log_close_session->iface; + WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); + if (conn->log_file_session != NULL) { + wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); - conn->log_close_session = NULL; + conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 9cae1c5c4d1..c4350d90adb 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -162,7 +162,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) WT_TRET(__wt_cache_destroy(session)); /* Discard transaction state. */ - __wt_txn_global_destroy(session); + WT_TRET(__wt_txn_global_destroy(session)); /* Close extensions, first calling any unload entry point. */ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index 4b72a472cb7..3376f2a3166 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -31,7 +31,7 @@ __curlog_logrec(WT_SESSION_IMPL *session, * Read the log header. Set up the step pointers to walk the * operations inside the record. Get the record type. */ - cl->stepp = LOG_SKIP_HEADER(cl->logrec->data); + cl->stepp = WT_LOG_SKIP_HEADER(cl->logrec->data); cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size; WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end, &cl->rectype)); @@ -174,8 +174,8 @@ __curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor) * header and the adjusted size. Add one to skip over the type * which is normally consumed by __wt_logrec_read. */ - cl->opvalue->data = LOG_SKIP_HEADER(cl->logrec->data) + 1; - cl->opvalue->size = LOG_REC_SIZE(cl->logrec->size) - 1; + cl->opvalue->data = WT_LOG_SKIP_HEADER(cl->logrec->data) + 1; + cl->opvalue->size = WT_LOG_REC_SIZE(cl->logrec->size) - 1; } /* * The log cursor sets the LSN and step count as the cursor key and @@ -187,24 +187,25 @@ __curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor) if (FLD_ISSET(cursor->flags, WT_CURSTD_RAW)) { memset(&item, 0, sizeof(item)); WT_RET(wiredtiger_struct_size((WT_SESSION *)session, - &item.size, LOGC_KEY_FORMAT, cl->cur_lsn->file, + &item.size, WT_LOGC_KEY_FORMAT, cl->cur_lsn->file, cl->cur_lsn->offset, key_count)); WT_RET(__wt_realloc(session, NULL, item.size, &cl->packed_key)); item.data = cl->packed_key; WT_RET(wiredtiger_struct_pack((WT_SESSION *)session, - cl->packed_key, item.size, LOGC_KEY_FORMAT, + cl->packed_key, item.size, WT_LOGC_KEY_FORMAT, cl->cur_lsn->file, cl->cur_lsn->offset, key_count)); __wt_cursor_set_key(cursor, &item); WT_RET(wiredtiger_struct_size((WT_SESSION *)session, - &item.size, LOGC_VALUE_FORMAT, cl->txnid, cl->rectype, + &item.size, WT_LOGC_VALUE_FORMAT, cl->txnid, cl->rectype, optype, fileid, cl->opkey, cl->opvalue)); WT_RET(__wt_realloc(session, NULL, item.size, &cl->packed_value)); item.data = cl->packed_value; WT_RET(wiredtiger_struct_pack((WT_SESSION *)session, - cl->packed_value, item.size, LOGC_VALUE_FORMAT, cl->txnid, - cl->rectype, optype, fileid, cl->opkey, cl->opvalue)); + cl->packed_value, item.size, WT_LOGC_VALUE_FORMAT, + cl->txnid, cl->rectype, optype, fileid, cl->opkey, + cl->opvalue)); __wt_cursor_set_value(cursor, &item); } else { __wt_cursor_set_key(cursor, cl->cur_lsn->file, @@ -237,8 +238,11 @@ __curlog_next(WT_CURSOR *cursor) */ if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) { cl->txnid = 0; - WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE, - __curlog_logrec, cl)); + ret = __wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE, + __curlog_logrec, cl); + if (ret == ENOENT) + ret = WT_NOTFOUND; + WT_ERR(ret); } WT_ASSERT(session, cl->logrec->data != NULL); WT_ERR(__curlog_kv(session, cursor)); @@ -271,8 +275,11 @@ __curlog_search(WT_CURSOR *cursor) */ WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl, &key.file, &key.offset, &counter)); - WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE, - __curlog_logrec, cl)); + ret = __wt_log_scan(session, &key, WT_LOGSCAN_ONE, + __curlog_logrec, cl); + if (ret == ENOENT) + ret = WT_NOTFOUND; + WT_ERR(ret); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); @@ -377,8 +384,8 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); - cursor->key_format = LOGC_KEY_FORMAT; - cursor->value_format = LOGC_VALUE_FORMAT; + cursor->key_format = WT_LOGC_KEY_FORMAT; + cursor->value_format = WT_LOGC_VALUE_FORMAT; WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index e54ed0ff8e7..ac95032748d 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -277,8 +277,6 @@ __evict_review( WT_PAGE_MODIFY *mod; uint32_t reconcile_flags; - reconcile_flags = WT_EVICTING; - /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. @@ -347,6 +345,7 @@ __evict_review( * Don't set the update-restore flag for internal pages, they don't have * updates that can be saved and restored. */ + reconcile_flags = WT_EVICTING; if (__wt_page_is_modified(page)) { if (LF_ISSET(WT_EVICT_EXCLUSIVE)) FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR); @@ -365,7 +364,7 @@ __evict_review( */ if (!LF_ISSET(WT_EVICT_EXCLUSIVE) && mod != NULL && !__wt_txn_visible_all(session, mod->rec_max_txn) && - !LF_ISSET(WT_SKIP_UPDATE_RESTORE)) + !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)) return (EBUSY); return (0); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index ebdd5187dbc..ad1f4c096df 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -374,9 +374,8 @@ struct __wt_page { /* * Internal pages (both column- and row-store). * - * The page record number is only used by column-store, but it - * makes some things simpler and it doesn't cost us any memory, - * other structures in this union are still as large. + * The page record number is only used by column-store, but it's + * simpler having only one kind of internal page. * * In-memory internal pages have an array of pointers to child * structures, maintained in collated order. When a page is @@ -410,10 +409,24 @@ struct __wt_page { uint32_t entries; WT_REF **index; } * volatile __index; /* Collated children */ + + /* + * When splitting to deepen the tree, track the number + * of entries in the newly created parent, and how many + * subsequent splits follow the initial set of entries. + * If future splits into the page are generally after + * the initial set of items, perform future deepening + * splits in this page to optimize for an append-style + * workload. + */ + uint32_t deepen_split_append; + uint32_t deepen_split_last; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref +#define pg_intl_deepen_split_append u.intl.deepen_split_append +#define pg_intl_deepen_split_last u.intl.deepen_split_last /* * Macros to copy/set the index because the name is obscured to ensure diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 2db14e293ed..76a25639ffd 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -148,8 +148,9 @@ struct __wt_btree { #define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ #define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */ #define WT_BTREE_SALVAGE 0x00800 /* Handle is for salvage */ -#define WT_BTREE_UPGRADE 0x01000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x02000 /* Handle is for verify */ +#define WT_BTREE_SKIP_CKPT 0x01000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x02000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x04000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 2ff89c1bdd5..06d41b89036 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1136,10 +1136,10 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) * between. */ locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED); - WT_TRET(__wt_hazard_clear(session, page)); - if (!locked) { - WT_TRET(EBUSY); - return (ret); + if ((ret = __wt_hazard_clear(session, page)) != 0 || !locked) { + if (locked) + ref->state = WT_REF_MEM; + return (ret == 0 ? EBUSY : ret); } (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); @@ -1156,9 +1156,9 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) */ WT_STAT_FAST_CONN_INCR( session, cache_eviction_force_delete); - } else { + } else WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail); - } + (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); return (ret); @@ -1180,7 +1180,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * Discard our hazard pointer. Ignore pages we don't have and the root * page, which sticks in memory, regardless. */ - if (ref == NULL || __wt_ref_is_root(ref)) + if (ref == NULL || ref->page == NULL || __wt_ref_is_root(ref)) return (0); /* @@ -1205,8 +1205,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) page = ref->page; if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || LF_ISSET(WT_READ_NO_EVICT) || - page->read_gen != WT_READGEN_OLDEST || !__wt_page_can_evict( - session, page, WT_EVICT_CHECK_SPLITS, NULL)) + page->read_gen != WT_READGEN_OLDEST || + !__wt_page_can_evict(session, page, WT_EVICT_CHECK_SPLITS, NULL)) return (__wt_hazard_clear(session, page)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h index 8ec9048e861..161eea5bc80 100644 --- a/src/third_party/wiredtiger/src/include/config.h +++ b/src/third_party/wiredtiger/src/include/config.h @@ -74,18 +74,20 @@ struct __wt_config_parser_impl { #define WT_CONFIG_ENTRY_WT_SESSION_rename 22 #define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 23 #define WT_CONFIG_ENTRY_WT_SESSION_salvage 24 -#define WT_CONFIG_ENTRY_WT_SESSION_strerror 25 -#define WT_CONFIG_ENTRY_WT_SESSION_truncate 26 -#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 27 -#define WT_CONFIG_ENTRY_WT_SESSION_verify 28 -#define WT_CONFIG_ENTRY_colgroup_meta 29 -#define WT_CONFIG_ENTRY_file_meta 30 -#define WT_CONFIG_ENTRY_index_meta 31 -#define WT_CONFIG_ENTRY_table_meta 32 -#define WT_CONFIG_ENTRY_wiredtiger_open 33 -#define WT_CONFIG_ENTRY_wiredtiger_open_all 34 -#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 35 -#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 36 +#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 25 +#define WT_CONFIG_ENTRY_WT_SESSION_strerror 26 +#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 27 +#define WT_CONFIG_ENTRY_WT_SESSION_truncate 28 +#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 29 +#define WT_CONFIG_ENTRY_WT_SESSION_verify 30 +#define WT_CONFIG_ENTRY_colgroup_meta 31 +#define WT_CONFIG_ENTRY_file_meta 32 +#define WT_CONFIG_ENTRY_index_meta 33 +#define WT_CONFIG_ENTRY_table_meta 34 +#define WT_CONFIG_ENTRY_wiredtiger_open 35 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 36 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 37 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 38 /* * configuration section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index a039edb7493..209eabea91c 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -330,10 +330,10 @@ struct __wt_connection_impl { WT_SESSION_IMPL *log_session; /* Log server session */ wt_thread_t log_tid; /* Log server thread */ int log_tid_set; /* Log server thread set */ - WT_CONDVAR *log_close_cond;/* Log close thread wait mutex */ - WT_SESSION_IMPL *log_close_session;/* Log close thread session */ - wt_thread_t log_close_tid; /* Log close thread thread */ - int log_close_tid_set;/* Log close thread set */ + WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */ + WT_SESSION_IMPL *log_file_session;/* Log file thread session */ + wt_thread_t log_file_tid; /* Log file thread thread */ + int log_file_tid_set;/* Log file thread set */ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 57c19f50417..780198b4bb0 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -105,6 +105,7 @@ __curfile_enter(WT_CURSOR_BTREE *cbt) static inline int __curfile_leave(WT_CURSOR_BTREE *cbt) { + WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -125,13 +126,16 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) cbt->page_deleted_count = 0; /* - * Release any page references we're holding. This can trigger - * eviction (e.g., forced eviction of big pages), so it is important to - * do it after releasing our snapshot above. + * Release any page references we're holding. This can trigger eviction + * (e.g., forced eviction of big pages), so it's important to do after + * releasing our snapshot above. + * + * Clear the reference regardless, so we don't try the release twice. */ - WT_RET(__wt_page_release(session, cbt->ref, 0)); + ret = __wt_page_release(session, cbt->ref, 0); cbt->ref = NULL; - return (0); + + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 7406bd471e2..3d3c851daad 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -210,7 +210,7 @@ extern int __wt_conn_remove_compressor(WT_SESSION_IMPL *session); extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session); extern int __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, WT_CONFIG_ITEM *keyid, WT_CONFIG_ARG *cfg_arg, WT_KEYED_ENCRYPTOR **kencryptorp); extern int __wt_conn_remove_encryptor(WT_SESSION_IMPL *session); -extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *config, WT_EXTRACTOR **extractorp, int *ownp); +extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_EXTRACTOR **extractorp, int *ownp); extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session); extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]); @@ -315,6 +315,8 @@ extern void __wt_cache_dump(WT_SESSION_IMPL *session); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags); extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); +extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec); extern void __wt_log_written_reset(WT_SESSION_IMPL *session); extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only); @@ -668,7 +670,7 @@ extern int __wt_txn_init(WT_SESSION_IMPL *session); extern void __wt_txn_stats_update(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); -extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session); +extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len); extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); @@ -688,4 +690,9 @@ extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval); +extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char *cfg[], int *has_create, int *has_drops); +extern int __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_recover(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 95aa6f9809d..faada258c09 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -30,9 +30,10 @@ #define WT_LOGSCAN_FROM_CKP 0x00000002 #define WT_LOGSCAN_ONE 0x00000004 #define WT_LOGSCAN_RECOVER 0x00000008 -#define WT_LOG_DSYNC 0x00000001 -#define WT_LOG_FLUSH 0x00000002 -#define WT_LOG_FSYNC 0x00000004 +#define WT_LOG_BACKGROUND 0x00000001 +#define WT_LOG_DSYNC 0x00000002 +#define WT_LOG_FLUSH 0x00000004 +#define WT_LOG_FSYNC 0x00000008 #define WT_READ_CACHE 0x00000001 #define WT_READ_COMPACT 0x00000002 #define WT_READ_NO_EVICT 0x00000004 @@ -65,7 +66,7 @@ #define WT_SYNC_DISCARD 0x00000004 #define WT_SYNC_DISCARD_FORCE 0x00000008 #define WT_SYNC_WRITE_LEAVES 0x00000010 -#define WT_TXN_LOG_CKPT_FAIL 0x00000001 +#define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 #define WT_TXN_LOG_CKPT_STOP 0x00000008 diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 2efbb20b39a..cb9d3d5e212 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -87,21 +87,25 @@ * To avoid locking shared data structures such as statistics and to permit * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS * (compare and swap) operations. - * - * Note that we avoid __sync_bool_compare_and_swap due to problems with - * optimization with some versions of clang. See - * http://llvm.org/bugs/show_bug.cgi?id=21499 for details. */ #define __WT_ATOMIC_ADD(v, val, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val)) #define __WT_ATOMIC_FETCH_ADD(v, val, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val)) +#ifdef __clang__ +/* + * We avoid __sync_bool_compare_and_swap with due to problems with + * optimization with some versions of clang. See + * http://llvm.org/bugs/show_bug.cgi?id=21499 for details. + */ #define __WT_ATOMIC_CAS(v, old, new, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ __sync_val_compare_and_swap(&(v), old, new) == (old)) -#define __WT_ATOMIC_CAS_VAL(v, old, new, n) \ +#else +#define __WT_ATOMIC_CAS(v, old, new, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - __sync_val_compare_and_swap(&(v), old, new)) + __sync_bool_compare_and_swap(&(v), old, new)) +#endif #define __WT_ATOMIC_STORE(v, val, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ __sync_lock_test_and_set(&(v), val)) @@ -111,28 +115,24 @@ #define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1) #define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1) #define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1) -#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 1) #define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1) #define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1) #define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2) #define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2) #define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2) -#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 2) #define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2) #define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2) #define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4) #define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4) #define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4) -#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 4) #define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4) #define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4) #define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8) #define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8) #define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8) -#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 8) #define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8) #define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8) diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h index 631f00cb5cd..964aa5c118f 100644 --- a/src/third_party/wiredtiger/src/include/lint.h +++ b/src/third_party/wiredtiger/src/include/lint.h @@ -24,8 +24,6 @@ ((v) += (val), (v)) #define __WT_ATOMIC_CAS(v, old, new) \ ((v) = ((v) == (old) ? (new) : (old)), (v) == (old)) -#define __WT_ATOMIC_CAS_VAL(v, old, new) \ - ((v) = ((v) == (old) ? (new) : (old)), (v) == (old)) #define __WT_ATOMIC_STORE(v, val) \ ((v) = (val)) #define __WT_ATOMIC_SUB(v, val) \ @@ -34,28 +32,24 @@ #define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val) #define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val) #define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) #define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val) #define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val) #define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val) #define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val) #define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) #define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val) #define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val) #define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val) #define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val) #define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) #define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val) #define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val) #define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val) #define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val) #define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new) #define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val) #define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val) diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index b11d514d99a..3de72b8b9a6 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -39,12 +39,12 @@ * ever changes. The value is the following: * txnid, record type, operation type, file id, operation key, operation value */ -#define LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI) -#define LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu) +#define WT_LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI) +#define WT_LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu) -#define LOG_SKIP_HEADER(data) \ +#define WT_LOG_SKIP_HEADER(data) \ ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record)) -#define LOG_REC_SIZE(size) \ +#define WT_LOG_REC_SIZE(size) \ ((size) - offsetof(WT_LOG_RECORD, record)) /* @@ -68,6 +68,9 @@ * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker. * WT_LOG_SLOT_READY - slot is ready for threads to join. * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot. + * + * The slot state must be volatile: threads loop checking the state and can't + * cache the first value they see. */ #define WT_LOG_SLOT_DONE 0 #define WT_LOG_SLOT_FREE 1 @@ -75,10 +78,10 @@ #define WT_LOG_SLOT_WRITTEN 3 #define WT_LOG_SLOT_READY 4 typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { - int64_t slot_state; /* Slot state */ + volatile int64_t slot_state; /* Slot state */ uint64_t slot_group_size; /* Group size */ int32_t slot_error; /* Error value */ -#define SLOT_INVALID_INDEX 0xffffffff +#define WT_SLOT_INVALID_INDEX 0xffffffff uint32_t slot_index; /* Active slot index */ wt_off_t slot_start_offset; /* Starting file offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ @@ -88,15 +91,15 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { WT_ITEM slot_buf; /* Buffer for grouped writes */ int32_t slot_churn; /* Active slots are scarce. */ -#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */ -#define SLOT_BUFFERED 0x02 /* Buffer writes */ -#define SLOT_CLOSEFH 0x04 /* Close old fh on release */ -#define SLOT_SYNC 0x08 /* Needs sync on release */ -#define SLOT_SYNC_DIR 0x10 /* Directory sync on release */ +#define WT_SLOT_BUF_GROW 0x01 /* Grow buffer on release */ +#define WT_SLOT_BUFFERED 0x02 /* Buffer writes */ +#define WT_SLOT_CLOSEFH 0x04 /* Close old fh on release */ +#define WT_SLOT_SYNC 0x08 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x10 /* Directory sync on release */ uint32_t flags; /* Flags */ } WT_LOGSLOT; -#define SLOT_INIT_FLAGS (SLOT_BUFFERED) +#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED) typedef struct { WT_LOGSLOT *slot; @@ -104,7 +107,7 @@ typedef struct { } WT_MYSLOT; /* Offset of first record */ -#define LOG_FIRST_RECORD log->allocsize +#define WT_LOG_FIRST_RECORD log->allocsize typedef struct { uint32_t allocsize; /* Allocation alignment size */ @@ -123,6 +126,7 @@ typedef struct { * System LSNs */ WT_LSN alloc_lsn; /* Next LSN for allocation */ + WT_LSN bg_sync_lsn; /* Latest background sync LSN */ WT_LSN ckpt_lsn; /* Last checkpoint LSN */ WT_LSN first_lsn; /* First LSN */ WT_LSN sync_dir_lsn; /* LSN of the last directory sync */ @@ -147,16 +151,16 @@ typedef struct { /* * Consolidation array information - * SLOT_ACTIVE must be less than SLOT_POOL. + * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL. * Our testing shows that the more consolidation we generate the * better the performance we see which equates to an active slot * slot count of one. */ -#define SLOT_ACTIVE 1 -#define SLOT_POOL 16 +#define WT_SLOT_ACTIVE 1 +#define WT_SLOT_POOL 16 uint32_t pool_index; /* Global pool index */ - WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */ - WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */ + WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ + WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ #define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h index fa5b2d848e8..bc72ddf8193 100644 --- a/src/third_party/wiredtiger/src/include/msvc.h +++ b/src/third_party/wiredtiger/src/include/msvc.h @@ -41,9 +41,6 @@ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ _InterlockedCompareExchange ## s \ ((t*)&(v), (t)(new), (t)(old)) == (t)(old)) -#define __WT_ATOMIC_CAS_VAL(v, old, new, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedCompareExchange ## s((t*)&(v), (t)(new), (t)(old))) #define __WT_ATOMIC_STORE(v, val, n, s, t) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ _InterlockedExchange ## s((t*)&(v), (t)(val))) @@ -55,8 +52,6 @@ #define WT_ATOMIC_FETCH_ADD1(v, val) \ __WT_ATOMIC_FETCH_ADD(v, val, 1, 8, char) #define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char) -#define WT_ATOMIC_CAS_VAL1(v, old, new) \ - __WT_ATOMIC_CAS_VAL(v, old, new, 1, 8, char) #define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char) #define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char) @@ -65,16 +60,12 @@ __WT_ATOMIC_FETCH_ADD(v, val, 2, 16, short) #define WT_ATOMIC_CAS2(v, old, new) \ __WT_ATOMIC_CAS(v, old, new, 2, 16, short) -#define WT_ATOMIC_CAS_VAL2(v, old, new) \ - __WT_ATOMIC_CAS_VAL(v, old, new, 2, 16, short) #define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short) #define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short) #define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long) #define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4, , long) #define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long) -#define WT_ATOMIC_CAS_VAL4(v, old, new) \ - __WT_ATOMIC_CAS_VAL(v, old, new, 4, , long) #define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long) #define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long) @@ -83,8 +74,6 @@ __WT_ATOMIC_FETCH_ADD(v, val, 8, 64, __int64) #define WT_ATOMIC_CAS8(v, old, new) \ __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64) -#define WT_ATOMIC_CAS_VAL8(v, old, new) \ - __WT_ATOMIC_CAS_VAL(v, old, new, 8, 64, __int64) #define WT_ATOMIC_STORE8(v, val) \ __WT_ATOMIC_STORE(v, val, 8, 64, __int64) #define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64) diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index daa47d6e776..8a8b229dbc0 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -112,6 +112,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_TXN_ISOLATION isolation; WT_TXN txn; /* Transaction state */ + WT_LSN bg_sync_lsn; /* Background sync operation LSN. */ u_int ncursors; /* Count of active file cursors. */ void *block_manager; /* Block-manager support */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 85d66b97e11..728c8c9fe8e 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -273,6 +273,7 @@ struct __wt_connection_stats { WT_STATS txn_pinned_checkpoint_range; WT_STATS txn_pinned_range; WT_STATS txn_rollback; + WT_STATS txn_sync; WT_STATS write_io; }; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 9f600ac95c1..f44a02cc332 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -25,6 +25,16 @@ #define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) +struct __wt_named_snapshot { + const char *name; + + STAILQ_ENTRY(__wt_named_snapshot) q; + + uint64_t snap_min, snap_max; + uint64_t *snapshot; + uint32_t snapshot_count; +}; + struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; volatile uint64_t snap_min; @@ -56,6 +66,11 @@ struct __wt_txn_global { volatile uint64_t checkpoint_id; volatile uint64_t checkpoint_snap_min; + /* Named snapshot state. */ + WT_RWLOCK *nsnap_rwlock; + volatile uint64_t nsnap_oldest_id; + STAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; + WT_TXN_STATE *states; /* Per-session transaction states */ }; @@ -144,6 +159,9 @@ struct __wt_txn { #define WT_TXN_ERROR 0x02 #define WT_TXN_HAS_ID 0x04 #define WT_TXN_HAS_SNAPSHOT 0x08 -#define WT_TXN_RUNNING 0x10 +#define WT_TXN_NAMED_SNAPSHOT 0x10 +#define WT_TXN_READONLY 0x20 +#define WT_TXN_RUNNING 0x40 +#define WT_TXN_SYNC_SET 0x80 uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index b06062fc483..bcb2d084df8 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -64,6 +64,13 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_DECL_RET; WT_TXN_OP *op; + WT_TXN *txn; + + txn = &session->txn; + + if (F_ISSET(txn, WT_TXN_READONLY)) + WT_RET_MSG(session, WT_ROLLBACK, + "Attempt to update in a read only transaction"); WT_RET(__txn_next_op(session, &op)); op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? @@ -89,13 +96,12 @@ __wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * __wt_txn_visible_all -- - * Check if a given transaction ID is "globally visible". This is, if - * all sessions in the system will see the transaction ID including the - * ID that belongs to a running checkpoint. + * __wt_txn_oldest_id -- + * Return the oldest transaction ID that has to be kept for the current + * tree. */ -static inline int -__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) +static inline uint64_t +__wt_txn_oldest_id(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; @@ -125,6 +131,22 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) */ oldest_id = checkpoint_snap_min; + return (oldest_id); +} + +/* + * __wt_txn_visible_all -- + * Check if a given transaction ID is "globally visible". This is, if + * all sessions in the system will see the transaction ID including the + * ID that belongs to a running checkpoint. + */ +static inline int +__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) +{ + uint64_t oldest_id; + + oldest_id = __wt_txn_oldest_id(session); + return (WT_TXNID_LT(id, oldest_id)); } @@ -219,7 +241,12 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (cfg != NULL) WT_RET(__wt_txn_config(session, cfg)); - if (txn->isolation == WT_ISO_SNAPSHOT) { + /* + * Allocate a snapshot if required. Named snapshot transactions already + * have an ID setup. + */ + if (txn->isolation == WT_ISO_SNAPSHOT && + !F_ISSET(txn, WT_TXN_NAMED_SNAPSHOT)) { if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a6f80039c10..fe27c4f1c62 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1389,6 +1389,8 @@ struct __wt_session { * @config{priority, priority of the transaction for resolving * conflicts. Transactions with higher values are less likely to * abort., an integer between -100 and 100; default \c 0.} + * @config{snapshot, use a named\, in-memory snapshot\, see @ref + * transaction_named_snapshots., a string; default empty.} * @config{sync, whether to sync log records when the transaction * commits\, inherited from ::wiredtiger_open \c transaction_sync., a * boolean flag; default empty.} @@ -1408,7 +1410,12 @@ struct __wt_session { * @snippet ex_all.c transaction commit/rollback * * @param session the session handle - * @configempty{WT_SESSION.commit_transaction, see dist/api_data.py} + * @configstart{WT_SESSION.commit_transaction, see dist/api_data.py} + * @config{sync, override whether to sync log records when the + * transaction commits\, inherited from ::wiredtiger_open \c + * transaction_sync., a string\, chosen from the following options: \c + * "background"\, \c "off"\, \c "on"; default empty.} + * @configend * @errors */ int __F(commit_transaction)(WT_SESSION *session, const char *config); @@ -1459,6 +1466,36 @@ struct __wt_session { int __F(checkpoint)(WT_SESSION *session, const char *config); /*! + * Manage named snapshot transactions. Use this API to create and drop + * named snapshots. Named snapshot transactions can be accessed via + * WT_CURSOR::open. See @ref transaction_named_snapshots. + * + * @snippet ex_all.c Snapshot examples + * + * @param session the session handle + * @configstart{WT_SESSION.snapshot, see dist/api_data.py} + * @config{drop = (, if non-empty\, specifies which snapshots to drop. + * Where a group of snapshots are being dropped\, the order is based on + * snapshot creation order not alphanumeric name order., a set of + * related configuration options defined below.} + * @config{ all, drop all named snapshots., a + * boolean flag; default \c false.} + * @config{ before, drop all snapshots up to but + * not including the specified name., a string; default empty.} + * @config{ names, drop specific named + * snapshots., a list of strings; default empty.} + * @config{ to, drop all snapshots up to and + * including the specified name., a string; default empty.} + * @config{ + * ),,} + * @config{name, specify a name for the snapshot., a string; default + * empty.} + * @configend + * @errors + */ + int __F(snapshot)(WT_SESSION *session, const char *config); + + /*! * Return the transaction ID range pinned by the session handle. * * The ID range is approximate and is calculated based on the oldest @@ -1474,6 +1511,24 @@ struct __wt_session { */ int __F(transaction_pinned_range)(WT_SESSION* session, uint64_t *range); + /*! + * Wait for a transaction to become synchronized. This method is + * only useful when ::wiredtiger_open is configured with the + * \c transaction_sync setting disabled. This method must be called + * when no transactions are active in the session. + * + * @snippet ex_all.c Transaction sync + * + * @param session the session handle + * @configstart{WT_SESSION.transaction_sync, see dist/api_data.py} + * @config{timeout_ms, maximum amount of time to wait for background + * sync to complete in milliseconds. A value of zero disables the + * timeout and returns immediately. The default waits forever., an + * integer; default \c .} + * @configend + * @errors + */ + int __F(transaction_sync)(WT_SESSION *session, const char *config); /*! @} */ }; @@ -3730,8 +3785,10 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_PINNED_RANGE 1139 /*! transaction: transactions rolled back */ #define WT_STAT_CONN_TXN_ROLLBACK 1140 +/*! transaction: transaction sync calls */ +#define WT_STAT_CONN_TXN_SYNC 1141 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1141 +#define WT_STAT_CONN_WRITE_IO 1142 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 6a5c04d9c7c..b876a2d032d 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -214,6 +214,8 @@ struct __wt_named_encryptor; typedef struct __wt_named_encryptor WT_NAMED_ENCRYPTOR; struct __wt_named_extractor; typedef struct __wt_named_extractor WT_NAMED_EXTRACTOR; +struct __wt_named_snapshot; + typedef struct __wt_named_snapshot WT_NAMED_SNAPSHOT; struct __wt_ovfl_reuse; typedef struct __wt_ovfl_reuse WT_OVFL_REUSE; struct __wt_ovfl_track; diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 71312849036..8bfb42821af 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -34,6 +34,75 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) } /* + * __wt_log_background -- + * Record the given LSN as the background LSN and signal the + * thread as needed. + */ +int +__wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + session->bg_sync_lsn = *lsn; + + /* + * Advance the logging subsystem background sync LSN if + * needed. + */ + __wt_spin_lock(session, &log->log_sync_lock); + if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0) + log->bg_sync_lsn = *lsn; + __wt_spin_unlock(session, &log->log_sync_lock); + return (__wt_cond_signal(session, conn->log_file_cond)); +} + +/* + * __wt_log_force_sync -- + * Force a sync of the log and files. + */ +int +__wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) +{ + WT_LOG *log; + WT_DECL_RET; + + log = S2C(session)->log; + + __wt_spin_lock(session, &log->log_sync_lock); + WT_ASSERT(session, log->log_dir_fh != NULL); + /* + * Sync the directory if the log file entry hasn't been written + * into the directory. + */ + if (log->sync_dir_lsn.file < min_lsn->file) { + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_force_sync: sync directory %s", + log->log_dir_fh->name)); + WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh)); + log->sync_dir_lsn = *min_lsn; + WT_STAT_FAST_CONN_INCR(session, log_sync_dir); + } + /* + * Sync the log file if needed. + */ + if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) { + WT_ERR(__wt_verbose(session, WT_VERB_LOG, + "log_force_sync: sync to LSN %d/%lu", + min_lsn->file, min_lsn->offset)); + WT_ERR(__wt_fsync(session, log->log_fh)); + log->sync_lsn = *min_lsn; + WT_STAT_FAST_CONN_INCR(session, log_sync); + WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); + } +err: + __wt_spin_unlock(session, &log->log_sync_lock); + return (ret); +} + +/* * __wt_log_needs_recovery -- * Return 0 if we encounter a clean shutdown and 1 if recovery * must be run in the given variable. @@ -259,9 +328,9 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) ret = 0; if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE || (ret = __wt_fallocate(session, fh, - LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP) + WT_LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP) ret = __wt_ftruncate(session, fh, - LOG_FIRST_RECORD + conn->log_file_max); + WT_LOG_FIRST_RECORD + conn->log_file_max); return (ret); } @@ -303,7 +372,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { WT_RET(__wt_log_newfile(session, 0, &created_log)); if (log->log_close_fh != NULL) - F_SET(slot, SLOT_CLOSEFH); + F_SET(slot, WT_SLOT_CLOSEFH); } /* @@ -329,7 +398,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) * Pre-allocate on the first real write into the log file, if it * was just created (i.e. not pre-allocated). */ - if (log->alloc_lsn.offset == LOG_FIRST_RECORD && created_log) + if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) WT_RET(__log_prealloc(session, log->log_fh)); log->alloc_lsn.offset += (wt_off_t)recsize; @@ -670,7 +739,7 @@ __log_truncate(WT_SESSION_IMPL *session, * truncate them to the end of the log file header. */ WT_ERR(__wt_ftruncate(session, - log_fh, LOG_FIRST_RECORD)); + log_fh, WT_LOG_FIRST_RECORD)); WT_ERR(__wt_fsync(session, log_fh)); WT_ERR(__wt_close(session, &log_fh)); } @@ -716,7 +785,7 @@ __wt_log_allocfile( */ WT_ERR(__log_openfile(session, 1, &log_fh, WT_LOG_TMPNAME, lognum)); WT_ERR(__log_file_header(session, log_fh, NULL, 1)); - WT_ERR(__wt_ftruncate(session, log_fh, LOG_FIRST_RECORD)); + WT_ERR(__wt_ftruncate(session, log_fh, WT_LOG_FIRST_RECORD)); if (prealloc) WT_ERR(__log_prealloc(session, log_fh)); WT_ERR(__wt_fsync(session, log_fh)); @@ -991,7 +1060,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) *freep = 1; /* Write the buffered records */ - if (F_ISSET(slot, SLOT_BUFFERED)) { + if (F_ISSET(slot, WT_SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, @@ -1005,8 +1074,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * off to the worker thread. The caller is responsible for freeing * the slot in that case. Otherwise the worker thread will free it. */ - if (F_ISSET(slot, SLOT_BUFFERED) && - !F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { + if (F_ISSET(slot, WT_SLOT_BUFFERED) && + !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; /* @@ -1036,15 +1105,15 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Signal the close thread if needed. */ - if (F_ISSET(slot, SLOT_CLOSEFH)) - WT_ERR(__wt_cond_signal(session, conn->log_close_cond)); + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) + WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->sync_lsn. */ - while (F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { + while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -1069,7 +1138,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * not yet stable in its parent directory. Do that * now if needed. */ - if (F_ISSET(slot, SLOT_SYNC_DIR) && + if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && (log->sync_dir_lsn.file < sync_lsn.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LOG, @@ -1084,7 +1153,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Sync the log file if needed. */ - if (F_ISSET(slot, SLOT_SYNC) && + if (F_ISSET(slot, WT_SLOT_SYNC) && WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s", log->log_fh->name)); @@ -1096,7 +1165,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Clear the flags before leaving the loop. */ - F_CLR(slot, SLOT_SYNC | SLOT_SYNC_DIR); + F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); break; @@ -1174,7 +1243,7 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created) * the end of the header. */ log->alloc_lsn.file = log->fileid; - log->alloc_lsn.offset = LOG_FIRST_RECORD; + log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; end_lsn = log->alloc_lsn; /* @@ -1467,8 +1536,6 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; - if (ret == ENOENT) - ret = 0; WT_TRET(__wt_close(session, &log_fh)); return (ret); } @@ -1499,9 +1566,9 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, locked = 1; if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(&tmp, SLOT_SYNC_DIR); + F_SET(&tmp, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(&tmp, SLOT_SYNC); + F_SET(&tmp, WT_SLOT_SYNC); WT_ERR(__log_acquire(session, record->size, &tmp)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; @@ -1697,8 +1764,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ret = __log_direct_write(session, record, &lsn, flags); if (ret == 0 && lsnp != NULL) *lsnp = lsn; - if (ret == 0) - return (0); + /* + * All needed syncing will be handled directly except + * a background sync. Handle that here. + */ + if (ret == 0) { + if (LF_ISSET(WT_LOG_BACKGROUND)) + goto bg; + else + return (0); + } if (ret != EAGAIN) WT_ERR(ret); /* @@ -1761,8 +1836,15 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, (void)__wt_cond_wait( session, log->log_write_cond, 10000); } -err: - if (locked) + + /* + * Advance the background sync LSN if needed. + */ +bg: if (LF_ISSET(WT_LOG_BACKGROUND) && + WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0) + WT_ERR(__wt_log_background(session, &lsn)); + +err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); if (ret == 0 && lsnp != NULL) *lsnp = lsn; @@ -1776,6 +1858,7 @@ err: if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; + return (ret); } diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c index bd830687df2..4f1dd1c0baf 100644 --- a/src/third_party/wiredtiger/src/log/log_auto.c +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -122,6 +122,7 @@ int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { + WT_DECL_RET; uint32_t fileid; uint64_t recno; WT_ITEM value; @@ -131,13 +132,17 @@ __wt_logop_col_put_print( WT_RET(__wt_logop_col_put_unpack( session, pp, end, &fileid, &recno, &value)); - fprintf(out, " \"optype\": \"col_put\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno); - WT_RET(__logrec_jsonify_str(session, &escaped, &value)); - fprintf(out, " \"value\": \"%s\"", escaped); - __wt_free(session, escaped); - return (0); + WT_RET(__wt_fprintf(out, " \"optype\": \"col_put\",\n")); + WT_ERR(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_ERR(__wt_fprintf(out, + " \"recno\": \"%" PRIu64 "\",\n", recno)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + " \"value\": \"%s\"", escaped)); + +err: __wt_free(session, escaped); + return (ret); } int @@ -190,9 +195,11 @@ __wt_logop_col_remove_print( WT_RET(__wt_logop_col_remove_unpack( session, pp, end, &fileid, &recno)); - fprintf(out, " \"optype\": \"col_remove\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - fprintf(out, " \"recno\": \"%" PRIu64 "\"", recno); + WT_RET(__wt_fprintf(out, " \"optype\": \"col_remove\",\n")); + WT_RET(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_RET(__wt_fprintf(out, + " \"recno\": \"%" PRIu64 "\"", recno)); return (0); } @@ -247,10 +254,13 @@ __wt_logop_col_truncate_print( WT_RET(__wt_logop_col_truncate_unpack( session, pp, end, &fileid, &start, &stop)); - fprintf(out, " \"optype\": \"col_truncate\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start); - fprintf(out, " \"stop\": \"%" PRIu64 "\"", stop); + WT_RET(__wt_fprintf(out, " \"optype\": \"col_truncate\",\n")); + WT_RET(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_RET(__wt_fprintf(out, + " \"start\": \"%" PRIu64 "\",\n", start)); + WT_RET(__wt_fprintf(out, + " \"stop\": \"%" PRIu64 "\"", stop)); return (0); } @@ -298,6 +308,7 @@ int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { + WT_DECL_RET; uint32_t fileid; WT_ITEM key; WT_ITEM value; @@ -307,14 +318,18 @@ __wt_logop_row_put_print( WT_RET(__wt_logop_row_put_unpack( session, pp, end, &fileid, &key, &value)); - fprintf(out, " \"optype\": \"row_put\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - WT_RET(__logrec_jsonify_str(session, &escaped, &key)); - fprintf(out, " \"key\": \"%s\",\n", escaped); - WT_RET(__logrec_jsonify_str(session, &escaped, &value)); - fprintf(out, " \"value\": \"%s\"", escaped); - __wt_free(session, escaped); - return (0); + WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n")); + WT_ERR(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + " \"key\": \"%s\",\n", escaped)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + " \"value\": \"%s\"", escaped)); + +err: __wt_free(session, escaped); + return (ret); } int @@ -361,6 +376,7 @@ int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { + WT_DECL_RET; uint32_t fileid; WT_ITEM key; char *escaped; @@ -369,12 +385,15 @@ __wt_logop_row_remove_print( WT_RET(__wt_logop_row_remove_unpack( session, pp, end, &fileid, &key)); - fprintf(out, " \"optype\": \"row_remove\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - WT_RET(__logrec_jsonify_str(session, &escaped, &key)); - fprintf(out, " \"key\": \"%s\"", escaped); - __wt_free(session, escaped); - return (0); + WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n")); + WT_ERR(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + " \"key\": \"%s\"", escaped)); + +err: __wt_free(session, escaped); + return (ret); } int @@ -421,6 +440,7 @@ int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { + WT_DECL_RET; uint32_t fileid; WT_ITEM start; WT_ITEM stop; @@ -431,15 +451,20 @@ __wt_logop_row_truncate_print( WT_RET(__wt_logop_row_truncate_unpack( session, pp, end, &fileid, &start, &stop, &mode)); - fprintf(out, " \"optype\": \"row_truncate\",\n"); - fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid); - WT_RET(__logrec_jsonify_str(session, &escaped, &start)); - fprintf(out, " \"start\": \"%s\",\n", escaped); - WT_RET(__logrec_jsonify_str(session, &escaped, &stop)); - fprintf(out, " \"stop\": \"%s\",\n", escaped); - fprintf(out, " \"mode\": \"%" PRIu32 "\"", mode); - __wt_free(session, escaped); - return (0); + WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n")); + WT_ERR(__wt_fprintf(out, + " \"fileid\": \"%" PRIu32 "\",\n", fileid)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &start)); + WT_ERR(__wt_fprintf(out, + " \"start\": \"%s\",\n", escaped)); + WT_ERR(__logrec_jsonify_str(session, &escaped, &stop)); + WT_ERR(__wt_fprintf(out, + " \"stop\": \"%s\",\n", escaped)); + WT_ERR(__wt_fprintf(out, + " \"mode\": \"%" PRIu32 "\"", mode)); + +err: __wt_free(session, escaped); + return (ret); } int diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 02b3056be6f..45455b59e6b 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -35,15 +35,15 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) conn = S2C(session); log = conn->log; - for (i = 0; i < SLOT_POOL; i++) { + for (i = 0; i < WT_SLOT_POOL; i++) { log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; - log->slot_pool[i].slot_index = SLOT_INVALID_INDEX; + log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; } /* * Set up the available slots from the pool the first time. */ - for (i = 0; i < SLOT_ACTIVE; i++) { + for (i = 0; i < WT_SLOT_ACTIVE; i++) { slot = &log->slot_pool[i]; slot->slot_index = (uint32_t)i; slot->slot_state = WT_LOG_SLOT_READY; @@ -54,13 +54,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. */ - for (i = 0; i < SLOT_POOL; i++) { + for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); - F_SET(&log->slot_pool[i], SLOT_INIT_FLAGS); + F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL); + log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * WT_SLOT_POOL); if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); @@ -82,7 +82,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) conn = S2C(session); log = conn->log; - for (i = 0; i < SLOT_POOL; i++) + for (i = 0; i < WT_SLOT_POOL; i++) __wt_buf_free(session, &log->slot_pool[i].slot_buf); return (0); } @@ -100,18 +100,29 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int64_t cur_state, new_state, old_state; + int64_t new_state, old_state; uint32_t allocated_slot, slot_grow_attempts; conn = S2C(session); log = conn->log; slot_grow_attempts = 0; find_slot: - allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE; + allocated_slot = WT_SLOT_ACTIVE == 1 ? 0 : + __wt_random(session->rnd) % WT_SLOT_ACTIVE; + /* + * Get the selected slot. Use a barrier to prevent the compiler from + * caching this read. + */ + WT_BARRIER(); slot = log->slot_array[allocated_slot]; - old_state = slot->slot_state; join_slot: /* + * Read the current slot state. Use a barrier to prevent the compiler + * from caching this read. + */ + WT_BARRIER(); + old_state = slot->slot_state; + /* * WT_LOG_SLOT_READY and higher means the slot is available for * joining. Any other state means it is in use and transitioning * from the active array. @@ -135,20 +146,18 @@ join_slot: * the slot for a buffer size increase and find another slot. */ if (new_state > (int64_t)slot->slot_buf.memsize) { - F_SET(slot, SLOT_BUF_GROW); + F_SET(slot, WT_SLOT_BUF_GROW); if (++slot_grow_attempts > 5) { WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); return (ENOMEM); } goto find_slot; } - cur_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, old_state, new_state); /* * We lost a race to add our size into this slot. Check the state * and try again. */ - if (cur_state != old_state) { - old_state = cur_state; + if (!WT_ATOMIC_CAS8(slot->slot_state, old_state, new_state)) { WT_STAT_FAST_CONN_INCR(session, log_slot_races); goto join_slot; } @@ -159,9 +168,9 @@ join_slot: */ WT_STAT_FAST_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(slot, SLOT_SYNC_DIR); + F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(slot, SLOT_SYNC); + F_SET(slot, WT_SLOT_SYNC); myslotp->slot = slot; myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY; return (0); @@ -193,7 +202,7 @@ retry: */ pool_i = log->pool_index; newslot = &log->slot_pool[pool_i]; - if (++log->pool_index >= SLOT_POOL) + if (++log->pool_index >= WT_SLOT_POOL) log->pool_index = 0; if (newslot->slot_state != WT_LOG_SLOT_FREE) { WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); @@ -203,7 +212,7 @@ retry: * churn is used to change how long we pause before closing * the slot - which leads to more consolidation and less churn. */ - if (++switch_fails % SLOT_POOL == 0 && slot->slot_churn < 5) + if (++switch_fails % WT_SLOT_POOL == 0 && slot->slot_churn < 5) ++slot->slot_churn; __wt_yield(); goto retry; @@ -303,7 +312,7 @@ __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) /* * Grow the buffer if needed before returning it to the pool. */ - if (F_ISSET(slot, SLOT_BUF_GROW)) { + if (F_ISSET(slot, WT_SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); @@ -320,7 +329,7 @@ err: * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ - slot->flags = SLOT_INIT_FLAGS; + slot->flags = WT_SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; return (ret); } @@ -353,28 +362,25 @@ __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) * a separate lock if there is contention. */ __wt_spin_lock(session, &log->log_slot_lock); - for (i = 0; i < SLOT_POOL; i++) { + for (i = 0; i < WT_SLOT_POOL; i++) { slot = &log->slot_pool[i]; - /* Avoid atomic operations if they won't succeed. */ - if (slot->slot_state != WT_LOG_SLOT_FREE && - slot->slot_state != WT_LOG_SLOT_READY) - continue; + /* Don't keep growing unrelated buffers. */ if (slot->slot_buf.memsize > (10 * newsize) && - !F_ISSET(slot, SLOT_BUF_GROW)) + !F_ISSET(slot, WT_SLOT_BUF_GROW)) + continue; + + /* Avoid atomic operations if they won't succeed. */ + orig_state = slot->slot_state; + if ((orig_state != WT_LOG_SLOT_FREE && + orig_state != WT_LOG_SLOT_READY) || + !WT_ATOMIC_CAS8( + slot->slot_state, orig_state, WT_LOG_SLOT_PENDING)) continue; - orig_state = WT_ATOMIC_CAS_VAL8( - slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); - if (orig_state != WT_LOG_SLOT_FREE) { - orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, - WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); - if (orig_state != WT_LOG_SLOT_READY) - continue; - } /* We have a slot - now go ahead and grow the buffer. */ old_size = slot->slot_buf.memsize; - F_CLR(slot, SLOT_BUF_GROW); + F_CLR(slot, WT_SLOT_BUF_GROW); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, WT_MAX(slot->slot_buf.memsize * 2, newsize))); slot->slot_state = orig_state; diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c index d67eb33c9e8..ef1c124111f 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c @@ -17,11 +17,14 @@ __clsm_close_bulk(WT_CURSOR *cursor) { WT_CURSOR_LSM *clsm; WT_CURSOR *bulk_cursor; + WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; + uint64_t avg_chunks, total_chunks; clsm = (WT_CURSOR_LSM *)cursor; lsm_tree = clsm->lsm_tree; + chunk = lsm_tree->chunk[0]; session = (WT_SESSION_IMPL *)clsm->iface.session; /* Close the bulk cursor to ensure the chunk is written to disk. */ @@ -31,7 +34,18 @@ __clsm_close_bulk(WT_CURSOR *cursor) clsm->nchunks = 0; /* Set ondisk, and flush the metadata */ - F_SET(lsm_tree->chunk[0], WT_LSM_CHUNK_ONDISK); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + /* + * Setup a generation in our chunk based on how many chunk_size + * pieces fit into a chunk of a given generation. This allows future + * LSM merges choose reasonable sets of chunks. + */ + avg_chunks = (lsm_tree->merge_min + lsm_tree->merge_max) / 2; + for (total_chunks = chunk->size / lsm_tree->chunk_size; + total_chunks > 1; + total_chunks /= avg_chunks) + ++chunk->generation; + WT_RET(__wt_lsm_meta_write(session, lsm_tree)); ++lsm_tree->dsk_gen; @@ -49,15 +63,18 @@ __clsm_insert_bulk(WT_CURSOR *cursor) { WT_CURSOR *bulk_cursor; WT_CURSOR_LSM *clsm; + WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; clsm = (WT_CURSOR_LSM *)cursor; lsm_tree = clsm->lsm_tree; + chunk = lsm_tree->chunk[0]; session = (WT_SESSION_IMPL *)clsm->iface.session; WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); - ++lsm_tree->chunk[0]->count; + ++chunk->count; + chunk->size += cursor->key.size + cursor->value.size; bulk_cursor = *clsm->cursors; bulk_cursor->set_key(bulk_cursor, &cursor->key); bulk_cursor->set_value(bulk_cursor, &cursor->value); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 4be0cd2a09c..0d3ce5da2d8 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -652,8 +652,10 @@ int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree) { + WT_DECL_RET; WT_LSM_MANAGER *manager; WT_LSM_WORK_UNIT *entry; + int pushed; manager = &S2C(session)->lsm_manager; @@ -672,13 +674,27 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, break; } - WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); + /* + * Don't allow any work units unless a tree is active, this avoids + * races on shutdown between clearing out queues and pushing new + * work units. + * + * Increment the queue reference before checking the flag since + * on close, the flag is cleared and then the queue reference count + * is checked. + */ + (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1); + if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1); + return (0); + } - WT_RET(__wt_calloc_one(session, &entry)); + pushed = 0; + WT_ERR(__wt_epoch(session, &lsm_tree->work_push_ts)); + WT_ERR(__wt_calloc_one(session, &entry)); entry->type = type; entry->flags = flags; entry->lsm_tree = lsm_tree; - (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1); WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created); if (type == WT_LSM_WORK_SWITCH) @@ -690,8 +706,12 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, else LSM_PUSH_ENTRY(&manager->appqh, &manager->app_lock, lsm_work_queue_app); + pushed = 1; - WT_RET(__wt_cond_signal(session, manager->work_cond)); - + WT_ERR(__wt_cond_signal(session, manager->work_cond)); return (0); +err: + if (!pushed) + (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1); + return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index b8aecfc89b6..6c6b185f821 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -27,6 +27,13 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int final) WT_UNUSED(final); /* Only used in diagnostic builds */ + /* + * The work unit queue should be empty, but it's worth checking + * since work units use a different locking scheme to regular tree + * operations. + */ + WT_ASSERT(session, lsm_tree->queue_ref == 0); + /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || @@ -1223,13 +1230,13 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) "LSM compaction requires active merge threads"); /* - * We are done if there is a single chunk in the tree and we have - * already created a bloom filter for it or we are configured not to. + * There is no work to do if there is only a single chunk in the tree + * and it has a bloom filter or is configured to never have a bloom + * filter. */ if (lsm_tree->nchunks == 1 && - ((FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) && - F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM)) || - !FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))) { + (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || + F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM))) { __wt_lsm_tree_release(session, lsm_tree); return (0); } diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 79a555d0922..a44e647ff08 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -175,10 +175,10 @@ __wt_turtle_init(WT_SESSION_IMPL *session) * creation doesn't fully complete, we won't have a turtle file and we * will repeat the process until we succeed. * - * Incremental backups can occur only run recovery is run and it becomes - * live. So if there is a turtle file and an incremental backup file - * that is an error. Otherwise, if there's already a turtle file, - * we're done. + * Incremental backups can occur only if recovery is run and it becomes + * live. So, if there is a turtle file and an incremental backup file, + * that is an error. Otherwise, if there's already a turtle file, we're + * done. */ WT_RET(__wt_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr)); WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); @@ -187,23 +187,23 @@ __wt_turtle_init(WT_SESSION_IMPL *session) WT_RET_MSG(session, EINVAL, "Incremental backup after running recovery " "is not allowed."); - return (0); - } - if (exist_incr) - F_SET(S2C(session), WT_CONN_WAS_BACKUP); + } else { + if (exist_incr) + F_SET(S2C(session), WT_CONN_WAS_BACKUP); - /* Create the metadata file. */ - WT_RET(__metadata_init(session)); + /* Create the metadata file. */ + WT_RET(__metadata_init(session)); - /* Load any hot-backup information. */ - WT_RET(__metadata_load_hot_backup(session)); + /* Load any hot-backup information. */ + WT_RET(__metadata_load_hot_backup(session)); - /* Create any bulk-loaded file stubs. */ - WT_RET(__metadata_load_bulk(session)); + /* Create any bulk-loaded file stubs. */ + WT_RET(__metadata_load_bulk(session)); - /* Create the turtle file. */ - WT_RET(__metadata_config(session, &metaconf)); - WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf)); + /* Create the turtle file. */ + WT_RET(__metadata_config(session, &metaconf)); + WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf)); + } /* Remove the backup files, we'll never read them again. */ WT_ERR(__wt_backup_file_remove(session)); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index 3a3b0e0d74f..19183ed9030 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -82,7 +82,7 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) writers = l->s.writers; old = (pad << 48) + (users << 32) + (users << 16) + writers; new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers; - return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY); + return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); } /* @@ -163,7 +163,7 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) users = l->s.users; old = (pad << 48) + (users << 32) + (readers << 16) + users; new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users; - return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY); + return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); } /* diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c index 2116efc6ea8..7a4f5fdb38d 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_open.c +++ b/src/third_party/wiredtiger/src/os_posix/os_open.c @@ -16,16 +16,9 @@ static int __open_directory(WT_SESSION_IMPL *session, char *path, int *fd) { WT_DECL_RET; - char *dir; - if ((dir = strrchr(path, '/')) == NULL) - path = (char *)"."; - else - *dir = '\0'; WT_SYSCALL_RETRY(((*fd = open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret); - if (dir != NULL) - *dir = '/'; if (ret != 0) WT_RET_MSG(session, ret, "%s: open_directory", path); return (ret); diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c index c70a04c8df7..e4f24cdb44e 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_thread.c +++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c @@ -49,12 +49,15 @@ __wt_thread_id(char *buf, size_t buflen) pthread_t self; /* - * POSIX 1003.1 allows pthread_t to be an opaque type, but on systems - * where it's a pointer, we'd rather print out the pointer and match - * gdb output. Since we don't yet run on any systems where pthread_t - * is not a pointer, do it that way for now. + * POSIX 1003.1 allows pthread_t to be an opaque type; on systems where + * it's a pointer, print the pointer to match gdb output. */ self = pthread_self(); +#ifdef __sun (void)snprintf(buf, buflen, - "%" PRIu64 ":%p", (uint64_t)getpid(), (void *)self); + "%" PRIuMAX ":%u", (uintmax_t)getpid(), self); +#else + (void)snprintf(buf, buflen, + "%" PRIuMAX ":%p", (uintmax_t)getpid(), (void *)self); +#endif } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index e417dbfaa83..36e85713421 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -711,7 +711,7 @@ __ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) * visibility check could give different results as the global ID moves * forward. */ - oldest_txn = S2C(session)->txn_global.oldest_id; + oldest_txn = __wt_txn_oldest_id(session); /* * Discard any transaction-cache records with transaction IDs earlier diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index eb706f5ff27..bd4f2012480 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -155,7 +155,8 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) } WT_ERR(__wt_extractor_config( - session, idx->config, &idx->extractor, &idx->extractor_owned)); + session, idx->name, idx->config, &idx->extractor, + &idx->extractor_owned)); WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->key_format)); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 7951b3ff50d..7e331c530fd 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -9,6 +9,7 @@ #include "wt_internal.h" static int __session_checkpoint(WT_SESSION *, const char *); +static int __session_snapshot(WT_SESSION *, const char *); static int __session_rollback_transaction(WT_SESSION *, const char *); /* @@ -74,6 +75,7 @@ __session_clear(WT_SESSION_IMPL *session) memset(session, 0, WT_SESSION_CLEAR_SIZE(session)); session->hazard_size = 0; session->nhazard = 0; + WT_INIT_LSN(&session->bg_sync_lsn); } /* @@ -850,6 +852,89 @@ err: API_END_RET(session, ret); } /* + * __session_transaction_sync -- + * WT_SESSION->transaction_sync method. + */ +static int +__session_transaction_sync(WT_SESSION *wt_session, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_SESSION_IMPL *session; + WT_TXN *txn; + struct timespec now, start; + uint64_t timeout_ms, waited_ms; + int forever; + + session = (WT_SESSION_IMPL *)wt_session; + conn = S2C(session); + txn = &session->txn; + if (F_ISSET(txn, WT_TXN_RUNNING)) + WT_RET_MSG(session, EINVAL, "transaction in progress"); + + /* + * If logging is not enabled there is nothing to do. + */ + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + return (0); + SESSION_API_CALL(session, transaction_sync, config, cfg); + WT_STAT_FAST_CONN_INCR(session, txn_sync); + + log = conn->log; + ret = 0; + timeout_ms = waited_ms = 0; + forever = 1; + + /* + * If there is no background sync LSN in this session, there + * is nothing to do. + */ + if (WT_IS_INIT_LSN(&session->bg_sync_lsn)) + goto err; + + /* + * If our LSN is smaller than the current sync LSN then our + * transaction is stable. We're done. + */ + if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0) + goto err; + + /* + * Our LSN is not yet stable. Wait and check again depending on the + * timeout. + */ + WT_ERR(__wt_config_gets_def( + session, cfg, "timeout_ms", (int)UINT_MAX, &cval)); + if ((unsigned int)cval.len != UINT_MAX) { + timeout_ms = (uint64_t)cval.val; + forever = 0; + } + + if (timeout_ms == 0) + WT_ERR(ETIMEDOUT); + + WT_ERR(__wt_epoch(session, &start)); + /* + * Keep checking the LSNs until we find it is stable or we reach + * our timeout. + */ + while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); + WT_ERR(__wt_epoch(session, &now)); + waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION; + if (forever || waited_ms < timeout_ms) + WT_ERR(__wt_cond_wait( + session, log->log_sync_cond, waited_ms)); + else + WT_ERR(ETIMEDOUT); + } + +err: API_END_RET(session, ret); +} + +/* * __session_checkpoint -- * WT_SESSION->checkpoint method. */ @@ -920,6 +1005,42 @@ err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); } /* + * __session_snapshot -- + * WT_SESSION->snapshot method. + */ +static int +__session_snapshot(WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_TXN_GLOBAL *txn_global; + int has_create, has_drop; + + has_create = has_drop = 0; + session = (WT_SESSION_IMPL *)wt_session; + txn_global = &S2C(session)->txn_global; + + SESSION_API_CALL(session, snapshot, config, cfg); + + WT_ERR(__wt_txn_named_snapshot_config( + session, cfg, &has_create, &has_drop)); + + WT_ERR(__wt_writelock(session, txn_global->nsnap_rwlock)); + + /* Drop any snapshots to be removed first. */ + if (has_drop) + WT_ERR(__wt_txn_named_snapshot_drop(session, cfg)); + + /* Start the named snapshot if requested. */ + if (has_create) + WT_ERR(__wt_txn_named_snapshot_begin(session, cfg)); + +err: WT_TRET(__wt_writeunlock(session, txn_global->nsnap_rwlock)); + + API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* * __session_strerror -- * WT_SESSION->strerror method. */ @@ -997,7 +1118,9 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, __session_commit_transaction, __session_rollback_transaction, __session_checkpoint, - __session_transaction_pinned_range + __session_snapshot, + __session_transaction_pinned_range, + __session_transaction_sync }; WT_DECL_RET; WT_SESSION_IMPL *session, *session_ret; diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index 720f40e8d11..6648dd5761f 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -114,9 +114,8 @@ __wt_session_lock_dhandle( WT_RET(__wt_readlock(session, dhandle->rwlock)); if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - WT_RET( + return ( __wt_readunlock(session, dhandle->rwlock)); - return (0); } is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0; @@ -135,9 +134,8 @@ __wt_session_lock_dhandle( if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - WT_RET( + return ( __wt_writeunlock(session, dhandle->rwlock)); - return (0); } /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index c30d4b2f029..44c2daa3802 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -549,6 +549,7 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "transaction: transaction range of IDs currently pinned"; stats->txn_pinned_checkpoint_range.desc = "transaction: transaction range of IDs currently pinned by a checkpoint"; + stats->txn_sync.desc = "transaction: transaction sync calls"; stats->txn_commit.desc = "transaction: transactions committed"; stats->txn_rollback.desc = "transaction: transactions rolled back"; } @@ -669,6 +670,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->txn_begin.v = 0; stats->txn_checkpoint.v = 0; stats->txn_fail_cache.v = 0; + stats->txn_sync.v = 0; stats->txn_commit.v = 0; stats->txn_rollback.v = 0; } diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 0492e39342f..63ef4db5092 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -258,6 +258,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) if (WT_TXNID_LT(snap_min, oldest_id)) oldest_id = snap_min; + /* The oldest ID can't move past any named snapshots. */ + if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && + WT_TXNID_LT(id, oldest_id)) + oldest_id = id; + /* Update the last running ID. */ if (WT_TXNID_LT(txn_global->last_running, snap_min)) { txn_global->last_running = snap_min; @@ -329,15 +334,30 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. * - * !!! This is an unusual use of the config code: the "default" value - * we pass in is inherited from the connection. If flush is not set in - * the connection-wide flag and not overridden here, we end up clearing - * all flags. + * We want to distinguish between inheriting implicitly and explicitly. */ - WT_RET(__wt_config_gets_def(session, cfg, "sync", - FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) ? 1 : 0, &cval)); - if (!cval.val) - txn->txn_logsync = 0; + F_CLR(txn, WT_TXN_SYNC_SET); + WT_RET(__wt_config_gets_def( + session, cfg, "sync", (int)UINT_MAX, &cval)); + if (cval.val == 0 || cval.val == 1) + /* + * This is an explicit setting of sync. Set the flag so + * that we know not to overwrite it in commit_transaction. + */ + F_SET(txn, WT_TXN_SYNC_SET); + + if (cval.val == 0) + FLD_CLR(txn->txn_logsync, WT_LOG_FLUSH); + + WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); + if (cval.len > 0) + /* + * The layering here isn't ideal - the named snapshot get + * function does both validation and setup. Otherwise we'd + * need to walk the list of named snapshots twice during + * transaction open. + */ + WT_RET(__wt_txn_named_snapshot_get(session, &cval)); return (0); } @@ -381,7 +401,8 @@ __wt_txn_release(WT_SESSION_IMPL *session) */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; - F_CLR(txn, WT_TXN_ERROR | WT_TXN_HAS_ID | WT_TXN_RUNNING); + /* Ensure the transaction flags are cleared on exit */ + txn->flags = 0; } /* @@ -391,17 +412,60 @@ __wt_txn_release(WT_SESSION_IMPL *session) int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_OP *op; u_int i; txn = &session->txn; + conn = S2C(session); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR)); if (!F_ISSET(txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "No transaction is active"); + /* + * The default sync setting is inherited from the connection, but can + * be overridden by an explicit "sync" setting for this transaction. + */ + WT_RET(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); + + /* + * If the user chose the default setting, check whether sync is enabled + * for this transaction (either inherited or via begin_transaction). + * If sync is disabled, clear the field to avoid the log write being + * flushed. + * + * Otherwise check for specific settings. We don't need to check for + * "on" because that is the default inherited from the connection. If + * the user set anything in begin_transaction, we only override with an + * explicit setting. + */ + if (cval.len == 0) { + if (!FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) && + !F_ISSET(txn, WT_TXN_SYNC_SET)) + txn->txn_logsync = 0; + } else { + /* + * If the caller already set sync on begin_transaction then + * they should not be using sync on commit_transaction. + * Flag that as an error. + */ + if (F_ISSET(txn, WT_TXN_SYNC_SET)) + WT_RET_MSG(session, EINVAL, + "Sync already set during begin_transaction."); + if (WT_STRING_MATCH("background", cval.str, cval.len)) + txn->txn_logsync = WT_LOG_BACKGROUND; + else if (WT_STRING_MATCH("off", cval.str, cval.len)) + txn->txn_logsync = 0; + /* + * We don't need to check for "on" here because that is the + * default to inherit from the connection setting. + */ + } + /* Commit notification. */ if (txn->notify != NULL) WT_TRET(txn->notify->notify(txn->notify, @@ -409,7 +473,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && - FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. @@ -596,8 +660,14 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) txn_global->current = txn_global->last_running = txn_global->oldest_id = WT_TXN_FIRST; + WT_RET(__wt_rwlock_alloc(session, + &txn_global->nsnap_rwlock, "named snapshot lock")); + txn_global->nsnap_oldest_id = WT_TXN_NONE; + STAILQ_INIT(&txn_global->nsnaph); + WT_RET(__wt_calloc_def( session, conn->session_size, &txn_global->states)); + for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) s->id = s->snap_min = WT_TXN_NONE; @@ -608,15 +678,21 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) * __wt_txn_global_destroy -- * Destroy the global transaction state. */ -void +int __wt_txn_global_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_TXN_GLOBAL *txn_global; conn = S2C(session); txn_global = &conn->txn_global; - if (txn_global != NULL) - __wt_free(session, txn_global->states); + if (txn_global == NULL) + return (0); + + WT_TRET(__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock)); + __wt_free(session, txn_global->states); + + return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 60c427c56b6..d8032b49b17 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -352,14 +352,14 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; void *saved_meta_next; - int full, logging, tracking; + int full, idle, logging, tracking; u_int i; conn = S2C(session); txn_global = &conn->txn_global; saved_isolation = session->isolation; txn = &session->txn; - full = logging = tracking = 0; + full = idle = logging = tracking = 0; /* Ensure the metadata table is open before taking any locks. */ WT_RET(__wt_metadata_open(session)); @@ -499,6 +499,10 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_commit(session, NULL)); /* + * If any tree was dirty, we will have updated the metadata with the + * new checkpoint information. If the metadata is clean, all other + * trees must have been clean. + * * Disable metadata tracking during the metadata checkpoint. * * We don't lock old checkpoints in the metadata file: there is no way @@ -553,11 +557,19 @@ err: /* txn_global->checkpoint_id = WT_TXN_NONE; txn_global->checkpoint_snap_min = WT_TXN_NONE; - /* Tell logging that we have finished a database checkpoint. */ - if (logging) + /* + * Tell logging that we have finished a database checkpoint. Do not + * write a log record if the database was idle. + */ + if (logging) { + if (ret == 0 && full && + F_ISSET((WT_BTREE *)session->meta_dhandle->handle, + WT_BTREE_SKIP_CKPT)) + idle = 1; WT_TRET(__wt_txn_checkpoint_log(session, full, - (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL, - NULL)); + (ret == 0 && !idle) ? + WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); + } for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i].dhandle == NULL) { @@ -577,7 +589,6 @@ err: /* } session->isolation = txn->isolation = saved_isolation; - return (ret); } @@ -788,6 +799,7 @@ __checkpoint_worker( * means it must exist. */ force = 0; + F_CLR(btree, WT_BTREE_SKIP_CKPT); if (!btree->modified && cfg != NULL) { ret = __wt_config_gets(session, cfg, "force", &cval); if (ret != 0 && ret != WT_NOTFOUND) @@ -796,8 +808,10 @@ __checkpoint_worker( force = 1; } if (!btree->modified && !force) { - if (!is_checkpoint) + if (!is_checkpoint) { + F_SET(btree, WT_BTREE_SKIP_CKPT); goto done; + } deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -815,8 +829,10 @@ __checkpoint_worker( (strcmp(name, (ckpt - 1)->name) == 0 || (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && - deleted < 2) + deleted < 2) { + F_SET(btree, WT_BTREE_SKIP_CKPT); goto done; + } } /* Add a new checkpoint entry at the end of the list. */ diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 9b6a3d46014..63e4c50aff5 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -70,21 +70,21 @@ __txn_commit_printlog( int firstrecord; firstrecord = 1; - fprintf(out, " \"ops\": [\n"); + WT_RET(__wt_fprintf(out, " \"ops\": [\n")); /* The logging subsystem zero-pads records. */ while (*pp < end && **pp) { if (!firstrecord) - fprintf(out, ",\n"); - fprintf(out, " {"); + WT_RET(__wt_fprintf(out, ",\n")); + WT_RET(__wt_fprintf(out, " {")); firstrecord = 0; WT_RET(__wt_txn_op_printlog(session, pp, end, out)); - fprintf(out, "\n }"); + WT_RET(__wt_fprintf(out, "\n }")); } - fprintf(out, "\n ]\n"); + WT_RET(__wt_fprintf(out, "\n ]\n")); return (0); } @@ -155,12 +155,12 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_TXN *txn; WT_TXN_OP *op; + txn = &session->txn; + if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) || F_ISSET(session, WT_SESSION_NO_LOGGING)) return (0); - txn = &session->txn; - /* We'd better have a transaction. */ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID)); @@ -303,6 +303,12 @@ __wt_txn_checkpoint_log( case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; *ckpt_lsn = S2C(session)->log->write_start_lsn; + /* + * We need to make sure that the log records in the checkpoint + * LSN are on disk. In particular to make sure that the + * current log file exists. + */ + WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: @@ -355,7 +361,7 @@ __wt_txn_checkpoint_log( WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); /* FALLTHROUGH */ - case WT_TXN_LOG_CKPT_FAIL: + case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; @@ -456,7 +462,7 @@ __txn_printlog(WT_SESSION_IMPL *session, WT_UNUSED(next_lsnp); out = cookie; - p = LOG_SKIP_HEADER(rawrec->data); + p = WT_LOG_SKIP_HEADER(rawrec->data); end = (const uint8_t *)rawrec->data + rawrec->size; logrec = (WT_LOG_RECORD *)rawrec->data; compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED); @@ -465,57 +471,56 @@ __txn_printlog(WT_SESSION_IMPL *session, WT_RET(__wt_logrec_read(session, &p, end, &rectype)); if (!firstrecord) - fprintf(out, ",\n"); - - if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n", - lsnp->file, lsnp->offset) < 0 || - fprintf(out, " \"hdr_flags\" : \"%s\",\n", - compressed ? "compressed" : "") < 0 || - fprintf(out, " \"rec_len\" : %" PRIu32 ",\n", logrec->len) < 0 || - fprintf(out, " \"mem_len\" : %" PRIu32 ",\n", - compressed ? logrec->mem_len : logrec->len) < 0) - return (errno); + WT_RET(__wt_fprintf(out, ",\n")); + + WT_RET(__wt_fprintf(out, + " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n", + lsnp->file, lsnp->offset)); + WT_RET(__wt_fprintf(out, + " \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : "")); + WT_RET(__wt_fprintf(out, + " \"rec_len\" : %" PRIu32 ",\n", logrec->len)); + WT_RET(__wt_fprintf(out, + " \"mem_len\" : %" PRIu32 ",\n", + compressed ? logrec->mem_len : logrec->len)); switch (rectype) { case WT_LOGREC_CHECKPOINT: WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset)); - if (fprintf(out, " \"type\" : \"checkpoint\",\n") < 0 || - fprintf( - out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "]\n", - ckpt_lsn.file, ckpt_lsn.offset) < 0) - return (errno); + WT_RET(__wt_fprintf(out, " \"type\" : \"checkpoint\",\n")); + WT_RET(__wt_fprintf(out, + " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "]\n", + ckpt_lsn.file, ckpt_lsn.offset)); break; case WT_LOGREC_COMMIT: WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); - if (fprintf(out, " \"type\" : \"commit\",\n") < 0 || - fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0) - return (errno); + WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n")); + WT_RET(__wt_fprintf(out, + " \"txnid\" : %" PRIu64 ",\n", txnid)); WT_RET(__txn_commit_printlog(session, &p, end, out)); break; case WT_LOGREC_FILE_SYNC: WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(Ii), &fileid, &start)); - if (fprintf(out, " \"type\" : \"file_sync\",\n") < 0 || - fprintf(out, " \"fileid\" : %" PRIu32 ",\n", - fileid) < 0 || - fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0) - return (errno); + WT_RET(__wt_fprintf(out, " \"type\" : \"file_sync\",\n")); + WT_RET(__wt_fprintf(out, + " \"fileid\" : %" PRIu32 ",\n", fileid)); + WT_RET(__wt_fprintf(out, + " \"start\" : %" PRId32 "\n", start)); break; case WT_LOGREC_MESSAGE: WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(S), &msg)); - if (fprintf(out, " \"type\" : \"message\",\n") < 0 || - fprintf(out, " \"message\" : \"%s\"\n", msg) < 0) - return (errno); + WT_RET(__wt_fprintf(out, " \"type\" : \"message\",\n")); + WT_RET(__wt_fprintf(out, " \"message\" : \"%s\"\n", msg)); break; } - if (fprintf(out, " }") < 0) - return (errno); + WT_RET(__wt_fprintf(out, " }")); return (0); } @@ -531,12 +536,10 @@ __wt_txn_printlog(WT_SESSION *wt_session, FILE *out) session = (WT_SESSION_IMPL *)wt_session; - if (fprintf(out, "[\n") < 0) - return (errno); + WT_RET(__wt_fprintf(out, "[\n")); WT_RET(__wt_log_scan( session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); - if (fprintf(out, "\n]\n") < 0) - return (errno); + WT_RET(__wt_fprintf(out, "\n]\n")); return (0); } diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c new file mode 100644 index 00000000000..7997d707de5 --- /dev/null +++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c @@ -0,0 +1,369 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __nsnap_destroy -- + * Destroy a named snapshot structure. + */ +static void +__nsnap_destroy(WT_SESSION_IMPL *session, WT_NAMED_SNAPSHOT *nsnap) +{ + __wt_free(session, nsnap->name); + __wt_free(session, nsnap->snapshot); + __wt_free(session, nsnap); +} + +/* + * __nsnap_drop_one -- + * Drop a single named snapshot. The named snapshot lock must be held + * write locked. + */ +static int +__nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) +{ + WT_DECL_RET; + WT_NAMED_SNAPSHOT *found; + WT_TXN_GLOBAL *txn_global; + + txn_global = &S2C(session)->txn_global; + + STAILQ_FOREACH(found, &txn_global->nsnaph, q) + if (WT_STRING_MATCH(found->name, name->str, name->len)) + break; + + if (found == NULL) + return (WT_NOTFOUND); + + /* Bump the global ID if we are removing the first entry */ + if (found == STAILQ_FIRST(&txn_global->nsnaph)) + txn_global->nsnap_oldest_id = (STAILQ_NEXT(found, q) != NULL) ? + STAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; + STAILQ_REMOVE(&txn_global->nsnaph, found, __wt_named_snapshot, q); + __nsnap_destroy(session, found); + + return (ret); +} + +/* + * __nsnap_drop_to -- + * Drop named snapshots, if the name is NULL all snapshots will be + * dropped. The named snapshot lock must be held write locked. + */ +static int +__nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive) +{ + WT_DECL_RET; + WT_NAMED_SNAPSHOT *last, *nsnap, *prev; + WT_TXN_GLOBAL *txn_global; + uint64_t new_nsnap_oldest; + + last = nsnap = prev = NULL; + txn_global = &S2C(session)->txn_global; + + if (STAILQ_EMPTY(&txn_global->nsnaph)) { + if (name == NULL) + return (0); + /* + * Dropping specific snapshots when there aren't any it's an + * error. + */ + WT_RET_MSG(session, EINVAL, + "Named snapshot '%.*s' for drop not found", + (int)name->len, name->str); + } + + /* + * The new ID will be none if we are removing all named snapshots + * which is the default behavior of this loop. + */ + new_nsnap_oldest = WT_TXN_NONE; + if (name != NULL) { + STAILQ_FOREACH(last, &txn_global->nsnaph, q) { + if (WT_STRING_MATCH(last->name, name->str, name->len)) + break; + prev = last; + } + if (last == NULL) + WT_RET_MSG(session, EINVAL, + "Named snapshot '%.*s' for drop not found", + (int)name->len, name->str); + + if (!inclusive) { + /* We are done if a drop before points to the head */ + if (prev == 0) + return (0); + last = prev; + } + + if (STAILQ_NEXT(last, q) != NULL) + new_nsnap_oldest = STAILQ_NEXT(last, q)->snap_min; + } + + do { + nsnap = STAILQ_FIRST(&txn_global->nsnaph); + WT_ASSERT(session, nsnap != NULL); + STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q); + __nsnap_destroy(session, nsnap); + /* Last will be NULL in the all case so it will never match */ + } while (nsnap != last && !STAILQ_EMPTY(&txn_global->nsnaph)); + + /* Now that the queue of named snapshots is updated, update the ID */ + txn_global->nsnap_oldest_id = new_nsnap_oldest; + + return (ret); +} + +/* + * __wt_txn_named_snapshot_begin -- + * Begin an named in-memory snapshot. + */ +int +__wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_NAMED_SNAPSHOT *nsnap, *nsnap_new; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + const char *txn_cfg[] = + { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), + "isolation=snapshot", NULL }; + int started_txn; + + started_txn = 0; + nsnap_new = NULL; + txn_global = &S2C(session)->txn_global; + txn = &session->txn; + + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + WT_ASSERT(session, cval.len != 0); + + if (!F_ISSET(txn, WT_TXN_RUNNING)) { + WT_RET(__wt_txn_begin(session, txn_cfg)); + started_txn = 1; + } + F_SET(txn, WT_TXN_READONLY); + + /* Save a copy of the transaction's snapshot. */ + WT_ERR(__wt_calloc_one(session, &nsnap_new)); + nsnap = nsnap_new; + WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); + nsnap->snap_min = txn->snap_min; + nsnap->snap_max = txn->snap_max; + if (txn->snapshot_count > 0) { + WT_ERR(__wt_calloc_def( + session, txn->snapshot_count, &nsnap->snapshot)); + memcpy(nsnap->snapshot, txn->snapshot, + txn->snapshot_count * sizeof(*nsnap->snapshot)); + } + nsnap->snapshot_count = txn->snapshot_count; + + /* Update the list. */ + + /* + * The semantic is that a new snapshot with the same name as an + * existing snapshot will replace the old one. + */ + WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); + + if (STAILQ_EMPTY(&txn_global->nsnaph)) + txn_global->nsnap_oldest_id = nsnap_new->snap_min; + STAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); + nsnap_new = NULL; + +err: if (started_txn) + WT_TRET(__wt_txn_rollback(session, NULL)); + else if (ret == 0) + F_SET(txn, WT_TXN_NAMED_SNAPSHOT); + + if (nsnap_new != NULL) + __nsnap_destroy(session, nsnap_new); + + return (ret); +} + +/* + * __wt_txn_named_snapshot_drop -- + * Drop named snapshots + */ +int +__wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG objectconf; + WT_CONFIG_ITEM all_config, k, names_config, to_config, before_config, v; + WT_DECL_RET; + + WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); + WT_RET(__wt_config_gets_def( + session, cfg, "drop.names", 0, &names_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); + WT_RET(__wt_config_gets_def( + session, cfg, "drop.before", 0, &before_config)); + + if (all_config.val != 0) + WT_RET(__nsnap_drop_to(session, NULL, 1)); + else if (before_config.len != 0) + WT_RET(__nsnap_drop_to(session, &before_config, 0)); + else if (to_config.len != 0) + WT_RET(__nsnap_drop_to(session, &to_config, 1)); + + /* We are done if there are no named drops */ + + if (names_config.len != 0) { + WT_RET(__wt_config_subinit( + session, &objectconf, &names_config)); + while ((ret = __wt_config_next(&objectconf, &k, &v)) == 0) { + ret = __nsnap_drop_one(session, &k); + if (ret != 0) + WT_RET_MSG(session, EINVAL, + "Named snapshot '%.*s' for drop not found", + (int)k.len, k.str); + } + if (ret == WT_NOTFOUND) + ret = 0; + } + + return (ret); +} + +/* + * __wt_txn_named_snapshot_get -- + * Lookup a named snapshot for a transaction. + */ +int +__wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) +{ + WT_NAMED_SNAPSHOT *nsnap; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = &S2C(session)->txn_global.states[session->id]; + + txn->isolation = WT_ISO_SNAPSHOT; + if (session->ncursors > 0) + WT_RET(__wt_session_copy_values(session)); + + WT_RET(__wt_readlock(session, txn_global->nsnap_rwlock)); + STAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) + if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { + txn->snap_min = txn_state->snap_min = nsnap->snap_min; + txn->snap_max = nsnap->snap_max; + if ((txn->snapshot_count = nsnap->snapshot_count) != 0) + memcpy(txn->snapshot, nsnap->snapshot, + nsnap->snapshot_count * + sizeof(*nsnap->snapshot)); + F_SET(txn, WT_TXN_HAS_SNAPSHOT); + break; + } + WT_RET(__wt_readunlock(session, txn_global->nsnap_rwlock)); + + if (nsnap == NULL) + WT_RET_MSG(session, EINVAL, + "Named snapshot '%.*s' not found", + (int)nameval->len, nameval->str); + + /* Flag that this transaction is opened on a named snapshot */ + F_SET(txn, WT_TXN_NAMED_SNAPSHOT); + + return (0); +} + +/* + * __wt_txn_named_snapshot_config -- + * Check the configuration for a named snapshot + */ +int +__wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, + const char *cfg[], int *has_create, int *has_drops) +{ + WT_CONFIG_ITEM cval; + WT_CONFIG_ITEM all_config, names_config, to_config, before_config; + WT_TXN *txn; + + txn = &session->txn; + *has_create = *has_drops = 0; + + /* Verify that the name is legal. */ + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + if (cval.len != 0) { + if (WT_STRING_MATCH("all", cval.str, cval.len)) + WT_RET_MSG(session, EINVAL, + "Can't create snapshot with reserved \"all\" name"); + + WT_RET(__wt_name_check(session, cval.str, cval.len)); + + if (F_ISSET(txn, WT_TXN_RUNNING) && + txn->isolation != WT_ISO_SNAPSHOT) + WT_RET_MSG(session, EINVAL, + "Can't create a named snapshot from a running " + "transaction that isn't snapshot isolation"); + else if (F_ISSET(txn, WT_TXN_RUNNING) && txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "Can't create a named snapshot from a running " + "transaction that has made updates"); + *has_create = 1; + } + + /* Verify that the drop configuration is sane. */ + WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); + WT_RET(__wt_config_gets_def( + session, cfg, "drop.names", 0, &names_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); + WT_RET(__wt_config_gets_def( + session, cfg, "drop.before", 0, &before_config)); + + /* Avoid more work if no drops are configured. */ + if (all_config.val != 0 || names_config.len != 0 || + before_config.len != 0 || to_config.len != 0) { + if (before_config.len != 0 && to_config.len != 0) + WT_RET_MSG(session, EINVAL, + "Illegal configuration; named snapshot drop can't " + "specify both before and to options"); + if (all_config.val != 0 && (names_config.len != 0 || + to_config.len != 0 || before_config.len != 0)) + WT_RET_MSG(session, EINVAL, + "Illegal configuration; named snapshot drop can't " + "specify all and any other options"); + *has_drops = 1; + } + + if (!*has_create && !*has_drops) + WT_RET_MSG(session, EINVAL, + "WT_SESSION::snapshot API called without any drop or " + "name option."); + + return (0); +} + +/* + * __wt_txn_named_snapshot_destroy -- + * Destroy all named snapshots on connection close + */ +int +__wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session) +{ + WT_NAMED_SNAPSHOT *nsnap; + WT_TXN_GLOBAL *txn_global; + + txn_global = &S2C(session)->txn_global; + txn_global->nsnap_oldest_id = WT_TXN_NONE; + + while (!STAILQ_EMPTY(&txn_global->nsnaph)) { + nsnap = STAILQ_FIRST(&txn_global->nsnaph); + WT_ASSERT(session, nsnap != NULL); + STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q); + __nsnap_destroy(session, nsnap); + } + + return (0); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index f11b585da1c..0eadcbf3b01 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -273,7 +273,7 @@ __txn_log_recover(WT_SESSION_IMPL *session, WT_UNUSED(next_lsnp); r = cookie; - p = LOG_SKIP_HEADER(logrec->data); + p = WT_LOG_SKIP_HEADER(logrec->data); end = (const uint8_t *)logrec->data + logrec->size; WT_UNUSED(firstrecord); @@ -463,8 +463,11 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * there. */ r.ckpt_lsn = metafile->ckpt_lsn; - WT_ERR(__wt_log_scan(session, - &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); + ret = __wt_log_scan(session, + &metafile->ckpt_lsn, 0, __txn_log_recover, &r); + if (ret == ENOENT) + ret = 0; + WT_ERR(ret); } } @@ -502,9 +505,13 @@ __wt_txn_recover(WT_SESSION_IMPL *session) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); - else - WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, - WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); + else { + ret = __wt_log_scan(session, &r.ckpt_lsn, + WT_LOGSCAN_RECOVER, __txn_log_recover, &r); + if (ret == ENOENT) + ret = 0; + WT_ERR(ret); + } conn->next_file_id = r.max_fileid; |