From 3c0c55340f577b847990247dfc47063930b5c50b Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Mon, 19 Dec 2016 00:43:52 -0500 Subject: WT-3080 Python test suite: add elapsed time for tests (#3201) For tests that are non-trivial (> 0.001 seconds) show the elapsed time if our verboseness is high enough. This will increase the amount of console output. --- test/suite/wttest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/suite/wttest.py b/test/suite/wttest.py index 4d6df0bc8bd..bd6d2005cd9 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -287,6 +287,7 @@ class WiredTigerTestCase(unittest.TestCase): self.testsubdir = self.className() + '.' + str(self.__class__.wt_ntests) self.testdir = os.path.join(WiredTigerTestCase._parentTestdir, self.testsubdir) self.__class__.wt_ntests += 1 + self.starttime = time.time() if WiredTigerTestCase._verbose > 2: self.prhead('started in ' + self.testdir, True) # tearDown needs connections list, set it here in case the open fails. @@ -355,6 +356,9 @@ class WiredTigerTestCase(unittest.TestCase): else: self.pr('preserving directory ' + self.testdir) + elapsed = time.time() - self.starttime + if elapsed > 0.001 and WiredTigerTestCase._verbose >= 2: + print "%s: %.2f seconds" % (str(self), elapsed) if not passed and not skipped: print "ERROR in " + str(self) self.pr('FAIL') -- cgit v1.2.1 From c0bae91eff62d1545f5c38e8adf83926607e736e Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Mon, 19 Dec 2016 00:44:33 -0500 Subject: WT-3082 Python test suite: shorten default run to avoid timeouts. (#3206) Tests that have lots of scenarios will have the number of scenarios pruned in the default case. Running with --long restores the previous larger number of scenarios. --- test/suite/test_backup03.py | 2 +- test/suite/test_config03.py | 2 +- test/suite/test_intpack.py | 4 ++-- test/suite/test_join01.py | 2 +- test/suite/test_lsm01.py | 2 +- test/suite/test_truncate01.py | 9 +++++---- test/suite/test_truncate02.py | 3 ++- test/suite/test_txn07.py | 3 ++- 8 files changed, 15 insertions(+), 12 deletions(-) diff --git a/test/suite/test_backup03.py b/test/suite/test_backup03.py index 73d05f0b0a1..c1ed3cc9e1a 100644 --- a/test/suite/test_backup03.py +++ b/test/suite/test_backup03.py @@ -74,7 +74,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): ('backup_9', dict(big=3,list=[])), # Backup everything ] - scenarios = make_scenarios(list) + scenarios = make_scenarios(list, prune=3, prunelong=1000) # Create a large cache, otherwise this test runs quite slowly. conn_config = 'cache_size=1G' diff --git a/test/suite/test_config03.py b/test/suite/test_config03.py index 6699f7d2650..89038d71319 100644 --- a/test/suite/test_config03.py +++ b/test/suite/test_config03.py @@ -71,7 +71,7 @@ class test_config03(test_base03.test_base03): cache_size_scenarios, create_scenarios, error_prefix_scenarios, eviction_target_scenarios, eviction_trigger_scenarios, multiprocess_scenarios, session_max_scenarios, - transactional_scenarios, verbose_scenarios, prune=1000) + transactional_scenarios, verbose_scenarios, prune=100, prunelong=1000) #wttest.WiredTigerTestCase.printVerbose(2, 'test_config03: running ' + \ # str(len(scenarios)) + ' of ' + \ diff --git a/test/suite/test_intpack.py b/test/suite/test_intpack.py index b0cece09494..ae391e68fca 100644 --- a/test/suite/test_intpack.py +++ b/test/suite/test_intpack.py @@ -126,8 +126,8 @@ class PackTester: class test_intpack(wttest.WiredTigerTestCase): name = 'test_intpack' - # We have to be a bit verbose here with naming, as there can be problems with - # case insensitive test names:w + # We have to be a bit verbose here with naming, scenario names are + # case insensitive and must be unique. scenarios = make_scenarios([ ('int8_t_b', dict(formatcode='b', low=-128, high=127, nbits=8)), diff --git a/test/suite/test_join01.py b/test/suite/test_join01.py index 2c4328dc7d3..bdd86a06d4f 100644 --- a/test/suite/test_join01.py +++ b/test/suite/test_join01.py @@ -69,7 +69,7 @@ class test_join01(wttest.WiredTigerTestCase): ] scenarios = make_scenarios(type_scen, bloom0_scen, bloom1_scen, projection_scen, nested_scen, stats_scen, - order_scen) + order_scen, prune=50, prunelong=1000) # We need statistics for these tests. conn_config = 'statistics=(all)' diff --git a/test/suite/test_lsm01.py b/test/suite/test_lsm01.py index b44df4bae14..f705b09b0a4 100644 --- a/test/suite/test_lsm01.py +++ b/test/suite/test_lsm01.py @@ -57,7 +57,7 @@ class test_lsm01(wttest.WiredTigerTestCase): scenarios = wtscenario.make_scenarios( chunk_size_scenarios, merge_max_scenarios, bloom_scenarios, bloom_bit_scenarios, bloom_hash_scenarios, record_count_scenarios, - prune=500) + prune=100, prunelong=500) # Test drop of an object. def test_lsm(self): diff --git a/test/suite/test_truncate01.py b/test/suite/test_truncate01.py index 2319eeddbef..7d2b3862568 100644 --- a/test/suite/test_truncate01.py +++ b/test/suite/test_truncate01.py @@ -183,11 +183,11 @@ class test_truncate_cursor(wttest.WiredTigerTestCase): # those tests to file objects. types = [ ('file', dict(type='file:', valuefmt='S', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.25)), ('file8t', dict(type='file:', valuefmt='8t', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.25)), ('table', dict(type='table:', valuefmt='S', - config='allocation_size=512,leaf_page_max=512')), + config='allocation_size=512,leaf_page_max=512', P=0.5)), ] keyfmt = [ ('integer', dict(keyfmt='i')), @@ -203,7 +203,8 @@ class test_truncate_cursor(wttest.WiredTigerTestCase): ('big', dict(nentries=1000,skip=37)), ] - scenarios = make_scenarios(types, keyfmt, size, reopen) + scenarios = make_scenarios(types, keyfmt, size, reopen, + prune=10, prunelong=1000) # Set a cursor key. def cursorKey(self, ds, uri, key): diff --git a/test/suite/test_truncate02.py b/test/suite/test_truncate02.py index 73fed362354..729825b26d4 100644 --- a/test/suite/test_truncate02.py +++ b/test/suite/test_truncate02.py @@ -85,7 +85,8 @@ class test_truncate_fast_delete(wttest.WiredTigerTestCase): ('txn2', dict(commit=False)), ] - scenarios = make_scenarios(types, keyfmt, overflow, reads, writes, txn) + scenarios = make_scenarios(types, keyfmt, overflow, reads, writes, txn, + prune=20, prunelong=1000) # Return the number of records visible to the cursor; test both forward # and backward iteration, they are different code paths in this case. diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py index f9577bad7f2..a08d68f88aa 100644 --- a/test/suite/test_txn07.py +++ b/test/suite/test_txn07.py @@ -70,7 +70,8 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): ('none', dict(compress='')), ] - scenarios = make_scenarios(types, op1s, txn1s, compress) + scenarios = make_scenarios(types, op1s, txn1s, compress, + prune=30, prunelong=1000) # Overrides WiredTigerTestCase def setUpConnectionOpen(self, dir): self.home = dir -- cgit v1.2.1 From d7f6c43a465eb43feabddf24d6cd1860ff08c10b Mon Sep 17 00:00:00 2001 From: David Hows Date: Mon, 19 Dec 2016 17:08:03 +1100 Subject: Revert "WT-3075 Document and enforce that WiredTiger now depends on Python 2.7 (#3196)" This reverts commit 8ae0338420c8902bdaf323f66ff09d44d4e6eb0c. --- build_posix/configure.ac.in | 2 +- src/docs/build-pydoc.sh | 2 +- src/docs/testing.dox | 2 +- src/docs/upgrading.dox | 11 +++-------- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index b7c39b5da8b..952c9ae607d 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -133,7 +133,7 @@ if test "$wt_cv_enable_java" = "yes"; then fi if test "$wt_cv_enable_python" = "yes"; then - AM_PATH_PYTHON([2.7]) + AM_PATH_PYTHON([2.6]) if test -n "$with_python_prefix" ; then PYTHON_INSTALL_ARG="-d $with_python_prefix" fi diff --git a/src/docs/build-pydoc.sh b/src/docs/build-pydoc.sh index aef88fd4c97..5e6e3635be5 100755 --- a/src/docs/build-pydoc.sh +++ b/src/docs/build-pydoc.sh @@ -3,4 +3,4 @@ TOP=$DOCS/.. . $TOP/config.sh cd python -PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.7/site-packages pydoc -w wiredtiger +PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.6/site-packages pydoc -w wiredtiger diff --git a/src/docs/testing.dox b/src/docs/testing.dox index cf280e8f3ff..7d454d54212 100644 --- a/src/docs/testing.dox +++ b/src/docs/testing.dox @@ -27,7 +27,7 @@ The WiredTiger unit test suite includes tests that cover: The WiredTiger Python test suite is built using the WiredTiger Python API and the Python unittest functionality (the test suite requires at -least Python version 2.7). +least Python version 2.6). The WiredTiger test suite automatically runs as part of every commit into the WiredTiger GitHub source tree. diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 1e0e2eaf99a..0b0826f2646 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,19 +1,14 @@ /*! @page upgrading Upgrading WiredTiger applications + @section version_291 Upgrading to Version 2.9.1 -
-
WiredTiger now requires Python 2.7 at minimum
-
-The minimum version of Python supported by WiredTiger is now 2.7 up from the -previous version of 2.6. This is due to extra unit tests added in this release -that depend on 2.7. This is not due to a change in the Python API. -
+
Changes to hazard pointer configuration
The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is allocated for hazard pointers as required by each session.
-

+
@section version_290 Upgrading to Version 2.9.0
-- cgit v1.2.1 From 84e44d4d729d0ff0c23a7dda98d9ed72b0e49fc0 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Mon, 19 Dec 2016 18:10:31 -0500 Subject: WT-2833 Add projections to wt dump utility (#3192) When dumping with a projection, the dumped metadata associated with the table is modified so that the list of columns matches the list from the projection, and the value format corresponds to projected format. We use an open cursor using the projection to obtain the value format to show, that requires a slight reordering of when cursors are opened in the dump utility. Also fix a problem in the JSON dump cursor which did not handle projections. Added tests for dump with projections, including tables and indices using the normal dump formats, and tables dumped using JSON dump format. Tables are also dumped with projections and then loaded into the modified table format to be checked. --- src/cursor/cur_index.c | 4 +- src/cursor/cur_json.c | 16 +++- src/cursor/cur_table.c | 2 +- src/include/extern.h | 2 +- src/utilities/util_dump.c | 200 +++++++++++++++++++++++++++++++++++------- test/suite/test_dump.py | 54 +++++++++++- test/suite/test_jsondump02.py | 18 ++++ test/suite/wtdataset.py | 92 ++++++++++++++++++- 8 files changed, 348 insertions(+), 40 deletions(-) diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 0ab992bc88c..4786b0524bc 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -520,8 +520,8 @@ __wt_curindex_open(WT_SESSION_IMPL *session, WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - __wt_json_column_init( - cursor, table->key_format, &idx->colconf, &table->colconf); + __wt_json_column_init(cursor, uri, table->key_format, + &idx->colconf, &table->colconf); if (0) { err: WT_TRET(__curindex_close(cursor)); diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index a0a3ffdd974..5870d14273e 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -369,11 +369,11 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) * of column names. */ void -__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, +__wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) { WT_CURSOR_JSON *json; - const char *p, *end, *beginkey; + const char *beginkey, *end, *lparen, *p; uint32_t keycnt, nkeys; json = (WT_CURSOR_JSON *)cursor->json_private; @@ -400,8 +400,16 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, keycnt++; p++; } - json->value_names.str = p; - json->value_names.len = WT_PTRDIFF(end, p); + if ((lparen = strchr(uri, '(')) != NULL) { + /* This cursor is a projection. */ + json->value_names.str = lparen; + json->value_names.len = strlen(lparen) - 1; + WT_ASSERT((WT_SESSION_IMPL *)cursor->session, + json->value_names.str[json->value_names.len] == ')'); + } else { + json->value_names.str = p; + json->value_names.len = WT_PTRDIFF(end, p); + } if (idxconf == NULL) { if (p > beginkey) p--; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index fae7667e44f..76f7fc5865f 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -951,7 +951,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) __wt_json_column_init( - cursor, table->key_format, NULL, &table->colconf); + cursor, uri, table->key_format, NULL, &table->colconf); /* * Open the colgroup cursors immediately: we're going to need them for diff --git a/src/include/extern.h b/src/include/extern.h index be042bcd6cb..4824dc93d96 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -294,7 +294,7 @@ extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern size_t __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_json_tokname(int toktype) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 7dde13ee837..651cc7acf9c 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -6,10 +6,14 @@ * See the file LICENSE for redistribution information. */ +#include #include "util.h" #include "util_dump.h" -static int dump_config(WT_SESSION *, const char *, bool, bool); +#define STRING_MATCH_CONFIG(s, item) \ + (strncmp(s, (item).str, (item).len) == 0 && (s)[(item).len] == '\0') + +static int dump_config(WT_SESSION *, const char *, WT_CURSOR *, bool, bool); static int dump_json_begin(WT_SESSION *); static int dump_json_end(WT_SESSION *); static int dump_json_separator(WT_SESSION *); @@ -17,7 +21,8 @@ static int dump_json_table_end(WT_SESSION *); static int dump_prefix(WT_SESSION *, bool, bool); static int dump_record(WT_CURSOR *, bool, bool); static int dump_suffix(WT_SESSION *, bool); -static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *, bool); +static int dump_table_config( + WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, bool); static int dump_table_parts_config( WT_SESSION *, WT_CURSOR *, const char *, const char *, bool); static int dup_json_string(const char *, char **); @@ -32,10 +37,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) size_t len; int ch, i; bool hex, json, reverse; - char *checkpoint, *config, *name; + char *checkpoint, *config, *name, *p, *simplename; hex = json = reverse = false; - checkpoint = config = name = NULL; + checkpoint = config = name = simplename = NULL; + cursor = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) switch (ch) { case 'c': @@ -84,14 +90,12 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) if ((ret = dump_json_separator(session)) != 0) goto err; free(name); - name = NULL; + free(simplename); + name = simplename = NULL; if ((name = util_name(session, argv[i], "table")) == NULL) goto err; - if (dump_config(session, name, hex, json) != 0) - goto err; - len = checkpoint == NULL ? 0 : strlen("checkpoint=") + strlen(checkpoint) + 1; @@ -115,10 +119,26 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) goto err; } + if ((simplename = strdup(name)) == NULL) { + ret = util_err(session, errno, NULL); + goto err; + } + if ((p = strchr(simplename, '(')) != NULL) + *p = '\0'; + if (dump_config(session, simplename, cursor, hex, json) != 0) + goto err; + if ((ret = dump_record(cursor, reverse, json)) != 0) goto err; if (json && (ret = dump_json_table_end(session)) != 0) goto err; + + ret = cursor->close(cursor); + cursor = NULL; + if (ret != 0) { + ret = util_err(session, ret, NULL); + goto err; + } } if (json && ((ret = dump_json_end(session)) != 0)) goto err; @@ -129,7 +149,11 @@ err: ret = 1; free(config); free(name); - + free(simplename); + if (cursor != NULL && (ret = cursor->close(cursor)) != 0) { + (void)util_err(session, ret, NULL); + ret = 1; + } return (ret); } @@ -138,15 +162,16 @@ err: ret = 1; * Dump the config for the uri. */ static int -dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) +dump_config(WT_SESSION *session, const char *uri, WT_CURSOR *cursor, bool hex, + bool json) { - WT_CURSOR *cursor; + WT_CURSOR *mcursor; WT_DECL_RET; int tret; /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, "metadata:create", NULL, NULL, &cursor)) != 0) { + session, "metadata:create", NULL, NULL, &mcursor)) != 0) { fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, "metadata:create", session->strerror(session, ret)); return (1); @@ -156,10 +181,11 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) * want to output a header if the user entered the wrong name. This is * where we find out a table doesn't exist, use a simple error message. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) == 0) { + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) == 0) { if ((!json && dump_prefix(session, hex, json) != 0) || - dump_table_config(session, cursor, uri, json) != 0 || + dump_table_config(session, mcursor, cursor, + uri, json) != 0 || dump_suffix(session, json) != 0) ret = 1; } else if (ret == WT_NOTFOUND) @@ -167,8 +193,8 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) else ret = util_err(session, ret, "%s", uri); - if ((tret = cursor->close(cursor)) != 0) { - tret = util_cerr(cursor, "close", tret); + if ((tret = mcursor->close(mcursor)) != 0) { + tret = util_cerr(mcursor, "close", tret); if (ret == 0) ret = tret; } @@ -224,17 +250,126 @@ dump_json_table_end(WT_SESSION *session) return (0); } +/* + * dump_add_config + * Add a formatted config string to an output buffer. + */ +static int +dump_add_config(WT_SESSION *session, char **bufp, size_t *leftp, + const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + int n; + va_list ap; + + va_start(ap, fmt); + n = vsnprintf(*bufp, *leftp, fmt, ap); + va_end(ap); + if (n < 0) + return (util_err(session, EINVAL, NULL)); + *bufp += n; + *leftp -= n; + return (0); +} + +/* + * dump_projection -- + * Create a new config containing projection information. + */ +static int +dump_projection(WT_SESSION *session, const char *config, WT_CURSOR *cursor, + char **newconfigp) +{ + WT_DECL_RET; + WT_CONFIG_ITEM key, value; + WT_CONFIG_PARSER *parser; + WT_EXTENSION_API *wt_api; + size_t len, vallen; + int nkeys; + char *newconfig; + const char *keyformat, *p; + + len = strlen(config) + strlen(cursor->value_format) + + strlen(cursor->uri) + 20; + if ((newconfig = malloc(len)) == NULL) + return util_err(session, errno, NULL); + *newconfigp = newconfig; + wt_api = session->connection->get_extension_api(session->connection); + if ((ret = wt_api->config_parser_open(wt_api, session, config, + strlen(config), &parser)) != 0) + return (util_err( + session, ret, "WT_EXTENSION_API.config_parser_open")); + keyformat = cursor->key_format; + for (nkeys = 0; *keyformat; keyformat++) + if (!__wt_isdigit((u_char)*keyformat)) + nkeys++; + + /* + * Copy the configuration, replacing some fields to match the + * projection. + */ + while ((ret = parser->next(parser, &key, &value)) == 0) { + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s=", (int)key.len, key.str)); + if (STRING_MATCH_CONFIG("value_format", key)) + WT_RET(dump_add_config(session, &newconfig, &len, + "%s", cursor->value_format)); + else if (STRING_MATCH_CONFIG("columns", key)) { + /* copy names of keys */ + p = value.str; + vallen = value.len; + while (vallen > 0) { + if ((*p == ',' || *p == ')') && --nkeys == 0) + break; + p++; + vallen--; + } + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s", (int)(p - value.str), value.str)); + + /* copy names of projected values */ + p = strchr(cursor->uri, '('); + assert(p != NULL); + assert(p[strlen(p) - 1] == ')'); + p++; + if (*p != ')') + WT_RET(dump_add_config(session, &newconfig, + &len, "%s", ",")); + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s),", (int)(strlen(p) - 1), p)); + } else if (value.type == WT_CONFIG_ITEM_STRING && + value.len != 0) + WT_RET(dump_add_config(session, &newconfig, &len, + "\"%.*s\",", (int)value.len, value.str)); + else + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s,", (int)value.len, value.str)); + } + if (ret != WT_NOTFOUND) + return (util_err(session, ret, "WT_CONFIG_PARSER.next")); + + assert(len > 0); + if ((ret = parser->close(parser)) != 0) + return (util_err( + session, ret, "WT_CONFIG_PARSER.close")); + + return (0); +} + /* * dump_table_config -- * Dump the config for a table. */ static int dump_table_config( - WT_SESSION *session, WT_CURSOR *cursor, const char *uri, bool json) + WT_SESSION *session, WT_CURSOR *mcursor, WT_CURSOR *cursor, + const char *uri, bool json) { WT_DECL_RET; + char *proj_config; const char *name, *v; + proj_config = NULL; /* Get the table name. */ if ((name = strchr(uri, ':')) == NULL) { fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); @@ -246,20 +381,25 @@ dump_table_config( * Dump out the config information: first, dump the uri entry itself, * it overrides all subsequent configurations. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) != 0) - return (util_cerr(cursor, "search", ret)); - if ((ret = cursor->get_value(cursor, &v)) != 0) - return (util_cerr(cursor, "get_value", ret)); - - WT_RET(print_config(session, uri, v, json, true)); + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) != 0) + return (util_cerr(mcursor, "search", ret)); + if ((ret = mcursor->get_value(mcursor, &v)) != 0) + return (util_cerr(mcursor, "get_value", ret)); + + if (strchr(cursor->uri, '(') != NULL) { + WT_ERR(dump_projection(session, v, cursor, &proj_config)); + v = proj_config; + } + WT_ERR(print_config(session, uri, v, json, true)); - WT_RET(dump_table_parts_config( - session, cursor, name, "colgroup:", json)); - WT_RET(dump_table_parts_config( - session, cursor, name, "index:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "colgroup:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "index:", json)); - return (0); +err: free(proj_config); + return (ret); } /* diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py index f6a83c32489..3127c7aef00 100644 --- a/test/suite/test_dump.py +++ b/test/suite/test_dump.py @@ -32,7 +32,7 @@ import wiredtiger, wttest from suite_subprocess import suite_subprocess from wtscenario import make_scenarios from wtdataset import SimpleDataSet, SimpleIndexDataSet, SimpleLSMDataSet, \ - ComplexDataSet, ComplexLSMDataSet + ComplexDataSet, ComplexLSMDataSet, ProjectionDataSet, ProjectionIndexDataSet # test_dump.py # Utilities: wt dump @@ -62,6 +62,10 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): ('table-simple-lsm', dict(uri='table:', dataset=SimpleLSMDataSet)), ('table-complex', dict(uri='table:', dataset=ComplexDataSet)), ('table-complex-lsm', dict(uri='table:', dataset=ComplexLSMDataSet)), + ('table-simple-proj', dict(uri='table:', + dataset=ProjectionDataSet, projection=True)), + ('table-index-proj', dict(uri='table:', + dataset=ProjectionIndexDataSet, projection=True)), ] scenarios = make_scenarios(types, keyfmt, dumpfmt) @@ -158,5 +162,53 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): pop = self.dataset(self, uri2, self.nentries, key_format=self.keyfmt) pop.check() +# test_dump_projection +# Utilities: wt dump +# Test the dump utility with projections +class test_dump_projection(wttest.WiredTigerTestCase, suite_subprocess): + dir = 'dump.dir' # Backup directory name + + name = 'test_dump' + nentries = 2500 + uri = 'table:' + + # Dump, re-load and do a content comparison. + def test_dump(self): + + # Create the object. + uri = self.uri + self.name + pop = ProjectionDataSet(self, uri, self.nentries, key_format='S') + pop.populate() + + # Check some cases with invalid projections. + self.runWt(['dump', '-x', uri + '('], \ + outfilename='bad1.out', errfilename='err1.out', failure=True) + self.check_non_empty_file('err1.out') + self.runWt(['dump', '-x', uri + '(xx)'], \ + outfilename='bad2.out', errfilename='err2.out', failure=True) + self.check_non_empty_file('err2.out') + self.runWt(['dump', '-x', uri + pop.projection[:-1]], \ + outfilename='bad3.out', errfilename='err3.out', failure=True) + self.check_non_empty_file('err3.out') + + # Dump the object with a valid projection. + self.runWt(['dump', '-x', uri + pop.projection], outfilename='dump.out') + + # Re-load the object in a new home. + os.mkdir(self.dir) + self.runWt(['-h', self.dir, 'load', '-f', 'dump.out']) + + # Check the database contents. + self.runWt(['list'], outfilename='list.out') + self.runWt(['-h', self.dir, 'list'], outfilename='list.out.new') + s1 = set(open('list.out').read().split()) + s2 = set(open('list.out.new').read().split()) + self.assertEqual(not s1.symmetric_difference(s2), True) + + # Check the object's contents. + self.reopen_conn(self.dir) + pop_reload = ProjectionDataSet(self, uri, self.nentries, key_format='S') + pop_reload.check() + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py index 8482851fb94..60863c4aa97 100644 --- a/test/suite/test_jsondump02.py +++ b/test/suite/test_jsondump02.py @@ -234,6 +234,24 @@ class test_jsondump02(wttest.WiredTigerTestCase, suite_subprocess): ('"ikey" : 4,\n"Skey" : "key4"', '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64')) self.check_json(self.table_uri4, table4_json) + # This projection has 3 value fields reversed with a key at the end. + table4_json_projection = ( + ('"ikey" : 1,\n"Skey" : "key1"', + '"i4" : 1,\n"S3" : "val1",\n"i2" : 1,\n"ikey" : 1'), + ('"ikey" : 2,\n"Skey" : "key2"', + '"i4" : 8,\n"S3" : "val8",\n"i2" : 4,\n"ikey" : 2'), + ('"ikey" : 3,\n"Skey" : "key3"', + '"i4" : 27,\n"S3" : "val27",\n"i2" : 9,\n"ikey" : 3'), + ('"ikey" : 4,\n"Skey" : "key4"', + '"i4" : 64,\n"S3" : "val64",\n"i2" : 16,\n"ikey" : 4')) + # bad projection URI + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.check_json(self.table_uri4 + '(i4,S3,i2,ikey', + table4_json_projection), + '/Unbalanced brackets/') + # This projection should work. + self.check_json(self.table_uri4 + '(i4,S3,i2,ikey)', + table4_json_projection) # The dump config currently is not supported for the index type. self.check_json(uri4index1, ( ('"Skey" : "key1"', diff --git a/test/suite/wtdataset.py b/test/suite/wtdataset.py index 74e07e24e93..946b97d995f 100644 --- a/test/suite/wtdataset.py +++ b/test/suite/wtdataset.py @@ -41,6 +41,7 @@ class BaseDataSet(object): self.key_format = kwargs.get('key_format', 'S') self.value_format = kwargs.get('value_format', 'S') self.config = kwargs.get('config', '') + self.projection = kwargs.get('projection', '') def create(self): self.testcase.session.create(self.uri, 'key_format=' + self.key_format @@ -103,7 +104,8 @@ class BaseDataSet(object): def check(self): self.testcase.pr('check: ' + self.uri) - cursor = self.testcase.session.open_cursor(self.uri, None) + cursor = self.testcase.session.open_cursor( + self.uri + self.projection, None, None) self.check_cursor(cursor) cursor.close() @@ -289,6 +291,94 @@ class ComplexLSMDataSet(ComplexDataSet): def is_lsm(cls): return True +class ProjectionDataSet(SimpleDataSet): + """ + ProjectionDataSet creates a table with predefined data identical to + SimpleDataSet (single key and value), but when checking it, uses + a cursor with a projection. + """ + def __init__(self, testcase, uri, rows, **kwargs): + kwargs['config'] = kwargs.get('config', '') + ',columns=(k,v0)' + kwargs['projection'] = '(v0,v0,v0)' + super(ProjectionDataSet, self).__init__(testcase, uri, rows, **kwargs) + + # A value suitable for checking the value returned by a cursor. + def comparable_value(self, i): + v0 = self.value(i) + return [v0, v0, v0] + + def check_cursor(self, cursor): + i = 0 + for key, got0, got1, got2 in cursor: + i += 1 + self.testcase.assertEqual(key, self.key(i)) + if cursor.value_format == '8t' and got0 == 0: # deleted + continue + self.testcase.assertEqual([got0, got1, got2], + self.comparable_value(i)) + self.testcase.assertEqual(i, self.rows) + +class ProjectionIndexDataSet(BaseDataSet): + """ + ProjectionIndexDataSet creates a table with three values and + an index. Checks are made against a projection of the main table + and a projection of the index. + """ + def __init__(self, testcase, uri, rows, **kwargs): + self.origconfig = kwargs.get('config', '') + self.indexname = 'index:' + uri.split(":")[1] + ':index0' + kwargs['config'] = self.origconfig + ',columns=(k,v0,v1,v2)' + kwargs['value_format'] = kwargs.get('value_format', 'SiS') + kwargs['projection'] = '(v1,v2,v0)' + super(ProjectionIndexDataSet, self).__init__( + testcase, uri, rows, **kwargs) + + def value(self, i): + return ('v0:' + str(i), i*i, 'v2:' + str(i)) + + # Suitable for checking the value returned by a cursor using a projection. + def comparable_value(self, i): + return [i*i, 'v2:' + str(i), 'v0:' + str(i)] + + def create(self): + super(ProjectionIndexDataSet, self).create() + self.testcase.session.create(self.indexname, 'columns=(v2,v1),' + + self.origconfig) + + def check_cursor(self, cursor): + i = 0 + for key, got0, got1, got2 in cursor: + i += 1 + self.testcase.assertEqual(key, self.key(i)) + if cursor.value_format == '8t' and got0 == 0: # deleted + continue + self.testcase.assertEqual([got0, got1, got2], + self.comparable_value(i)) + self.testcase.assertEqual(i, self.rows) + + def check_index_cursor(self, cursor): + for i in xrange(1, self.rows + 1): + k = self.key(i) + v = self.value(i) + ik = (v[2], v[1]) # The index key is (v2,v2) + expect = [v[1],k,v[2],v[0]] + self.testcase.assertEqual(expect, cursor[ik]) + + def check(self): + BaseDataSet.check(self) + + # Check values in the index. + idxcursor = self.testcase.session.open_cursor( + self.indexname + '(v1,k,v2,v0)') + self.check_index_cursor(idxcursor) + idxcursor.close() + + def index_count(self): + return 1 + + def index_name(self, i): + return self.indexname + # create a key based on a cursor as a shortcut to creating a SimpleDataSet def simple_key(cursor, i): return BaseDataSet.key_by_format(i, cursor.key_format) -- cgit v1.2.1 From 9a3d212c6f94bf8fbf6be39ed63e35a7d0424104 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 20 Dec 2016 10:42:37 +1100 Subject: WT-2771 Add a statistic to track per-btree dirty cache usage. (#3207) --- dist/stat_data.py | 1 + src/btree/bt_stat.c | 2 + src/include/btree.h | 1 + src/include/btree.i | 44 +++++++++++++---- src/include/stat.h | 1 + src/include/wiredtiger.in | 118 +++++++++++++++++++++++---------------------- src/support/stat.c | 4 ++ tools/wtstats/stat_data.py | 2 + 8 files changed, 105 insertions(+), 68 deletions(-) diff --git a/dist/stat_data.py b/dist/stat_data.py index 022810d5c49..c481382dafc 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -477,6 +477,7 @@ dsrc_stats = [ ########################################## # Cache and eviction statistics ########################################## + CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 06428b87f6e..f4701a858d5 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -40,6 +40,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); + WT_STAT_SET(session, stats, cache_bytes_dirty, + __wt_btree_dirty_inuse(session)); WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); diff --git a/src/include/btree.h b/src/include/btree.h index 595afc453c8..c89e3c36c20 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -131,6 +131,7 @@ struct __wt_btree { uint64_t write_gen; /* Write generation */ uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ WT_REF *evict_ref; /* Eviction thread's location */ diff --git a/src/include/btree.i b/src/include/btree.i index 4f69c258621..fba6ee8e38a 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -70,6 +70,23 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem)); } +/* + * __wt_btree_dirty_inuse -- + * Return the number of dirty bytes in use. + */ +static inline uint64_t +__wt_btree_dirty_inuse(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + + btree = S2BT(session); + cache = S2C(session)->cache; + + return (__wt_cache_bytes_plus_overhead(cache, + btree->bytes_dirty_intl + btree->bytes_dirty_leaf)); +} + /* * __wt_btree_dirty_leaf_inuse -- * Return the number of bytes in use by dirty leaf pages. @@ -105,11 +122,12 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); + (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } } /* Track internal size in cache. */ @@ -238,10 +256,12 @@ __wt_cache_page_byte_dirty_decr( if (i == 5) return; - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_intl, + decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -297,6 +317,7 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) */ size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { @@ -392,17 +413,20 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_zero_uint64(session, + &btree->bytes_dirty_intl, + modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - __wt_cache_decr_zero_uint64(session, - &cache->bytes_dirty_leaf, - modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); + } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { __wt_cache_decr_zero_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); + __wt_cache_decr_zero_uint64(session, + &cache->bytes_dirty_leaf, + modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); } } diff --git a/src/include/stat.h b/src/include/stat.h index 0daab83e166..3dcdf68b8d5 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -564,6 +564,7 @@ struct __wt_dsrc_stats { int64_t cache_pages_requested; int64_t cache_write; int64_t cache_write_restore; + int64_t cache_bytes_dirty; int64_t cache_eviction_clean; int64_t cache_state_gen_avg_gap; int64_t cache_state_avg_written_size; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index a6deed7e14e..f9e232e0310 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -4978,181 +4978,183 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE 2059 /*! cache: pages written requiring in-memory restoration */ #define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2060 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_DSRC_CACHE_BYTES_DIRTY 2061 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2061 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2062 /*! * cache_walk: Average difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2062 +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2063 /*! * cache_walk: Average on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2063 +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2064 /*! * cache_walk: Clean pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2064 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2065 /*! * cache_walk: Current eviction generation, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2065 +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2066 /*! * cache_walk: Dirty pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2066 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2067 /*! * cache_walk: Entries in the root page, only reported if cache_walk or * all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2067 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2068 /*! * cache_walk: Internal pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2068 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2069 /*! * cache_walk: Leaf pages currently in cache, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2069 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2070 /*! * cache_walk: Maximum difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2070 +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2071 /*! * cache_walk: Maximum page size seen, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2071 +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2072 /*! * cache_walk: Minimum on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2072 +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2073 /*! * cache_walk: On-disk page image sizes smaller than a single allocation * unit, only reported if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2073 +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2074 /*! * cache_walk: Pages created in memory and never written, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2074 +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2075 /*! * cache_walk: Pages currently queued for eviction, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2075 +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2076 /*! * cache_walk: Pages that could not be queued for eviction, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2076 +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2077 /*! * cache_walk: Refs skipped during cache traversal, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2077 +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2078 /*! * cache_walk: Size of the root page, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2078 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2079 /*! * cache_walk: Total number of pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES 2079 +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2080 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2080 +#define WT_STAT_DSRC_COMPRESS_READ 2081 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2081 +#define WT_STAT_DSRC_COMPRESS_WRITE 2082 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2082 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2083 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2083 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2084 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2084 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2085 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2085 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2086 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2086 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2087 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2087 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2088 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2088 +#define WT_STAT_DSRC_CURSOR_CREATE 2089 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2089 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2090 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2090 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2091 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2091 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2092 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2092 +#define WT_STAT_DSRC_CURSOR_INSERT 2093 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2093 +#define WT_STAT_DSRC_CURSOR_NEXT 2094 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2094 +#define WT_STAT_DSRC_CURSOR_PREV 2095 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2095 +#define WT_STAT_DSRC_CURSOR_REMOVE 2096 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2096 +#define WT_STAT_DSRC_CURSOR_RESET 2097 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2097 +#define WT_STAT_DSRC_CURSOR_RESTART 2098 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2098 +#define WT_STAT_DSRC_CURSOR_SEARCH 2099 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2099 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2100 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2100 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2101 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2101 +#define WT_STAT_DSRC_CURSOR_UPDATE 2102 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2102 +#define WT_STAT_DSRC_REC_DICTIONARY 2103 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2103 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2104 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2104 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2105 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2105 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2106 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2106 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2107 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2107 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2108 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2108 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2109 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2109 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2110 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2110 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2111 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2111 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2112 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2112 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2113 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2113 +#define WT_STAT_DSRC_REC_PAGES 2114 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2114 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2115 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2115 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2116 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2116 +#define WT_STAT_DSRC_SESSION_COMPACT 2117 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2117 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2118 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2118 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2119 /*! * @} diff --git a/src/support/stat.c b/src/support/stat.c index a9c0b24ef29..66710473ab9 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -64,6 +64,7 @@ static const char * const __stats_dsrc_desc[] = { "cache: pages requested from the cache", "cache: pages written from cache", "cache: pages written requiring in-memory restoration", + "cache: tracked dirty bytes in the cache", "cache: unmodified pages evicted", "cache_walk: Average difference between current eviction generation when the page was last considered", "cache_walk: Average on-disk page image size seen", @@ -225,6 +226,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_pages_requested = 0; stats->cache_write = 0; stats->cache_write_restore = 0; + /* not clearing cache_bytes_dirty */ stats->cache_eviction_clean = 0; /* not clearing cache_state_gen_avg_gap */ /* not clearing cache_state_avg_written_size */ @@ -372,6 +374,7 @@ __wt_stat_dsrc_aggregate_single( to->cache_pages_requested += from->cache_pages_requested; to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; + to->cache_bytes_dirty += from->cache_bytes_dirty; to->cache_eviction_clean += from->cache_eviction_clean; to->cache_state_gen_avg_gap += from->cache_state_gen_avg_gap; to->cache_state_avg_written_size += @@ -535,6 +538,7 @@ __wt_stat_dsrc_aggregate( WT_STAT_READ(from, cache_pages_requested); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); + to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); to->cache_state_gen_avg_gap += WT_STAT_READ(from, cache_state_gen_avg_gap); diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py index d925dd67b80..5d385cda705 100644 --- a/tools/wtstats/stat_data.py +++ b/tools/wtstats/stat_data.py @@ -94,6 +94,7 @@ no_scale_per_second_list = [ 'btree: row-store leaf pages', 'cache: bytes currently in the cache', 'cache: overflow values cached in memory', + 'cache: tracked dirty bytes in the cache', 'cache_walk: Average difference between current eviction generation when the page was last considered', 'cache_walk: Average on-disk page image size seen', 'cache_walk: Clean pages currently in cache', @@ -186,6 +187,7 @@ no_clear_list = [ 'transaction: transaction range of IDs currently pinned by named snapshots', 'btree: btree checkpoint generation', 'cache: bytes currently in the cache', + 'cache: tracked dirty bytes in the cache', 'cache_walk: Average difference between current eviction generation when the page was last considered', 'cache_walk: Average on-disk page image size seen', 'cache_walk: Clean pages currently in cache', -- cgit v1.2.1 From 6da10a59c9b30f2bc3a6a0b3587d161db3326ad8 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Tue, 20 Dec 2016 11:06:59 +1100 Subject: WT-2833 Fix a compiler warning (#3208) --- src/utilities/util_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 651cc7acf9c..95cd39322c4 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -268,7 +268,7 @@ dump_add_config(WT_SESSION *session, char **bufp, size_t *leftp, if (n < 0) return (util_err(session, EINVAL, NULL)); *bufp += n; - *leftp -= n; + *leftp -= (size_t)n; return (0); } -- cgit v1.2.1 From 7742cd7a80a7ce76c6ab1cc5eb62ac2d1f4f0afd Mon Sep 17 00:00:00 2001 From: David Hows Date: Tue, 20 Dec 2016 11:29:40 +1100 Subject: WT-2402 Fix setting the Autoconf options too late for strict checking (#3209) Fix setting the Autoconf options too late for strict checking. --- build_posix/configure.ac.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 952c9ae607d..0fef587b4b8 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -91,6 +91,9 @@ fi # Linux requires _GNU_SOURCE to be defined AS_CASE([$host_os], [linux*], [AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE"]) +# Configure options. +AM_OPTIONS + # If enable-strict is configured, turn on as much error checking as we can for # this compiler. Intended for developers, and only works for gcc/clang, but it # fills a need. @@ -109,9 +112,6 @@ if test "$wt_cv_enable_strict" = "yes"; then AM_CFLAGS="$AM_CFLAGS $wt_cv_strict_warnings" fi -# Configure options. -AM_OPTIONS - # Java and Python APIs if test "$wt_cv_enable_java" = "yes" -o "$wt_cv_enable_python" = "yes"; then # Only a warning, we need to build release packages without SWIG. -- cgit v1.2.1 From a06d7cae9030429a49f00a9c1825d632593e8652 Mon Sep 17 00:00:00 2001 From: David Hows Date: Tue, 20 Dec 2016 16:52:05 +1100 Subject: WT-3091Add stats to test_perf001 test, so we can investigate what happened when it failed. (#3210) --- test/suite/test_perf001.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/suite/test_perf001.py b/test/suite/test_perf001.py index b22ed2baeb0..6331a3f64d6 100644 --- a/test/suite/test_perf001.py +++ b/test/suite/test_perf001.py @@ -40,7 +40,8 @@ class test_perf001(wttest.WiredTigerTestCase): scenarios = make_scenarios([ #('file-file', dict(tabletype='file',indextype='file')), - ('file-lsm', dict(tabletype='file',indextype='lsm')), + ('file-lsm', dict(tabletype='file',indextype='lsm', cfg='', + conn_config="statistics=(fast),statistics_log=(wait=1)")), #('lsm-file', dict(tabletype='lsm',indextype='file')), #('lsm-lsm', dict(tabletype='lsm',indextype='lsm')), ]) -- cgit v1.2.1 From 13d9445453ff5e4727040ceed972dc0923bb8ae5 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 23 Dec 2016 10:27:17 +1100 Subject: WT-0000 Cut WiredTiger WT-2.9.1 release (#3215) --- NEWS | 50 ++++++++++++++++++++++++++++++++++++++ README | 2 +- build_posix/aclocal/version-set.m4 | 2 +- dist/s_string.ok | 3 +++ src/docs/top/main.dox | 8 +++--- src/docs/upgrading.dox | 10 ++++++++ 6 files changed, 69 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index bdc84ed6ef5..268949b119f 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,53 @@ +WiredTiger release 2.9.1, 2016-12-22 +------------------------------------ + +New features and API changes; refer to the API documentation for full details: + +* SERVER-26545 Remove fixed-size limitation on WiredTiger hazard pointers. See the upgrading documentation for details +* WT-283 Add a new WT_SESSION::alter method that can be used to reconfigure table metadata +* WT-2670 Change the default file system access pattern advice for data files from random to no advice. Add access_pattern_hint configuration option for WT_SESSION::create API that can be used to advise the file system of expected access semantics. See the upgrading documentation for details. +* WT-3034 Add support for including updates when reading from named snapshots + +Significant changes and bug fixes: + +* WT-2960 Reduce likelihood of using the lookaside file, especially when inserting multi-megabyte values +* WT-3056 Allow projected table and join cursors to use primary keys +* WT-3070 Fix a bug in search_near on indexes + +Other noteworthy changes since the previous release: + +* WT-2336 Add a test validating schema operations via file system call monitoring +* WT-2402 Pad structures to avoid cache line sharing +* WT-2771 Add a statistic to track per-btree dirty cache usage +* WT-2833 Add projections to wt dump utility +* WT-2969 Possible snapshot corruption during compaction +* WT-3014 Add GCC/clang support for ELF symbol visibility +* WT-3021 Fixes for java log example, raw mode in java, and raw mode in log cursors +* WT-3025 Fix error path in log_force_sync +* WT-3028 Don't check for blocked eviction with in-memory workloads +* WT-3030 Fix a race between scans and splits reading the index hint +* WT-3037 Clean up some log slot comments +* WT-3048 WiredTiger maximum size warning uses the wrong format +* WT-3051 Remove external __wt_hex symbol +* WT-3052 Improve search if index hint is wrong +* WT-3053 Make Python use internal memory allocation again +* WT-3054 Make a PackOutputStream constructor that is compatible with the previous interface. +* WT-3055 When an AsyncOp is created, cache the whether the cursor is "raw" +* WT-3057 WiredTiger hazard pointers should use the WT_REF, not the WT_PAGE +* WT-3061 Syscall testing should support pwrite64 on Linux +* WT-3064 Minor tree cleanups: .gitignore, NEWS misspelling +* WT-3066 Minor code cleanups +* WT-3068 Copy artifacts of test runs in wtperf_run script +* WT-3068 Have Jenkins include specific files for copy rather than exclude +* WT-3069 Fix LevelDB APIs build failures +* WT-3071 Fixed sign-conversion compiler errors in Java and Python SWIG code +* WT-3075 Document and enforce that WiredTiger now depends on Python 2.7 +* WT-3078 Test reconfiguration hang in the statlog server +* WT-3080 Python test suite: add elapsed time for tests +* WT-3082 Python test suite: shorten default run to avoid timeouts +* WT-3084 Fix Coverity resource leak complaint +* WT-3091 Add stats to test_perf001 test, so we can investigate what happened when it failed + WiredTiger release 2.9.0, 2016-09-06 ------------------------------------ diff --git a/README b/README index 55e9058826a..4b25a42f4eb 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -WiredTiger 2.9.1: (December 7, 2016) +WiredTiger 2.9.1: (December 23, 2016) This is version 2.9.1 of WiredTiger. diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index ecb45b5e73e..b3f2c50fad8 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -3,7 +3,7 @@ dnl build by dist/s_version VERSION_MAJOR=2 VERSION_MINOR=9 VERSION_PATCH=1 -VERSION_STRING='"WiredTiger 2.9.1: (December 7, 2016)"' +VERSION_STRING='"WiredTiger 2.9.1: (December 23, 2016)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/dist/s_string.ok b/dist/s_string.ok index f2429237f21..2b998c27813 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -21,6 +21,7 @@ Alakuijala Alexandrescu's Alloc Async +AsyncOp Athanassoulis Athlon BBBBB @@ -276,6 +277,7 @@ PRNG PTHREAD PTR PackInputStream +PackOutputStream Pandis Phong PlatformSDK @@ -339,6 +341,7 @@ Split's Stoica StoreLoad StoreStore +Syscall TAILQ TCMalloc TESTUTIL diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index 01acc849d50..84487c13174 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases -@row{WiredTiger 2.9.0 (current), +@row{WiredTiger 2.9.1 (current), + [Release package], + [Documentation]} +@row{WiredTiger 2.9.0 (previous), [Release package], [Documentation]} -@row{WiredTiger 2.8.0 (previous), - [Release package], - [Documentation]} @row{Development branch, [Source code], [Documentation]} diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 0b0826f2646..fea0a4a8364 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -8,6 +8,16 @@ The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is allocated for hazard pointers as required by each session. + +
Change to the default fadvise behavior for data files
+
+The old default behavior was to advise the file system that access would be +random for data files, and there was no way to alter that. We no longer +call advise the file system of expected access patterns by default, and +have added a new \c access_pattern_hint configuration option available for +WT_SESSION::create that can be used to restore the old default by setting +the value to "random". +
@section version_290 Upgrading to Version 2.9.0 -- cgit v1.2.1 From 2efe896a63d36c49b18d7fba093b3052d565cb55 Mon Sep 17 00:00:00 2001 From: David Hows Date: Fri, 23 Dec 2016 11:22:20 +1100 Subject: Bump release version on develop to 2.9.2 --- README | 6 +++--- RELEASE_INFO | 2 +- build_posix/aclocal/version-set.m4 | 4 ++-- build_posix/aclocal/version.m4 | 2 +- dist/package/wiredtiger.spec | 2 +- src/docs/upgrading.dox | 4 ++++ 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README b/README index 4b25a42f4eb..f7edae2835d 100644 --- a/README +++ b/README @@ -1,6 +1,6 @@ -WiredTiger 2.9.1: (December 23, 2016) +WiredTiger 2.9.2: (December 23, 2016) -This is version 2.9.1 of WiredTiger. +This is version 2.9.2 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -8,7 +8,7 @@ WiredTiger release packages and documentation can be found at: The documentation for this specific release can be found at: - http://source.wiredtiger.com/2.9.1/index.html + http://source.wiredtiger.com/2.9.2/index.html The WiredTiger source code can be found at: diff --git a/RELEASE_INFO b/RELEASE_INFO index 502b17188ce..b7145aa2cb3 100644 --- a/RELEASE_INFO +++ b/RELEASE_INFO @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=2 WIREDTIGER_VERSION_MINOR=9 -WIREDTIGER_VERSION_PATCH=1 +WIREDTIGER_VERSION_PATCH=2 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index b3f2c50fad8..c677ce41192 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -2,8 +2,8 @@ dnl build by dist/s_version VERSION_MAJOR=2 VERSION_MINOR=9 -VERSION_PATCH=1 -VERSION_STRING='"WiredTiger 2.9.1: (December 23, 2016)"' +VERSION_PATCH=2 +VERSION_STRING='"WiredTiger 2.9.2: (December 23, 2016)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4 index a75ba93e405..29782a22f82 100644 --- a/build_posix/aclocal/version.m4 +++ b/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -2.9.1 +2.9.2 diff --git a/dist/package/wiredtiger.spec b/dist/package/wiredtiger.spec index ca88f76b06b..aacdf327c98 100644 --- a/dist/package/wiredtiger.spec +++ b/dist/package/wiredtiger.spec @@ -1,5 +1,5 @@ Name: wiredtiger -Version: 2.9.1 +Version: 2.9.2 Release: 1%{?dist} Summary: WiredTiger data storage engine diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index fea0a4a8364..b73bd984abd 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,5 +1,9 @@ /*! @page upgrading Upgrading WiredTiger applications +@section version_291 Upgrading to Version 2.9.2 +
+
+ @section version_291 Upgrading to Version 2.9.1
-- cgit v1.2.1 From 190acd85f1183b11b2b2d9f90e6272f5a58fce71 Mon Sep 17 00:00:00 2001 From: David Hows Date: Fri, 23 Dec 2016 11:53:07 +1100 Subject: Fix 2.9.2 documentation stub issues --- src/docs/upgrading.dox | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index b73bd984abd..af612fb0aad 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,8 +1,6 @@ /*! @page upgrading Upgrading WiredTiger applications -@section version_291 Upgrading to Version 2.9.2 -
-
+@section version_292 Upgrading to Version 2.9.2 @section version_291 Upgrading to Version 2.9.1 -- cgit v1.2.1 From 20348a7afc0fb5a8dc888fd4c9885f07d70109ee Mon Sep 17 00:00:00 2001 From: Sulabh Mahajan Date: Fri, 23 Dec 2016 14:44:23 +1100 Subject: WT-2994 Create documentation describing page sizes and relationships (#3204) --- src/docs/Doxyfile | 8 + src/docs/file-formats.dox | 6 +- src/docs/programming.dox | 3 +- src/docs/spell.ok | 1 + src/docs/tune-compression.dox | 62 ----- src/docs/tune-page-size-and-comp.dox | 426 +++++++++++++++++++++++++++++++++++ src/docs/tune-page-sizes.dox | 142 ------------ src/docs/upgrading.dox | 2 +- 8 files changed, 440 insertions(+), 210 deletions(-) delete mode 100644 src/docs/tune-compression.dox create mode 100644 src/docs/tune-page-size-and-comp.dox delete mode 100644 src/docs/tune-page-sizes.dox diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile index 69e9716b425..3d8c46962f1 100644 --- a/src/docs/Doxyfile +++ b/src/docs/Doxyfile @@ -216,11 +216,19 @@ ALIASES = "notyet{1}=Note: "\1" not yet supported in Wired "hrow{3}=
" \ "hrow{4}=" \ "hrow{5}=" \ + "hrow{6}=" \ + "hrow{7}=" \ + "hrow{8}=" \ + "hrow{9}=" \ "row{1}=" \ "row{2}=" \ "row{3}=" \ "row{4}=" \ "row{5}=" \ + "row{6}=" \ + "row{7}=" \ + "row{8}=" \ + "row{9}=" \ "configstart{2}=@param config\n Configuration string, see @ref config_strings. Permitted values:\n
\1\2\3
\1\2\3\4
\1\2\3\4\5
\1\2\3\4\5\6
\1\2\3\4\5\6\7
\1\2\3\4\5\6\7\8
\1\2\3\4\5\6\7\8\9
\1
\1\2
\1\2\3
\1\2\3\4
\1\2\3\4\5
\1\2\3\4\5\6
\1\2\3\4\5\6\7
\1\2\3\4\5\6\7\8
\1\2\3\4\5\6\7\8\9
@hrow{Name,Effect,Values}" \ "config{3}= @row{\1,\2,\3}" \ "configend=
" \ diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox index d8990aca7a6..21dc4580bc2 100644 --- a/src/docs/file-formats.dox +++ b/src/docs/file-formats.dox @@ -110,7 +110,7 @@ considered. (See @subpage_single huffman for details.) compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -146,7 +146,7 @@ Huffman encoding can be high, and should be considered. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -157,7 +157,7 @@ compression: block compression. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 81e612e8ee8..aa76bef4614 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -66,14 +66,13 @@ each of which is ordered by one or more columns. - @subpage_single wtstats

- @subpage_single tune_memory_allocator -- @subpage_single tune_page_sizes +- @subpage_single tune_page_size_and_comp - @subpage_single tune_cache - @subpage_single tune_bulk_load - @subpage_single tune_cursor_persist - @subpage_single tune_read_only - @subpage_single tune_durability - @subpage_single tune_checksum -- @subpage_single tune_compression - @subpage_single tune_file_alloc - @subpage_single tune_system_buffer_cache - @subpage_single tune_transparent_huge_pages diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 2413cbc93fb..f87f24cef5c 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -51,6 +51,7 @@ LIBS LLVM LOGREC LRVv +LRU LSB LSM LZ diff --git a/src/docs/tune-compression.dox b/src/docs/tune-compression.dox deleted file mode 100644 index 8db2151aa76..00000000000 --- a/src/docs/tune-compression.dox +++ /dev/null @@ -1,62 +0,0 @@ -/*! @page tune_compression Compression - -WiredTiger includes a number of optional compression techniques. Configuring -compression generally decreases on-disk and in-memory resource requirements -and the amount of I/O, and increases CPU cost when data are read and written. - -Configuring compression may change application throughput. For example, -in applications using solid-state drives (where I/O is less expensive), -turning off compression may increase application performance by reducing -CPU costs; in applications where I/O costs are more expensive, turning on -compression may increase application performance by reducing the overall -number of I/O operations. - -An example of turning on row-store key prefix compression: - -@snippet ex_all.c Configure key prefix compression on - -An example of turning on row-store or column-store dictionary compression: - -@snippet ex_all.c Configure dictionary compression on - -@section compression_formats Block Compression Formats -WiredTiger provides two methods of compressing your data when using block -compression: the raw and noraw methods. These methods change how WiredTiger -works to fit data into the blocks that are stored on disk. - -@subsection noraw_compression Noraw Compression -Noraw compression is the traditional compression model where a fixed -amount of data is given to the compression system, then turned into a -compressed block of data. The amount of data chosen to compress is the -data needed to fill the uncompressed block. Thus when compressed, the block will -be smaller than the normal data size and the sizes written to disk will often -vary depending on how compressible the data being stored is. Algorithms -using noraw compression include zlib-noraw, lz4-noraw and snappy. - -@subsection raw_compression Raw Compression -WiredTiger's raw compression takes advantage of compressors that provide a -streaming compression API. Using the streaming API WiredTiger will try to fit -as much data as possible into one block. This means that blocks created -with raw compression should be of similar size. Using a streaming compression -method should also make for less overhead in compression, as the setup and -initial work for compressing is done fewer times compared to the amount of -data stored. Algorithms using raw compression include zlib, lz4. - -@subsection to_raw_or_noraw Choosing between Raw and Noraw Compression -When looking at which compression method to use the biggest consideration is -that raw compression will normally provide higher compression levels while -using more CPU for compression. - -An additional consideration is that raw compression may provide a performance -advantage in workloads where data is accessed sequentially. That is because -more data is generally packed into each block on disk. Conversely, noraw -compression may perform better for workloads with random access patterns -because each block will tend to be smaller and require less work to read and -decompress. - -See @ref file_formats_compression for more information on available -compression techniques. - -See @ref compression for information on how to configure and enable compression. - - */ diff --git a/src/docs/tune-page-size-and-comp.dox b/src/docs/tune-page-size-and-comp.dox new file mode 100644 index 00000000000..70e9875bcc4 --- /dev/null +++ b/src/docs/tune-page-size-and-comp.dox @@ -0,0 +1,426 @@ +/*! @page tune_page_size_and_comp Tuning page size and compression + +This document aims to explain the role played by different page sizes in +WiredTiger. It also details motivation behind an application wanting to modify +these page sizes from their default values and the procedure to do so. +Applications commonly configure page sizes based on their workload's typical key +and value size. Once a page size has been chosen, appropriate defaults for the +other configuration values are derived by WiredTiger from the page sizes, and +relatively few applications will need to modify the other page and key/value +size configuration options. WiredTiger also offers several compression options +that have an impact on the size of the data both in-memory and on-disk. Hence +while selecting page sizes, an application must also look at its desired +compression needs. Since the data and workload for a table differs from one +table to another in the database, an application can choose to set page sizes +and compression options on a per-table basis. + +@section data_life_cycle Data life cycle +Before detailing each page size, here is a review of how data gets stored inside +WiredTiger: + - WiredTiger uses the physical disks to store data durably, creating on-disk +files for the tables in the database directory. It also caches the portion of +the table being currently accessed by the application for reading or writing in +main memory. + - WiredTiger maintains a table's data in memory using a data structure called a +B-Tree ( +B+ Tree to be specific), +referring to the nodes of a B-Tree as pages. Internal pages carry only keys. The +leaf pages store both keys and values. + - The format of the in-memory pages is not the same as the format of the +on-disk pages. Therefore, the in-memory pages regularly go through a process +called reconciliation to create data structures appropriate for storage on the +disk. These data structures are referred to as on-disk pages. An application can +set a maximum size separately for the internal and leaf on-disk pages otherwise +WiredTiger uses a default value. If reconciliation of an in-memory page is +leading to an on-disk page size greater than this maximum, WiredTiger creates +multiple smaller on-disk pages. + - A component of WiredTiger called the Block Manager divides the on-disk pages +into smaller chunks called blocks, which then get written to the disk. The size +of these blocks is defined by a parameter called allocation_size, which is the +underlying unit of allocation for the file the data gets stored in. An +application might choose to have data compressed before it gets stored to disk +by enabling block compression. + - A database’s tables are usually much larger than the main memory available. +Not all of the data can be kept in memory at any given time. A process called +eviction takes care of making space for new data by freeing the memory of data +infrequently accessed. An eviction server regularly finds in-memory pages that +have not been accessed in a while (following an LRU algorithm). Several +background eviction threads continuously process these pages, reconcile them to +disk and remove them from the main memory. + - When an application does an insert or an update of a key/value pair, the +associated key is used to refer to an in-memory page. In the case of this page +not being in memory, appropriate on-disk page(s) are read and an in-memory page +constructed (the opposite of reconciliation). A data structure is maintained on +every in-memory page to store any insertions or modifications to the data done +on that page. As more and more data gets written to this page, the page’s memory +footprint keeps growing. + - An application can choose to set the maximum size a page is allowed to grow +in-memory. A default size is set by WiredTiger if the application doesn't +specify one. To keep page management efficient, as a page grows larger in-memory +and approaches this maximum size, if possible, it is split into smaller +in-memory pages. + - When doing an insert or an update, if a page grows larger than the maximum, +the application thread is used to forcefully evict this page. This is done to +split the growing page into smaller in-memory pages and reconcile them into +on-disk pages. Once written to the disk they are removed from the main memory, +making space for more data to be written. When an application gets involved in +forced eviction, it might take longer than usual to do these inserts and +updates. It is not always possible to (force) evict a page from memory and this +page can temporarily grow larger in size than the configured maximum. This page +then remains marked to be evicted and reattempts are made as the application +puts more data in it. + +@section configurable_page_struct Configurable page structures in WiredTiger +There are three page sizes that the user can configure: + 1. The maximum page size of any type of in-memory page in the WiredTiger cache, +memory_page_max. + 2. The maximum size of the on-disk page for an internal page, internal_page_max. + 3. The maximum size of the on-disk leaf page, leaf_page_max. + +There are additional configuration settings that tune more esoteric and +specialized data. Those are included for completeness but are rarely changed. + +@subsection memory_page_max memory_page_max +The maximum size a table’s page is allowed to grow to in memory before being +reconciled to disk. + - An integer, with acceptable values between 512B and 10TB + - Default size: 5 MB + - Additionally constrained by the condition: + leaf_page_max <= memory_page_max <= cache_size/10 + - Motivation to tune the value: +\n memory_page_max is significant for applications wanting to tune for +consistency in write intensive workloads. + - This is the parameter to start with for tuning and trying different values +to find the correct balance between overall throughput and individual operation +latency for each table. + - Splitting a growing in-memory page into smaller pages and reconciliation +both require exclusive access to the page which makes an application's write +operations wait. Having a large memory_page_max means that the pages will need +to be split and reconciled less often. But when that happens, the duration that +an exclusive access to the page is required is longer, increasing the latency of +an application’s insert or update operations. Conversely, having a smaller +memory_page_max reduces the time taken for splitting and reconciling the pages, +but causes it to happen more frequently, forcing more frequent but shorter +exclusive accesses to the pages. + - Applications should choose the memory_page_max value considering the +trade-off between frequency of exclusive access to the pages (for reconciliation +or splitting pages into smaller pages) versus the duration that the exclusive +access is required. + - Configuration: +\n Specified as memory_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +

+     "key_format=S,value_format=S,memory_page_max=10MB"
+
+ +@subsection internal_page_max internal_page_max +The maximum page size for the reconciled on-disk internal pages of the B-Tree, +in bytes. When an internal page grows past this size, it splits into multiple +pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 4 KB (*appropriate for applications with relatively small keys) + - Additionally constrained by the condition: the size must be a multiple of the +allocation size + - Motivation to tune the value: +\n internal_page_max is significant for applications wanting to avoid excessive +L2 cache misses while searching the tree. + - Recall that only keys are stored on internal pages, so the type and size of +the key values for a table help drive the setting for this parameter. + - Should be sized to fit into on-chip caches. + - Applications doing full-table scans with out-of-memory workloads might +increase internal_page_max to transfer more data per I/O. + - Influences the shape of the B-Tree, i.e. depth and the number of children +each page in B-Tree has. To iterate to the desired key/value pair in the B-Tree, +WiredTiger has to binary search the key-range in a page to determine the child +page to proceed to and continue down the depth until it reaches the correct leaf +page. Having an unusually deep B-Tree, or having too many children per page can +negatively impact time taken to iterate the B-Tree, slowing down the application. +The number of children per page and, hence, the tree depth depends upon the +number of keys that can be stored in an internal page, which is +internal_page_max divided by key size. Applications should choose an appropriate +internal_page_max size that avoids the B-Tree from getting too deep. + - Configuration: +\n Specified as internal_page_max configuration option to WT_SESSION::create(). +An example of such a configuration string is as follows: + +
+     "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB"
+
+ +@subsection leaf_page_max leaf_page_max +The maximum page size for the reconciled on-disk leaf pages of the B-Tree, in +bytes. When a leaf page grows past this size, it splits into multiple pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 32 KB (*appropriate for applications with relatively small keys +and values) + - Additionally constrained by the condition: must be a multiple of the +allocation size + - Motivation to tune the value: +\n leaf_page_max is significant for applications wanting to maximize sequential +data transfer from a storage device. + - Should be sized to maximize I/O performance (when reading from disk, it is +usually desirable to read a large amount of data, assuming some locality of +reference in the application's access pattern). + - Applications doing full-table scans through out-of-cache workloads might +increase leaf_page_max to transfer more data per I/O. + - Applications focused on read/write amplification might decrease the page +size to better match the underlying storage block size. + - Configuration: +\n Specified as leaf_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +
+     "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB"
+
+ +The following configuration items following are rarely used. They are described +for completeness: + +@subsection allocation_size allocation_size +This is the underlying unit of allocation for the file. As the unit of file +allocation, it sets the minimum page size and how much space is wasted when +storing small amounts of data and overflow items. + - an integer between 512B and 128 MB + - must a power-of-two + - default : 4 KB + - Motivation to tune the value: +\n Most applications should not need to tune the allocation size. + - To be compatible with virtual memory page sizes and direct I/O requirements +on the platform (4KB for most common server platforms) + - Smaller values decrease the file space required by overflow items. + - For example, if the allocation size is set to 4KB, an overflow item of +18,000 bytes requires 5 allocation units and wastes about 2KB of space. If the +allocation size is 16KB, the same overflow item would waste more than 10KB. + - Configuration: +\n Specified as allocation_size configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +
+     "key_format=S,value_format=S,allocation_size=4KB"
+
+ +@subsection key_val_max internal/leaf key/value max + - Overflow items +\n Overflow items are keys and values too large to easily store on a page. Overflow +items are stored separately in the file from the page where the item logically +appears, and so reading or writing an overflow item is more expensive than an +on-page item, normally requiring additional I/O. Additionally, overflow values +are not cached in memory. This means overflow items won't affect the caching +behavior of the application. It also means that each time an overflow value is +read, it is re-read from disk. + - internal_key_max +\n The largest key stored in an internal page, in bytes. If set, keys larger than +the specified size are stored as overflow items. + - The default and the maximum allowed value are both one-tenth the size of a +newly split internal page. + - leaf_key_max +\n The largest key stored in a leaf page, in bytes. If set, keys larger than the +specified size are stored as overflow items. + - The default value is one-tenth the size of a newly split leaf page. + - leaf_value_max +\n The largest value stored in a leaf page, in bytes. If set, values larger than +the specified size are stored as overflow items + - The default is one-half the size of a newly split leaf page. + - If the size is larger than the maximum leaf page size, the page size is +temporarily ignored when large values are written. + - Motivation to tune the values: +\n Most applications should not need to tune the maximum key and value sizes. +Applications requiring a small page size, but also having latency concerns such +that the additional work to retrieve an overflow item may find modifying these +values useful. +\n Since overflow items are separately stored in the on-disk file, aren't cached +and require additional I/O to access (read or write), applications should avoid +creating overflow items. + - Since page sizes also determine the default size of overflow items, i.e., +keys and values too large to easily store on a page, they can be configured to +avoid performance penalties working with overflow items: + - Applications with large keys and values, and concerned with latency, +might increase the page size to avoid creating overflow items, in order to avoid +the additional cost of retrieving them. + - Applications with large keys and values, doing random searches, might +decrease the page size to avoid wasting cache space on overflow items that +aren't likely to be needed. + - Applications with large keys and values, doing table scans, might +increase the page size to avoid creating overflow items, as the overflow items +must be read into memory in all cases, anyway. + - internal_key_max, leaf_key_max and leaf_value_max configuration values +allow applications to change the size at which a key or value will be treated +as an overflow item. + - Most applications should not need to tune the maximum key and value +sizes. + - The value of internal_key_max is relative to the maximum internal page +size. Because the number of keys on an internal page determines the depth of the +tree, the internal_key_max value can only be adjusted within a certain range, +and the configured value will be automatically adjusted by WiredTiger, if +necessary, to ensure a reasonable number of keys fit on an internal page. + - The values of leaf_key_max and leaf_value_max are not relative to the +maximum leaf page size. If either is larger than the maximum page size, the page +size will be ignored when the larger keys and values are being written, and a +larger page will be created as necessary. + - Configuration: +\n Specified as internal_key_max, leaf_key_max and leaf_value_max configuration +options to WT_SESSION::create(). An example of configuration string for a large +leaf overflow value: + +
+     "key_format=S,value_format=S,leaf_page_max=16KB,leaf_value_max=256KB"
+
+ +@subsection split_pct split_pct (split percentage) +The size (specified as percentage of internal/leaf page_max) at which the +reconciled page must be split into multiple smaller pages before being sent for +compression and then be written to the disk. If the reconciled page can fit into +a single on-disk page without the page growing beyond it's set max size, +split_pct is ignored and the page isn't split. + - an integer between 25 and 100 + - default : 75 + - Motivation to tune the value: +\n Most applications should not need to tune the split percentage size. + - This value should be selected to avoid creating a large number of tiny +pages or repeatedly splitting whenever new entries are inserted. +\n For example, if the maximum page size is 1MB, a split_pct value of 10% +would potentially result in creating a large number of 100KB pages, which may +not be optimal for future I/O. Or, if the maximum page size is 1MB, a split_pct +value of 90% would potentially result in repeatedly splitting pages as the split +pages grow to 1MB over and over. The default value for split_pct is 75%, +intended to keep large pages relatively large, while still giving split pages +room to grow. + - Configuration: +\n Specified as split_pct configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +
+     "key_format=S,value_format=S,split_pct=60"
+
+ +@section compression_considerations Compression considerations +WiredTiger compresses data at several stages to preserve memory and disk space. +Applications can configure these different compression algorithms to tailor +their requirements between memory, disk and CPU consumption. Compression +algorithms other than block compression work by modifying how the keys and +values are represented, and hence reduce data size in-memory and on-disk. Block +compression on the other hand compress the data in its binary representation +while saving it on the disk. + +Configuring compression may change application throughput. For example, in +applications using solid-state drives (where I/O is less expensive), turning +off compression may increase application performance by reducing CPU costs; in +applications where I/O costs are more expensive, turning on compression may +increase application performance by reducing the overall number of I/O +operations. + +WiredTiger uses some internal algorithms to compress the amount of data stored +that are not configurable, but always on. For example, run-length reduces the +size requirement by storing sequential, duplicate values in the store only a +single time (with an associated count). + +Different compression options available with WiredTiger: + - Key-prefix + - Reduces the size requirement by storing any identical key prefix only once +per page. The cost is additional CPU and memory when operating on the in-memory +tree. Specifically, reverse sequential cursor movement (but not forward) through +a prefix-compressed page or the random lookup of a key/value pair will allocate +sufficient memory to hold some number of uncompressed keys. So, for example, if +key prefix compression only saves a small number of bytes per key, the +additional memory cost of instantiating the uncompressed key may mean prefix +compression is not worthwhile. Further, in cases where the on-disk cost is the +primary concern, block compression may mean prefix compression is less useful. + - Configuration: +\n Specified as prefix_compression configuration option to +WT_SESSION::create(). Applications may limit the use of prefix compression by +configuring the minimum number of bytes that must be gained before prefix +compression is used with prefix_compression_min configuration option. An example +of such a configuration string is as follows: + +
+          "key_format=S,value_format=S,prefix_compression=true,prefix_compression_min=7"
+
+ + - Dictionary + - Reduces the size requirement by storing any identical value only once per +page. + - Configuration: +\n Specified as dictionary configuration configuration option to +WT_SESSION::create(), which specifies the maximum number of unique values +remembered in the B-Tree row-store leaf page value dictionary. An example of +such a configuration string is as follows: + +
+          "key_format=S,value_format=S,dictionary=1000"
+
+ + - Huffman + - Reduces the size requirement by compressing individual key/value items, and +can be separately configured either or both keys and values. The additional CPU +cost of Huffman encoding can be high, and should be considered. (See Huffman +Encoding for details.) + - Configuration: +\n Specified as huffman_key and/or huffman_value configuration option to +WT_SESSION::create(). These options can take values of "english" (to use a +built-in English language frequency table), "utf8" or "utf16" (to +use a custom utf8 or utf16 symbol frequency table file). An example of such a +configuration string is as follows: + +
+          "key_format=S,value_format=S,huffman_key=english,huffman_value=english"
+
+ + - Block Compression + - Reduces the size requirement of on-disk objects by compressing blocks of +the backing object's file. The additional CPU cost of block compression can be +high, and should be considered. When block compression has been configured, +configured page sizes will not match the actual size of the page on disk. + - WiredTiger provides two methods of compressing your data when using block +compression: the raw and noraw methods. These methods change how WiredTiger +works to fit data into the blocks that are stored on disk. Applications needing +to write specific sized blocks may want to consider implementing a +WT_COMPRESSOR::compress_raw function. + - Noraw compression: +\n A fixed amount of data is given to the compression system, then turned into +a compressed block of data. The amount of data chosen to compress is the data +needed to fill the uncompressed block. Thus when compressed, the block will be +smaller than the normal data size and the sizes written to disk will often vary +depending on how compressible the data being stored is. Algorithms using noraw +compression include zlib-noraw, lz4-noraw and snappy. +Noraw compression is better suited for workloads with random access patterns +because each block will tend to be smaller and require less work to read and +decompress. + - Raw compression: +\n WiredTiger's raw compression takes advantage of compressors that provide a +streaming compression API. Using the streaming API WiredTiger will try to fit as +much data as possible into one block. This means that blocks created with raw +compression should be of similar size. Using a streaming compression method +should also make for less overhead in compression, as the setup and initial work +for compressing is done fewer times compared to the amount of data stored. +Algorithms using raw compression include zlib, lz4. +Compared to noraw, raw compression provides more compression while using more +CPU. Raw compression may provide a performance advantage in workloads where data +is accessed sequentially. That is because more data is generally packed into +each block on disk. + - Configuration: +\n Specified as the block_compressor configuration option to +WT_SESSION::create(). If WiredTiger has builtin support for "lz4", "snappy", +"zlib" or "zstd" compression, these names are available as the value to the +option. An example of such a configuration string is as follows: + +
+          "key_format=S,value_format=S,block_compressor=snappy"
+
+ +See @ref compression for further information on how to configure and enable +different compression options. + +@subsection table_compress Table summarizing compression in WiredTiger + + +@hrow{Compression Type, Supported by row-store, Supported by variable col-store, + Supported by fixed col-store, Default config, Reduces in-mem size, + Reduces on-disk size, CPU and Memory cost} +@row{Key-prefix, yes, no, no, disabled, yes, yes, minor} +@row{Dictionary, yes, yes, no, disabled, yes, yes, minor} +@row{Huffman, yes, yes, no, disabled, yes, yes, can be high} +@row{Block, yes, yes, yes, disabled, no, yes, can be high} +
+ +*/ diff --git a/src/docs/tune-page-sizes.dox b/src/docs/tune-page-sizes.dox deleted file mode 100644 index 130e047a02d..00000000000 --- a/src/docs/tune-page-sizes.dox +++ /dev/null @@ -1,142 +0,0 @@ -/*! @page tune_page_sizes Page and overflow key/value sizes - -There are seven page and key/value size configuration strings: - -- allocation size (\c allocation_size), -- page sizes (\c internal_page_max and \c leaf_page_max), -- key and value sizes (\c internal_key_max, \c leaf_key_max and \c leaf_value_max), and the -- page-split percentage (\c split_pct). - -All seven are specified to the WT_SESSION::create method, in other -words, they are configurable on a per-file basis. - -Applications commonly configure page sizes, based on their workload's -typical key and value size. Once the correct page size has been chosen, -appropriate defaults for the other configuration values are derived from -the page sizes, and relatively few applications will need to modify the -other page and key/value size configuration options. - -An example of configuring page and key/value sizes: - -@snippet ex_all.c Create a table and configure the page size - -@section tune_page_sizes_sizes Page, key and value sizes - -The \c internal_page_max and \c leaf_page_max configuration values -specify a maximum size for Btree internal and leaf pages. That is, when -an internal or leaf page grows past that size, it splits into multiple -pages. Generally, internal pages should be sized to fit into on-chip -caches in order to minimize cache misses when searching the tree, while -leaf pages should be sized to maximize I/O performance (if reading from -disk is necessary, it is usually desirable to read a large amount of -data, assuming some locality of reference in the application's access -pattern). - -The default page size configurations (2KB for \c internal_page_max, 32KB -for \c leaf_page_max), are appropriate for applications with relatively -small keys and values. - -- Applications doing full-table scans through out-of-memory workloads -might increase both internal and leaf page sizes to transfer more data -per I/O. -- Applications focused on read/write amplification might decrease the page -size to better match the underlying storage block size. - -When block compression has been configured, configured page sizes will -not match the actual size of the page on disk. Block compression in -WiredTiger happens within the I/O subsystem, and so a page might split -even if subsequent compression would result in a resulting page size -small enough to leave as a single page. In other words, page sizes are -based on in-memory sizes, not on-disk sizes. Applications needing to -write specific sized blocks may want to consider implementing a -WT_COMPRESSOR::compress_raw function. - -The page sizes also determine the default size of overflow items, that -is, keys and values too large to easily store on a page. Overflow items -are stored separately in the file from the page where the item logically -appears, and so reading or writing an overflow item is more expensive -than an on-page item, normally requiring additional I/O. Additionally, -overflow values are not cached in memory. This means overflow items -won't affect the caching behavior of the application, but it also means -that each time an overflow value is read, it is re-read from disk. - -For both of these reasons, applications should avoid creating large -numbers of commonly referenced overflow items. This is especially -important for keys, as keys on internal pages are referenced during -random searches, not just during data retrieval. Generally, -applications should make every attempt to avoid creating overflow keys. - -- Applications with large keys and values, and concerned with latency, -might increase the page size to avoid creating overflow items, in order -to avoid the additional cost of retrieving them. - -- Applications with large keys and values, doing random searches, might -decrease the page size to avoid wasting cache space on overflow items -that aren't likely to be needed. - -- Applications with large keys and values, doing table scans, might -increase the page size to avoid creating overflow items, as the overflow -items must be read into memory in all cases, anyway. - -The \c internal_key_max, \c leaf_key_max and \c leaf_value_max -configuration values allow applications to change the size at which a -key or value will be treated as an overflow item. - -The value of \c internal_key_max is relative to the maximum internal -page size. Because the number of keys on an internal page determines -the depth of the tree, the \c internal_key_max value can only be -adjusted within a certain range, and the configured value will be -automatically adjusted by WiredTiger, if necessary to ensure a -reasonable number of keys fit on an internal page. - -The values of \c leaf_key_max and \c leaf_value_max are not relative to -the maximum leaf page size. If either is larger than the maximum page -size, the page size will be ignored when the larger keys and values are -being written, and a larger page will be created as necessary. - -Most applications should not need to tune the maximum key and value -sizes. Applications requiring a small page size, but also having -latency concerns such that the additional work to retrieve an overflow -item is an issue, may find them useful. - -An example of configuring a large leaf overflow value: - -@snippet ex_all.c Create a table and configure a large leaf value max - -@section tune_page_sizes_split_percentage Split percentage - -The \c split_pct configuration string configures the size of a split -page. When a page grows sufficiently large that it must be written as -multiple disk blocks, the newly written block size is \c split_pct -percent of the maximum page size. This value should be selected to -avoid creating a large number of tiny pages or repeatedly splitting -whenever new entries are inserted. For example, if the maximum page -size is 1MB, a \c split_pct value of 10% would potentially result in -creating a large number of 100KB pages, which may not be optimal for -future I/O. Or, if the maximum page size is 1MB, a \c split_pct value -of 90% would potentially result in repeatedly splitting pages as the -split pages grow to 1MB over and over. The default value for \c -split_pct is 75%, intended to keep large pages relatively large, while -still giving split pages room to grow. - -Most applications should not need to tune the split percentage size. - -@section tune_page_sizes_allocation_size Allocation size - -The \c allocation_size configuration value is the underlying unit of -allocation for the file. As the unit of file allocation, it sets the -minimum page size and how much space is wasted when storing small -amounts of data and overflow items. For example, if the allocation size -is set to 4KB, an overflow item of 18,000 bytes requires 5 allocation -units and wastes about 2KB of space. If the allocation size is 16KB, -the same overflow item would waste more than 10KB. - -The default allocation size is 4KB, chosen for compatibility with -virtual memory page sizes and direct I/O requirements on common server -platforms. - -Most applications should not need to tune the allocation size; it is -primarily intended for applications coping with the specific -requirements some file systems make to support features like direct I/O. - -*/ diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index af612fb0aad..59a299d48a1 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -321,7 +321,7 @@ be updated. The WT_SESSION::create \c internal_item_max and \c leaf_item_max configuration strings are now deprecated in favor of the \c internal_key_max, \c leaf_key_max, and \c leaf_value_max -configuration strings. See @ref tune_page_sizes for more information. +configuration strings. See @ref tune_page_size_and_comp for more information.

-- cgit v1.2.1 From 0605d628342faaaeb5fea7c6f816dda151c412f4 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 22 Dec 2016 22:58:28 -0500 Subject: WT-3092 Quiet a warning from autogen.sh (#3211) --- build_posix/aclocal/options.m4 | 2 +- lang/python/Makefile.am | 3 ++- src/cursor/cur_std.c | 1 + src/include/extern.h | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/build_posix/aclocal/options.m4 b/build_posix/aclocal/options.m4 index 7043430a6d6..bc4b31dfee3 100644 --- a/build_posix/aclocal/options.m4 +++ b/build_posix/aclocal/options.m4 @@ -57,7 +57,7 @@ AH_TEMPLATE( HAVE_CRC32_HARDWARE, [Define to 1 to configure CRC32 hardware support.]) AC_MSG_CHECKING(if --enable-crc32-hardware option specified) AC_ARG_ENABLE(crc32-hardware, - AC_HELP_STRING([--enable-crc32-hardware], + AS_HELP_STRING([--enable-crc32-hardware], [Enable CRC32 hardware support.]), r=$enableval, r=yes) case "$r" in no) wt_cv_enable_crc32_hardware=no;; diff --git a/lang/python/Makefile.am b/lang/python/Makefile.am index 03c65a57028..b32d0321194 100644 --- a/lang/python/Makefile.am +++ b/lang/python/Makefile.am @@ -17,7 +17,8 @@ install-exec-local: (cd $(PYSRC) && \ $(PYTHON) setup.py build_py -d $(abs_builddir)/build && \ $(PYTHON) setup.py build_ext -f -b $(abs_builddir)/build $(PYDIRS) && \ - $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG)) + $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG) && \ + rm -rf $(abs_builddir)/build) # We build in different places for an install vs running from the tree: # clean up both. Don't rely on "setup.py clean" -- everything that should diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 6264de89df9..7ace6d49cf0 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -144,6 +144,7 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor) */ int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) + WT_GCC_FUNC_ATTRIBUTE((cold)) { WT_SESSION_IMPL *session; diff --git a/src/include/extern.h b/src/include/extern.h index 4824dc93d96..2fb92c5faf0 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -315,7 +315,7 @@ extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *e extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 3eaa4ea8d458f1a57d3aac916e2bc8a59450af97 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 23 Dec 2016 15:03:07 +1100 Subject: WT-3086 Add information about transaction state to cache stuck diagnostics (#3214) --- src/evict/evict_lru.c | 188 ++++++++++++++++++++++++++++++++++++++++++-------- src/include/extern.h | 2 +- 2 files changed, 159 insertions(+), 31 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 6fa728916de..0a2a9d28402 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -365,7 +365,7 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) ret = ETIMEDOUT; __wt_err(session, ret, "Cache stuck for too long, giving up"); - WT_TRET(__wt_cache_dump(session, NULL)); + WT_TRET(__wt_dump_stuck_info(session, NULL)); return (ret); } #endif @@ -1974,15 +1974,116 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) } #ifdef HAVE_DIAGNOSTIC +static int __dump_txn_state(WT_SESSION_IMPL *, FILE *fp); +static int __dump_cache(WT_SESSION_IMPL *, FILE *fp); /* - * __wt_cache_dump -- - * Dump debugging information to a file (default stderr) about the size of - * the files in the cache. + * __dump_txn_state -- + * Output debugging information about the global transaction state. */ int -__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) +__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN *txn; + WT_TXN_STATE *s; + const char *iso_tag; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + WT_ORDERED_READ(session_cnt, conn->session_cnt); + + /* Note: odd string concatenation avoids spelling errors. */ + if (fprintf(fp, "==========\n" "transaction state dump\n") < 0) + return (EIO); + + if (fprintf(fp, + "current ID: %" PRIu64 "\n" + "last running ID: %" PRIu64 "\n" + "oldest ID: %" PRIu64 "\n" + "oldest named snapshot ID: %" PRIu64 "\n", + txn_global->current, txn_global->last_running, + txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0) + return (EIO); + + if (fprintf(fp, + "checkpoint running? %s\n" + "checkpoint generation: %" PRIu64 "\n" + "checkpoint pinned ID: %" PRIu64 "\n" + "checkpoint txn ID: %" PRIu64 "\n" + "session count: %" PRIu32 "\n", + txn_global->checkpoint_running ? "yes" : "no", + txn_global->checkpoint_gen, + txn_global->checkpoint_pinned, + txn_global->checkpoint_txnid, + session_cnt) < 0) + return (EIO); + + if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0) + return (EIO); + + /* + * Walk each session transaction state and dump information. Accessing + * the content of session handles is not thread safe, so some + * information may change while traversing if other threads are active + * at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + + txn = &conn->sessions[i].txn; + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + default: + iso_tag = "INVALID"; + break; + } + + if (fprintf(fp, + "ID: %6" PRIu64 + ", mod count: %u" + ", pinned ID: %" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s" "\n", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag) < 0) + return (EIO); + } + + return (0); +} + +/* + * __dump_cache -- + * Output debugging information about the size of the files in cache. + */ +int +__dump_cache(WT_SESSION_IMPL *session, FILE *fp) { - FILE *fp; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; @@ -1997,13 +2098,9 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) conn = S2C(session); total_bytes = total_dirty_bytes = 0; - if (ofile == NULL) - fp = stderr; - else if ((fp = fopen(ofile, "w")) == NULL) - return (EIO); - /* Note: odd string concatenation avoids spelling errors. */ - (void)fprintf(fp, "==========\n" "cache dump\n"); + if (fprintf(fp, "==========\n" "cache dump\n") < 0) + return (EIO); saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { @@ -2048,13 +2145,17 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) } session->dhandle = NULL; - if (dhandle->checkpoint == NULL) - (void)fprintf(fp, "%s(): \n", dhandle->name); - else - (void)fprintf(fp, "%s(checkpoint=%s): \n", - dhandle->name, dhandle->checkpoint); - if (intl_pages != 0) - (void)fprintf(fp, + if (dhandle->checkpoint == NULL) { + if (fprintf(fp, + "%s(): \n", dhandle->name) < 0) + return (EIO); + } else { + if (fprintf(fp, "%s(checkpoint=%s): \n", + dhandle->name, dhandle->checkpoint) < 0) + return (EIO); + } + if (intl_pages != 0) { + if (fprintf(fp, "\t" "internal: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " @@ -2069,9 +2170,11 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) (intl_bytes - intl_dirty_bytes) >> 20, intl_dirty_bytes >> 20, intl_bytes_max >> 20, - intl_dirty_bytes_max >> 20); - if (leaf_pages != 0) - (void)fprintf(fp, + intl_dirty_bytes_max >> 20) < 0) + return (EIO); + } + if (leaf_pages != 0) { + if (fprintf(fp, "\t" "leaf: " "%" PRIu64 " pages, " "%" PRIu64 "MB, " @@ -2086,7 +2189,9 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) (leaf_bytes - leaf_dirty_bytes) >> 20, leaf_dirty_bytes >> 20, leaf_bytes_max >> 20, - leaf_dirty_bytes_max >> 20); + leaf_dirty_bytes_max >> 20) < 0) + return (EIO); + } total_bytes += intl_bytes + leaf_bytes; total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; @@ -2099,16 +2204,39 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); - (void)fprintf(fp, + if (fprintf(fp, "cache dump: " - "total found = %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" - "total dirty bytes = %" PRIu64 "MB\n", + "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" + "total dirty bytes: %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, - total_dirty_bytes >> 20); - (void)fprintf(fp, "==========\n"); - - if (ofile != NULL && fclose(fp) != 0) + total_dirty_bytes >> 20) < 0) return (EIO); + if (fprintf(fp, "==========\n") < 0) + return (EIO); + return (0); } + +/* + * __wt_dump_stuck_info -- + * Dump debugging information to a file (default stderr) about the state + * of WiredTiger when we have determined that the cache is stuck full. + */ +int +__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) +{ + FILE *fp; + WT_DECL_RET; + + if (ofile == NULL) + fp = stderr; + else if ((fp = fopen(ofile, "w")) == NULL) + return (EIO); + + WT_ERR(__dump_txn_state(session, fp)); + WT_ERR(__dump_cache(session, fp)); +err: if (ofile != NULL && fclose(fp) != 0) + return (EIO); + return (ret); +} #endif diff --git a/src/include/extern.h b/src/include/extern.h index 2fb92c5faf0..bb7fbddcae5 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -352,7 +352,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 9216a5b64ec51bc1e381b96fe85345915d8fcaeb Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 22 Dec 2016 23:12:30 -0500 Subject: WT-3093 Reduce the size of WT_PAGE. (#3212) * Inline read/write locks in their structures the same way we handle spinlocks. WiredTiger currently has no need for a separate allocation, that was left over from the original POSIX pthread implementation. * Remove the lock name field from the read/write lock structure, shrinking the lock from 16B to 8B, the name field was never used, and it should be easy to identify the read/write lock's purpose from the enclosing structure. This means we no longer need two separate structures (the lock and the lock plus name), which simplifies the actual implementation. * Reduce the WT_PAGE size by pushing all of the variable-length column-store RLE array off-page into a separate allocation (instead of just the array itself), and moving the number-of-entries for the leaf pages out of the per page-type union. The latter change simplifies a bunch of stuff, row-store and fixed-length column-store no longer require a structure in the union at all, and lots of the #define's to handle that go away. * Move WT_ITEM.flags to the end of the structure, there's no reason to leave it in the middle anymore, and it's stylistically odd. --- src/block/block_write.c | 4 +- src/btree/bt_curnext.c | 4 +- src/btree/bt_curprev.c | 8 +-- src/btree/bt_cursor.c | 15 +++--- src/btree/bt_debug.c | 6 +-- src/btree/bt_delete.c | 7 ++- src/btree/bt_discard.c | 9 ++-- src/btree/bt_handle.c | 3 +- src/btree/bt_ovfl.c | 8 +-- src/btree/bt_page.c | 28 ++++++----- src/btree/bt_rebalance.c | 2 +- src/btree/bt_ret.c | 4 +- src/btree/bt_slvg.c | 26 +++++----- src/btree/bt_split.c | 6 +-- src/btree/bt_stat.c | 3 +- src/btree/bt_vrfy.c | 8 +-- src/btree/col_modify.c | 5 +- src/btree/col_srch.c | 7 ++- src/btree/row_key.c | 12 ++--- src/btree/row_modify.c | 9 ++-- src/btree/row_srch.c | 24 ++++----- src/conn/conn_dhandle.c | 2 +- src/conn/conn_handle.c | 3 +- src/conn/conn_log.c | 26 +++++----- src/conn/conn_sweep.c | 8 +-- src/cursor/cur_backup.c | 16 +++--- src/cursor/cur_log.c | 4 +- src/evict/evict_lru.c | 2 +- src/include/btmem.h | 110 +++++++++++++++++++++--------------------- src/include/btree.h | 2 +- src/include/btree.i | 6 +-- src/include/column.i | 23 ++++----- src/include/connection.h | 2 +- src/include/cursor.h | 4 +- src/include/dhandle.h | 2 +- src/include/extern.h | 20 ++++---- src/include/log.h | 2 +- src/include/lsm.h | 2 +- src/include/mutex.h | 18 ++----- src/include/thread_group.h | 2 +- src/include/txn.h | 4 +- src/include/verify_build.h | 1 - src/include/wiredtiger.in | 12 ++--- src/include/wt_internal.h | 6 ++- src/log/log.c | 12 ++--- src/lsm/lsm_cursor.c | 2 +- src/lsm/lsm_tree.c | 10 ++-- src/reconcile/rec_track.c | 8 +-- src/reconcile/rec_write.c | 9 ++-- src/schema/schema_util.c | 6 +-- src/session/session_api.c | 4 +- src/session/session_dhandle.c | 21 ++++---- src/support/mtx_rw.c | 93 +++++++++++------------------------ src/support/thread_group.c | 26 +++++----- src/txn/txn.c | 22 ++++----- src/txn/txn_ckpt.c | 10 ++-- src/txn/txn_nsnap.c | 12 ++--- 57 files changed, 328 insertions(+), 382 deletions(-) diff --git a/src/block/block_write.c b/src/block/block_write.c index d08aba45920..ea7859d6a38 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -43,10 +43,10 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) * more targeted solution at some point. */ if (!conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __wt_ftruncate(session, block->fh, len); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); } /* diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 4d3976f9647..ba5fceae7c7 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -338,7 +338,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } /* Check for the end of the page. */ - if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) + if (cbt->row_iteration_slot >= page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; @@ -356,7 +356,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 2dd443ffac1..602c01b60eb 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -458,13 +458,13 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); - if (page->pg_row_entries == 0) + if (page->entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); - cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; + cbt->row_iteration_slot = page->entries * 2 + 1; cbt->rip_saved = NULL; goto new_insert; } @@ -515,7 +515,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 650289f2cd8..d18b9b76992 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -163,7 +163,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ - if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) + if (cbt->recno >= cbt->ref->ref_recno + page->entries) return (false); /* @@ -173,9 +173,9 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) break; case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ - if (page->pg_var_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_var_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * Column-store updates are stored as "insert" objects. If @@ -191,16 +191,16 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * backing store; check the cell for a record already deleted * when read. */ - cip = &page->pg_var_d[cbt->slot]; + cip = &page->pg_var[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) return (false); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_row_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * See above: for row-store, no insert object can have the same @@ -418,8 +418,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ - if (cbt->slot != 0 && - cbt->slot != cbt->ref->page->pg_row_entries - 1) + if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d507cc0e396..957ccdbea1a 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -662,18 +662,18 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) break; case WT_PAGE_COL_FIX: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_fix_entries; + entries = page->entries; break; case WT_PAGE_COL_VAR: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_var_entries; + entries = page->entries; break; case WT_PAGE_ROW_INT: WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_ROW_LEAF: - entries = page->pg_row_entries; + entries = page->entries; break; WT_ILLEGAL_VALUE(session); } diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 00e41475de9..b55ad291c5e 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -318,13 +318,12 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ - if (page_del != NULL) WT_RET(__wt_calloc_def( - session, page->pg_row_entries + 1, &page_del->update_list)); + session, page->entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ - WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); + WT_ERR(__wt_calloc_def(session, page->entries, &upd_array)); page->modify->mod_row_update = upd_array; /* @@ -332,7 +331,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * structures, fill in the per-page update array with references to * deleted items. */ - for (i = 0, size = 0; i < page->pg_row_entries; ++i) { + for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index c2733d6567b..d2beb84fee9 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -206,8 +206,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) if (mod->mod_col_update != NULL) __free_skip_array(session, mod->mod_col_update, page->type == - WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, - update_ignore); + WT_PAGE_COL_FIX ? 1 : page->entries, update_ignore); break; case WT_PAGE_ROW_LEAF: /* @@ -219,12 +218,12 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) */ if (mod->mod_row_insert != NULL) __free_skip_array(session, mod->mod_row_insert, - page->pg_row_entries + 1, update_ignore); + page->entries + 1, update_ignore); /* Free the update array. */ if (mod->mod_row_update != NULL) __free_update(session, mod->mod_row_update, - page->pg_row_entries, update_ignore); + page->entries, update_ignore); break; } @@ -332,7 +331,7 @@ static void __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { /* Free the RLE lookup array. */ - __wt_free(session, page->pg_var_repeats); + __wt_free(session, page->u.col_var.repeats); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 47c7972dd57..6ed70788759 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -359,8 +359,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Initialize locks. */ - WT_RET(__wt_rwlock_alloc( - session, &btree->ovfl_lock, "btree overflow lock")); + __wt_rwlock_init(session, &btree->ovfl_lock); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index 29ea561db3a..ae0da62af57 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -67,11 +67,11 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ - __wt_readlock(session, S2BT(session)->ovfl_lock); + __wt_readlock(session, &S2BT(session)->ovfl_lock); ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : __ovfl_read(session, unpack->data, unpack->size, store); - __wt_readunlock(session, S2BT(session)->ovfl_lock); + __wt_readunlock(session, &S2BT(session)->ovfl_lock); return (ret); } @@ -249,7 +249,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ - __wt_writelock(session, btree->ovfl_lock); + __wt_writelock(session, &btree->ovfl_lock); switch (unpack->raw) { case WT_CELL_KEY_OVFL: @@ -263,7 +263,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_ILLEGAL_VALUE(session); } - __wt_writeunlock(session, btree->ovfl_lock); + __wt_writeunlock(session, &btree->ovfl_lock); /* Free the backing disk blocks. */ return (bm->free(bm, session, unpack->data, unpack->size)); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 7bac7079fe8..f20f6398e37 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -67,7 +67,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, switch (type) { case WT_PAGE_COL_FIX: - page->pg_fix_entries = alloc_entries; + page->entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: @@ -102,12 +102,12 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { } break; case WT_PAGE_COL_VAR: - page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_var_entries = alloc_entries; + page->pg_var = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: - page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_row_entries = alloc_entries; + page->pg_row = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } @@ -333,9 +333,10 @@ __inmem_col_var( WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; const WT_PAGE_HEADER *dsk; + size_t size; uint64_t rle; - size_t bytes_allocated; uint32_t i, indx, n, repeat_off; + void *p; btree = S2BT(session); dsk = page->dsk; @@ -343,7 +344,6 @@ __inmem_col_var( repeats = NULL; repeat_off = 0; unpack = &_unpack; - bytes_allocated = 0; /* * Walk the page, building references: the page contains unsorted value @@ -351,7 +351,7 @@ __inmem_col_var( * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). */ indx = 0; - cip = page->pg_var_d; + cip = page->pg_var; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell)); @@ -367,12 +367,14 @@ __inmem_col_var( if (rle > 1) { if (repeats == NULL) { __inmem_col_var_repeats(session, page, &n); - WT_RET(__wt_realloc_def(session, - &bytes_allocated, n + 1, &repeats)); + size = sizeof(WT_COL_VAR_REPEAT) + + (n + 1) * sizeof(WT_COL_RLE); + WT_RET(__wt_calloc(session, 1, size, &p)); + *sizep += size; - page->pg_var_repeats = repeats; + page->u.col_var.repeats = p; page->pg_var_nrepeats = n; - *sizep += bytes_allocated; + repeats = page->pg_var_repeats; } repeats[repeat_off].indx = indx; repeats[repeat_off].recno = recno; @@ -569,7 +571,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) unpack = &_unpack; /* Walk the page, building indices. */ - rip = page->pg_row_d; + rip = page->pg_row; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 29380459b94..24b4f7bb33d 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -265,7 +265,7 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, */ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page)); - ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key); + ret = __wt_row_leaf_key_copy(session, page, &page->pg_row[0], key); __wt_page_out(session, &page); return (ret); } diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index 8ef2db67e7b..6409a1a180c 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -64,10 +64,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) } /* Take the value from the original page cell. */ - cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); + cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); break; case WT_PAGE_ROW_LEAF: - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index fde4d4fb9de..a8243eba17f 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -603,9 +603,9 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, */ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, 0, &page)); WT_ERR(__wt_row_leaf_key_copy(session, - page, &page->pg_row_d[0], &trk->row_start)); - WT_ERR(__wt_row_leaf_key_copy(session, page, - &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop)); + page, &page->pg_row[0], &trk->row_start)); + WT_ERR(__wt_row_leaf_key_copy(session, + page, &page->pg_row[page->entries - 1], &trk->row_stop)); __wt_verbose(session, WT_VERB_SALVAGE, "%s start key %s", @@ -1244,10 +1244,10 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_RET(__wt_page_in(session, ref, 0)); page = ref->page; - entriesp = page->type == WT_PAGE_COL_VAR ? - &page->pg_var_entries : &page->pg_fix_entries; + entriesp = + page->type == WT_PAGE_COL_VAR ? &page->entries : &page->entries; - save_col_var = page->pg_var_d; + save_col_var = page->pg_var; save_entries = *entriesp; /* @@ -1303,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_var_d = save_col_var; + page->pg_var = save_col_var; *entriesp = save_entries; ret = __wt_page_release(session, ref, 0); @@ -1973,14 +1973,14 @@ __slvg_row_build_leaf( /* We should have selected some entries, but not the entire page. */ WT_ASSERT(session, skip_start + skip_stop > 0 && - skip_start + skip_stop < page->pg_row_entries); + skip_start + skip_stop < page->entries); /* * Take a copy of this page's first key to define the start of * its range. The key may require processing, otherwise, it's * a copy from the page. */ - rip = page->pg_row_d + skip_start; + rip = page->pg_row + skip_start; WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); WT_ERR(__wt_row_ikey_incr( session, ref->home, 0, key->data, key->size, ref)); @@ -1988,14 +1988,14 @@ __slvg_row_build_leaf( /* Set the referenced flag on overflow pages we're using. */ if (trk->trk_ovfl_cnt != 0) WT_ERR(__slvg_row_ovfl(session, - trk, page, skip_start, page->pg_row_entries - skip_stop)); + trk, page, skip_start, page->entries - skip_stop)); /* * Change the page to reflect the correct record count: there is no * need to copy anything on the page itself, the entries value limits * the number of page items. */ - page->pg_row_entries -= skip_stop; + page->entries -= skip_stop; cookie->skip = skip_start; /* @@ -2014,7 +2014,7 @@ __slvg_row_build_leaf( WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_row_entries += skip_stop; + page->entries += skip_stop; /* * Discard our hazard pointer and evict the page, updating the @@ -2081,7 +2081,7 @@ __slvg_row_ovfl(WT_SESSION_IMPL *session, * We're merging a row-store page, and we took some number of records, * figure out which (if any) overflow records we used. */ - for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) { + for (rip = page->pg_row + start; start < stop; ++start, ++rip) { copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, NULL, &cell, NULL, NULL); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fe49f937719..6b0b8a08c02 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1770,9 +1770,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) /* Find the last item on the page. */ if (type == WT_PAGE_ROW_LEAF) - ins_head = page->pg_row_entries == 0 ? + ins_head = page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); else ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); @@ -1822,7 +1822,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) key->size = WT_INSERT_KEY_SIZE(ins); } else WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); + session, page, &page->pg_row[0], key, true)); WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); parent_incr += sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index f4701a858d5..0da0e0807bd 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -106,8 +106,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) switch (page->type) { case WT_PAGE_COL_FIX: WT_STAT_INCR(session, stats, btree_column_fix); - WT_STAT_INCRV( - session, stats, btree_entries, page->pg_fix_entries); + WT_STAT_INCRV(session, stats, btree_entries, page->entries); break; case WT_PAGE_COL_INT: WT_STAT_INCR(session, stats, btree_column_internal); diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 340f9bb6f0e..05990918215 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -386,7 +386,7 @@ recno_chk: if (recno != vs->record_total + 1) } switch (page->type) { case WT_PAGE_COL_FIX: - vs->record_total += page->pg_fix_entries; + vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; @@ -614,7 +614,7 @@ __verify_row_leaf_key_order( * If a tree is empty (just created), it won't have keys; if there * are no keys, we're done. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (0); /* @@ -624,7 +624,7 @@ __verify_row_leaf_key_order( */ if (vs->max_addr->size != 0) { WT_RET(__wt_row_leaf_key_copy( - session, page, page->pg_row_d, vs->tmp1)); + session, page, page->pg_row, vs->tmp1)); /* * Compare the key against the largest key we've seen so far. @@ -653,7 +653,7 @@ __verify_row_leaf_key_order( /* Update the largest key we've seen to the last key on this page. */ WT_RET(__wt_row_leaf_key_copy(session, page, - page->pg_row_d + (page->pg_row_entries - 1), vs->max_key)); + page->pg_row + (page->entries - 1), vs->max_key)); (void)__wt_page_addr_string(session, ref, vs->max_addr); return (0); diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index a7920da5267..9ccb9728189 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -115,9 +115,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_col_update, ins_headp, - page->pg_var_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 64ee9e94f4c..c72d66f8796 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -240,8 +240,8 @@ leaf_only: cbt->compare = 1; return (0); } - if (recno >= current->ref_recno + page->pg_fix_entries) { - cbt->recno = current->ref_recno + page->pg_fix_entries; + if (recno >= current->ref_recno + page->entries) { + cbt->recno = current->ref_recno + page->entries; goto past_end; } else { cbt->recno = recno; @@ -257,8 +257,7 @@ leaf_only: } if ((cip = __col_var_search(current, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(current); - cbt->slot = page->pg_var_entries == 0 ? - 0 : page->pg_var_entries - 1; + cbt->slot = page->entries == 0 ? 0 : page->entries - 1; goto past_end; } else { cbt->recno = recno; diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 99ee34a6c5d..032fdf7d897 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -26,7 +26,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) btree = S2BT(session); - if (page->pg_row_entries == 0) { /* Just checking... */ + if (page->entries == 0) { /* Just checking... */ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); return (0); } @@ -51,15 +51,15 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) */ WT_RET(__wt_scr_alloc(session, 0, &key)); WT_RET(__wt_scr_alloc(session, - (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + (uint32_t)__bitstr_size(page->entries), &tmp)); memset(tmp->mem, 0, tmp->memsize); if ((gap = btree->key_gap) == 0) gap = 1; - __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap); + __inmem_row_leaf_slots(tmp->mem, 0, page->entries, gap); /* Instantiate the keys. */ - for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i) + for (rip = page->pg_row, i = 0; i < page->entries; ++rip, ++i) if (__bit_test(tmp->mem, i)) WT_ERR(__wt_row_leaf_key_work( session, page, rip, key, true)); @@ -282,7 +282,7 @@ switch_and_jump: /* Switching to a forward roll. */ * the tracking cache. */ if (slot_offset == 0) { - __wt_readlock(session, btree->ovfl_lock); + __wt_readlock(session, &btree->ovfl_lock); copy = WT_ROW_KEY_COPY(rip); if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { @@ -290,7 +290,7 @@ switch_and_jump: /* Switching to a forward roll. */ ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); } - __wt_readunlock(session, btree->ovfl_lock); + __wt_readunlock(session, &btree->ovfl_lock); WT_ERR(ret); break; } diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index a1c214e5b8b..b1a81ca3d9f 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -85,9 +85,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_row_update, - upd_entry, page->pg_row_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_row_update, upd_entry, page->entries); /* Set the WT_UPDATE array reference. */ upd_entry = &mod->mod_row_update[cbt->slot]; @@ -147,10 +146,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, - mod->mod_row_insert, ins_headp, page->pg_row_entries + 1); + mod->mod_row_insert, ins_headp, page->entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? - page->pg_row_entries: cbt->slot; + page->entries: cbt->slot; ins_headp = &mod->mod_row_insert[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index d4e82c458d4..aa299a161da 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -486,14 +486,14 @@ leaf_only: if (insert && descend_right) { cbt->append_tree = 1; - if (page->pg_row_entries == 0) { - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + if (page->entries == 0) { + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, - page->pg_row_d + (page->pg_row_entries - 1)); + page->pg_row + (page->entries - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -511,11 +511,11 @@ leaf_only: * doing the tests and error handling inside the loop costs about 5%. */ base = 0; - limit = page->pg_row_entries; + limit = page->entries; if (collator == NULL && srch_key->size <= WT_COMPARE_SHORT_MAXLEN) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -529,7 +529,7 @@ leaf_only: else if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -547,7 +547,7 @@ leaf_only: else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -591,13 +591,13 @@ leaf_match: cbt->compare = 0; */ if (base == 0) { cbt->compare = 1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); + cbt->slot = WT_ROW_SLOT(page, page->pg_row + (base - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -645,16 +645,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) __cursor_pos_clear(cbt); /* If the page has disk-based entries, select from them. */ - if (page->pg_row_entries != 0) { + if (page->entries != 0) { cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; + cbt->slot = __wt_random(&session->rnd) % page->entries; /* * The real row-store search function builds the key, so we * have to as well. */ return (__wt_row_leaf_key(session, - page, page->pg_row_d + cbt->slot, cbt->tmp, false)); + page, page->pg_row + cbt->slot, cbt->tmp, false)); } /* diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index e9e3925c57e..b2f4bb04ce4 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -42,7 +42,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, &dhandle)); - WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); + __wt_rwlock_init(session, &dhandle->rwlock); dhandle->name_hash = __wt_hash_city64(uri, strlen(uri)); WT_ERR(__wt_strdup(session, uri, &dhandle->name)); WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint)); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 02182daa7dc..3f7fc9bb2a7 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -64,8 +64,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ - WT_RET(__wt_rwlock_alloc( - session, &conn->hot_backup_lock, "hot backup")); + __wt_rwlock_init(session, &conn->hot_backup_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS; ++i) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 8198b3a1a02..8f8f8614ba8 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -237,7 +237,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) * We can only archive files if a hot backup is not in progress or * if we are the backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); locked = true; if (!conn->hot_backup || backup_file != 0) { for (i = 0; i < logcount; i++) { @@ -248,7 +248,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) session, WT_LOG_FILENAME, lognum)); } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); locked = false; /* @@ -260,7 +260,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } @@ -355,9 +355,9 @@ __wt_log_truncate_files( __wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file); - __wt_writelock(session, log->log_archive_lock); + __wt_writelock(session, &log->log_archive_lock); ret = __log_archive_once(session, backup_file); - __wt_writeunlock(session, log->log_archive_lock); + __wt_writeunlock(session, &log->log_archive_lock); return (ret); } @@ -433,7 +433,7 @@ __log_file_server(void *arg) */ if (!conn->hot_backup) { __wt_readlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, @@ -441,7 +441,7 @@ __log_file_server(void *arg) close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); @@ -814,10 +814,11 @@ __log_server(void *arg) * agreed not to rename or remove any files in * the database directory. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __log_prealloc_once(session); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock( + session, &conn->hot_backup_lock); WT_ERR(ret); } @@ -826,10 +827,10 @@ __log_server(void *arg) */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( - session, log->log_archive_lock) == 0) { + session, &log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); __wt_writeunlock( - session, log->log_archive_lock); + session, &log->log_archive_lock); WT_ERR(ret); } else __wt_verbose(session, WT_VERB_LOG, @@ -884,8 +885,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); - WT_RET(__wt_rwlock_alloc(session, - &log->log_archive_lock, "log archive lock")); + __wt_rwlock_init(session, &log->log_archive_lock); if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) log->allocsize = (uint32_t) WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN); diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index d1254d8afcc..7d5cb7d7c72 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -81,7 +81,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) * handle list lock so that connection-level handle searches * never need to retry. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* Only sweep clean trees where all updates are visible. */ if (btree->modified || @@ -95,7 +95,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) */ ret = __wt_conn_btree_sync_and_close(session, false, true); -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); return (ret); } @@ -188,7 +188,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) WT_DECL_RET; /* Try to get exclusive access. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* * If there are no longer any references to the handle in any @@ -205,7 +205,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) * don't retry the discard until it times out again. */ if (ret != 0) { -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); } return (ret); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 456aa2e0f02..08b15e6ca5e 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -230,10 +230,10 @@ __backup_start( * We are holding the checkpoint and schema locks so schema operations * will not see the backup file list until it is complete and valid. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = true; conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); @@ -297,9 +297,9 @@ err: /* Close the hot backup file. */ if (ret == 0) { WT_ASSERT(session, dest != NULL); WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false)); - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = cb->list; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); } return (ret); @@ -319,9 +319,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) conn = S2C(session); /* Release all btree names held by the backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); if (cb->list != NULL) { for (i = 0; cb->list[i] != NULL; ++i) __wt_free(session, cb->list[i]); @@ -332,9 +332,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion can proceed, as can the next hot backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = false; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); return (ret); } diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index 3ee6554b3c0..e5b56aa406f 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -305,7 +305,7 @@ __curlog_close(WT_CURSOR *cursor) WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); if (F_ISSET(cl, WT_CURLOG_ARCHIVE_LOCK)) - __wt_readunlock(session, conn->log->log_archive_lock); + __wt_readunlock(session, &conn->log->log_archive_lock); __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); @@ -383,7 +383,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ - __wt_readlock(session, log->log_archive_lock); + __wt_readlock(session, &log->log_archive_lock); F_SET(cl, WT_CURLOG_ARCHIVE_LOCK); if (0) { diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 0a2a9d28402..214b5c007cb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -420,7 +420,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) return (0); /* Wait for any eviction thread group changes to stabilize. */ - __wt_writelock(session, conn->evict_threads.lock); + __wt_writelock(session, &conn->evict_threads.lock); /* * Signal the threads to finish and stop populating the queue. diff --git a/src/include/btmem.h b/src/include/btmem.h index 9bd835f5d09..43c1a309d52 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -434,6 +434,19 @@ struct __wt_page_modify { uint8_t update_restored; /* Page created by restoring updates */ }; +/* + * WT_COL_RLE -- + * Variable-length column-store pages have an array of page entries with RLE + * counts greater than 1 when reading the page, so it's not necessary to walk + * the page counting records to find a specific entry. We can do a binary search + * in this array, then an offset calculation to find the cell. + */ +WT_PACKED_STRUCT_BEGIN(__wt_col_rle) + uint64_t recno; /* Record number of first repeat. */ + uint64_t rle; /* Repeat count. */ + uint32_t indx; /* Slot of entry in col_var. */ +WT_PACKED_STRUCT_END + /* * WT_PAGE -- * The WT_PAGE structure describes the in-memory page information. @@ -515,53 +528,54 @@ struct __wt_page { } while (0) /* Row-store leaf page. */ - struct { - WT_ROW *d; /* Key/value pairs */ - uint32_t entries; /* Entries */ - } row; -#undef pg_row_d -#define pg_row_d u.row.d -#undef pg_row_entries -#define pg_row_entries u.row.entries + WT_ROW *row; /* Key/value pairs */ +#undef pg_row +#define pg_row u.row /* Fixed-length column-store leaf page. */ - struct { - uint8_t *bitf; /* Values */ - uint32_t entries; /* Entries */ - } col_fix; + uint8_t *fix_bitf; /* Values */ #undef pg_fix_bitf -#define pg_fix_bitf u.col_fix.bitf -#undef pg_fix_entries -#define pg_fix_entries u.col_fix.entries +#define pg_fix_bitf u.fix_bitf /* Variable-length column-store leaf page. */ struct { - WT_COL *d; /* Values */ + WT_COL *col_var; /* Values */ /* - * Variable-length column-store files maintain a list of - * RLE entries on the page so it's unnecessary to walk - * the page counting records to find a specific entry. + * Variable-length column-store pages have an array + * of page entries with RLE counts greater than 1 when + * reading the page, so it's not necessary to walk the + * page counting records to find a specific entry. We + * can do a binary search in this array, then an offset + * calculation to find the cell. + * + * It's a separate structure to keep the page structure + * as small as possible. */ - WT_COL_RLE *repeats; /* RLE array for lookups */ - uint32_t nrepeats; /* Number of repeat slots */ - - uint32_t entries; /* Entries */ + struct __wt_col_var_repeat { + uint32_t nrepeats; /* repeat slots */ + WT_COL_RLE repeats[0]; /* lookup RLE array */ + } *repeats; +#define WT_COL_VAR_REPEAT_SET(page) \ + ((page)->u.col_var.repeats != NULL) } col_var; -#undef pg_var_d -#define pg_var_d u.col_var.d +#undef pg_var +#define pg_var u.col_var.col_var #undef pg_var_repeats -#define pg_var_repeats u.col_var.repeats +#define pg_var_repeats u.col_var.repeats->repeats #undef pg_var_nrepeats -#define pg_var_nrepeats u.col_var.nrepeats -#undef pg_var_entries -#define pg_var_entries u.col_var.entries +#define pg_var_nrepeats u.col_var.repeats->nrepeats } u; /* - * The page's type and flags are positioned at the end of the WT_PAGE - * union, it reduces cache misses in the row-store search function. + * Page entries, type and flags are positioned at the end of the WT_PAGE + * union to reduce cache misses in the row-store search function. + * + * The entries field only applies to leaf pages, internal pages use the + * page-index entries instead. */ + uint32_t entries; /* Leaf page entries */ + #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) #define WT_PAGE_INVALID 0 /* Invalid page */ @@ -618,8 +632,8 @@ struct __wt_page { #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; - /* The evict pass generation for the page */ - uint64_t evict_pass_gen; + + uint64_t evict_pass_gen; /* Eviction pass generation */ size_t memory_footprint; /* Memory attached to the page */ @@ -792,11 +806,11 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Walk the entries of an in-memory row-store leaf page. */ #define WT_ROW_FOREACH(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i)) + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i)) #define WT_ROW_FOREACH_REVERSE(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \ + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row + ((page)->entries - 1); \ (i) > 0; --(rip), --(i)) /* @@ -804,7 +818,7 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Return the 0-based array offset based on a WT_ROW reference. */ #define WT_ROW_SLOT(page, rip) \ - ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d)) + ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row)) /* * WT_COL -- @@ -828,18 +842,6 @@ struct __wt_col { uint32_t __col_value; }; -/* - * WT_COL_RLE -- - * In variable-length column store leaf pages, we build an array of entries - * with RLE counts greater than 1 when reading the page. We can do a binary - * search in this array, then an offset calculation to find the cell. - */ -WT_PACKED_STRUCT_BEGIN(__wt_col_rle) - uint64_t recno; /* Record number of first repeat. */ - uint64_t rle; /* Repeat count. */ - uint32_t indx; /* Slot of entry in col_var.d */ -WT_PACKED_STRUCT_END - /* * WT_COL_PTR, WT_COL_PTR_SET -- * Return/Set a pointer corresponding to the data offset. (If the item does @@ -856,15 +858,15 @@ WT_PACKED_STRUCT_END * Walk the entries of variable-length column-store leaf page. */ #define WT_COL_FOREACH(page, cip, i) \ - for ((i) = (page)->pg_var_entries, \ - (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i)) + for ((i) = (page)->entries, \ + (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i)) /* * WT_COL_SLOT -- * Return the 0-based array offset based on a WT_COL reference. */ #define WT_COL_SLOT(page, cip) \ - ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d)) + ((uint32_t)(((WT_COL *)cip) - (page)->pg_var)) /* * WT_IKEY -- @@ -1041,7 +1043,7 @@ struct __wt_insert_head { #define WT_ROW_INSERT_SMALLEST(page) \ ((page)->modify == NULL || \ (page)->modify->mod_row_insert == NULL ? \ - NULL : (page)->modify->mod_row_insert[(page)->pg_row_entries]) + NULL : (page)->modify->mod_row_insert[(page)->entries]) /* * The column-store leaf page update lists are arrays of pointers to structures, diff --git a/src/include/btree.h b/src/include/btree.h index c89e3c36c20..d742310bf8f 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -114,7 +114,7 @@ struct __wt_btree { int split_pct; /* Split page percent */ WT_COMPRESSOR *compressor; /* Page compressor */ WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ - WT_RWLOCK *ovfl_lock; /* Overflow lock */ + WT_RWLOCK ovfl_lock; /* Overflow lock */ uint64_t last_recno; /* Column-store last record number */ diff --git a/src/include/btree.i b/src/include/btree.i index fba6ee8e38a..09fa8df8c56 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1008,7 +1008,7 @@ __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; - rip = &page->u.row.d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; WT_RET(__wt_row_leaf_key(session, page, rip, key, false)); } else { key->data = WT_INSERT_KEY(cbt->ins); @@ -1207,9 +1207,9 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) */ ins_head = page->type == WT_PAGE_ROW_LEAF ? - (page->pg_row_entries == 0 ? + (page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : + WT_ROW_INSERT_SLOT(page, page->entries - 1)) : WT_COL_APPEND(page); if (ins_head == NULL) return (false); diff --git a/src/include/column.i b/src/include/column.i index d15f874b281..c1b45a1f4e0 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -221,13 +221,13 @@ __col_var_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - if (page->pg_var_nrepeats == 0) - return (page->pg_var_entries == 0 ? 0 : - ref->ref_recno + (page->pg_var_entries - 1)); + if (!WT_COL_VAR_REPEAT_SET(page)) + return (page->entries == 0 ? 0 : + ref->ref_recno + (page->entries - 1)); repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1]; return ((repeat->recno + repeat->rle) - 1 + - (page->pg_var_entries - (repeat->indx + 1))); + (page->entries - (repeat->indx + 1))); } /* @@ -246,8 +246,7 @@ __col_fix_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - return (page->pg_fix_entries == 0 ? - 0 : ref->ref_recno + (page->pg_fix_entries - 1)); + return (page->entries == 0 ? 0 : ref->ref_recno + (page->entries - 1)); } /* @@ -273,7 +272,9 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * slot for this record number, because we know any intervening records * have repeat counts of 1. */ - for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) { + for (base = 0, + limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; + limit != 0; limit >>= 1) { indx = base + (limit >> 1); repeat = page->pg_var_repeats + indx; @@ -281,7 +282,7 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) recno < repeat->recno + repeat->rle) { if (start_recnop != NULL) *start_recnop = repeat->recno; - return (page->pg_var_d + repeat->indx); + return (page->pg_var + repeat->indx); } if (recno < repeat->recno) continue; @@ -306,14 +307,14 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * !!! * The test could be written more simply as: * - * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * (recno >= start_recno + (page->entries - start_indx)) * * It's split into two parts because the simpler test will overflow if * searching for large record numbers. */ if (recno >= start_recno && - recno - start_recno >= page->pg_var_entries - start_indx) + recno - start_recno >= page->entries - start_indx) return (NULL); - return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); + return (page->pg_var + start_indx + (uint32_t)(recno - start_recno)); } diff --git a/src/include/connection.h b/src/include/connection.h index 60ce5f55234..6818633d816 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -262,7 +262,7 @@ struct __wt_connection_impl { WT_TXN_GLOBAL txn_global; /* Global transaction state */ - WT_RWLOCK *hot_backup_lock; /* Hot backup serialization */ + WT_RWLOCK hot_backup_lock; /* Hot backup serialization */ bool hot_backup; /* Hot backup in progress */ char **hot_backup_list; /* Hot backup file list */ diff --git a/src/include/cursor.h b/src/include/cursor.h index d522abc2a56..31c8963a486 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -52,8 +52,8 @@ { 0 }, /* recno raw buffer */ \ NULL, /* json_private */ \ NULL, /* lang_private */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM key */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM value */ \ 0, /* int saved_err */ \ NULL, /* internal_uri */ \ 0 /* uint32_t flags */ \ diff --git a/src/include/dhandle.h b/src/include/dhandle.h index d7802bb319b..dcc788f0839 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -42,7 +42,7 @@ * A handle for a generic named data source. */ struct __wt_data_handle { - WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */ + WT_RWLOCK rwlock; /* Lock for shared/exclusive ops */ TAILQ_ENTRY(__wt_data_handle) q; TAILQ_ENTRY(__wt_data_handle) hashq; diff --git a/src/include/extern.h b/src/include/extern.h index bb7fbddcae5..bcad3580e25 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -671,16 +671,16 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg) WT_G extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2_round(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_log2_int(uint32_t n) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/log.h b/src/include/log.h index 3f2cb2ba8e6..d9fea892c68 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -235,7 +235,7 @@ struct __wt_log { WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */ - WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + WT_RWLOCK log_archive_lock;/* Archive and log cursors */ /* Notify any waiting threads when sync_lsn is updated. */ WT_CONDVAR *log_sync_cond; diff --git a/src/include/lsm.h b/src/include/lsm.h index fefed9daa81..2bbb813bad2 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -189,7 +189,7 @@ struct __wt_lsm_tree { #define LSM_TREE_MAX_QUEUE 100 uint32_t queue_ref; - WT_RWLOCK *rwlock; + WT_RWLOCK rwlock; TAILQ_ENTRY(__wt_lsm_tree) q; uint64_t dsk_gen; diff --git a/src/include/mutex.h b/src/include/mutex.h index 6b81b1a6265..727a690bb1c 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -30,11 +30,14 @@ struct __wt_condvar { }; /* + * Read/write locks: + * + * WiredTiger uses read/write locks for shared/exclusive access to resources. * !!! * Don't modify this structure without understanding the read/write locking * functions. */ -typedef union { /* Read/write lock */ +union __wt_rwlock { /* Read/write lock */ uint64_t u; struct { uint32_t wr; /* Writers and readers */ @@ -45,19 +48,6 @@ typedef union { /* Read/write lock */ uint16_t next; /* Next available ticket number */ uint16_t writers_active;/* Count of active writers */ } s; -} wt_rwlock_t; - -/* - * Read/write locks: - * - * WiredTiger uses read/write locks for shared/exclusive access to resources. - */ -struct __wt_rwlock { - WT_CACHE_LINE_PAD_BEGIN - const char *name; /* Lock name for debugging */ - - wt_rwlock_t rwlock; /* Read/write lock */ - WT_CACHE_LINE_PAD_END }; /* diff --git a/src/include/thread_group.h b/src/include/thread_group.h index 76758a090c4..77cff00dc8d 100644 --- a/src/include/thread_group.h +++ b/src/include/thread_group.h @@ -40,7 +40,7 @@ struct __wt_thread_group { const char *name; /* Name */ - WT_RWLOCK *lock; /* Protects group changes */ + WT_RWLOCK lock; /* Protects group changes */ /* * Condition signalled when wanting to wake up threads that are diff --git a/src/include/txn.h b/src/include/txn.h index 12fc2a0a5b7..7e802c188ab 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -92,7 +92,7 @@ struct __wt_txn_global { * Prevents the oldest ID moving forwards while threads are scanning * the global transaction state. */ - WT_RWLOCK *scan_rwlock; + WT_RWLOCK scan_rwlock; /* * Track information about the running checkpoint. The transaction @@ -114,7 +114,7 @@ struct __wt_txn_global { volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ /* Named snapshot state. */ - WT_RWLOCK *nsnap_rwlock; + WT_RWLOCK nsnap_rwlock; volatile uint64_t nsnap_oldest_id; TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 8abc192892e..640f5e4cf5f 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -59,7 +59,6 @@ __wt_verify_build(void) sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \ sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0) WT_PADDING_CHECK(WT_LOGSLOT); - WT_PADDING_CHECK(WT_RWLOCK); WT_PADDING_CHECK(WT_SPINLOCK); WT_PADDING_CHECK(WT_TXN_STATE); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f9e232e0310..9ee28317bc4 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -114,16 +114,16 @@ struct __wt_item { size_t size; #ifndef DOXYGEN -#define WT_ITEM_ALIGNED 0x00000001 -#define WT_ITEM_INUSE 0x00000002 - /* This appears in the middle of the struct to avoid padding. */ - /*! Object flags (internal use). */ - uint32_t flags; - /*! Managed memory chunk (internal use). */ void *mem; + /*! Managed memory size (internal use). */ size_t memsize; + +#define WT_ITEM_ALIGNED 0x00000001 +#define WT_ITEM_INUSE 0x00000002 + /*! Object flags (internal use). */ + uint32_t flags; #endif }; diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index e18563dd2d2..da318ad8a86 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -106,6 +106,8 @@ struct __wt_col; typedef struct __wt_col WT_COL; struct __wt_col_rle; typedef struct __wt_col_rle WT_COL_RLE; +struct __wt_col_var_repeat; + typedef struct __wt_col_var_repeat WT_COL_VAR_REPEAT; struct __wt_colgroup; typedef struct __wt_colgroup WT_COLGROUP; struct __wt_compact_state; @@ -266,8 +268,6 @@ struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_row; typedef struct __wt_row WT_ROW; -struct __wt_rwlock; - typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; struct __wt_save_upd; @@ -302,6 +302,8 @@ union __wt_lsn; typedef union __wt_lsn WT_LSN; union __wt_rand_state; typedef union __wt_rand_state WT_RAND_STATE; +union __wt_rwlock; + typedef union __wt_rwlock WT_RWLOCK; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/log/log.c b/src/log/log.c index 413df312a15..fb3935abf81 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -895,12 +895,12 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) */ create_log = true; if (conn->log_prealloc > 0 && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __log_alloc_prealloc(session, log->fileid); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); /* * If ret is 0 it means we found a pre-allocated file. @@ -1029,12 +1029,12 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) log = conn->log; if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __wt_ftruncate(session, log_fh, offset); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); if (ret != ENOTSUP) return (ret); F_SET(log, WT_LOG_TRUNCATE_NOTSUP); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 839648b97d7..a2511f48e2b 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -304,7 +304,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm) * byte, if the application uses two leading DC4 byte for some reason, we'll do * a wasted data copy each time a new value is inserted into the object. */ -static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 }; +static const WT_ITEM __tombstone = { "\x14\x14", 2, NULL, 0, 0 }; /* * __clsm_deleted -- diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 38d87dd852b..71a981a6284 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -469,7 +469,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Try to open the tree. */ WT_RET(__wt_calloc_one(session, &lsm_tree)); - WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); + __wt_rwlock_init(session, &lsm_tree->rwlock); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); @@ -1082,7 +1082,7 @@ err: if (locked) void __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_readlock(session, lsm_tree->rwlock); + __wt_readlock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1100,7 +1100,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_readunlock(session, lsm_tree->rwlock); + __wt_readunlock(session, &lsm_tree->rwlock); } /* @@ -1110,7 +1110,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_writelock(session, lsm_tree->rwlock); + __wt_writelock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1128,7 +1128,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_writeunlock(session, lsm_tree->rwlock); + __wt_writeunlock(session, &lsm_tree->rwlock); } /* diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c index 3795b6e5ae8..5bf425b1b21 100644 --- a/src/reconcile/rec_track.c +++ b/src/reconcile/rec_track.c @@ -875,9 +875,9 @@ __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } @@ -903,9 +903,9 @@ __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index e82f449a50d..a667a288187 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -4069,7 +4069,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Copy the original, disk-image bytes into place. */ memcpy(r->first_free, page->pg_fix_bitf, - __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt)); + __bitstr_size((size_t)page->entries * btree->bitcnt)); /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { @@ -4081,9 +4081,8 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) } /* Calculate the number of entries per page remainder. */ - entry = page->pg_fix_entries; - nrecs = WT_FIX_BYTES_TO_ENTRIES( - btree, r->space_avail) - page->pg_fix_entries; + entry = page->entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries; r->recno += entry; /* Walk any append list. */ @@ -4206,7 +4205,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, session, r, page, pageref->ref_recno, btree->maxleafpage)); /* We may not be taking all of the entries on the original page. */ - page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take; + page_take = salvage->take == 0 ? page->entries : salvage->take; page_start = salvage->skip == 0 ? 0 : salvage->skip; /* Calculate the number of entries per page. */ diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c index 433224a868e..9de4b916a79 100644 --- a/src/schema/schema_util.c +++ b/src/schema/schema_util.c @@ -26,7 +26,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) conn = S2C(session); if (!conn->hot_backup) return (0); - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); /* * There is a window at the end of a backup where the list has been * cleared from the connection but the flag is still set. It is safe @@ -34,7 +34,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) */ if (!conn->hot_backup || (backup_list = conn->hot_backup_list) == NULL) { - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (0); } for (i = 0; backup_list[i] != NULL; ++i) { @@ -43,7 +43,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) break; } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (ret); } diff --git a/src/session/session_api.c b/src/session/session_api.c index fe1bf821d3b..fcbfa8809b3 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1686,7 +1686,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) WT_ERR(__wt_txn_named_snapshot_config( session, cfg, &has_create, &has_drop)); - __wt_writelock(session, txn_global->nsnap_rwlock); + __wt_writelock(session, &txn_global->nsnap_rwlock); /* Drop any snapshots to be removed first. */ if (has_drop) @@ -1696,7 +1696,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) if (has_create) WT_ERR(__wt_txn_named_snapshot_begin(session, cfg)); -err: __wt_writeunlock(session, txn_global->nsnap_rwlock); +err: __wt_writeunlock(session, &txn_global->nsnap_rwlock); API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 732dc797b6d..f1251794b89 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -181,17 +181,17 @@ __wt_session_lock_dhandle( */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { - __wt_readlock(session, dhandle->rwlock); + __wt_readlock(session, &dhandle->rwlock); if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); return (0); } is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN); if (is_open && !want_exclusive) return (0); - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } else is_open = false; @@ -201,10 +201,11 @@ __wt_session_lock_dhandle( * with another thread that successfully opens the file, we * don't want to block waiting to get exclusive access. */ - if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { + if ((ret = + __wt_try_writelock(session, &dhandle->rwlock)) == 0) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); return (0); } @@ -215,7 +216,7 @@ __wt_session_lock_dhandle( if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = false; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); continue; } @@ -286,9 +287,9 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) if (locked) { if (write_locked) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); } else - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } session->dhandle = NULL; @@ -509,7 +510,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_WITH_SCHEMA_LOCK(session, WT_WITH_HANDLE_LIST_LOCK(session, @@ -531,7 +532,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_RET(ret); } diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c index ea18f556257..35ad5da23f2 100644 --- a/src/support/mtx_rw.c +++ b/src/support/mtx_rw.c @@ -115,23 +115,27 @@ #include "wt_internal.h" /* - * __wt_rwlock_alloc -- - * Allocate and initialize a read/write lock. + * __wt_rwlock_init -- + * Initialize a read/write lock. */ -int -__wt_rwlock_alloc( - WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) +void +__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_RWLOCK *rwlock; - - __wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name); + WT_UNUSED(session); - WT_RET(__wt_calloc_one(session, &rwlock)); + l->u = 0; +} - rwlock->name = name; +/* + * __wt_rwlock_destroy -- + * Destroy a read/write lock. + */ +void +__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) +{ + WT_UNUSED(session); - *rwlockp = rwlock; - return (0); + l->u = 0; } /* @@ -139,13 +143,12 @@ __wt_rwlock_alloc( * Try to get a shared lock, fail immediately if unavailable. */ int -__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_read); - l = &rwlock->rwlock; new = old = *l; /* @@ -172,19 +175,15 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * exclusive. */ void -__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - - l = &rwlock->rwlock; - /* * Try to get the lock in a single operation if it is available to * readers. This avoids the situation where multiple readers arrive * concurrently and have to line up in order to enter the lock. For * read-heavy workloads it can make a significant difference. */ - while (__wt_try_readlock(session, rwlock) != 0) { + while (__wt_try_readlock(session, l) != 0) { if (l->s.writers_active > 0) __wt_yield(); else @@ -197,9 +196,8 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Get a shared lock. */ void -__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; @@ -207,8 +205,6 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -246,14 +242,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release a shared lock. */ void -__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - /* * Increment the writers value (other readers are doing the same, make * sure we don't race). @@ -266,13 +258,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Try to get an exclusive lock, fail immediately if unavailable. */ int -__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; old = new = *l; /* @@ -296,16 +287,13 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Wait to get an exclusive lock. */ void -__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -338,13 +326,12 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release an exclusive lock. */ void -__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new; + WT_RWLOCK new; WT_UNUSED(session); - l = &rwlock->rwlock; (void)__wt_atomic_sub16(&l->s.writers_active, 1); /* @@ -368,40 +355,16 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; } -/* - * __wt_rwlock_destroy -- - * Destroy a read/write lock. - */ -void -__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) -{ - WT_RWLOCK *rwlock; - - rwlock = *rwlockp; /* Clear our caller's reference. */ - if (rwlock == NULL) - return; - *rwlockp = NULL; - - __wt_verbose( - session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name); - - __wt_free(session, rwlock); -} - #ifdef HAVE_DIAGNOSTIC /* * __wt_rwlock_islocked -- * Return if a read/write lock is currently locked for reading or writing. */ bool -__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - return (l->s.writers != l->s.next || l->s.readers != l->s.next); } #endif diff --git a/src/support/thread_group.c b/src/support/thread_group.c index a866d2d01c5..a89468c367a 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -50,8 +50,7 @@ __thread_group_grow( { WT_THREAD *thread; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* * Any bounds checking is done by the caller so we know that @@ -84,8 +83,7 @@ __thread_group_shrink(WT_SESSION_IMPL *session, WT_THREAD *thread; uint32_t current_slot; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); for (current_slot = group->alloc; current_slot > new_count; ) { /* @@ -142,7 +140,7 @@ __thread_group_resize( WT_ASSERT(session, group->current_threads <= group->alloc && - __wt_rwlock_islocked(session, group->lock)); + __wt_rwlock_islocked(session, &group->lock)); if (new_min == group->min && new_max == group->max) return (0); @@ -227,9 +225,9 @@ __wt_thread_group_resize( " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); return (ret); } @@ -255,17 +253,17 @@ __wt_thread_group_create( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); - WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); + __wt_rwlock_init(session, &group->lock); WT_ERR(__wt_cond_alloc( session, "Thread group cond", false, &group->wait_cond)); cond_alloced = true; - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { @@ -288,7 +286,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) __wt_verbose(session, WT_VERB_THREAD_GROUP, "Destroying thread group: %p", (void *)group); - WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* Shut down all threads and free associated resources. */ WT_TRET(__thread_group_shrink(session, group, 0)); @@ -322,15 +320,15 @@ __wt_thread_group_start_one( return (0); if (wait) - __wt_writelock(session, group->lock); - else if (__wt_try_writelock(session, group->lock) != 0) + __wt_writelock(session, &group->lock); + else if (__wt_try_writelock(session, &group->lock) != 0) return (0); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) WT_TRET(__thread_group_grow( session, group, group->current_threads + 1)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); return (ret); } diff --git a/src/txn/txn.c b/src/txn/txn.c index 26a0ed679e2..660d37b17d5 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) n = 0; /* We're going to scan the table: wait for the lock. */ - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -180,7 +180,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->pinned_id = pinned_id; -done: __wt_readunlock(session, txn_global->scan_rwlock); +done: __wt_readunlock(session, &txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); } @@ -293,13 +293,13 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* First do a read-only scan. */ if (wait) - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) + __wt_try_readlock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session); - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for @@ -314,9 +314,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* It looks like an update is necessary, wait for exclusive access. */ if (wait) - __wt_writelock(session, txn_global->scan_rwlock); + __wt_writelock(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) + __wt_try_writelock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* @@ -375,7 +375,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) #endif } -done: __wt_writeunlock(session, txn_global->scan_rwlock); +done: __wt_writeunlock(session, &txn_global->scan_rwlock); return (ret); } @@ -768,10 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->scan_rwlock, "transaction scan lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->nsnap_rwlock, "named snapshot lock")); + __wt_rwlock_init(session, &txn_global->scan_rwlock); + __wt_rwlock_init(session, &txn_global->nsnap_rwlock); txn_global->nsnap_oldest_id = WT_TXN_NONE; TAILQ_INIT(&txn_global->nsnaph); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 399d9187d82..3b19162fd3d 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -679,7 +679,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * This allows ordinary visibility checks to move forward because * checkpoints often take a long time and only write to the metadata. */ - __wt_writelock(session, txn_global->scan_rwlock); + __wt_writelock(session, &txn_global->scan_rwlock); txn_global->checkpoint_txnid = txn->id; txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); @@ -700,7 +700,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, txn_global->scan_rwlock); + __wt_writeunlock(session, &txn_global->scan_rwlock); /* * Unblock updates -- we can figure out that any updates to clean pages @@ -1159,7 +1159,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); hot_backup_locked = true; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { @@ -1233,7 +1233,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); WT_ASSERT(session, btree->ckpt == NULL); btree->ckpt = ckptbase; @@ -1241,7 +1241,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, return (0); err: if (hot_backup_locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); __wt_meta_ckptlist_free(session, ckptbase); __wt_free(session, name_alloc); diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 65ec1a6662f..659570dbcd9 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -211,9 +211,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (TAILQ_EMPTY(&txn_global->nsnaph)) { WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && !__wt_txn_visible_all(session, nsnap_new->pinned_id)); - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_global->nsnap_oldest_id = nsnap_new->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); @@ -297,16 +297,16 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - __wt_readlock(session, txn_global->nsnap_rwlock); + __wt_readlock(session, &txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { /* * Acquire the scan lock so the oldest ID can't move * forward without seeing our pinned ID. */ - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_state->pinned_id = nsnap->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); WT_ASSERT(session, !__wt_txn_visible_all( session, txn_state->pinned_id) && @@ -327,7 +327,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) F_SET(txn, WT_TXN_HAS_SNAPSHOT); break; } - __wt_readunlock(session, txn_global->nsnap_rwlock); + __wt_readunlock(session, &txn_global->nsnap_rwlock); if (nsnap == NULL) WT_RET_MSG(session, EINVAL, -- cgit v1.2.1 From b99a91ec0fa042c867158f51cfd3a0106d7ac535 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 23 Dec 2016 17:59:45 +1100 Subject: WT-3086 lint. (#3217) --- dist/s_funcs.list | 1 - src/evict/evict_lru.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/dist/s_funcs.list b/dist/s_funcs.list index 01835390997..b73767cad13 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -13,7 +13,6 @@ __wt_bloom_get __wt_bulk_insert_fix __wt_bulk_insert_row __wt_bulk_insert_var -__wt_cache_dump __wt_config_getone __wt_cursor_get_raw_value __wt_debug_addr diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 214b5c007cb..b4cb2cc229a 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -2036,6 +2036,7 @@ __dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) continue; txn = &conn->sessions[i].txn; + iso_tag = "INVALID"; switch (txn->isolation) { case WT_ISO_READ_COMMITTED: iso_tag = "WT_ISO_READ_COMMITTED"; @@ -2046,9 +2047,6 @@ __dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) case WT_ISO_SNAPSHOT: iso_tag = "WT_ISO_SNAPSHOT"; break; - default: - iso_tag = "INVALID"; - break; } if (fprintf(fp, -- cgit v1.2.1 From 4d0b97a7f138f4079024c23ce9cfb70827bc133c Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Sat, 24 Dec 2016 01:47:19 +1100 Subject: WT-3093 Coverity lint. (#3216) --- src/btree/bt_slvg.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index a8243eba17f..fea979cac6e 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1235,7 +1235,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_PAGE *page; WT_SALVAGE_COOKIE *cookie, _cookie; uint64_t recno, skip, take; - uint32_t *entriesp, save_entries; + uint32_t save_entries; cookie = &_cookie; WT_CLEAR(*cookie); @@ -1244,11 +1244,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_RET(__wt_page_in(session, ref, 0)); page = ref->page; - entriesp = - page->type == WT_PAGE_COL_VAR ? &page->entries : &page->entries; - save_col_var = page->pg_var; - save_entries = *entriesp; + save_entries = page->entries; /* * Calculate the number of K/V entries we are going to skip, and @@ -1304,7 +1301,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Reset the page. */ page->pg_var = save_col_var; - *entriesp = save_entries; + page->entries = save_entries; ret = __wt_page_release(session, ref, 0); if (ret == 0) -- cgit v1.2.1 From 74cc96ce14d386d9f81a45cca7adddaaab5fb9d5 Mon Sep 17 00:00:00 2001 From: "Alexandra (Sasha) Fedorova" Date: Wed, 28 Dec 2016 08:07:35 -0800 Subject: WT-2898 evict dynamic workers (#3039) * WT-2898. NOT ready for review. Initial implementation of dynamically tuning the number of eviction workers. * Not ready for review. All the code is there. Still need to test/tune on different machines. * Remove debugging prints. * Style police * Spelling. * Fixup merge issue and compiler warning * Sulab and David do a review! * Fix compiler warning. Not ready for review. There is a performance regression after merging with develop. I'm on it. * Conversion to signed values in percent calculation to make sure that we always correctly compute percent difference, which can be negative, regardless of how the complier performs sign extension. Change thresholds so we have less churn. * Fix more compiler warnnings. Sorry about the churn, I don't see the same failures locally as on the autobuild even though I compile with -Werror. * Replace 1/0 with true/false * More compiler warning and style fixes. Configured with --enable-strict, so hopefully I have caught everything this time. * Minor nit/pick, init a variable * Rename free to free_thread, otherwise hides a global * Fix indentation * Fixes to the changes. Percent difference must be signed as it can be negative if the number of pages evicted per second decreased since the last period. * Added stats and log messages to reflect eviction worker tuning activity. Fixed a bug in the code that checks the group bounds when stopping the thread. * Removed verbose message, because we already have a statistic tracking evictions per second, so this is probably redundant. * whitespace * KNF * More aggressive addition/removal of eviction workers. We used to add/remove them one at a time; it's difficult to see the effects of extra workers with such an incremental change, because eviction throughput is affected by other noise, such as what happens in the kernel and in the I/O system. Now we add and remove eviction workers in batches. * Style fixes. * Fix compiler warning. * Simplified the tuning logic. Addressed Sulabh's comments. * A tuning parameter change * Fixed a bug where we needed a random value, but were not getting it via the random number generator, so it was not random and the code did not have the right behaviour. Added stats. * Move the call to tune evict workers into __evict_pass, so we can begin the tuning earlier. * NOT READY FOR REVIEW. Changed defaults for the number of eviction workers, so I can experiment with larger values. * NOT READY FOR REVIEW. A parameter to put a cap on how many threads we are adding at a time. * Reverse the changes of the last commit. That change hurt performance. * Changed all wtperf runners that set eviction thread maximum to 30, so we could evaluate the effects of the dynamic branch. * Updated the number reserved for internal sessions to 40, since we can now create up to 30 eviction worker threads by default. * Fix spellchecker complaints * KNF * NOT READY FOR REVIEW. Revised the algorithm to settle on a good value of evict workers once we sufficiently explore the configuration space using the gradient descent approach with random adjustments. The algorithm successfully finds the best static number of workers, but performs works. I suspect that there is an issue with how threads are removed. Suspect a bug in thread support code. Have not chased it yet. * Remove prints, add stats. * Fix a copy-paste bug where a code line was inadvertently eliminated. * Reduce the maximum for eviction workers to 30. Prevent dereferencing a NULL pointer if we dynamically grow a thread group after we've shrunk it and freed the associated memory. * Cleaned up and simplified the code. * NOT READY FOR REVIEW. A new version of the tuning algorithm that fixes a memory issue when we try to pre-allocate a large eviction thread group. Still need to tune and clean up the code. * Clean up the code. * Get rid of s_label warnings. Remove unused code. * Fix various style errors. * Fixed the logic in figuring out the maximum value for eviction threads upon cache creation or reconfiguration, which had caused a crash in one of the tests. * Changed default max for the number of eviction threads to eight. * Fix ranges for the minimum number of eviction threads * Fix eviction thread ranges to make the csuite happy * Commit automatic changes by s_all * Review: KNF, whitespace and renamed a few things. * Fix lock usage * KNF --- bench/wtperf/runners/500m-btree-50r50u.wtperf | 2 +- bench/wtperf/runners/500m-btree-80r20u.wtperf | 2 +- bench/wtperf/runners/500m-btree-populate.wtperf | 2 +- bench/wtperf/runners/500m-btree-rdonly.wtperf | 2 +- bench/wtperf/runners/checkpoint-stress.wtperf | 2 +- bench/wtperf/runners/evict-btree-readonly.wtperf | 2 +- .../wtperf/runners/evict-btree-stress-multi.wtperf | 2 +- bench/wtperf/runners/evict-btree-stress.wtperf | 2 +- bench/wtperf/runners/evict-btree.wtperf | 2 +- bench/wtperf/runners/evict-lsm-readonly.wtperf | 2 +- bench/wtperf/runners/evict-lsm.wtperf | 2 +- bench/wtperf/runners/log.wtperf | 2 +- .../wtperf/runners/mongodb-secondary-apply.wtperf | 2 +- .../runners/multi-btree-read-heavy-stress.wtperf | 2 +- bench/wtperf/runners/multi-btree-stress.wtperf | 2 +- .../runners/multi-btree-zipfian-populate.wtperf | 2 +- .../runners/multi-btree-zipfian-workload.wtperf | 2 +- bench/wtperf/stress/btree-split-stress.wtperf | 2 +- dist/api_data.py | 2 +- dist/stat_data.py | 4 + src/config/config_def.c | 10 +- src/conn/conn_cache.c | 4 +- src/evict/evict_lru.c | 204 ++++++++++- src/include/connection.h | 14 +- src/include/extern.h | 1 + src/include/stat.h | 4 + src/include/wiredtiger.in | 384 +++++++++++---------- src/support/stat.c | 16 + src/support/thread_group.c | 55 ++- tools/wtstats/stat_data.py | 2 + 30 files changed, 497 insertions(+), 239 deletions(-) diff --git a/bench/wtperf/runners/500m-btree-50r50u.wtperf b/bench/wtperf/runners/500m-btree-50r50u.wtperf index 536127f0dd8..4d2a70f1107 100644 --- a/bench/wtperf/runners/500m-btree-50r50u.wtperf +++ b/bench/wtperf/runners/500m-btree-50r50u.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" sess_config="isolation=snapshot" diff --git a/bench/wtperf/runners/500m-btree-80r20u.wtperf b/bench/wtperf/runners/500m-btree-80r20u.wtperf index d6218c44af0..6645df835df 100644 --- a/bench/wtperf/runners/500m-btree-80r20u.wtperf +++ b/bench/wtperf/runners/500m-btree-80r20u.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" # close_conn as false allows this test to close/finish faster, but if running diff --git a/bench/wtperf/runners/500m-btree-populate.wtperf b/bench/wtperf/runners/500m-btree-populate.wtperf index f9aed094aa1..ab7b17ca683 100644 --- a/bench/wtperf/runners/500m-btree-populate.wtperf +++ b/bench/wtperf/runners/500m-btree-populate.wtperf @@ -9,7 +9,7 @@ # # This generates about 80 Gb of uncompressed data. But it should compress # well and be small on disk. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" compact=true compression="snappy" sess_config="isolation=snapshot" diff --git a/bench/wtperf/runners/500m-btree-rdonly.wtperf b/bench/wtperf/runners/500m-btree-rdonly.wtperf index 2c9540ff589..e8958d20e2c 100644 --- a/bench/wtperf/runners/500m-btree-rdonly.wtperf +++ b/bench/wtperf/runners/500m-btree-rdonly.wtperf @@ -5,7 +5,7 @@ # # Set cache to half of memory of AWS perf instance. Enable logging and # checkpoints. Collect wiredtiger stats for ftdc. -conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=4)" +conn_config="cache_size=16G,checkpoint=(wait=60,log_size=2GB),session_max=20000,log=(enabled),statistics=(fast),statistics_log=(wait=30,json),eviction=(threads_max=8)" create=false compression="snappy" sess_config="isolation=snapshot" diff --git a/bench/wtperf/runners/checkpoint-stress.wtperf b/bench/wtperf/runners/checkpoint-stress.wtperf index bbd3a3ba5ed..5daa276e622 100644 --- a/bench/wtperf/runners/checkpoint-stress.wtperf +++ b/bench/wtperf/runners/checkpoint-stress.wtperf @@ -1,6 +1,6 @@ # A stress configuration to create long running checkpoints while doing a lot # of updates. -conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false)" +conn_config="cache_size=16GB,eviction=(threads_max=8),log=(enabled=false)" table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" # Enough data to fill the cache. 150 million 1k records results in two ~11GB # tables diff --git a/bench/wtperf/runners/evict-btree-readonly.wtperf b/bench/wtperf/runners/evict-btree-readonly.wtperf index 25599fadd8d..972bc371f2d 100644 --- a/bench/wtperf/runners/evict-btree-readonly.wtperf +++ b/bench/wtperf/runners/evict-btree-readonly.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4),mmap=false" +conn_config="cache_size=50M,eviction=(threads_max=8),mmap=false" table_config="type=file" icount=10000000 report_interval=5 diff --git a/bench/wtperf/runners/evict-btree-stress-multi.wtperf b/bench/wtperf/runners/evict-btree-stress-multi.wtperf index a5a29f66fa0..5a2cad6d78e 100644 --- a/bench/wtperf/runners/evict-btree-stress-multi.wtperf +++ b/bench/wtperf/runners/evict-btree-stress-multi.wtperf @@ -1,4 +1,4 @@ -conn_config="cache_size=1G,eviction=(threads_max=4),session_max=2000" +conn_config="cache_size=1G,eviction=(threads_max=8),session_max=2000" table_config="type=file" table_count=100 close_conn=false diff --git a/bench/wtperf/runners/evict-btree-stress.wtperf b/bench/wtperf/runners/evict-btree-stress.wtperf index 740fb88c050..96e3f01b325 100644 --- a/bench/wtperf/runners/evict-btree-stress.wtperf +++ b/bench/wtperf/runners/evict-btree-stress.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4)" +conn_config="cache_size=50M,eviction=(threads_max=8)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/bench/wtperf/runners/evict-btree.wtperf b/bench/wtperf/runners/evict-btree.wtperf index e7d967e5c63..3810e6a8294 100644 --- a/bench/wtperf/runners/evict-btree.wtperf +++ b/bench/wtperf/runners/evict-btree.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M,eviction=(threads_max=4)" +conn_config="cache_size=50M,eviction=(threads_max=8)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/bench/wtperf/runners/evict-lsm-readonly.wtperf b/bench/wtperf/runners/evict-lsm-readonly.wtperf index 661b8e21924..470dca695dd 100644 --- a/bench/wtperf/runners/evict-lsm-readonly.wtperf +++ b/bench/wtperf/runners/evict-lsm-readonly.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=8)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/bench/wtperf/runners/evict-lsm.wtperf b/bench/wtperf/runners/evict-lsm.wtperf index b872d429046..a0f2a78d013 100644 --- a/bench/wtperf/runners/evict-lsm.wtperf +++ b/bench/wtperf/runners/evict-lsm.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=8)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/bench/wtperf/runners/log.wtperf b/bench/wtperf/runners/log.wtperf index 6cf50dfb5a5..4379ba22373 100644 --- a/bench/wtperf/runners/log.wtperf +++ b/bench/wtperf/runners/log.wtperf @@ -16,7 +16,7 @@ # - Config + "-C "checkpoint=(wait=0)": no checkpoints # - Config + "-C "log=(enabled,prealloc=false,file_max=1M)": no pre-allocation # -conn_config="cache_size=5G,log=(enabled=true),checkpoint=(log_size=500M),eviction=(threads_max=4)" +conn_config="cache_size=5G,log=(enabled=true),checkpoint=(log_size=500M),eviction=(threads_max=8)" table_config="type=file" icount=1000000 report_interval=5 diff --git a/bench/wtperf/runners/mongodb-secondary-apply.wtperf b/bench/wtperf/runners/mongodb-secondary-apply.wtperf index f9e41184f95..58bd1a76b97 100644 --- a/bench/wtperf/runners/mongodb-secondary-apply.wtperf +++ b/bench/wtperf/runners/mongodb-secondary-apply.wtperf @@ -1,5 +1,5 @@ # Simulate the MongoDB oplog apply threads on a secondary. -conn_config="cache_size=10GB,session_max=1000,eviction=(threads_min=4,threads_max=4),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" +conn_config="cache_size=10GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" table_config="allocation_size=4k,memory_page_max=5MB,prefix_compression=false,split_pct=75,leaf_page_max=32k,internal_page_max=16k,type=file" # Spread the workload out over several tables. table_count=4 diff --git a/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf b/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf index d7b27f8fda4..f07e6c80b39 100644 --- a/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf +++ b/bench/wtperf/runners/multi-btree-read-heavy-stress.wtperf @@ -2,7 +2,7 @@ # up by dividing the workload across a lot of threads. This needs to be # tuned to the particular machine so the workload is close to capacity in the # steady state, but not overwhelming. -conn_config="cache_size=20GB,session_max=1000,eviction=(threads_min=4,threads_max=4),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" +conn_config="cache_size=20GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60),statistics=(fast),statistics_log=(json,wait=1)" table_config="allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,internal_page_max=16k,type=file" # Divide original icount by database_count. table_count=8 diff --git a/bench/wtperf/runners/multi-btree-stress.wtperf b/bench/wtperf/runners/multi-btree-stress.wtperf index b10b08f6035..bee1f431043 100644 --- a/bench/wtperf/runners/multi-btree-stress.wtperf +++ b/bench/wtperf/runners/multi-btree-stress.wtperf @@ -1,7 +1,7 @@ # wtperf options file: multi-database configuration attempting to # trigger slow operations by overloading CPU and disk. # References Jira WT-2131 -conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=2),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)" +conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=8),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)" table_config="allocation_size=4k,prefix_compression=false,split_pct=75,leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file" # Divide original icount by database_count. database_count=5 diff --git a/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf b/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf index ddd9c055eac..1fdba049779 100644 --- a/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf +++ b/bench/wtperf/runners/multi-btree-zipfian-populate.wtperf @@ -1,5 +1,5 @@ # Create a set of tables with uneven distribution of data -conn_config="cache_size=1G,eviction=(threads_max=4),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" table_config="type=file" table_count=100 icount=0 diff --git a/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf b/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf index 380350c88c8..dfb3306a7a5 100644 --- a/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf +++ b/bench/wtperf/runners/multi-btree-zipfian-workload.wtperf @@ -1,5 +1,5 @@ # Read from a set of tables with uneven distribution of data -conn_config="cache_size=1G,eviction=(threads_max=4),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" table_config="type=file" table_count=100 icount=0 diff --git a/bench/wtperf/stress/btree-split-stress.wtperf b/bench/wtperf/stress/btree-split-stress.wtperf index deb8c70d12f..86bb288fc6d 100644 --- a/bench/wtperf/stress/btree-split-stress.wtperf +++ b/bench/wtperf/stress/btree-split-stress.wtperf @@ -1,4 +1,4 @@ -conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=4,threads_min=4)" +conn_config="cache_size=2GB,statistics=[fast,clear],statistics_log=(wait=10),eviction=(threads_max=8,threads_min=4)" table_config="type=file,leaf_page_max=8k,internal_page_max=8k,memory_page_max=2MB,split_deepen_min_child=250" icount=200000 report_interval=5 diff --git a/dist/api_data.py b/dist/api_data.py index 98f9b5a230a..04071a84332 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -406,7 +406,7 @@ connection_runtime_config = [ Config('eviction', '', r''' eviction configuration options''', type='category', subconfig=[ - Config('threads_max', '1', r''' + Config('threads_max', '8', r''' maximum number of threads WiredTiger will start to help evict pages from cache. The number of threads started will vary depending on the current eviction load. Each eviction worker diff --git a/dist/stat_data.py b/dist/stat_data.py index c481382dafc..0af5d6d017e 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -193,6 +193,7 @@ connection_stats = [ CacheStat('cache_bytes_other', 'bytes not belonging to page images in the cache', 'no_clear,no_scale,size'), CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), + CacheStat('cache_eviction_active_workers', 'eviction worker thread active', 'no_clear'), CacheStat('cache_eviction_aggressive_set', 'eviction currently operating in aggressive mode', 'no_clear,no_scale'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), CacheStat('cache_eviction_app_dirty', 'modified pages evicted by application threads'), @@ -222,12 +223,15 @@ connection_stats = [ CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'), CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'), CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'), + CacheStat('cache_eviction_stable_state_workers', 'eviction worker thread stable number', 'no_clear'), CacheStat('cache_eviction_state', 'eviction state', 'no_clear,no_scale'), CacheStat('cache_eviction_walk', 'pages walked for eviction'), CacheStat('cache_eviction_walks_abandoned', 'eviction walks abandoned'), CacheStat('cache_eviction_walks_active', 'files with active eviction walks', 'no_clear,no_scale'), CacheStat('cache_eviction_walks_started', 'files with new eviction walks started'), CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), + CacheStat('cache_eviction_worker_created', 'eviction worker thread created'), + CacheStat('cache_eviction_worker_removed', 'eviction worker thread removed'), CacheStat('cache_hazard_checks', 'hazard pointer check calls'), CacheStat('cache_hazard_max', 'hazard pointer maximum array length', 'max_aggregate,no_scale'), CacheStat('cache_hazard_walks', 'hazard pointer check entries walked'), diff --git a/src/config/config_def.c b/src/config/config_def.c index e4fd7937a40..83c1436eade 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -1050,7 +1050,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { { "WT_CONNECTION.reconfigure", "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8," "cache_size=100MB,checkpoint=(log_size=0,wait=0),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," @@ -1261,7 +1261,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1285,7 +1285,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1309,7 +1309,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" @@ -1330,7 +1330,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index fe5f94ea03d..9b07b46abcd 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -143,7 +143,9 @@ __wt_cache_config(WT_SESSION_IMPL *session, bool reconfigure, const char *cfg[]) if (reconfigure) WT_RET(__wt_thread_group_resize( session, &conn->evict_threads, - conn->evict_threads_min, conn->evict_threads_max, + conn->evict_threads_min, + WT_MAX(conn->evict_threads_min, + WT_MIN(conn->evict_threads_max, EVICT_GROUP_INCR)), WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); return (0); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index b4cb2cc229a..485fd0e6d40 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -15,6 +15,7 @@ static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); +static int __evict_tune_workers(WT_SESSION_IMPL *session); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_file( WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); @@ -389,10 +390,19 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); - /* Create the eviction thread group */ + /* + * Create the eviction thread group. + * We don't set the group size to the maximum allowed sessions, + * because this may have adverse memory effects. Instead, + * we set the group's maximum to a small value. The code + * that tunes the number of workers will increase the + * maximum if necessary. + */ WT_RET(__wt_thread_group_create(session, &conn->evict_threads, "eviction-server", conn->evict_threads_min, - conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, + WT_MAX(conn->evict_threads_min, + WT_MIN(conn->evict_threads_max, EVICT_GROUP_INCR)), + WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_run)); /* @@ -548,6 +558,8 @@ __evict_pass(WT_SESSION_IMPL *session) if (loop == 0) prev = now; + if (conn->evict_threads.threads[0]->session == session) + __evict_tune_workers(session); /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages @@ -573,14 +585,6 @@ __evict_pass(WT_SESSION_IMPL *session) if (!__evict_update_work(session)) break; - /* - * Try to start a new thread if we have capacity and haven't - * reached the eviction targets. - */ - if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) - WT_RET(__wt_thread_group_start_one( - session, &conn->evict_threads, false)); - __wt_verbose(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, @@ -844,6 +848,182 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) __wt_spin_unlock(session, &cache->evict_walk_lock); } +#define EVICT_TUNE_BATCH 1 /* Max workers to add each period */ +#define EVICT_TUNE_DATAPT_MIN 3 /* Data points needed before deciding + if we should keep adding workers or + settle on an earlier value. */ +#define EVICT_TUNE_PERIOD 2 /* Tune period in seconds */ + +/* + * __evict_tune_workers -- + * Find the right number of eviction workers. Gradually ramp up the number of + * workers increasing the number in batches indicated by the setting above. + * Store the number of workers that gave us the best throughput so far and + * the number of data points we have tried. + * + * Every once in a while when we have the minimum number of data points + * we check whether the eviction throughput achieved with the current number + * of workers is the best we have seen so far. If so, we will keep increasing + * the number of workers. If not, we are past the infliction point on the + * eviction throughput curve. In that case, we will set the number of workers + * to the best observed so far and settle into a stable state. + */ +static int +__evict_tune_workers(WT_SESSION_IMPL *session) +{ + struct timespec current_time; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; + uint64_t pgs_evicted_cur, pgs_evicted_persec_cur; + uint32_t new_max, thread_surplus; + + conn = S2C(session); + cache = conn->cache; + + WT_ASSERT(session, conn->evict_threads.threads[0]->session == session); + pgs_evicted_persec_cur = 0; + + if (conn->evict_tune_stable) + return (0); + + __wt_epoch(session, ¤t_time); + + /* + * Every EVICT_TUNE_PERIOD seconds record the number of + * pages evicted per second observed in the previous period. + */ + if (WT_TIMEDIFF_SEC( + current_time, conn->evict_tune_last_time) < EVICT_TUNE_PERIOD) + return (0); + + pgs_evicted_cur = cache->pages_evict; + + /* + * If we have recorded the number of pages evicted at the end of + * the previous measurement interval, we can compute the eviction + * rate in evicted pages per second achieved during the current + * measurement interval. + * Otherwise, we just record the number of evicted pages and return. + */ + if (conn->evict_tune_pgs_last == 0) + goto out; + + delta_msec = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time); + delta_pages = pgs_evicted_cur - conn->evict_tune_pgs_last; + pgs_evicted_persec_cur = (delta_pages * WT_THOUSAND) / delta_msec; + conn->evict_tune_num_points++; + + /* Keep track of the maximum eviction throughput seen and the number + * of workers corresponding to that throughput. + */ + if (pgs_evicted_persec_cur > conn->evict_tune_pg_sec_max) { + conn->evict_tune_pg_sec_max = pgs_evicted_persec_cur; + conn->evict_tune_workers_best = + conn->evict_threads.current_threads; + } + + /* + * Compare the current number of data points with the number + * needed variable. If they are equal, we will check whether + * we are still going up on the performance curve, in which + * case we will continue increasing the number of workers, or + * we are past the inflection point on the curve, in which case + * we will go back to the best observed number of workers and + * settle into a stable state. + */ + if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) { + if ((conn->evict_tune_workers_best == + conn->evict_threads.current_threads) && + (conn->evict_threads.current_threads < + conn->evict_threads_max)) { + /* + * Keep adding workers. We will check again + * at the next check point. + */ + conn->evict_tune_datapts_needed += + WT_MIN(EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max + - conn->evict_threads.current_threads)/ + EVICT_TUNE_BATCH); + } else { + /* + * We are past the inflection point. Choose the + * best number of eviction workers observed and + * settle into a stable state. + */ + thread_surplus = + conn->evict_threads.current_threads - + conn->evict_tune_workers_best; + + for (i = 0; i < thread_surplus; i++) { + WT_RET(__wt_thread_group_stop_one(session, + &conn->evict_threads, true)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_removed); + } + WT_STAT_CONN_SET(session, + cache_eviction_stable_state_workers, + conn->evict_tune_workers_best); + conn->evict_tune_stable = true; + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + return (0); + } + } + + /* + * If we have not added any worker threads in the past, we set the + * number needed equal to the number of data points that we must + * accumulate before deciding if we should keep adding workers or settle + * on a previously tried value of workers. + */ + if (conn->evict_tune_last_action_time.tv_sec == 0) + conn->evict_tune_datapts_needed = WT_MIN(EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max - + conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); + + if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) { + cur_threads = conn->evict_threads.current_threads; + target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, + conn->evict_threads_max); + /* + * Resize the group to allow for an additional batch of threads. + * We resize the group in increments of a few sessions. + * Allocating the group to accommodate the maximum number of + * workers has adverse effects on performance due to memory + * effects, so we gradually ramp up the allocation. + */ + if (conn->evict_threads.max < target_threads) { + new_max = WT_MIN(conn->evict_threads.max + + EVICT_GROUP_INCR, conn->evict_threads_max); + + WT_RET(__wt_thread_group_resize( + session, &conn->evict_threads, + conn->evict_threads_min, new_max, + WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); + } + + /* Now actually start the new threads. */ + for (i = 0; i < (target_threads - cur_threads); ++i) { + WT_RET(__wt_thread_group_start_one(session, + &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_created); + __wt_verbose(session, WT_VERB_EVICTSERVER, + "added worker thread"); + } + conn->evict_tune_last_action_time = current_time; + } + + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + +out: conn->evict_tune_last_time = current_time; + conn->evict_tune_pgs_last = pgs_evicted_cur; + return (0); +} + /* * __evict_lru_pages -- * Get pages from the LRU queue to evict. @@ -1282,8 +1462,8 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, - u_int max_entries, u_int *slotp) +__evict_walk_file(WT_SESSION_IMPL *session, + WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; diff --git a/src/include/connection.h b/src/include/connection.h index 6818633d816..665275440cf 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -301,6 +301,16 @@ struct __wt_connection_impl { uint32_t evict_threads_max;/* Max eviction threads */ uint32_t evict_threads_min;/* Min eviction threads */ +#define EVICT_GROUP_INCR 4 /* Evict group size increased in batches */ + uint32_t evict_tune_datapts_needed;/* Data needed to tune */ + struct timespec evict_tune_last_action_time;/* Time of last action */ + struct timespec evict_tune_last_time; /* Time of last check */ + uint32_t evict_tune_num_points; /* Number of values tried */ + uint64_t evict_tune_pgs_last; /* Number of pages evicted */ + uint64_t evict_tune_pg_sec_max; /* Max throughput encountered */ + bool evict_tune_stable; /* Are we stable? */ + uint32_t evict_tune_workers_best;/* Best performing value */ + #define WT_STATLOG_FILENAME "WiredTigerStat.%d.%H" WT_SESSION_IMPL *stat_session; /* Statistics log session */ wt_thread_t stat_tid; /* Statistics log thread */ @@ -326,11 +336,11 @@ struct __wt_connection_impl { bool log_tid_set; /* Log server thread set */ WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */ WT_SESSION_IMPL *log_file_session;/* Log file thread session */ - wt_thread_t log_file_tid; /* Log file thread thread */ + wt_thread_t log_file_tid; /* Log file thread */ bool log_file_tid_set;/* Log file thread set */ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ - wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ + wt_thread_t log_wrlsn_tid; /* Log write lsn thread */ bool log_wrlsn_tid_set;/* Log write lsn thread set */ WT_LOG *log; /* Logging structure */ WT_COMPRESSOR *log_compressor;/* Logging compressor */ diff --git a/src/include/extern.h b/src/include/extern.h index bcad3580e25..566eb386c29 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -728,6 +728,7 @@ extern int __wt_thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP * extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_thread_group_stop_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/stat.h b/src/include/stat.h index 3dcdf68b8d5..fd3e3290d95 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -310,7 +310,11 @@ struct __wt_connection_stats { int64_t cache_eviction_slow; int64_t cache_eviction_state; int64_t cache_eviction_walks_abandoned; + int64_t cache_eviction_active_workers; + int64_t cache_eviction_worker_created; int64_t cache_eviction_worker_evicting; + int64_t cache_eviction_worker_removed; + int64_t cache_eviction_stable_state_workers; int64_t cache_eviction_force_fail; int64_t cache_eviction_walks_active; int64_t cache_eviction_walks_started; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 9ee28317bc4..7c27baa9395 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1855,7 +1855,7 @@ struct __wt_connection { * threads WiredTiger will start to help evict pages from cache. The * number of threads started will vary depending on the current eviction * load. Each eviction worker thread uses a session from the configured - * session_max., an integer between 1 and 20; default \c 1.} + * session_max., an integer between 1 and 20; default \c 8.} * @config{    threads_min, minimum number of * threads WiredTiger will start to help evict pages from cache. The * number of threads currently running will vary depending on the @@ -2331,7 +2331,7 @@ struct __wt_connection { * WiredTiger will start to help evict pages from cache. The number of threads * started will vary depending on the current eviction load. Each eviction * worker thread uses a session from the configured session_max., an integer - * between 1 and 20; default \c 1.} + * between 1 and 20; default \c 8.} * @config{    threads_min, * minimum number of threads WiredTiger will start to help evict pages from * cache. The number of threads currently running will vary depending on the @@ -4429,396 +4429,404 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_STATE 1051 /*! cache: eviction walks abandoned */ #define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1052 +/*! cache: eviction worker thread active */ +#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1053 +/*! cache: eviction worker thread created */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1054 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1053 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1055 +/*! cache: eviction worker thread removed */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1056 +/*! cache: eviction worker thread stable number */ +#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1057 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1054 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1058 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1055 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1059 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1056 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1060 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1057 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1061 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1058 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1062 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1059 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1063 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1060 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1064 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1061 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1065 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1062 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1066 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1063 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1067 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1064 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1068 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1065 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1069 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1066 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1070 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1067 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1071 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1068 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1072 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1069 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1073 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1070 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1074 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1071 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1075 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1072 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1076 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1073 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1077 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1074 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1078 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1075 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1079 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1076 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1080 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1077 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1081 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1078 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1082 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1079 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1083 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1080 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1084 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1081 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1085 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1082 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1086 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1083 +#define WT_STAT_CONN_CACHE_READ 1087 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1084 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1088 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1085 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1089 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1086 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1090 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1087 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1091 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1088 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1092 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1089 +#define WT_STAT_CONN_CACHE_WRITE 1093 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1090 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1094 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1091 +#define WT_STAT_CONN_CACHE_OVERHEAD 1095 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1092 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1096 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1093 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1097 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1094 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1098 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1095 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1099 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1096 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1100 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1097 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1101 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1098 +#define WT_STAT_CONN_COND_AUTO_WAIT 1102 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1099 +#define WT_STAT_CONN_FILE_OPEN 1103 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1100 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1104 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1101 +#define WT_STAT_CONN_MEMORY_FREE 1105 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1102 +#define WT_STAT_CONN_MEMORY_GROW 1106 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1103 +#define WT_STAT_CONN_COND_WAIT 1107 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1104 +#define WT_STAT_CONN_RWLOCK_READ 1108 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1105 +#define WT_STAT_CONN_RWLOCK_WRITE 1109 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1106 +#define WT_STAT_CONN_FSYNC_IO 1110 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1107 +#define WT_STAT_CONN_READ_IO 1111 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1108 +#define WT_STAT_CONN_WRITE_IO 1112 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1109 +#define WT_STAT_CONN_CURSOR_CREATE 1113 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1110 +#define WT_STAT_CONN_CURSOR_INSERT 1114 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1111 +#define WT_STAT_CONN_CURSOR_NEXT 1115 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1112 +#define WT_STAT_CONN_CURSOR_PREV 1116 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1113 +#define WT_STAT_CONN_CURSOR_REMOVE 1117 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1114 +#define WT_STAT_CONN_CURSOR_RESET 1118 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1115 +#define WT_STAT_CONN_CURSOR_RESTART 1119 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1116 +#define WT_STAT_CONN_CURSOR_SEARCH 1120 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1117 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1121 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1118 +#define WT_STAT_CONN_CURSOR_UPDATE 1122 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1119 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1123 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1120 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1124 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1121 +#define WT_STAT_CONN_DH_SWEEP_REF 1125 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1122 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1126 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1123 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1127 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1124 +#define WT_STAT_CONN_DH_SWEEP_TOD 1128 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1125 +#define WT_STAT_CONN_DH_SWEEPS 1129 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1126 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1130 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1127 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1131 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1128 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1132 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1129 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1130 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134 /*! lock: handle-list lock acquisitions */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1131 +#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1135 /*! lock: handle-list lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1132 +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1136 /*! lock: handle-list lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1133 +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1137 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1134 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1138 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1135 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1139 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1136 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1140 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1137 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1141 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1138 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1142 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1139 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1143 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1140 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1144 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1141 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1145 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1142 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1146 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1143 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1147 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1144 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1148 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1145 +#define WT_STAT_CONN_LOG_SLOT_RACES 1149 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1146 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1147 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1151 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1148 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1152 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1149 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1153 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1150 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1154 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1151 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1155 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1152 +#define WT_STAT_CONN_LOG_FLUSH 1156 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1153 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1157 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1154 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1158 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1155 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1159 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1156 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1160 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1157 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1161 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1158 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1162 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1159 +#define WT_STAT_CONN_LOG_SCANS 1163 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1160 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1164 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1161 +#define WT_STAT_CONN_LOG_WRITE_LSN 1165 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1162 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1166 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1163 +#define WT_STAT_CONN_LOG_SYNC 1167 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1164 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1168 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1165 +#define WT_STAT_CONN_LOG_SYNC_DIR 1169 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1166 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1170 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1167 +#define WT_STAT_CONN_LOG_WRITES 1171 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1168 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1172 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1169 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1173 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1170 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1174 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1171 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1175 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1172 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1176 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1173 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1177 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1174 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1178 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1175 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1179 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1176 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1180 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1177 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1181 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1178 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1182 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1179 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1183 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1180 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1184 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1181 +#define WT_STAT_CONN_REC_PAGES 1185 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1182 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1186 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1183 +#define WT_STAT_CONN_REC_PAGE_DELETE 1187 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1184 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1188 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1185 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1189 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1186 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1190 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1187 +#define WT_STAT_CONN_SESSION_OPEN 1191 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1188 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1192 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1189 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1193 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1190 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1194 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1191 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1195 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1192 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1196 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1193 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1197 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1194 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1198 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1199 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1200 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1201 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1202 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1203 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1204 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1205 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1206 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1207 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1208 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1209 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1210 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1207 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1211 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1208 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1212 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1209 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1213 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1210 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1214 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1211 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1215 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1212 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1216 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1213 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1217 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1214 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1218 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1215 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1219 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1216 +#define WT_STAT_CONN_PAGE_SLEEP 1220 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1217 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1221 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1218 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1222 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1219 +#define WT_STAT_CONN_TXN_BEGIN 1223 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1220 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1224 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1221 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1225 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1222 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1226 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1223 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1227 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1228 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1229 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1230 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1231 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT 1232 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1233 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1230 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1234 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1235 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1236 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1233 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1237 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1234 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1238 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1235 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1239 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1236 +#define WT_STAT_CONN_TXN_SYNC 1240 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1237 +#define WT_STAT_CONN_TXN_COMMIT 1241 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1238 +#define WT_STAT_CONN_TXN_ROLLBACK 1242 /*! * @} diff --git a/src/support/stat.c b/src/support/stat.c index 66710473ab9..167d17137ce 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -677,7 +677,11 @@ static const char * const __stats_connection_desc[] = { "cache: eviction server unable to reach eviction goal", "cache: eviction state", "cache: eviction walks abandoned", + "cache: eviction worker thread active", + "cache: eviction worker thread created", "cache: eviction worker thread evicting pages", + "cache: eviction worker thread removed", + "cache: eviction worker thread stable number", "cache: failed eviction of pages that exceeded the in-memory maximum", "cache: files with active eviction walks", "cache: files with new eviction walks started", @@ -958,7 +962,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_slow = 0; /* not clearing cache_eviction_state */ stats->cache_eviction_walks_abandoned = 0; + /* not clearing cache_eviction_active_workers */ + stats->cache_eviction_worker_created = 0; stats->cache_eviction_worker_evicting = 0; + stats->cache_eviction_worker_removed = 0; + /* not clearing cache_eviction_stable_state_workers */ stats->cache_eviction_force_fail = 0; /* not clearing cache_eviction_walks_active */ stats->cache_eviction_walks_started = 0; @@ -1232,8 +1240,16 @@ __wt_stat_connection_aggregate( to->cache_eviction_state += WT_STAT_READ(from, cache_eviction_state); to->cache_eviction_walks_abandoned += WT_STAT_READ(from, cache_eviction_walks_abandoned); + to->cache_eviction_active_workers += + WT_STAT_READ(from, cache_eviction_active_workers); + to->cache_eviction_worker_created += + WT_STAT_READ(from, cache_eviction_worker_created); to->cache_eviction_worker_evicting += WT_STAT_READ(from, cache_eviction_worker_evicting); + to->cache_eviction_worker_removed += + WT_STAT_READ(from, cache_eviction_worker_removed); + to->cache_eviction_stable_state_workers += + WT_STAT_READ(from, cache_eviction_stable_state_workers); to->cache_eviction_force_fail += WT_STAT_READ(from, cache_eviction_force_fail); to->cache_eviction_walks_active += diff --git a/src/support/thread_group.c b/src/support/thread_group.c index a89468c367a..d04f8977a9a 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -71,12 +71,12 @@ __thread_group_grow( /* * __thread_group_shrink -- - * Decrease the number of running threads in the group, and free any + * Decrease the number of running threads in the group. Optionally free any * memory associated with slots larger than the new count. */ static int __thread_group_shrink(WT_SESSION_IMPL *session, - WT_THREAD_GROUP *group, uint32_t new_count) + WT_THREAD_GROUP *group, uint32_t new_count, bool free_thread) { WT_DECL_RET; WT_SESSION *wt_session; @@ -105,14 +105,15 @@ __thread_group_shrink(WT_SESSION_IMPL *session, WT_TRET(__wt_thread_join(session, thread->tid)); thread->tid = 0; } - - if (thread->session != NULL) { - wt_session = (WT_SESSION *)thread->session; - WT_TRET(wt_session->close(wt_session, NULL)); - thread->session = NULL; + if (free_thread) { + if (thread->session != NULL) { + wt_session = (WT_SESSION *)thread->session; + WT_TRET(wt_session->close(wt_session, NULL)); + thread->session = NULL; + } + __wt_free(session, thread); + group->threads[current_slot] = NULL; } - __wt_free(session, thread); - group->threads[current_slot] = NULL; } /* Update the thread group state to match our changes */ @@ -145,11 +146,14 @@ __thread_group_resize( if (new_min == group->min && new_max == group->max) return (0); + if (new_min > new_max) + return (EINVAL); + /* - * Coll shrink to reduce the number of thread structures and running + * Call shrink to reduce the number of thread structures and running * threads if required by the change in group size. */ - WT_RET(__thread_group_shrink(session, group, new_max)); + WT_RET(__thread_group_shrink(session, group, new_max, true)); /* * Only reallocate the thread array if it is the largest ever, since @@ -289,7 +293,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* Shut down all threads and free associated resources. */ - WT_TRET(__thread_group_shrink(session, group, 0)); + WT_TRET(__thread_group_shrink(session, group, 0, true)); __wt_free(session, group->threads); @@ -332,3 +336,30 @@ __wt_thread_group_start_one( return (ret); } + +/* + * __wt_thread_group_stop_one -- + * Stop one thread if possible. + */ +int +__wt_thread_group_stop_one( + WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) +{ + WT_DECL_RET; + + if (group->current_threads <= group->min) + return (0); + + if (wait) + __wt_writelock(session, &group->lock); + else if (__wt_try_writelock(session, &group->lock) != 0) + return (0); + + /* Recheck the bounds now that we hold the lock */ + if (group->current_threads > group->min) + WT_TRET(__thread_group_shrink( + session, group, group->current_threads - 1, false)); + __wt_writeunlock(session, &group->lock); + + return (ret); +} diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py index 5d385cda705..a94ce524ae3 100644 --- a/tools/wtstats/stat_data.py +++ b/tools/wtstats/stat_data.py @@ -128,6 +128,8 @@ no_clear_list = [ 'cache: eviction currently operating in aggressive mode', 'cache: eviction empty score', 'cache: eviction state', + 'cache: eviction worker thread active', + 'cache: eviction worker thread stable number', 'cache: files with active eviction walks', 'cache: maximum bytes configured', 'cache: maximum page size at eviction', -- cgit v1.2.1 From 4c461ebc2009d73a9b6e2ef0ee57bdfeba270064 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 29 Dec 2016 14:56:21 -0500 Subject: WT-3104 Fix single threaded eviction configurations. (#3221) --- bench/wtperf/runners/evict-btree-1.wtperf | 2 +- bench/wtperf/runners/evict-lsm-1.wtperf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/wtperf/runners/evict-btree-1.wtperf b/bench/wtperf/runners/evict-btree-1.wtperf index 24da4dd7902..741101d083f 100644 --- a/bench/wtperf/runners/evict-btree-1.wtperf +++ b/bench/wtperf/runners/evict-btree-1.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M" +conn_config="cache_size=50M,eviction=(threads_max=1)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/bench/wtperf/runners/evict-lsm-1.wtperf b/bench/wtperf/runners/evict-lsm-1.wtperf index ad885d98eb7..641a85dc889 100644 --- a/bench/wtperf/runners/evict-lsm-1.wtperf +++ b/bench/wtperf/runners/evict-lsm-1.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6)" +conn_config="cache_size=50M,eviction=(threads_max=1),lsm_manager=(worker_thread_max=6)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 -- cgit v1.2.1 From 5af64580f5be08d2f8900b96a83d29a3ae2cf04a Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 4 Jan 2017 00:55:11 -0500 Subject: SERVER-16796 Recovery progress via verbose messages. (#3225) --- dist/api_data.py | 1 + dist/flags.py | 1 + src/config/config_def.c | 35 ++++++++++++++++++++--------------- src/conn/conn_api.c | 1 + src/include/flags.h | 19 ++++++++++--------- src/include/wiredtiger.in | 12 ++++++------ src/log/log.c | 9 +++++++++ src/txn/txn_recover.c | 2 +- 8 files changed, 49 insertions(+), 31 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 04071a84332..324d1e4f281 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -537,6 +537,7 @@ connection_runtime_config = [ 'rebalance', 'reconcile', 'recovery', + 'recovery_progress', 'salvage', 'shared_cache', 'split', diff --git a/dist/flags.py b/dist/flags.py index 320bd8f6fb9..70e18712839 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -76,6 +76,7 @@ flags = { 'VERB_REBALANCE', 'VERB_RECONCILE', 'VERB_RECOVERY', + 'VERB_RECOVERY_PROGRESS', 'VERB_SALVAGE', 'VERB_SHARED_CACHE', 'VERB_SPLIT', diff --git a/src/config/config_def.c b/src/config/config_def.c index 83c1436eade..6a93c1d05e2 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -149,9 +149,10 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," + "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," + "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -751,9 +752,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," + "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," + "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -837,9 +839,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," + "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," + "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -918,9 +921,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," + "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," + "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -999,9 +1003,10 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," + "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," + "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 474b8bbad8a..50617240d38 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1811,6 +1811,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "rebalance", WT_VERB_REBALANCE }, { "reconcile", WT_VERB_RECONCILE }, { "recovery", WT_VERB_RECOVERY }, + { "recovery_progress", WT_VERB_RECOVERY_PROGRESS }, { "salvage", WT_VERB_SALVAGE }, { "shared_cache", WT_VERB_SHARED_CACHE }, { "split", WT_VERB_SPLIT }, diff --git a/src/include/flags.h b/src/include/flags.h index e7a5ba066df..2f0c207078a 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -102,15 +102,16 @@ #define WT_VERB_REBALANCE 0x00008000 #define WT_VERB_RECONCILE 0x00010000 #define WT_VERB_RECOVERY 0x00020000 -#define WT_VERB_SALVAGE 0x00040000 -#define WT_VERB_SHARED_CACHE 0x00080000 -#define WT_VERB_SPLIT 0x00100000 -#define WT_VERB_TEMPORARY 0x00200000 -#define WT_VERB_THREAD_GROUP 0x00400000 -#define WT_VERB_TRANSACTION 0x00800000 -#define WT_VERB_VERIFY 0x01000000 -#define WT_VERB_VERSION 0x02000000 -#define WT_VERB_WRITE 0x04000000 +#define WT_VERB_RECOVERY_PROGRESS 0x00040000 +#define WT_VERB_SALVAGE 0x00080000 +#define WT_VERB_SHARED_CACHE 0x00100000 +#define WT_VERB_SPLIT 0x00200000 +#define WT_VERB_TEMPORARY 0x00400000 +#define WT_VERB_THREAD_GROUP 0x00800000 +#define WT_VERB_TRANSACTION 0x01000000 +#define WT_VERB_VERIFY 0x02000000 +#define WT_VERB_VERSION 0x04000000 +#define WT_VERB_WRITE 0x08000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 7c27baa9395..90989cc679d 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1985,9 +1985,9 @@ struct __wt_connection { * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c - * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, + * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\, + * \c "version"\, \c "write"; default empty.} * @configend * @errors */ @@ -2516,9 +2516,9 @@ struct __wt_connection { * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, - * \c "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c + * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c + * "verify"\, \c "version"\, \c "write"; default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as "write_through=[data]". Configuring \c write_through requires diff --git a/src/log/log.c b/src/log/log.c index fb3935abf81..74c5442d405 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1674,6 +1674,10 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, &log_fh, WT_LOG_FILENAME, start_lsn.l.file, WT_LOG_OPEN_VERIFY)); WT_ERR(__wt_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); @@ -1722,6 +1726,11 @@ advance: WT_ERR(__log_openfile(session, &log_fh, WT_LOG_FILENAME, rd_lsn.l.file, WT_LOG_OPEN_VERIFY)); + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 + " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index a6390dcbd06..2d8a77a69e6 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -501,7 +501,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; - __wt_verbose(session, WT_VERB_RECOVERY, + __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); -- cgit v1.2.1 From 49e48315235a189ed769c43e35e6a73b9a074fa2 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 4 Jan 2017 00:57:52 -0500 Subject: WT-3100 test bug: format is weighted to delete, insert, then write operations (#3219) test/format was weighted to delete, insert, then write operations, which meant that configuring insert=80 might have no effect, if the randomly assigned percentage of delete operations was 95. Rewrite the code that calculates operation percentages to assign operation percentages in a random order, add an explicit read percentage instead of making all non-allocated operations default to reads. --- test/format/config.c | 104 +++++++++++++++++++++++++++++++++++++++++---------- test/format/config.h | 18 +++++---- test/format/format.h | 1 + 3 files changed, 97 insertions(+), 26 deletions(-) diff --git a/test/format/config.c b/test/format/config.c index cf922b5db04..43447c9ba02 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -44,6 +44,7 @@ static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); static void config_map_file_type(const char *, u_int *); static void config_map_isolation(const char *, u_int *); +static void config_pct(void); static void config_reset(void); /* @@ -159,31 +160,19 @@ config_setup(void) config_encryption(); config_isolation(); config_lrt(); + config_pct(); /* - * Periodically, set the delete percentage to 0 so salvage gets run, - * as long as the delete percentage isn't nailed down. - * Don't do it on the first run, all our smoke tests would hit it. - */ - if (!g.replay && g.run_cnt % 10 == 9 && !config_is_perm("delete_pct")) - config_single("delete_pct=0", 0); - - /* - * If this is an LSM run, set the cache size and crank up the insert - * percentage. + * If this is an LSM run, ensure cache size sanity. + * Ensure there is at least 1MB of cache per thread. */ - if (DATASOURCE("lsm")) { - if (!config_is_perm("cache")) + if (!config_is_perm("cache")) { + if (DATASOURCE("lsm")) g.c_cache = 30 * g.c_chunk_size; - - if (!config_is_perm("insert_pct")) - g.c_insert_pct = mmrand(NULL, 50, 85); + if (g.c_cache < g.c_threads) + g.c_cache = g.c_threads; } - /* Ensure there is at least 1MB of cache per thread. */ - if (!config_is_perm("cache") && g.c_cache < g.c_threads) - g.c_cache = g.c_threads; - /* Give in-memory configuration a final review. */ config_in_memory_check(); @@ -481,6 +470,83 @@ config_lrt(void) } } +/* + * config_pct -- + * Configure operation percentages. + */ +static void +config_pct(void) +{ + static struct { + const char *name; /* Operation */ + uint32_t *vp; /* Value store */ + u_int order; /* Order of assignment */ + } list[] = { +#define CONFIG_DELETE_ENTRY 0 + { "delete_pct", &g.c_delete_pct, 0 }, + { "insert_pct", &g.c_insert_pct, 0 }, + { "read_pct", &g.c_read_pct, 0 }, + { "write_pct", &g.c_write_pct, 0 }, + }; + u_int i, max_order, max_slot, n, pct; + + /* + * Walk the list of operations, checking for an illegal configuration + * and creating a random order in the list. + */ + pct = 0; + for (i = 0; i < WT_ELEMENTS(list); ++i) + if (config_is_perm(list[i].name)) + pct += *list[i].vp; + else + list[i].order = mmrand(NULL, 0, 1000); + if (pct > 100) + testutil_die(EINVAL, + "operation percentages total to more than 100%%"); + + /* + * If the delete percentage isn't nailed down, periodically set it to + * 0 so salvage gets run. Don't do it on the first run, all our smoke + * tests would hit it. + */ + if (!config_is_perm("delete_pct") && !g.replay && g.run_cnt % 10 == 9) { + list[CONFIG_DELETE_ENTRY].order = 0; + *list[CONFIG_DELETE_ENTRY].vp = 0; + } + + /* + * Walk the list, allocating random numbers of operations in a random + * order. + * + * If the "order" field is non-zero, we need to create a value for this + * operation. Find the largest order field in the array; if one non-zero + * order field is found, it's the last entry and gets the remainder of + * the operations. + */ + for (pct = 100 - pct;;) { + for (i = n = + max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) { + if (list[i].order != 0) + ++n; + if (list[i].order > max_order) { + max_order = list[i].order; + max_slot = i; + } + } + if (n == 0) + break; + if (n == 1) { + *list[max_slot].vp = pct; + break; + } + *list[max_slot].vp = mmrand(NULL, 0, pct); + list[max_slot].order = 0; + pct -= *list[max_slot].vp; + } + testutil_assert(g.c_delete_pct + + g.c_insert_pct + g.c_read_pct + g.c_write_pct == 100); +} + /* * config_error -- * Display configuration information on error. diff --git a/test/format/config.h b/test/format/config.h index e4f7af2e1b2..e3e1e73a786 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -131,7 +131,7 @@ static CONFIG c[] = { { "delete_pct", "percent operations that are deletes", - 0x0, 0, 45, 90, &g.c_delete_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_delete_pct, NULL }, { "dictionary", "if values are dictionary compressed", /* 20% */ @@ -171,7 +171,7 @@ static CONFIG c[] = { { "insert_pct", "percent operations that are inserts", - 0x0, 0, 45, 90, &g.c_insert_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_insert_pct, NULL }, { "internal_key_truncation", "if internal keys are truncated", /* 95% */ @@ -254,6 +254,14 @@ static CONFIG c[] = { "quiet run (same as -q)", C_IGNORE|C_BOOL, 0, 0, 0, &g.c_quiet, NULL }, + { "read_pct", + "percent operations that are reads", + C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL }, + + { "rebalance", + "rebalance testing", /* 100% */ + C_BOOL, 100, 1, 0, &g.c_rebalance, NULL }, + { "repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0, 90, 90, &g.c_repeat_data_pct, NULL }, @@ -270,10 +278,6 @@ static CONFIG c[] = { "the number of runs", C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_runs, NULL }, - { "rebalance", - "rebalance testing", /* 100% */ - C_BOOL, 100, 1, 0, &g.c_rebalance, NULL }, - { "salvage", "salvage testing", /* 100% */ C_BOOL, 100, 1, 0, &g.c_salvage, NULL }, @@ -320,7 +324,7 @@ static CONFIG c[] = { { "write_pct", "percent operations that are writes", - 0x0, 0, 90, 90, &g.c_write_pct, NULL }, + C_IGNORE, 0, 0, 100, &g.c_write_pct, NULL }, { NULL, NULL, 0x0, 0, 0, 0, NULL, NULL } }; diff --git a/test/format/format.h b/test/format/format.h index c1f4875dbb2..6bb44410acc 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -192,6 +192,7 @@ typedef struct { uint32_t c_reverse; uint32_t c_rows; uint32_t c_runs; + uint32_t c_read_pct; uint32_t c_rebalance; uint32_t c_salvage; uint32_t c_split_pct; -- cgit v1.2.1 From b47f127c8d935e2a9815970eb1309d6e4b417549 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 4 Jan 2017 01:13:25 -0500 Subject: WT-3099 lint: static function declarations, non-text characters in documentation (#3218) * Remove characters outside the ISO/IEC 8859-1 character set in documentation. Add --encoding=iso-8859-1 to the aspell check line to avoid in the future. * __dump_txn_state and __dump_cache were prototyped static but not declared static. * Clang sanitizer complaint: ret set but never read because the error label sets ret explicitly. --- dist/s_docs | 3 ++- src/docs/spell.ok | 3 ++- src/docs/tune-page-size-and-comp.dox | 8 ++++---- src/evict/evict_lru.c | 10 ++++------ src/utilities/util_dump.c | 16 ++++++++-------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dist/s_docs b/dist/s_docs index f4332257193..6ebffb947ec 100755 --- a/dist/s_docs +++ b/dist/s_docs @@ -96,7 +96,8 @@ spellchk() type aspell > /dev/null 2>&1 || return (cd ../src/docs && - cat *.dox | aspell --lang=en --personal=./spell.ok list) | + cat *.dox | + aspell --encoding=iso-8859-1 --lang=en --personal=./spell.ok list) | sort -u > $t test -s $t && { echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" diff --git a/src/docs/spell.ok b/src/docs/spell.ok index f87f24cef5c..bc2e16b1122 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -50,8 +50,8 @@ LDFLAGS LIBS LLVM LOGREC -LRVv LRU +LRVv LSB LSM LZ @@ -168,6 +168,7 @@ dNLen dNOff dT dataN +database's dataitem dataset datasets diff --git a/src/docs/tune-page-size-and-comp.dox b/src/docs/tune-page-size-and-comp.dox index 70e9875bcc4..96b0fda2333 100644 --- a/src/docs/tune-page-size-and-comp.dox +++ b/src/docs/tune-page-size-and-comp.dox @@ -40,7 +40,7 @@ of these blocks is defined by a parameter called allocation_size, which is the underlying unit of allocation for the file the data gets stored in. An application might choose to have data compressed before it gets stored to disk by enabling block compression. - - A database’s tables are usually much larger than the main memory available. + - A database's tables are usually much larger than the main memory available. Not all of the data can be kept in memory at any given time. A process called eviction takes care of making space for new data by freeing the memory of data infrequently accessed. An eviction server regularly finds in-memory pages that @@ -52,7 +52,7 @@ associated key is used to refer to an in-memory page. In the case of this page not being in memory, appropriate on-disk page(s) are read and an in-memory page constructed (the opposite of reconciliation). A data structure is maintained on every in-memory page to store any insertions or modifications to the data done -on that page. As more and more data gets written to this page, the page’s memory +on that page. As more and more data gets written to this page, the page's memory footprint keeps growing. - An application can choose to set the maximum size a page is allowed to grow in-memory. A default size is set by WiredTiger if the application doesn't @@ -81,7 +81,7 @@ There are additional configuration settings that tune more esoteric and specialized data. Those are included for completeness but are rarely changed. @subsection memory_page_max memory_page_max -The maximum size a table’s page is allowed to grow to in memory before being +The maximum size a table's page is allowed to grow to in memory before being reconciled to disk. - An integer, with acceptable values between 512B and 10TB - Default size: 5 MB @@ -98,7 +98,7 @@ both require exclusive access to the page which makes an application's write operations wait. Having a large memory_page_max means that the pages will need to be split and reconciled less often. But when that happens, the duration that an exclusive access to the page is required is longer, increasing the latency of -an application’s insert or update operations. Conversely, having a smaller +an application's insert or update operations. Conversely, having a smaller memory_page_max reduces the time taken for splitting and reconciling the pages, but causes it to happen more frequently, forcing more frequent but shorter exclusive accesses to the pages. diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 485fd0e6d40..a03c1f16dec 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1619,7 +1619,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); - /* Pages we no longer need (clean or dirty), are found money. */ + /* Pages being forcibly evicted go on the urgent queue. */ if (page->read_gen == WT_READGEN_OLDEST || page->memory_footprint >= btree->splitmempage) { WT_STAT_CONN_INCR( @@ -1629,7 +1629,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, continue; } - /* Pages that are empty or from dead trees are also good. */ + /* Pages that are empty or from dead trees are fast-tracked. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; @@ -2154,13 +2154,11 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) } #ifdef HAVE_DIAGNOSTIC -static int __dump_txn_state(WT_SESSION_IMPL *, FILE *fp); -static int __dump_cache(WT_SESSION_IMPL *, FILE *fp); /* * __dump_txn_state -- * Output debugging information about the global transaction state. */ -int +static int __dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) { WT_CONNECTION_IMPL *conn; @@ -2259,7 +2257,7 @@ __dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) * __dump_cache -- * Output debugging information about the size of the files in cache. */ -int +static int __dump_cache(WT_SESSION_IMPL *session, FILE *fp) { WT_CONNECTION_IMPL *conn; diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 95cd39322c4..3f8b4a49dfe 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -81,13 +81,13 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) return (usage()); if (json && - ((ret = dump_json_begin(session)) != 0 || - (ret = dump_prefix(session, hex, json)) != 0)) + (dump_json_begin(session) != 0 || + dump_prefix(session, hex, json) != 0)) goto err; for (i = 0; i < argc; i++) { if (json && i > 0) - if ((ret = dump_json_separator(session)) != 0) + if (dump_json_separator(session) != 0) goto err; free(name); free(simplename); @@ -120,7 +120,7 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) } if ((simplename = strdup(name)) == NULL) { - ret = util_err(session, errno, NULL); + (void)util_err(session, errno, NULL); goto err; } if ((p = strchr(simplename, '(')) != NULL) @@ -128,19 +128,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) if (dump_config(session, simplename, cursor, hex, json) != 0) goto err; - if ((ret = dump_record(cursor, reverse, json)) != 0) + if (dump_record(cursor, reverse, json) != 0) goto err; - if (json && (ret = dump_json_table_end(session)) != 0) + if (json && dump_json_table_end(session) != 0) goto err; ret = cursor->close(cursor); cursor = NULL; if (ret != 0) { - ret = util_err(session, ret, NULL); + (void)util_err(session, ret, NULL); goto err; } } - if (json && ((ret = dump_json_end(session)) != 0)) + if (json && dump_json_end(session) != 0) goto err; if (0) { -- cgit v1.2.1 From 3a211a245a3b9198fdc0618bd0e2d3d97ff8171c Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 4 Jan 2017 09:53:58 -0500 Subject: WT-3100 test bug: format is weighted to delete, insert, then write operations. Bug fix. --- test/format/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/format/config.c b/test/format/config.c index 43447c9ba02..50430fe073e 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -499,7 +499,7 @@ config_pct(void) if (config_is_perm(list[i].name)) pct += *list[i].vp; else - list[i].order = mmrand(NULL, 0, 1000); + list[i].order = mmrand(NULL, 1, 1000); if (pct > 100) testutil_die(EINVAL, "operation percentages total to more than 100%%"); -- cgit v1.2.1 From 0f8cb7b38a85e1afe6c91d49222b8baab4525ad0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 4 Jan 2017 11:03:38 -0500 Subject: WT-2898 Improve performance of eviction-heavy workloads by dynamically controlling the number of eviction threads lint fix --- src/evict/evict_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a03c1f16dec..08cafcf32ed 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -559,7 +559,7 @@ __evict_pass(WT_SESSION_IMPL *session) prev = now; if (conn->evict_threads.threads[0]->session == session) - __evict_tune_workers(session); + WT_RET(__evict_tune_workers(session)); /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages -- cgit v1.2.1 From 4cde2ec263744da56807a84309d0f910c6ab2636 Mon Sep 17 00:00:00 2001 From: Eric Milkie Date: Wed, 4 Jan 2017 15:24:13 -0500 Subject: WT-3109 correct named snapshots documentation (#3231) --- src/docs/transactions.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox index bbbd2d52296..3b438eda366 100644 --- a/src/docs/transactions.dox +++ b/src/docs/transactions.dox @@ -141,7 +141,7 @@ as if the transaction started at the time of the WT_SESSION::snapshot call that created the snapshot. Named snapshots keep data pinned in cache as if a real transaction were -running for the time that the named transaction is active. The resources +running for the time that the named snapshot is active. The resources associated with named snapshots should be released by calling WT_SESSION::snapshot with a configuration that includes "drop=". See WT_SESSION::snapshot documentation for details of -- cgit v1.2.1 From 8255cfa17271e33cd1de1c240c49e9ec511aa4c6 Mon Sep 17 00:00:00 2001 From: Sulabh Mahajan Date: Sat, 7 Jan 2017 03:54:40 +1100 Subject: WT-3108 Add disk memory size to verify debug output (#3226) * WT-3108 Add disk memory size to verify debug output * Check for page->dsk to be not NULL before accessing disk mem size --- src/btree/bt_debug.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 957ccdbea1a..b62125e069d 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -679,8 +679,11 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) } WT_RET(ds->f(ds, ": %s\n", __wt_page_type_string(page->type))); - WT_RET(ds->f(ds, - "\t" "disk %p, entries %" PRIu32, (void *)page->dsk, entries)); + WT_RET(ds->f(ds, "\t" "disk %p", (void *)page->dsk)); + if (page->dsk != NULL) + WT_RET(ds->f( + ds, ", dsk_mem_size %" PRIu32, page->dsk->mem_size)); + WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked( -- cgit v1.2.1 From 9dabbaf2da6ecdd337436f815bb6468802b9c07a Mon Sep 17 00:00:00 2001 From: sueloverso Date: Fri, 6 Jan 2017 15:40:54 -0500 Subject: WT-3112 Add lock statistics to try_lock path. (#3233) --- src/evict/evict_lru.c | 4 ++-- src/include/mutex.i | 30 ++++++++++++++++++++++++++++++ src/include/schema.h | 2 +- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 08cafcf32ed..2cedb1d49c3 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -326,7 +326,7 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - for (spins = 0; (ret = __wt_spin_trylock( + for (spins = 0; (ret = __wt_spin_trylock_track( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) @@ -1264,7 +1264,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - for (spins = 0; (ret = __wt_spin_trylock( + for (spins = 0; (ret = __wt_spin_trylock_track( session, &conn->dhandle_lock)) == EBUSY && cache->pass_intr == 0; spins++) { diff --git a/src/include/mutex.i b/src/include/mutex.i index a6309e0976b..a9abef5be70 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -300,3 +300,33 @@ __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) } else __wt_spin_lock(session, t); } + +/* + * __wt_spin_trylock_track -- + * Try to lock a spinlock or fail immediately if it is busy. + * Track if successful. + */ +static inline int +__wt_spin_trylock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + struct timespec enter, leave; + WT_DECL_RET; + int64_t **stats; + + if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { + __wt_epoch(session, &enter); + ret = __wt_spin_trylock(session, t); + __wt_epoch(session, &leave); + WT_RET(ret); + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][t->stat_count_off]++; + if (F_ISSET(session, WT_SESSION_INTERNAL)) + stats[session->stat_bucket][t->stat_int_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + else + stats[session->stat_bucket][t->stat_app_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + } else + ret = __wt_spin_trylock(session, t); + return (ret); +} diff --git a/src/include/schema.h b/src/include/schema.h index a17affb7660..bb116e5cf2f 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -102,7 +102,7 @@ struct __wt_table { ret = 0; \ if (F_ISSET(session, (flag))) { \ op; \ - } else if ((ret = __wt_spin_trylock(session, lock)) == 0) { \ + } else if ((ret = __wt_spin_trylock_track(session, lock)) == 0) {\ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ -- cgit v1.2.1 From 216903ac097f61ee787f08296b2f3be298f54087 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 9 Jan 2017 07:22:10 -0500 Subject: WT-3112 Time the eviction try-lock for the dhandle overall, not per-attempt (#3235) * WT-3112 Time the eviction try-lock for the dhandle overall, not per-attempt. * Fix comment * Some style preference and nits. * lint - my spell checker knows backoff * Minor refactoring. --- src/evict/evict_lru.c | 78 +++++++++++++++++++++++++++++++++++++-------------- src/include/mutex.i | 17 ++--------- 2 files changed, 60 insertions(+), 35 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 2cedb1d49c3..ba8851812cb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -23,6 +23,59 @@ static int __evict_walk_file( #define WT_EVICT_HAS_WORKERS(s) \ (S2C(s)->evict_threads.current_threads > 1) +/* + * __evict_lock_dhandle -- + * Try to get the dhandle lock, with yield and sleep back off. + * Keep timing statistics overall. + */ +static int +__evict_lock_dhandle(WT_SESSION_IMPL *session) +{ + struct timespec enter, leave; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SPINLOCK *dh_lock; + int64_t **stats; + u_int spins; + bool dh_stats; + + conn = S2C(session); + cache = conn->cache; + dh_lock = &conn->dhandle_lock; + stats = (int64_t **)conn->stats; + dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1; + + /* + * Maintain lock acquisition timing statistics as if this were a + * regular lock acquisition. + */ + if (dh_stats) + __wt_epoch(session, &enter); + /* + * Use a custom lock acquisition back off loop so the eviction server + * notices any interrupt quickly. + */ + for (spins = 0; + (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY && + cache->pass_intr == 0; spins++) { + if (spins < WT_THOUSAND) + __wt_yield(); + else + __wt_sleep(0, WT_THOUSAND); + } + /* + * Only record statistics on success. + */ + WT_RET(ret); + if (dh_stats) { + __wt_epoch(session, &leave); + stats[session->stat_bucket][dh_lock->stat_int_usecs_off] += + (int64_t)WT_TIMEDIFF_US(leave, enter); + } + return (0); +} + /* * __evict_entry_priority -- * Get the adjusted read generation for an eviction entry. @@ -307,7 +360,6 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) struct timespec now; #endif uint64_t orig_pages_evicted; - u_int spins; conn = S2C(session); cache = conn->cache; @@ -326,21 +378,14 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - for (spins = 0; (ret = __wt_spin_trylock_track( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } + /* * If we gave up acquiring the lock, that indicates a * session is waiting for us to clear walks. Do that * as part of a normal pass (without the handle list * lock) to avoid deadlock. */ - if (ret == EBUSY) + if ((ret = __evict_lock_dhandle(session)) == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); @@ -1226,7 +1271,7 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int max_entries, retries, slot, spins, start_slot, total_candidates; + u_int max_entries, retries, slot, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); @@ -1264,16 +1309,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - for (spins = 0; (ret = __wt_spin_trylock_track( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; - spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } - WT_ERR(ret); + WT_ERR(__evict_lock_dhandle(session)); dhandle_locked = true; } diff --git a/src/include/mutex.i b/src/include/mutex.i index a9abef5be70..6b83cb280d3 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -309,24 +309,13 @@ __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) static inline int __wt_spin_trylock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { - struct timespec enter, leave; - WT_DECL_RET; int64_t **stats; if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { - __wt_epoch(session, &enter); - ret = __wt_spin_trylock(session, t); - __wt_epoch(session, &leave); - WT_RET(ret); + WT_RET(__wt_spin_trylock(session, t)); stats = (int64_t **)S2C(session)->stats; stats[session->stat_bucket][t->stat_count_off]++; - if (F_ISSET(session, WT_SESSION_INTERNAL)) - stats[session->stat_bucket][t->stat_int_usecs_off] += - (int64_t)WT_TIMEDIFF_US(leave, enter); - else - stats[session->stat_bucket][t->stat_app_usecs_off] += - (int64_t)WT_TIMEDIFF_US(leave, enter); + return (0); } else - ret = __wt_spin_trylock(session, t); - return (ret); + return (__wt_spin_trylock(session, t)); } -- cgit v1.2.1 From 247b3a5f2c2b2d8ab53d151fa18a23143501c2b0 Mon Sep 17 00:00:00 2001 From: David Hows Date: Tue, 10 Jan 2017 11:13:46 +1100 Subject: WT-3106 Add truncate operation to wt command line utility (#3227) --- SConstruct | 1 + build_posix/Make.base | 1 + src/docs/command-line.dox | 13 ++++++ src/docs/upgrading.dox | 7 ++++ src/utilities/util.h | 1 + src/utilities/util_main.c | 5 +++ src/utilities/util_truncate.c | 51 ++++++++++++++++++++++++ test/suite/test_util14.py | 92 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 171 insertions(+) create mode 100644 src/utilities/util_truncate.c create mode 100644 test/suite/test_util14.py diff --git a/SConstruct b/SConstruct index df7a66238e8..e9e72630b11 100644 --- a/SConstruct +++ b/SConstruct @@ -313,6 +313,7 @@ wtbin = env.Program("wt", [ "src/utilities/util_rename.c", "src/utilities/util_salvage.c", "src/utilities/util_stat.c", + "src/utilities/util_truncate.c", "src/utilities/util_upgrade.c", "src/utilities/util_verbose.c", "src/utilities/util_verify.c", diff --git a/build_posix/Make.base b/build_posix/Make.base index 9354eb4b183..e5228fac885 100644 --- a/build_posix/Make.base +++ b/build_posix/Make.base @@ -36,6 +36,7 @@ wt_SOURCES =\ src/utilities/util_rename.c \ src/utilities/util_salvage.c \ src/utilities/util_stat.c \ + src/utilities/util_truncate.c \ src/utilities/util_upgrade.c \ src/utilities/util_verbose.c \ src/utilities/util_verify.c \ diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 5726a1d19a1..df52324f8f8 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -369,6 +369,19 @@ The following are command-specific options for the \c stat command: Include only "fast" statistics in the output (equivalent to passing statistics=(fast)) to WT_SESSION::open_cursor. +
+@section util_truncate wt truncate +Truncate a table, removing all data. + +The \c truncate command truncates the specified \c uri. It is equivalent to a +call to WT_SESSION::truncate with no start or stop specified. + +@subsection util_truncate_synopsis Synopsis +wt [-RVv] [-C config] [-E secretkey ] [-h directory] truncate uri + +@subsection util_truncate_options Options +The \c truncate command has no command-specific options. +
@section util_upgrade wt upgrade Upgrade a table. diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 59a299d48a1..4a356f7da61 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,6 +1,13 @@ /*! @page upgrading Upgrading WiredTiger applications @section version_292 Upgrading to Version 2.9.2 +
+
WiredTiger Utility now supports truncate
+
+The WiredTiger Utility can now \c truncate an object. Removing all contents +from the specified object. +
+
@section version_291 Upgrading to Version 2.9.1 diff --git a/src/utilities/util.h b/src/utilities/util.h index 2658d877b63..cf12d7d4aa6 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -49,6 +49,7 @@ int util_rename(WT_SESSION *, int, char *[]); int util_salvage(WT_SESSION *, int, char *[]); int util_stat(WT_SESSION *, int, char *[]); int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop); +int util_truncate(WT_SESSION *, int, char *[]); int util_upgrade(WT_SESSION *, int, char *[]); int util_verify(WT_SESSION *, int, char *[]); int util_write(WT_SESSION *, int, char *[]); diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 1da56adf137..001a66d6d9e 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -175,6 +175,10 @@ main(int argc, char *argv[]) config = "statistics=(all)"; } break; + case 't' : + if (strcmp(command, "truncate") == 0) + func = util_truncate; + break; case 'u': if (strcmp(command, "upgrade") == 0) func = util_upgrade; @@ -272,6 +276,7 @@ usage(void) "\t" "rename\t rename an object\n" "\t" "salvage\t salvage a file\n" "\t" "stat\t display statistics for an object\n" + "\t" "truncate truncate an object, removing all content\n" "\t" "upgrade\t upgrade an object\n" "\t" "verify\t verify an object\n" "\t" "write\t write values to an object\n"); diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c new file mode 100644 index 00000000000..9325c0d7e84 --- /dev/null +++ b/src/utilities/util_truncate.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_truncate(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *name; + + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + if ((name = util_name(session, *argv, "table")) == NULL) + return (1); + + if ((ret = session->truncate(session, name, NULL, NULL, NULL)) != 0) + return (util_err(session, ret, "%s: session.truncate", name)); + + free(name); + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "truncate uri\n", + progname, usage_prefix); + return (1); +} diff --git a/test/suite/test_util14.py b/test/suite/test_util14.py new file mode 100644 index 00000000000..e2a9f41f0d4 --- /dev/null +++ b/test/suite/test_util14.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util14.py +# Utilities: wt truncate +class test_util14(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util14.a' + nentries = 1000 + + def test_truncate_process(self): + """ + Test truncate in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + for i in range(0, self.nentries): + cursor[str(i)] = str(i) + cursor.close() + + self.runWt(["truncate", "table:" + self.tablename]) + + """ + Test to confirm table exists and is empty + """ + outfile="outfile.txt" + errfile="errfile.txt" + self.assertTrue(os.path.exists(self.tablename + ".wt")) + self.runWt(["read", 'table:' + self.tablename, 'NoMatch'], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'NoMatch: not found\n') + + """ + Tests for error cases + 1. Missing URI + 2. Invalid URI + 3. Valid but incorrect URI + 4. Double URI + """ + self.runWt(["truncate"], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'usage:') + + self.runWt(["truncate", "foobar"], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'No such file or directory') + + self.runWt(["truncate", 'table:xx' + self.tablename], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'No such file or directory') + + self.runWt(["truncate", 'table:' + self.tablename, 'table:' + self.tablename], + outfilename=outfile, errfilename=errfile, failure=True) + self.check_empty_file(outfile) + self.check_file_contains(errfile, 'usage:') + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1 From aa1961b0056db9a3b38243d328b07f2d48d90f3d Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 9 Jan 2017 19:25:52 -0500 Subject: WT-3105 Create all eviction sessions initially to avoid deadlock. (#3237) --- src/evict/evict_lru.c | 32 +++++--------------------------- src/include/connection.h | 2 +- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index ba8851812cb..948c1e1139e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -437,18 +437,11 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* * Create the eviction thread group. - * We don't set the group size to the maximum allowed sessions, - * because this may have adverse memory effects. Instead, - * we set the group's maximum to a small value. The code - * that tunes the number of workers will increase the - * maximum if necessary. + * Set the group size to the maximum allowed sessions. */ WT_RET(__wt_thread_group_create(session, &conn->evict_threads, - "eviction-server", conn->evict_threads_min, - WT_MAX(conn->evict_threads_min, - WT_MIN(conn->evict_threads_max, EVICT_GROUP_INCR)), - WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, - __wt_evict_thread_run)); + "eviction-server", conn->evict_threads_min, conn->evict_threads_max, + WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_run)); /* * Allow queues to be populated now that the eviction threads @@ -921,7 +914,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; uint64_t pgs_evicted_cur, pgs_evicted_persec_cur; - uint32_t new_max, thread_surplus; + uint32_t thread_surplus; conn = S2C(session); cache = conn->cache; @@ -1033,23 +1026,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session) target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, conn->evict_threads_max); /* - * Resize the group to allow for an additional batch of threads. - * We resize the group in increments of a few sessions. - * Allocating the group to accommodate the maximum number of - * workers has adverse effects on performance due to memory - * effects, so we gradually ramp up the allocation. + * Start the new threads. */ - if (conn->evict_threads.max < target_threads) { - new_max = WT_MIN(conn->evict_threads.max + - EVICT_GROUP_INCR, conn->evict_threads_max); - - WT_RET(__wt_thread_group_resize( - session, &conn->evict_threads, - conn->evict_threads_min, new_max, - WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); - } - - /* Now actually start the new threads. */ for (i = 0; i < (target_threads - cur_threads); ++i) { WT_RET(__wt_thread_group_start_one(session, &conn->evict_threads, false)); diff --git a/src/include/connection.h b/src/include/connection.h index 665275440cf..7d2b78e9f66 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -107,7 +107,7 @@ struct __wt_named_extractor { * Allocate some additional slots for internal sessions so the user cannot * configure too few sessions for us to run. */ -#define WT_EXTRA_INTERNAL_SESSIONS 10 +#define WT_EXTRA_INTERNAL_SESSIONS 20 /* * WT_CONN_CHECK_PANIC -- -- cgit v1.2.1 From c6c95a82915a6f0b96a4c514cf0b29e74bcd2f0a Mon Sep 17 00:00:00 2001 From: David Hows Date: Thu, 12 Jan 2017 16:10:13 +1100 Subject: WT-3110 Add more test cases for the WT command line utility (#3232) --- test/suite/test_util15.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++ test/suite/test_util16.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++ test/suite/test_util17.py | 57 +++++++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 test/suite/test_util15.py create mode 100644 test/suite/test_util16.py create mode 100644 test/suite/test_util17.py diff --git a/test/suite/test_util15.py b/test/suite/test_util15.py new file mode 100644 index 00000000000..33096e71bee --- /dev/null +++ b/test/suite/test_util15.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util15.py +# Utilities: wt alter +class test_util15(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util15.a' + + def test_alter_process(self): + """ + Test alter in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + + """ + Alter access pattern and confirm + """ + acc_pat_seq="access_pattern_hint=sequential" + self.runWt(["alter", "table:" + self.tablename, acc_pat_seq]) + cursor = self.session.open_cursor("metadata:create", None, None) + cursor.set_key("table:" + self.tablename) + self.assertEqual(cursor.search(),0) + string = cursor.get_value() + cursor.close() + self.assertTrue(acc_pat_seq in string) + + """ + Alter access pattern again and confirm + """ + acc_pat_rand="access_pattern_hint=random" + self.runWt(["alter", "table:" + self.tablename, acc_pat_rand]) + cursor = self.session.open_cursor("metadata:create", None, None) + cursor.set_key("table:" + self.tablename) + self.assertEqual(cursor.search(),0) + string = cursor.get_value() + cursor.close() + self.assertTrue(acc_pat_rand in string) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_util16.py b/test/suite/test_util16.py new file mode 100644 index 00000000000..00e68c1017a --- /dev/null +++ b/test/suite/test_util16.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util16.py +# Utilities: wt rename +class test_util16(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util16.a' + tablename2 = 'test_util16.b' + nentries = 1000 + + def test_rename_process(self): + """ + Test alter in a 'wt' process + """ + params = 'key_format=S,value_format=S' + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + for i in range(0, self.nentries): + cursor[str(i)] = str(i) + cursor.close() + + self.runWt(["rename", "table:" + self.tablename, "table:" + self.tablename2]) + self.assertTrue(os.path.exists(self.tablename2 + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename2, None, None) + count = 0 + while cursor.next() == 0: + count +=1 + cursor.close() + self.assertEquals(self.nentries, count) + + self.runWt(["rename", "table:" + self.tablename2, "table:" + self.tablename]) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + cursor = self.session.open_cursor('table:' + self.tablename, None, None) + count = 0 + while cursor.next() == 0: + count +=1 + cursor.close() + self.assertEquals(self.nentries, count) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_util17.py b/test/suite/test_util17.py new file mode 100644 index 00000000000..decc1fabf1d --- /dev/null +++ b/test/suite/test_util17.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +# test_util17.py +# Utilities: wt stat +class test_util17(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_util17.a' + + def test_stat_process(self): + """ + Test stat in a 'wt' process + This test is just here to confirm that stat produces a correct looking + output, it isn't here to do statistics validation. + """ + params = 'key_format=S,value_format=S' + outfile = "wt-stat.out" + expected_string = "cursor: cursor create calls=" + self.session.create('table:' + self.tablename, params) + self.assertTrue(os.path.exists(self.tablename + ".wt")) + self.runWt(["stat"], outfilename=outfile) + self.check_file_contains(outfile, expected_string) + + expected_string = "cache_walk: Entries in the root page=1" + self.runWt(["stat", "table:" + self.tablename ], outfilename=outfile) + self.check_file_contains(outfile, expected_string) + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1 From 4d85f283c3607ada88922eb7579fb9aa6ee73ba3 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 12 Jan 2017 20:22:40 -0500 Subject: bug: dist/s_all script has misplaced quote causing bad error reporting (#3243) --- dist/s_all | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dist/s_all b/dist/s_all index 4c9d4eccebb..d7a69b08644 100755 --- a/dist/s_all +++ b/dist/s_all @@ -97,10 +97,10 @@ COMMANDS=" 2>&1 ./s_string > ${t_pfx}s_string 2>&1 ./s_tags > ${t_pfx}tags 2>&1 ./s_typedef -c > ${t_pfx}s_typedef_c -2>&1 ./s_void > ${t_pfx}s_void" +2>&1 ./s_void > ${t_pfx}s_void 2>&1 ./s_whitespace > ${t_pfx}s_whitespace 2>&1 ./s_win > ${t_pfx}s_win -2>&1 python style.py > ${t_pfx}py_style +2>&1 python style.py > ${t_pfx}py_style" # Parallelize if possible. xp="" -- cgit v1.2.1 From ec9b2bd417be1fad7484335390385c2a669fc407 Mon Sep 17 00:00:00 2001 From: David Hows Date: Fri, 13 Jan 2017 23:24:33 +1100 Subject: WT-3116 Change s_all to avoid missing error returns on older platforms (#3246) --- dist/s_all | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dist/s_all b/dist/s_all index d7a69b08644..be33657e640 100755 --- a/dist/s_all +++ b/dist/s_all @@ -57,7 +57,7 @@ errchk() # Some tests shouldn't return an error, we exclude them here. case "$1" in *s_export|*s_tags) - break;; + ;; *) errfound=1;; esac @@ -111,14 +111,13 @@ fi echo "$COMMANDS" | xargs $xp -I{} /bin/sh -c {} for f in `find . -name ${t_pfx}\*`; do - if ! `test -s $f`; then - continue + if `test -s $f`; then + LOCAL_NAME=`basename $f` + # Find original command and trim redirect garbage + FAILED_CMD=`echo "$COMMANDS" | grep $LOCAL_NAME | \ + sed -e 's/ >.*//' -e 's/.* //'` + errchk "$FAILED_CMD" $f fi - LOCAL_NAME=`basename $f` - # Find original command and trim redirect garbage - FAILED_CMD=`echo "$COMMANDS" | grep $LOCAL_NAME | \ - sed -e 's/ >.*//' -e 's/.* //'` - errchk "$FAILED_CMD" $f done echo 'dist/s_all run finished' -- cgit v1.2.1 From 36c9a6513bee481a7ef27f0696a88f9b1921c356 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Sun, 15 Jan 2017 20:31:47 -0500 Subject: WT-3114 Avoid archiving log files immediately after recovery. (#3238) --- src/txn/txn_log.c | 16 +++++++++------- test/suite/test_reconfig02.py | 1 + test/suite/test_txn02.py | 4 +++- test/suite/test_txn05.py | 6 +++++- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 5f4704b40c4..7ad295f421b 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -368,14 +368,16 @@ __wt_txn_checkpoint_log( /* * If this full checkpoint completed successfully and there is - * no hot backup in progress, tell the logging subsystem the - * checkpoint LSN so that it can archive. Do not update the - * logging checkpoint LSN if this is during a clean connection - * close, only during a full checkpoint. A clean close may not - * update any metadata LSN and we do not want to archive in - * that case. + * no hot backup in progress and this is not recovery, tell + * the logging subsystem the checkpoint LSN so that it can + * archive. Do not update the logging checkpoint LSN if this + * is during a clean connection close, only during a full + * checkpoint. A clean close may not update any metadata LSN + * and we do not want to archive in that case. */ - if (!S2C(session)->hot_backup && txn->full_ckpt) + if (!S2C(session)->hot_backup && + !F_ISSET(S2C(session), WT_CONN_RECOVERING) && + txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ diff --git a/test/suite/test_reconfig02.py b/test/suite/test_reconfig02.py index 36a78a1805f..8054b2a6ab5 100644 --- a/test/suite/test_reconfig02.py +++ b/test/suite/test_reconfig02.py @@ -109,6 +109,7 @@ class test_reconfig02(wttest.WiredTigerTestCase): # Now turn on archive, sleep a bit to allow the archive thread # to run and then confirm that all original logs are gone. self.conn.reconfigure("log=(archive=true)") + self.session.checkpoint("force") time.sleep(2) cur_logs = fnmatch.filter(os.listdir('.'), "*Log*") for o in orig_logs: diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index a0c2c12a47c..7c2a58516bc 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -176,8 +176,10 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: - self.check(backup_conn.open_session(), None, committed) + session = backup_conn.open_session() finally: + session.checkpoint("force") + self.check(backup_conn.open_session(), None, committed) # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. time.sleep(1.0) diff --git a/test/suite/test_txn05.py b/test/suite/test_txn05.py index 9e84fe7d3fe..5913c4688a3 100644 --- a/test/suite/test_txn05.py +++ b/test/suite/test_txn05.py @@ -139,8 +139,12 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: - self.check(backup_conn.open_session(), None, committed) + session = backup_conn.open_session() finally: + self.check(session, None, committed) + # Force a checkpoint because we don't record the recovery + # checkpoint as available for archiving. + session.checkpoint("force") # Sleep long enough so that the archive thread is guaranteed # to run before we close the connection. time.sleep(1.0) -- cgit v1.2.1 From 67f96585500a67236e6df2d633acf64dfe16fe5f Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Sun, 15 Jan 2017 22:21:24 -0500 Subject: WT-3121 In the test suite, create a standard way to load extensions (#3241) * In the test suite, create a standard way to load extensions. Most examples of overriding setUpConnectionOpen() can now be handled by a combination of conn_config (as variable or method) and conn_extensions (as variable or method). * conn_config when defined as a method, now only takes the self argument, clean up the callers. * Refactored several more tests to use conn_config() in favor of overriding setUpConnectionOpen(). --- test/suite/run.py | 3 +- test/suite/test_async01.py | 2 +- test/suite/test_async02.py | 2 +- test/suite/test_backup04.py | 2 +- test/suite/test_bug011.py | 2 +- test/suite/test_collator.py | 29 +--------------- test/suite/test_compress01.py | 20 +++-------- test/suite/test_cursor07.py | 2 +- test/suite/test_cursor08.py | 20 +++-------- test/suite/test_encrypt01.py | 39 +++++---------------- test/suite/test_encrypt02.py | 46 ++++++------------------ test/suite/test_encrypt03.py | 36 ++++--------------- test/suite/test_encrypt04.py | 42 +++++++--------------- test/suite/test_encrypt05.py | 39 +++++---------------- test/suite/test_encrypt06.py | 38 ++++---------------- test/suite/test_encrypt07.py | 33 +++--------------- test/suite/test_join03.py | 28 +-------------- test/suite/test_join04.py | 28 +-------------- test/suite/test_join07.py | 28 +-------------- test/suite/test_readonly01.py | 3 +- test/suite/test_schema05.py | 28 +-------------- test/suite/test_schema07.py | 3 +- test/suite/test_stat02.py | 2 +- test/suite/test_txn02.py | 22 +++++------- test/suite/test_txn04.py | 22 +++++------- test/suite/test_txn05.py | 22 +++++------- test/suite/test_txn06.py | 4 +-- test/suite/test_txn07.py | 53 +++++++++------------------- test/suite/test_txn08.py | 2 +- test/suite/test_txn09.py | 17 +++------ test/suite/test_txn11.py | 2 +- test/suite/test_txn13.py | 2 +- test/suite/test_txn15.py | 2 +- test/suite/wttest.py | 81 +++++++++++++++++++++++++++++++++++++++---- 34 files changed, 207 insertions(+), 497 deletions(-) diff --git a/test/suite/run.py b/test/suite/run.py index ba6d9f78503..97c58bfdccf 100644 --- a/test/suite/run.py +++ b/test/suite/run.py @@ -324,7 +324,8 @@ if __name__ == '__main__': # All global variables should be set before any test classes are loaded. # That way, verbose printing can be done at the class definition level. wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub, - verbose, dirarg, longtest) + verbose, wt_builddir, dirarg, + longtest) # Without any tests listed as arguments, do discovery if len(testargs) == 0: diff --git a/test/suite/test_async01.py b/test/suite/test_async01.py index cbb3dad8de6..158c16a9381 100644 --- a/test/suite/test_async01.py +++ b/test/suite/test_async01.py @@ -132,7 +132,7 @@ class test_async01(wttest.WiredTigerTestCase, suite_subprocess): ]) # Enable async for this test. - def conn_config(self, dir): + def conn_config(self): return 'async=(enabled=true,ops_max=%s,' % self.async_ops + \ 'threads=%s)' % self.async_threads diff --git a/test/suite/test_async02.py b/test/suite/test_async02.py index 50652da6dfd..28435fe85b2 100644 --- a/test/suite/test_async02.py +++ b/test/suite/test_async02.py @@ -129,7 +129,7 @@ class test_async02(wttest.WiredTigerTestCase, suite_subprocess): ]) # Enable async for this test. - def conn_config(self, dir): + def conn_config(self): return 'async=(enabled=true,ops_max=%s,' % self.async_ops + \ 'threads=%s)' % self.async_threads diff --git a/test/suite/test_backup04.py b/test/suite/test_backup04.py index 919649fed57..be52a5e1e97 100644 --- a/test/suite/test_backup04.py +++ b/test/suite/test_backup04.py @@ -60,7 +60,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): ]) # Create a large cache, otherwise this test runs quite slowly. - def conn_config(self, dir): + def conn_config(self): return 'cache_size=1G,log=(archive=false,enabled,file_max=%s)' % \ self.logmax diff --git a/test/suite/test_bug011.py b/test/suite/test_bug011.py index 969aaeb5b39..5e0721b93f1 100644 --- a/test/suite/test_bug011.py +++ b/test/suite/test_bug011.py @@ -43,7 +43,7 @@ class test_bug011(wttest.WiredTigerTestCase): nrows = 10000 nops = 10000 # Add connection configuration for this test. - def conn_config(self, dir): + def conn_config(self): return 'cache_size=1GB' @wttest.longtest("Eviction copes with lots of files") diff --git a/test/suite/test_collator.py b/test/suite/test_collator.py index 3fae4ff47cb..e7be557335e 100644 --- a/test/suite/test_collator.py +++ b/test/suite/test_collator.py @@ -48,34 +48,7 @@ class test_collator(wttest.WiredTigerTestCase): nentries = 100 nindices = 4 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor'), - ('collators', 'revint', 'revint_collator')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + conn_extensions = [ 'extractors/csv', 'collators/revint' ] def create_indices(self): # Create self.nindices index files, each with a column from the CSV diff --git a/test/suite/test_compress01.py b/test/suite/test_compress01.py index 606f7b63235..ef1064d294e 100644 --- a/test/suite/test_compress01.py +++ b/test/suite/test_compress01.py @@ -51,22 +51,10 @@ class test_compress01(wttest.WiredTigerTestCase): nrecords = 10000 bigvalue = "abcdefghij" * 1000 - # Load the compression extension, compression is enabled elsewhere. - def conn_config(self, dir): - return self.extensionArg(self.compress) - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None: - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + # Load the compression extension, skip the test if missing + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) # Create a table, add keys with both big and small values, then verify them. def test_compress(self): diff --git a/test/suite/test_cursor07.py b/test/suite/test_cursor07.py index d6078183fc1..19db718fd11 100644 --- a/test/suite/test_cursor07.py +++ b/test/suite/test_cursor07.py @@ -49,7 +49,7 @@ class test_cursor07(wttest.WiredTigerTestCase, suite_subprocess): ('reopen', dict(reopen=True)) ]) # Enable logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ 'transaction_sync="(method=dsync,enabled)"' diff --git a/test/suite/test_cursor08.py b/test/suite/test_cursor08.py index 3f8f50defa7..cc76f528aa9 100644 --- a/test/suite/test_cursor08.py +++ b/test/suite/test_cursor08.py @@ -54,24 +54,14 @@ class test_cursor08(wttest.WiredTigerTestCase, suite_subprocess): ] scenarios = make_scenarios(reopens, compress) # Load the compression extension, and enable it for logging. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ 'compressor=%s),' % self.compress + \ - 'transaction_sync="(method=dsync,enabled)",' + \ - self.extensionArg(self.compress) + 'transaction_sync="(method=dsync,enabled)"' - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None or name == 'none': - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) def test_log_cursor(self): # print "Creating %s with config '%s'" % (self.uri, self.create_params) diff --git a/test/suite/test_encrypt01.py b/test/suite/test_encrypt01.py index 746c9d13e96..317bed93246 100644 --- a/test/suite/test_encrypt01.py +++ b/test/suite/test_encrypt01.py @@ -66,41 +66,20 @@ class test_encrypt01(wttest.WiredTigerTestCase): nrecords = 5000 bigvalue = "abcdefghij" * 1001 # len(bigvalue) = 10010 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) + extlist.extension('compressors', self.block_compress) + extlist.extension('compressors', self.log_compress) + + def conn_config(self): encarg = 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) comparg = '' if self.log_compress != None: comparg='log=(compressor={0}),'.format(self.log_compress) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt), - ('compressors', self.block_compress), - ('compressors', self.log_compress)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg)) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + return encarg + comparg # Create a table, add keys with both big and small values, then verify them. def test_encrypt(self): diff --git a/test/suite/test_encrypt02.py b/test/suite/test_encrypt02.py index 648686274c4..2d3b8a29b13 100644 --- a/test/suite/test_encrypt02.py +++ b/test/suite/test_encrypt02.py @@ -39,51 +39,25 @@ from wtscenario import make_scenarios class test_encrypt02(wttest.WiredTigerTestCase, suite_subprocess): uri = 'file:test_encrypt02' encrypt_type = [ - ('noarg', dict( encrypt='rotn', encrypt_args='name=rotn', - secret_arg=None)), - ('keyid', dict( encrypt='rotn', encrypt_args='name=rotn,keyid=11', - secret_arg=None)), - ('pass', dict( encrypt='rotn', encrypt_args='name=rotn', - secret_arg='ABC')), - ('keyid-pass', dict( encrypt='rotn', encrypt_args='name=rotn,keyid=11', - secret_arg='ABC')), + ('noarg', dict( conn_extensions=[ 'encryptors/rotn' ], + encrypt_args='name=rotn', secret_arg=None)), + ('keyid', dict( conn_extensions=[ 'encryptors/rotn' ], + encrypt_args='name=rotn,keyid=11', secret_arg=None)), + ('pass', dict( conn_extensions=[ 'encryptors/rotn' ], + encrypt_args='name=rotn', secret_arg='ABC')), + ('keyid-pass', dict( conn_extensions=[ 'encryptors/rotn' ], + encrypt_args='name=rotn,keyid=11', secret_arg='ABC')), ] scenarios = make_scenarios(encrypt_type) nrecords = 5000 bigvalue = "abcdefghij" * 1001 # len(bigvalue) = 10010 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_config(self): secretarg = '' if self.secret_arg != None: secretarg = ',secretkey=' + self.secret_arg - encarg = 'encryption=({0}{1})'.format(self.encrypt_args, secretarg) - extarg = self.extensionArg([('encryptors', self.encrypt)]) - connarg = 'create,error_prefix="{0}: ",{1},{2}'.format( - self.shortid(), encarg, extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + return 'encryption=({0}{1})'.format(self.encrypt_args, secretarg) # Create a table, add keys with both big and small values, then verify them. def test_pass(self): diff --git a/test/suite/test_encrypt03.py b/test/suite/test_encrypt03.py index cf459190637..0809c16c6d1 100644 --- a/test/suite/test_encrypt03.py +++ b/test/suite/test_encrypt03.py @@ -50,37 +50,13 @@ class test_encrypt03(wttest.WiredTigerTestCase): ] scenarios = make_scenarios(types, encrypt) - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( - self.sys_encrypt, self.sys_encrypt_args) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt)]) - self.pr('encarg = ' + encarg + ' extarg = ' + extarg) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( - self.shortid(), encarg, extarg)) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( + self.sys_encrypt, self.sys_encrypt_args) # Create a table with encryption values that are in error. def test_encrypt(self): diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py index a244cf97961..a10e6c28831 100644 --- a/test/suite/test_encrypt04.py +++ b/test/suite/test_encrypt04.py @@ -77,9 +77,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): wttest.WiredTigerTestCase.__init__(self, *args, **kwargs) self.part = 1 + def conn_extensions(self, extlist): + extarg = None + if self.expect_forceerror: + extarg='(config=\"rotn_force_error=true\")' + extlist.extension('encryptors', self.name, extarg) + # Override WiredTigerTestCase, we have extensions. def setUpConnectionOpen(self, dir): - forceerror = None + self.expect_forceerror = False if self.part == 1: self.name = self.name1 self.keyid = self.keyid1 @@ -93,16 +99,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.fileinclear = self.fileinclear2 if \ hasattr(self, 'fileinclear2') else False if hasattr(self, 'forceerror1') and hasattr(self, 'forceerror2'): - forceerror = "rotn_force_error=true" - self.expect_forceerror = forceerror != None + self.expect_forceerror = True self.got_forceerror = False encarg = 'encryption=(name={0},keyid={1},secretkey={2}),'.format( self.name, self.keyid, self.secretkey) - # If forceerror is set for this test, add a config arg to - # the extension string. That signals rotn to return a (-1000) - # error code, which we'll detect here. - extarg = self.extensionArg([('encryptors', self.name, forceerror)]) + # If forceerror is set for this test, conn_extensions adds a + # config arg to the extension string. That signals rotn to + # return a (-1000) error code, which we'll detect here. + extarg = self.extensionsConfig() self.pr('encarg = ' + encarg + ' extarg = ' + extarg) completed = False try: @@ -135,29 +140,6 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(cursor.search(), 0) self.assertEquals(cursor.get_value(), val) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, extarg) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - extfile = '"' + extfile + '"' - if not extfile in extfiles: - s = extfile - if extarg != None: - s += "=(config=\"" + extarg + "\")" - extfiles.append(s) - if len(extfiles) == 0: - return '' - else: - return ',extensions=[' + ','.join(extfiles) + ']' - # Evaluate expression, which either must succeed (if expect_okay) # or must fail (if !expect_okay). def check_okay(self, expect_okay, expr): diff --git a/test/suite/test_encrypt05.py b/test/suite/test_encrypt05.py index 19a3522b3d5..d8862321821 100644 --- a/test/suite/test_encrypt05.py +++ b/test/suite/test_encrypt05.py @@ -49,41 +49,20 @@ class test_encrypt05(wttest.WiredTigerTestCase): nrecords = 500 bigvalue = 'a' * 500 # we use values that will definitely give compression - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file_encrypt) + extlist.extension('compressors', self.block_compress) + extlist.extension('compressors', self.log_compress) + + def conn_config(self): encarg = 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) comparg = '' if self.log_compress != None: comparg='log=(compressor={0}),'.format(self.log_compress) - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file_encrypt), - ('compressors', self.block_compress), - ('compressors', self.log_compress)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg)) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + return encarg + comparg def getvalue(self, r, n): if n < len(self.bigvalue): diff --git a/test/suite/test_encrypt06.py b/test/suite/test_encrypt06.py index 893c4ba3095..3dd7ac17eff 100644 --- a/test/suite/test_encrypt06.py +++ b/test/suite/test_encrypt06.py @@ -89,38 +89,14 @@ class test_encrypt06(wttest.WiredTigerTestCase): scenarios = make_scenarios(encrypt, storagetype) nrecords = 1000 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( + def conn_extensions(self, extlist): + extlist.extension('encryptors', self.sys_encrypt) + extlist.extension('encryptors', self.file0_encrypt) + extlist.extension('encryptors', self.file1_encrypt) + + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( self.sys_encrypt, self.sys_encrypt_args) - comparg = '' - extarg = self.extensionArg([('encryptors', self.sys_encrypt), - ('encryptors', self.file0_encrypt), - ('encryptors', self.file1_encrypt)]) - self.open_params = 'create,error_prefix="{0}: ",{1}{2}{3}'.format( - self.shortid(), encarg, comparg, extarg) - conn = self.wiredtiger_open(dir, self.open_params) - self.pr(`conn`) - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' def encrypt_file_params(self, name, args): if name == None: diff --git a/test/suite/test_encrypt07.py b/test/suite/test_encrypt07.py index 97ab1987d4f..1c342783353 100644 --- a/test/suite/test_encrypt07.py +++ b/test/suite/test_encrypt07.py @@ -44,35 +44,12 @@ class test_encrypt07(test_salvage.test_salvage): nrecords = 5000 bigvalue = "abcdefghij" * 1007 # len(bigvalue) = 10070 - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - encarg = 'encryption=(name={0}{1}),'.format( - self.sys_encrypt, self.sys_encrypt_args) - extarg = self.extensionArg([('encryptors', self.sys_encrypt)]) - conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( - self.shortid(), encarg, extarg)) - self.pr(`conn`) - return conn + def conn_extensions(self, extlist): + extlist.extension('encryptors', self.sys_encrypt) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + def conn_config(self): + return 'encryption=(name={0}{1}),'.format( + self.sys_encrypt, self.sys_encrypt_args) def rot13(self, s): return codecs.encode(s, 'rot_13') diff --git a/test/suite/test_join03.py b/test/suite/test_join03.py index edab7146a6b..fe47b75f99b 100644 --- a/test/suite/test_join03.py +++ b/test/suite/test_join03.py @@ -36,33 +36,7 @@ class test_join03(wttest.WiredTigerTestCase): table_name1 = 'test_join03' nentries = 100 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + conn_extensions = [ 'extractors/csv' ] def gen_key(self, i): return [ i + 1 ] diff --git a/test/suite/test_join04.py b/test/suite/test_join04.py index a71418d9f05..4190f299676 100644 --- a/test/suite/test_join04.py +++ b/test/suite/test_join04.py @@ -36,33 +36,7 @@ class test_join04(wttest.WiredTigerTestCase): table_name1 = 'test_join04' nentries = 100 - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + conn_extensions = [ 'extractors/csv' ] # JIRA WT-2308: # Test extractors with equality joins diff --git a/test/suite/test_join07.py b/test/suite/test_join07.py index 2a32e678d72..6a31970250f 100644 --- a/test/suite/test_join07.py +++ b/test/suite/test_join07.py @@ -200,33 +200,7 @@ class test_join07(wttest.WiredTigerTestCase): scenarios = make_scenarios(extractscen) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + conn_extensions = [ 'extractors/csv' ] def expect(self, token, expected): if token == None or token.kind not in expected: diff --git a/test/suite/test_readonly01.py b/test/suite/test_readonly01.py index e4b431ca1da..f41280a3283 100644 --- a/test/suite/test_readonly01.py +++ b/test/suite/test_readonly01.py @@ -75,8 +75,7 @@ class test_readonly01(wttest.WiredTigerTestCase, suite_subprocess): scenarios = make_scenarios(basecfg_list, dir_list, log_list, types) - def conn_config(self, dir): - self.home = dir + def conn_config(self): params = \ 'error_prefix="%s",' % self.shortid() + \ '%s' % self.logcfg + \ diff --git a/test/suite/test_schema05.py b/test/suite/test_schema05.py index 28ad51b3c92..bb3d4f49006 100644 --- a/test/suite/test_schema05.py +++ b/test/suite/test_schema05.py @@ -57,33 +57,7 @@ class test_schema05(wttest.WiredTigerTestCase): ('index-after', { 'create_index' : 2 }), ]) - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, exts): - extfiles = [] - for ext in exts: - (dirname, name, libname) = ext - if name != None and name != 'none': - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext', dirname) - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') - if not os.path.exists(extfile): - self.skipTest('extension "' + extfile + '" not built') - if not extfile in extfiles: - extfiles.append(extfile) - if len(extfiles) == 0: - return '' - else: - return ',extensions=["' + '","'.join(extfiles) + '"]' - - # Override WiredTigerTestCase, we have extensions. - def setUpConnectionOpen(self, dir): - extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')]) - connarg = 'create,error_prefix="{0}: ",{1}'.format( - self.shortid(), extarg) - conn = self.wiredtiger_open(dir, connarg) - self.pr(`conn`) - return conn + conn_extensions = [ 'extractors/csv' ] def create_indices(self): # Create self.nindices index files, each with a column from the CSV diff --git a/test/suite/test_schema07.py b/test/suite/test_schema07.py index ac397c6e1a1..3e4b1d28a4d 100644 --- a/test/suite/test_schema07.py +++ b/test/suite/test_schema07.py @@ -33,8 +33,7 @@ import wiredtiger, wttest class test_schema07(wttest.WiredTigerTestCase): tablename = 'table:test_schema07' - def conn_config(self, dir): - return 'cache_size=10MB' + conn_config = 'cache_size=10MB' @wttest.longtest("Creating many tables shouldn't fill the cache") def test_many_tables(self): diff --git a/test/suite/test_stat02.py b/test/suite/test_stat02.py index cecda7f1ddc..45af283ed02 100644 --- a/test/suite/test_stat02.py +++ b/test/suite/test_stat02.py @@ -59,7 +59,7 @@ class test_stat_cursor_config(wttest.WiredTigerTestCase): scenarios = make_scenarios(uri, data_config, cursor_config) # Turn on statistics for this test. - def conn_config(self, dir): + def conn_config(self): return 'statistics=(%s)' % self.data_config # For each database/cursor configuration, confirm the right combinations diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index 7c2a58516bc..01626057b9e 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -93,11 +93,10 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): checklog_calls = 100 if wttest.islongtest() else 2 checklog_mod = (len(scenarios) / checklog_calls + 1) - def setUpConnectionOpen(self, dir): - self.home = dir + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] # # We don't want to run zero fill with only the same settings, such @@ -107,17 +106,9 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): zerofill = 'false' if self.scenario_number % freq == 0: zerofill = 'true' - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'log=(zero_fill=%s),' % zerofill + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'log=(zero_fill=%s),' % zerofill + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -206,6 +197,8 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(cur_logs, pr_logs) def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1, 2, 10 and 11. @@ -228,6 +221,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): # Close and reopen the connection and cursor. if reopen == 'reopen': self.reopen_conn() + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') self.session.begin_transaction( diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py index ade39272f84..d8f6774ded1 100644 --- a/test/suite/test_txn04.py +++ b/test/suite/test_txn04.py @@ -63,24 +63,15 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): txn1s = [('t1c', dict(txn1='commit')), ('t1r', dict(txn1='rollback'))] scenarios = make_scenarios(types, op1s, txn1s) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir + + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") # Set archive false on the home directory. - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -146,6 +137,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # The runWt command closes our connection and sessions so # we need to reopen them here. self.hot_backup(None, committed) + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') c.set_value(1) # Then do the given modification. @@ -193,6 +185,8 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): self.hot_backup(self.uri, committed) def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() with self.expectedStdoutPattern('recreating metadata'): self.ops() diff --git a/test/suite/test_txn05.py b/test/suite/test_txn05.py index 5913c4688a3..7aaff221ba4 100644 --- a/test/suite/test_txn05.py +++ b/test/suite/test_txn05.py @@ -64,23 +64,15 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): txn1s = [('t1c', dict(txn1='commit')), ('t1r', dict(txn1='rollback'))] scenarios = make_scenarios(types, op1s, txn1s) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir + + def conn_config(self): # Cycle through the different transaction_sync values in a # deterministic manner. - self.txn_sync = self.sync_list[ + txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + # Set archive false on the home directory. + return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'transaction_sync="%s",' % txn_sync # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -167,6 +159,8 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): self.runWt(['-h', self.backup_dir, 'printlog'], outfilename='printlog.out') def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1-5. diff --git a/test/suite/test_txn06.py b/test/suite/test_txn06.py index 2bff97f6aac..c91dc6a623b 100644 --- a/test/suite/test_txn06.py +++ b/test/suite/test_txn06.py @@ -40,10 +40,10 @@ class test_txn06(wttest.WiredTigerTestCase, suite_subprocess): source_uri = 'table:' + tablename + "_src" nrows = 100000 - def setUpConnectionOpen(self, *args): + def conn_config(self): if not wiredtiger.verbose_build(): self.skipTest('requires a verbose build') - return super(test_txn06, self).setUpConnectionOpen(*args) + return '' def test_long_running(self): # Populate a table diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py index a08d68f88aa..e2986fb999a 100644 --- a/test/suite/test_txn07.py +++ b/test/suite/test_txn07.py @@ -72,42 +72,18 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): scenarios = make_scenarios(types, op1s, txn1s, compress, prune=30, prunelong=1000) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir - # Cycle through the different transaction_sync values in a - # deterministic manner. - self.txn_sync = self.sync_list[ - self.scenario_number % len(self.sync_list)] - self.backup_dir = os.path.join(self.home, "WT_BACKUP") - conn_params = \ - 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ - 'compressor=%s)' % self.compress + \ - self.extensionArg(self.compress) + \ - ',create,error_prefix="%s: ",' % self.shortid() + \ - "statistics=(fast)," + \ - 'transaction_sync="%s",' % self.txn_sync - # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) - try: - conn = self.wiredtiger_open(dir, conn_params) - except wiredtiger.WiredTigerError as e: - print "Failed conn at '%s' with config '%s'" % (dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn - - # Return the wiredtiger_open extension argument for a shared library. - def extensionArg(self, name): - if name == None or name == '': - return '' - - testdir = os.path.dirname(__file__) - extdir = os.path.join(run.wt_builddir, 'ext/compressors') - extfile = os.path.join( - extdir, name, '.libs', 'libwiredtiger_' + name + '.so') - if not os.path.exists(extfile): - self.skipTest('compression extension "' + extfile + '" not built') - return ',extensions=["' + extfile + '"]' + + def conn_config(self): + return 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ + 'compressor=%s)' % self.compress + \ + ',create,error_prefix="%s: ",' % self.shortid() + \ + "statistics=(fast)," + \ + 'transaction_sync="%s",' % \ + self.sync_list[self.scenario_number % len(self.sync_list)] + + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('compressors', self.compress) # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -140,7 +116,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): self.backup(self.backup_dir) backup_conn_params = 'log=(enabled,file_max=%s,' % self.logmax + \ 'compressor=%s)' % self.compress + \ - self.extensionArg(self.compress) + self.extensionsConfig() backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: self.check(backup_conn.open_session(), None, committed) @@ -148,6 +124,9 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): backup_conn.close() def test_ops(self): + self.backup_dir = os.path.join(self.home, "WT_BACKUP") + self.session2 = self.conn.open_session() + # print "Creating %s with config '%s'" % (self.uri, self.create_params) self.session.create(self.uri, self.create_params) # Set up the table with entries for 1-5. diff --git a/test/suite/test_txn08.py b/test/suite/test_txn08.py index f0cdf08df07..04faed9d45a 100644 --- a/test/suite/test_txn08.py +++ b/test/suite/test_txn08.py @@ -41,7 +41,7 @@ class test_txn08(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ 'transaction_sync="(method=dsync,enabled)"' diff --git a/test/suite/test_txn09.py b/test/suite/test_txn09.py index cfad8270ab1..768d714e248 100644 --- a/test/suite/test_txn09.py +++ b/test/suite/test_txn09.py @@ -80,19 +80,9 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess): op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s, prune=20, prunelong=5000) - # Overrides WiredTigerTestCase - def setUpConnectionOpen(self, dir): - self.home = dir - conn_params = \ - 'create,error_prefix="%s: ",' % self.shortid() + \ - 'log=(archive=false,enabled=%s),' % int(self.log_enabled) + \ - 'transaction_sync=(enabled=false),' - - # print "Opening conn at '%s' with config '%s'" % (dir, conn_params) - conn = self.wiredtiger_open(dir, conn_params) - self.pr(`conn`) - self.session2 = conn.open_session() - return conn + def conn_config(self): + return 'log=(archive=false,enabled=%s),' % int(self.log_enabled) + \ + 'transaction_sync=(enabled=false)' # Check that a cursor (optionally started in a new transaction), sees the # expected values. @@ -141,6 +131,7 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess): # Close and reopen the connection and cursor, toggling the log self.log_enabled = not self.log_enabled self.reopen_conn() + self.session2 = self.conn.open_session() c = self.session.open_cursor(self.uri, None, 'overwrite') self.session.begin_transaction( diff --git a/test/suite/test_txn11.py b/test/suite/test_txn11.py index 147bf3a76c0..3c02b1e86e3 100644 --- a/test/suite/test_txn11.py +++ b/test/suite/test_txn11.py @@ -44,7 +44,7 @@ class test_txn11(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=%s,' % self.archive + \ 'enabled,file_max=%s,prealloc=false),' % self.logmax + \ 'transaction_sync=(enabled=false),' diff --git a/test/suite/test_txn13.py b/test/suite/test_txn13.py index ae0250c06e8..2bf49486b3a 100644 --- a/test/suite/test_txn13.py +++ b/test/suite/test_txn13.py @@ -50,7 +50,7 @@ class test_txn13(wttest.WiredTigerTestCase, suite_subprocess): ]) # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'log=(archive=false,enabled,file_max=%s)' % self.logmax + \ ',cache_size=8G' diff --git a/test/suite/test_txn15.py b/test/suite/test_txn15.py index c061c093b02..a2bfb626338 100644 --- a/test/suite/test_txn15.py +++ b/test/suite/test_txn15.py @@ -41,7 +41,7 @@ class test_txn15(wttest.WiredTigerTestCase, suite_subprocess): create_params = 'key_format=i,value_format=i' entries = 100 # Turn on logging for this test. - def conn_config(self, dir): + def conn_config(self): return 'statistics=(fast),' + \ 'log=(archive=false,enabled,file_max=100K),' + \ 'use_environment=false,' + \ diff --git a/test/suite/wttest.py b/test/suite/wttest.py index bd6d2005cd9..0dce51f07d5 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -37,9 +37,8 @@ except ImportError: import unittest from contextlib import contextmanager -import os, re, shutil, sys, time, traceback -import wtscenario -import wiredtiger +import glob, os, re, shutil, sys, time, traceback +import wiredtiger, wtscenario def shortenWithEllipsis(s, maxlen): if len(s) > maxlen: @@ -152,6 +151,14 @@ class TestSuiteConnection(object): else: return getattr(self._conn, attr) +# Just like a list of strings, but with a convenience function +class ExtensionList(list): + skipIfMissing = False + def extension(self, dirname, name, extarg=None): + if name != None and name != 'none': + ext = '' if extarg == None else '=' + extarg + self.append(dirname + '/' + name + ext) + class WiredTigerTestCase(unittest.TestCase): _globalSetup = False _printOnceSeen = {} @@ -160,9 +167,16 @@ class WiredTigerTestCase(unittest.TestCase): # Can be a string or a callable function or lambda expression. conn_config = '' + # conn_extensions can be overridden to add a list of extensions to load. + # Each entry is a string (directory and extension name) and optional config. + # Example: + # conn_extensions = ('extractors/csv_extractor', + # 'test/fail_fs={allow_writes=100}') + conn_extensions = () + @staticmethod def globalSetup(preserveFiles = False, useTimestamp = False, - gdbSub = False, verbose = 1, dirarg = None, + gdbSub = False, verbose = 1, builddir = None, dirarg = None, longtest = False): WiredTigerTestCase._preserveFiles = preserveFiles d = 'WT_TEST' if dirarg == None else dirarg @@ -172,6 +186,7 @@ class WiredTigerTestCase(unittest.TestCase): os.makedirs(d) wtscenario.set_long_run(longtest) WiredTigerTestCase._parentTestdir = d + WiredTigerTestCase._builddir = builddir WiredTigerTestCase._origcwd = os.getcwd() WiredTigerTestCase._resultfile = open(os.path.join(d, 'results.txt'), "w", 0) # unbuffered WiredTigerTestCase._gdbSubprocess = gdbSub @@ -224,12 +239,66 @@ class WiredTigerTestCase(unittest.TestCase): return "%s.%s.%s" % (self.__module__, self.className(), self._testMethodName) - # Can be overridden, but first consider setting self.conn_config . + # Return the wiredtiger_open extension argument for + # any needed shared library. + def extensionsConfig(self): + exts = self.conn_extensions + if hasattr(exts, '__call__'): + exts = ExtensionList() + self.conn_extensions(exts) + result = '' + extfiles = {} + skipIfMissing = False + if hasattr(exts, 'skip_if_missing'): + skipIfMissing = exts.skip_if_missing + for ext in exts: + extconf = '' + if '=' in ext: + splits = ext.split('=', 1) + ext = splits[0] + extconf = '=' + splits[1] + splits = ext.split('/') + if len(splits) != 2: + raise Exception(self.shortid() + + ": " + ext + + ": extension is not named /") + libname = splits[1] + dirname = splits[0] + pat = os.path.join(WiredTigerTestCase._builddir, 'ext', + dirname, libname, '.libs', 'libwiredtiger_*.so') + filenames = glob.glob(pat) + if len(filenames) == 0: + if skipIfMissing: + self.skipTest('extension "' + ext + '" not built') + continue + else: + raise Exception(self.shortid() + + ": " + ext + + ": no extensions library found matching: " + pat) + elif len(filenames) > 1: + raise Exception(self.shortid() + + ": " + ext + + ": multiple extensions libraries found matching: " + pat) + complete = '"' + filenames[0] + '"' + extconf + if ext in extfiles: + if extfiles[ext] != complete: + raise Exception(self.shortid() + + ": non-matching extension arguments in " + + str(exts)) + else: + extfiles[ext] = complete + if len(extfiles) != 0: + result = ',extensions=[' + ','.join(extfiles.values()) + ']' + return result + + # Can be overridden, but first consider setting self.conn_config + # or self.conn_extensions def setUpConnectionOpen(self, home): self.home = home config = self.conn_config if hasattr(config, '__call__'): - config = config(home) + config = self.conn_config() + config += self.extensionsConfig() # In case the open starts additional threads, flush first to # avoid confusion. sys.stdout.flush() -- cgit v1.2.1 From 4da006a05bf3e01ebbfcfd7d55ee67e84413f44a Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 16 Jan 2017 20:20:43 -0500 Subject: WT-3105 Avoid thread group deadlock on close in new dynamic eviction code. (#3242) --- src/evict/evict_lru.c | 29 +++++++++++++++++++++++------ src/support/thread_group.c | 8 ++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 948c1e1139e..9b969de9a9e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -912,6 +912,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) struct timespec current_time; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; + WT_DECL_RET; uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; uint64_t pgs_evicted_cur, pgs_evicted_persec_cur; uint32_t thread_surplus; @@ -945,7 +946,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) * Otherwise, we just record the number of evicted pages and return. */ if (conn->evict_tune_pgs_last == 0) - goto out; + goto err; delta_msec = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time); delta_pages = pgs_evicted_cur - conn->evict_tune_pgs_last; @@ -995,8 +996,13 @@ __evict_tune_workers(WT_SESSION_IMPL *session) conn->evict_tune_workers_best; for (i = 0; i < thread_surplus; i++) { - WT_RET(__wt_thread_group_stop_one(session, - &conn->evict_threads, true)); + /* + * If we get an error, it should be because we + * were unable to acquire the thread group lock. + * Break out of trying. + */ + WT_ERR(__wt_thread_group_stop_one( + session, &conn->evict_threads, false)); WT_STAT_CONN_INCR(session, cache_eviction_worker_removed); } @@ -1029,7 +1035,12 @@ __evict_tune_workers(WT_SESSION_IMPL *session) * Start the new threads. */ for (i = 0; i < (target_threads - cur_threads); ++i) { - WT_RET(__wt_thread_group_start_one(session, + /* + * If we get an error, it should be because we were + * unable to acquire the thread group lock. Break out + * of trying. + */ + WT_ERR(__wt_thread_group_start_one(session, &conn->evict_threads, false)); WT_STAT_CONN_INCR(session, cache_eviction_worker_created); @@ -1042,9 +1053,15 @@ __evict_tune_workers(WT_SESSION_IMPL *session) WT_STAT_CONN_SET(session, cache_eviction_active_workers, conn->evict_threads.current_threads); -out: conn->evict_tune_last_time = current_time; +err: conn->evict_tune_last_time = current_time; conn->evict_tune_pgs_last = pgs_evicted_cur; - return (0); + /* + * If we got an EBUSY trying to acquire the lock just return. + * We can try to tune the workers next time. + */ + if (ret == EBUSY) + ret = 0; + return (ret); } /* diff --git a/src/support/thread_group.c b/src/support/thread_group.c index d04f8977a9a..beb143e63e2 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -325,8 +325,8 @@ __wt_thread_group_start_one( if (wait) __wt_writelock(session, &group->lock); - else if (__wt_try_writelock(session, &group->lock) != 0) - return (0); + else + WT_RET(__wt_try_writelock(session, &group->lock)); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) @@ -352,8 +352,8 @@ __wt_thread_group_stop_one( if (wait) __wt_writelock(session, &group->lock); - else if (__wt_try_writelock(session, &group->lock) != 0) - return (0); + else + WT_RET(__wt_try_writelock(session, &group->lock)); /* Recheck the bounds now that we hold the lock */ if (group->current_threads > group->min) -- cgit v1.2.1 From 04923774e5ede7a16c45ea31bd020e153a2a7666 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 16 Jan 2017 20:27:05 -0500 Subject: WT-3127 Fix a bug: CPU yield calls don't necessarily imply memory barriers (#3244) Add a full-barrier as part of the yield call. --- src/os_posix/os_yield.c | 8 ++++++++ src/os_win/os_yield.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c index 37d05bc1854..f7c43aae746 100644 --- a/src/os_posix/os_yield.c +++ b/src/os_posix/os_yield.c @@ -16,5 +16,13 @@ void __wt_yield(void) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + sched_yield(); } diff --git a/src/os_win/os_yield.c b/src/os_win/os_yield.c index aab1559e072..038f2efe162 100644 --- a/src/os_win/os_yield.c +++ b/src/os_win/os_yield.c @@ -15,5 +15,13 @@ void __wt_yield(void) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + SwitchToThread(); } -- cgit v1.2.1 From f8c20c2b1c258126cc162721eccd51ea4282e1b7 Mon Sep 17 00:00:00 2001 From: David Hows Date: Wed, 18 Jan 2017 01:05:03 +1100 Subject: WT-3121 Make all ROTN encryption tests skipable (#3247) * WT-3121 test/suite: Make all tests skippable that use encryptors, collators, extractors in extensions. --- test/suite/test_collator.py | 5 ++++- test/suite/test_encrypt02.py | 16 +++++++++------- test/suite/test_encrypt03.py | 1 + test/suite/test_encrypt04.py | 1 + test/suite/test_encrypt06.py | 1 + test/suite/test_encrypt07.py | 2 ++ test/suite/test_join03.py | 4 +++- test/suite/test_join04.py | 4 +++- test/suite/test_join07.py | 4 +++- test/suite/test_schema05.py | 4 +++- 10 files changed, 30 insertions(+), 12 deletions(-) diff --git a/test/suite/test_collator.py b/test/suite/test_collator.py index e7be557335e..7ce135c8976 100644 --- a/test/suite/test_collator.py +++ b/test/suite/test_collator.py @@ -48,7 +48,10 @@ class test_collator(wttest.WiredTigerTestCase): nentries = 100 nindices = 4 - conn_extensions = [ 'extractors/csv', 'collators/revint' ] + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') + extlist.extension('collators', 'revint') def create_indices(self): # Create self.nindices index files, each with a column from the CSV diff --git a/test/suite/test_encrypt02.py b/test/suite/test_encrypt02.py index 2d3b8a29b13..d950be067e2 100644 --- a/test/suite/test_encrypt02.py +++ b/test/suite/test_encrypt02.py @@ -39,17 +39,19 @@ from wtscenario import make_scenarios class test_encrypt02(wttest.WiredTigerTestCase, suite_subprocess): uri = 'file:test_encrypt02' encrypt_type = [ - ('noarg', dict( conn_extensions=[ 'encryptors/rotn' ], - encrypt_args='name=rotn', secret_arg=None)), - ('keyid', dict( conn_extensions=[ 'encryptors/rotn' ], - encrypt_args='name=rotn,keyid=11', secret_arg=None)), - ('pass', dict( conn_extensions=[ 'encryptors/rotn' ], - encrypt_args='name=rotn', secret_arg='ABC')), - ('keyid-pass', dict( conn_extensions=[ 'encryptors/rotn' ], + ('noarg', dict( encrypt_args='name=rotn', secret_arg=None)), + ('keyid', dict( encrypt_args='name=rotn,keyid=11', secret_arg=None)), + ('pass', dict( encrypt_args='name=rotn', secret_arg='ABC')), + ('keyid-pass', dict( encrypt_args='name=rotn,keyid=11', secret_arg='ABC')), ] scenarios = make_scenarios(encrypt_type) + def conn_extensions(self, extlist): + # Load the compression extension, skip the test if missing + extlist.skip_if_missing = True + extlist.extension('encryptors', 'rotn') + nrecords = 5000 bigvalue = "abcdefghij" * 1001 # len(bigvalue) = 10010 diff --git a/test/suite/test_encrypt03.py b/test/suite/test_encrypt03.py index 0809c16c6d1..302572bd044 100644 --- a/test/suite/test_encrypt03.py +++ b/test/suite/test_encrypt03.py @@ -51,6 +51,7 @@ class test_encrypt03(wttest.WiredTigerTestCase): scenarios = make_scenarios(types, encrypt) def conn_extensions(self, extlist): + extlist.skip_if_missing = True extlist.extension('encryptors', self.sys_encrypt) extlist.extension('encryptors', self.file_encrypt) diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py index a10e6c28831..17777fc9564 100644 --- a/test/suite/test_encrypt04.py +++ b/test/suite/test_encrypt04.py @@ -81,6 +81,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): extarg = None if self.expect_forceerror: extarg='(config=\"rotn_force_error=true\")' + extlist.skip_if_missing = True extlist.extension('encryptors', self.name, extarg) # Override WiredTigerTestCase, we have extensions. diff --git a/test/suite/test_encrypt06.py b/test/suite/test_encrypt06.py index 3dd7ac17eff..72718e53b2b 100644 --- a/test/suite/test_encrypt06.py +++ b/test/suite/test_encrypt06.py @@ -90,6 +90,7 @@ class test_encrypt06(wttest.WiredTigerTestCase): nrecords = 1000 def conn_extensions(self, extlist): + extlist.skip_if_missing = True extlist.extension('encryptors', self.sys_encrypt) extlist.extension('encryptors', self.file0_encrypt) extlist.extension('encryptors', self.file1_encrypt) diff --git a/test/suite/test_encrypt07.py b/test/suite/test_encrypt07.py index 1c342783353..81c9f1a49ea 100644 --- a/test/suite/test_encrypt07.py +++ b/test/suite/test_encrypt07.py @@ -45,6 +45,8 @@ class test_encrypt07(test_salvage.test_salvage): bigvalue = "abcdefghij" * 1007 # len(bigvalue) = 10070 def conn_extensions(self, extlist): + # Load the compression extension, skip the test if missing + extlist.skip_if_missing = True extlist.extension('encryptors', self.sys_encrypt) def conn_config(self): diff --git a/test/suite/test_join03.py b/test/suite/test_join03.py index fe47b75f99b..dd8111f6ead 100644 --- a/test/suite/test_join03.py +++ b/test/suite/test_join03.py @@ -36,7 +36,9 @@ class test_join03(wttest.WiredTigerTestCase): table_name1 = 'test_join03' nentries = 100 - conn_extensions = [ 'extractors/csv' ] + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def gen_key(self, i): return [ i + 1 ] diff --git a/test/suite/test_join04.py b/test/suite/test_join04.py index 4190f299676..e65b8b53333 100644 --- a/test/suite/test_join04.py +++ b/test/suite/test_join04.py @@ -36,7 +36,9 @@ class test_join04(wttest.WiredTigerTestCase): table_name1 = 'test_join04' nentries = 100 - conn_extensions = [ 'extractors/csv' ] + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') # JIRA WT-2308: # Test extractors with equality joins diff --git a/test/suite/test_join07.py b/test/suite/test_join07.py index 6a31970250f..8fae3539246 100644 --- a/test/suite/test_join07.py +++ b/test/suite/test_join07.py @@ -200,7 +200,9 @@ class test_join07(wttest.WiredTigerTestCase): scenarios = make_scenarios(extractscen) - conn_extensions = [ 'extractors/csv' ] + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def expect(self, token, expected): if token == None or token.kind not in expected: diff --git a/test/suite/test_schema05.py b/test/suite/test_schema05.py index bb3d4f49006..d536a629373 100644 --- a/test/suite/test_schema05.py +++ b/test/suite/test_schema05.py @@ -57,7 +57,9 @@ class test_schema05(wttest.WiredTigerTestCase): ('index-after', { 'create_index' : 2 }), ]) - conn_extensions = [ 'extractors/csv' ] + def conn_extensions(self, extlist): + extlist.skip_if_missing = True + extlist.extension('extractors', 'csv') def create_indices(self): # Create self.nindices index files, each with a column from the CSV -- cgit v1.2.1 From 2d2bb414675e449f46d6412db93bb7b32057af0a Mon Sep 17 00:00:00 2001 From: sueloverso Date: Tue, 17 Jan 2017 15:41:40 -0500 Subject: WT-3118 Protect test against unexpectedly slow child start. (#3248) --- test/recovery/random-abort.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index c407361c7eb..a6e4d9801e5 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -33,7 +33,11 @@ static char home[512]; /* Program working dir */ static const char *progname; /* Program name */ +/* + * These two names for the URI and file system must be maintained in tandem. + */ static const char * const uri = "table:main"; +static const char * const fs_main = "main.wt"; static bool inmem; #define MAX_TH 12 @@ -211,6 +215,7 @@ extern char *__wt_optarg; int main(int argc, char *argv[]) { + struct stat sb; FILE *fp; WT_CONNECTION *conn; WT_CURSOR *cursor; @@ -305,8 +310,15 @@ main(int argc, char *argv[]) /* parent */ /* * Sleep for the configured amount of time before killing - * the child. + * the child. Start the timeout from the time we notice that + * the table has been created. That allows the test to run + * correctly on really slow machines. Verify the process ID + * still exists in case the child aborts for some reason we + * don't stay in this loop forever. */ + snprintf(fname, sizeof(fname), "%s/%s", home, fs_main); + while (stat(fname, &sb) != 0 && kill(pid, 0) == 0) + sleep(1); sleep(timeout); /* -- cgit v1.2.1 From 25a7c8aae547b7a0c50081656935c663c640a9f0 Mon Sep 17 00:00:00 2001 From: Sulabh Mahajan Date: Wed, 18 Jan 2017 13:18:41 +1100 Subject: WT-3083 Fix a bug in wtperf config dump (#3224) Also add a test case to ensure the functionality doesn't break in the future. --- bench/wtperf/config.c | 87 +++++++++++-- bench/wtperf/wtperf.c | 122 ++++++++++++----- test/wtperf/test_conf_dump.py | 296 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 461 insertions(+), 44 deletions(-) create mode 100644 test/wtperf/test_conf_dump.py diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 5b14a4cdf68..a15a3485dde 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -622,17 +622,9 @@ config_opt_str(WTPERF *wtperf, const char *optstr) return (ret); } - /* - * Append the current line to our copy of the config. The config is - * stored in the order it is processed, so added options will be after - * any parsed from the original config. We allocate len + 1 to allow for - * a null byte to be added. - */ - config_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); - config_line->string = dstrdup(optstr); - TAILQ_INSERT_TAIL(&opts->config_head, config_line, q); - while (ret == 0) { + size_t pos; + if ((ret = scan->next(scan, &k, &v)) != 0) { /* Any parse error has already been reported. */ if (ret == WT_NOTFOUND) @@ -640,6 +632,46 @@ config_opt_str(WTPERF *wtperf, const char *optstr) break; } ret = config_opt(wtperf, &k, &v); + + /* + * Append the key-value pair to our copy of the config. + * The config is stored in the order it is processed, so added + * options will be after any parsed from the original config. + */ + config_line = dcalloc(sizeof(CONFIG_QUEUE_ENTRY), 1); + /* + * If key or value is a string, consider extra space for the + * quotes. Add 2 to the required space for '=' and the ending + * null character in "key=value". + */ + config_line->string = dcalloc( + k.len + (k.type == WT_CONFIG_ITEM_STRING ? 2 : 0) + + v.len + (v.type == WT_CONFIG_ITEM_STRING ? 2 : 0) + 2, 1); + pos = 0; + if (k.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + strncpy(config_line->string + pos, k.str, k.len); + pos += k.len; + if (k.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + config_line->string[pos] = '='; + pos++; + if (v.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + strncpy(config_line->string + pos, v.str, v.len); + pos += v.len; + if (v.type == WT_CONFIG_ITEM_STRING) { + config_line->string[pos] = '"'; + pos++; + } + config_line->string[pos] = '\0'; + TAILQ_INSERT_TAIL(&opts->config_head, config_line, q); } if ((t_ret = scan->close(scan)) != 0) { lprintf(wtperf, ret, 0, "Error in config_scan_end"); @@ -754,8 +786,11 @@ config_consolidate(CONFIG_OPTS *opts) /* * This loop iterates over the config queue and for each entry checks if - * a later queue entry has the same key. If there's a match, the current - * queue entry is removed and we continue. + * a later queue entry has the same key. If there's a match, and key is + * "conn_config" or "table_config", the later queue entry is replaced + * with a concatenated entry of the two queue entries, the current queue + * entry is removed. For any other key, if there is a match, the current + * queue entry is removed. */ conf_line = TAILQ_FIRST(&opts->config_head); while (conf_line != NULL) { @@ -771,6 +806,34 @@ config_consolidate(CONFIG_OPTS *opts) if (strncmp(conf_line->string, test_line->string, (size_t)((string_key - conf_line->string) + 1)) == 0) { + if ((strncmp("conn_config=", conf_line->string, + (size_t)((string_key - conf_line->string) + + 1)) == 0) || + (strncmp("table_config=", conf_line->string, + (size_t)((string_key - conf_line->string) + + 1)) == 0)) { + char *concat_str, *val_pointer; + + /* + * To concatenate the two config + * strings, copy the first string to a + * new one, replace the ending '"' with + * a ',' and then concatenate the second + * string's value after its starting '"' + */ + val_pointer = + strchr(test_line->string, '=') + 2; + concat_str = + dmalloc(strlen(conf_line->string) + + strlen(val_pointer) + 1); + strcpy(concat_str, conf_line->string); + concat_str[strlen(concat_str) - 1] = + ','; + strcat(concat_str, val_pointer); + free(test_line->string); + test_line->string = concat_str; + } + TAILQ_REMOVE(&opts->config_head, conf_line, q); free(conf_line->string); free(conf_line); diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 8c7f0053388..2f747fa3fc7 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2502,52 +2502,110 @@ main(int argc, char *argv[]) __wt_stream_set_line_buffer(stdout); /* Concatenate non-default configuration strings. */ - if (opts->verbose > 1 || user_cconfig != NULL || - opts->session_count_idle > 0 || wtperf->compress_ext != NULL || - wtperf->async_config != NULL) { - req_len = strlen(debug_cconfig) + 20; - if (user_cconfig != NULL) - req_len += strlen(user_cconfig); - if (wtperf->async_config != NULL) - req_len += strlen(wtperf->async_config); - if (wtperf->compress_ext != NULL) - req_len += strlen(wtperf->compress_ext); + if ((opts->verbose > 1 && strlen(debug_cconfig)) || + user_cconfig != NULL || opts->session_count_idle > 0 || + wtperf->compress_ext != NULL || wtperf->async_config != NULL) { + bool append_comma; + uint32_t pos; + + append_comma = false; + pos = 0; + req_len = 20; + req_len += (wtperf->async_config != NULL ? + strlen(wtperf->async_config) : 0); + req_len += (wtperf->compress_ext != NULL ? + strlen(wtperf->compress_ext) : 0); if (opts->session_count_idle > 0) { - sreq_len = strlen(",session_max=") + 6; + sreq_len = strlen("session_max=") + 6; req_len += sreq_len; sess_cfg = dmalloc(sreq_len); snprintf(sess_cfg, sreq_len, - ",session_max=%" PRIu32, + "session_max=%" PRIu32, opts->session_count_idle + wtperf->workers_cnt + opts->populate_threads + 10); } + req_len += (user_cconfig != NULL ? strlen(user_cconfig) : 0); + req_len += (debug_cconfig != NULL ? strlen(debug_cconfig) : 0); cc_buf = dmalloc(req_len); - snprintf(cc_buf, req_len, "%s,%s,%s,%s,%s", - wtperf->async_config ? wtperf->async_config : "", - wtperf->compress_ext ? wtperf->compress_ext : "", - opts->verbose > 1 ? debug_cconfig : "", - sess_cfg != NULL ? sess_cfg : "", - user_cconfig != NULL ? user_cconfig : ""); + + if (wtperf->async_config != NULL && + strlen(wtperf->async_config)) { + pos += (uint32_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", wtperf->async_config); + append_comma = true; + } + if (wtperf->compress_ext != NULL && + strlen(wtperf->compress_ext)) { + pos += (uint32_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", wtperf->compress_ext); + append_comma = true; + } + if (sess_cfg != NULL && strlen(sess_cfg)) { + pos += (uint32_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", sess_cfg); + append_comma = true; + } + if (user_cconfig != NULL && strlen(user_cconfig)) { + pos += (uint32_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", user_cconfig); + append_comma = true; + } + if (opts->verbose > 1 && strlen(debug_cconfig)) { + pos += (uint32_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", debug_cconfig); + append_comma = true; + } + if (strlen(cc_buf) && (ret = config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) goto err; } - if (opts->verbose > 1 || opts->index || + if ((opts->verbose > 1 && strlen(debug_tconfig)) || opts->index || user_tconfig != NULL || wtperf->compress_table != NULL) { - req_len = strlen(debug_tconfig) + 20; - if (user_tconfig != NULL) - req_len += strlen(user_tconfig); - if (wtperf->compress_table != NULL) - req_len += strlen(wtperf->compress_table); - if (opts->index) - req_len += strlen(INDEX_COL_NAMES); + bool append_comma; + uint32_t pos; + + append_comma = false; + pos = 0; + req_len = 20; + req_len += (wtperf->compress_table != NULL ? + strlen(wtperf->compress_table) : 0); + req_len += (opts->index ? strlen(INDEX_COL_NAMES) : 0); + req_len += (user_tconfig != NULL ? strlen(user_tconfig) : 0); + req_len += (debug_tconfig != NULL ? strlen(debug_tconfig) : 0); tc_buf = dmalloc(req_len); - snprintf(tc_buf, req_len, "%s,%s,%s,%s", - opts->index ? INDEX_COL_NAMES : "", - wtperf->compress_table != NULL ? - wtperf->compress_table : "", - opts->verbose > 1 ? debug_tconfig : "", - user_tconfig ? user_tconfig : ""); + + if (wtperf->compress_table != NULL && + strlen(wtperf->compress_table)) { + pos += (uint32_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", wtperf->compress_table); + append_comma = true; + } + if (opts->index) { + pos += (uint32_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", INDEX_COL_NAMES); + append_comma = true; + } + if (user_tconfig != NULL && strlen(user_tconfig)) { + pos += (uint32_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", user_tconfig); + append_comma = true; + } + if (opts->verbose > 1 && strlen(debug_tconfig)) { + pos += (uint32_t)snprintf( + tc_buf + pos, req_len - pos, "%s%s", + append_comma ? "," : "", debug_tconfig); + append_comma = true; + } + if (strlen(tc_buf) && (ret = config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) goto err; diff --git a/test/wtperf/test_conf_dump.py b/test/wtperf/test_conf_dump.py new file mode 100644 index 00000000000..ef7f276a1d0 --- /dev/null +++ b/test/wtperf/test_conf_dump.py @@ -0,0 +1,296 @@ +# Usage: python test_conf_dump.py +# +# This script tests if the config file dumped in the test directory corresponds +# correctly to the wtperf config file used. Command line options to wtperf are +# also taken into account. +# +# Following expectations are checked for: +# 1. If provided through multiple sources, "conn_config" and "table_config" +# configuration options are appended to each other. All other options get +# replaced by a higher precedent source. +# 2. The precedence order for the options in an increasing order is as follows: +# default option, +# provided through config file, +# provided through option -o +# provided through option -C (for conn_config) or -T (for table_config) +# +# Test fails if any config option is missing or has a wrong value. Test also +# fails if the value for the option is not replaced/appended in the correct +# order of precedence as stated above. + +import os, re, subprocess, sys + +OP_FILE = "WT_TEST/CONFIG.wtperf" +TMP_CONF = "__tmp.wtperf" +WTPERF_BIN = "./wtperf" +WTPERF_DIR = "../../build_posix/bench/wtperf/" + +CONF_NOT_PROVIDED = -2 + +# Generate a wtperf conf file to use +def generate_conf_file(file_name): + f = open(file_name, 'w') + f.write( +'''conn_config="cache_size=16GB,eviction=(threads_max=4),log=(enabled=false),session_max=33" +table_config="leaf_page_max=32k,internal_page_max=16k,allocation_size=4k,split_pct=90,type=file" +close_conn=false +icount=1500 +create=true +compression="snappy" +checkpoint_interval=5 +checkpoint_threads=1 +populate_threads=1 +report_interval=5 +session_count_idle=50 +session_count_idle=60 +session_count_idle=70 +session_count_idle=80 +run_time=5 +sample_interval=5 +sample_rate=1 +table_count=2 +threads=((count=6,updates=1)) +value_sz=1000 +warmup=2 +''') + f.close() + +# Build a command from the given options and execute wtperf +def execute_wtperf(conf_file, option_C = "", option_T = "", option_o = ""): + # Generate the command to run, execute wtperf + cmd = WTPERF_BIN + " -O " + conf_file + if option_C: + cmd += " -C " + option_C + if option_T: + cmd += " -T " + option_T + if option_o: + # Any quotes in option_o need to be escaped before providing it as part + # of the command + option_o_cmd_str = option_o.replace('"', '\\"') + cmd += " -o " + option_o_cmd_str + + print "Running: ", cmd + subprocess.check_call(cmd, shell=True) + print "=========================\n" + +# Build a dictionary of config key and it's value from the given config file. +# Optionally take -C, -T and -o and overwrite/append values as per correct +# precedence +def build_dict_from_conf( + conf_file, option_C = "", option_T = "", option_o = ""): + # Open given conf file and make a dictionary of passed arguments and values + with open(conf_file) as f: + lines = f.read().splitlines() + + # Maintain precedence order of config file, -o, -C/-T + # Build a dict of config options, appending values for table_config and + # conn_config, if specified multiple times. Replace with the latest in + # case of all other configuration keys. + key_val_dict = {} + for line in lines: + if re.match('^\s*#', line) is None: + key_val_pair = line.split('=', 1) + if ((key_val_pair[0] == 'table_config' or + key_val_pair[0] == 'conn_config') and + key_val_pair[0] in key_val_dict): + tmp_val = key_val_dict[key_val_pair[0]][:-1] + tmp_val += "," + tmp_val += key_val_pair[1][1:] + key_val_dict[key_val_pair[0]] = tmp_val + else: + key_val_dict[key_val_pair[0]] = key_val_pair[1] + + # If provided, put option o in the dict + if option_o: + opt_o_key_val_list = option_o.split(',') + for op_o_key_val in opt_o_key_val_list: + key_val_pair = op_o_key_val.split('=', 1) + if ((key_val_pair[0] == 'table_config' or + key_val_pair[0] == 'conn_config') and + key_val_pair[0] in key_val_dict): + tmp_val = key_val_dict[key_val_pair[0]][:-1] + tmp_val += "," + tmp_val += key_val_pair[1][1:] + key_val_dict[key_val_pair[0]] = tmp_val + else: + key_val_dict[key_val_pair[0]] = key_val_pair[1] + + # If provided, put option C in the dict + if option_C: + tmp_val = key_val_dict["conn_config"][:-1] + tmp_val += "," + tmp_val += option_C[1:] + key_val_dict["conn_config"] = tmp_val + + # If provided, put option T in the dict + if option_T: + tmp_val = key_val_dict["table_config"][:-1] + tmp_val += "," + tmp_val += option_T[1:] + key_val_dict["table_config"] = tmp_val + + return key_val_dict + +# Extract configuration value for the given key from the given config file +def extract_config_from_file(conf_file, key): + ret_val = "" + with open(conf_file) as f: + lines = f.read().splitlines() + for line in lines: + if re.match('^\s*#', line) is None: + key_val_pair = line.split('=', 1) + if key_val_pair[0] == key: + ret_val = key_val_pair[1] + return ret_val + +# Extract configuration value for the given key from the given "-o" string +def extract_config_from_opt_o(option_o, key): + ret_val = "" + opt_o_key_val_list = option_o.split(',') + for op_o_key_val in opt_o_key_val_list: + key_val_pair = op_o_key_val.split('=', 1) + if key_val_pair[0] == key: + ret_val = key_val_pair[1] + return ret_val + +# Execute test: +# Run wtperf with given config and check if the dumped config file matches the +# given inputs +def run_test(conf_file, option_C = "", option_T = "", option_o = ""): + # Run wtperf + execute_wtperf(conf_file, option_C, option_T, option_o) + + key_val_dict_ip = build_dict_from_conf( + conf_file, option_C, option_T, option_o) + key_val_dict_op = build_dict_from_conf(OP_FILE) + + conn_config_from_file = extract_config_from_file(conf_file, "conn_config") + table_config_from_file = extract_config_from_file(conf_file, "table_config") + conn_config_from_opt_o = "" + table_config_from_opt_o = "" + if option_o: + conn_config_from_opt_o = extract_config_from_opt_o( + option_o, "conn_config") + table_config_from_opt_o = extract_config_from_opt_o( + option_o, "table_config") + + # Check if dumped output conf matches with input file and options + match = True + for key in key_val_dict_ip: + match_itr = True + + # Check if we see this config key in the dumped file + if not key in key_val_dict_op: + print "Key '", key, "' not found in dumped file ", OP_FILE + match = match_itr = False + continue + + # Check if values from all sources of conn_config are presented in the + # conn_config in dumped file. Also check of their relative ordering as + # per precedence rules defined. + if (key == 'conn_config' and + (conn_config_from_file or conn_config_from_opt_o or option_C)): + # Should find these config in order: file < option o < option C + file_loc = CONF_NOT_PROVIDED + option_o_loc = CONF_NOT_PROVIDED + option_C_loc = CONF_NOT_PROVIDED + op_conn_config = key_val_dict_op['conn_config'] + + if conn_config_from_file: + file_loc = op_conn_config.find(conn_config_from_file[1:-1]) + if conn_config_from_opt_o: + option_o_loc = op_conn_config.find(conn_config_from_opt_o[1:-1]) + if option_C: + option_C_loc = op_conn_config.find(option_C[1:-1]) + + # Check if value from any of the sources is missing + if ((conn_config_from_file and file_loc == -1) or + (conn_config_from_opt_o and option_o_loc == -1) or + (option_C and option_C_loc == -1)): + print "Part of conn_config missing in dumped file ", OP_FILE + match_itr = False + + # Check if the values got appended in the correct order + if match_itr: + if ((option_o_loc != CONF_NOT_PROVIDED and + option_o_loc < file_loc) or + (option_C_loc != CONF_NOT_PROVIDED and + (option_C_loc < file_loc or option_C_loc < option_o_loc))): + print "Detected incorrect config append order:" + match_itr = False + + # Check if values from all sources of table_config are presented in the + # table_config in dumped file. Also check of their relative ordering as + # per precedence rules defined. + if (key == 'table_config' and + (table_config_from_file or table_config_from_opt_o or option_T)): + # Should find these config in order: file < option o < option T + file_loc = CONF_NOT_PROVIDED + option_o_loc = CONF_NOT_PROVIDED + option_T_loc = CONF_NOT_PROVIDED + op_table_config = key_val_dict_op['table_config'] + + if table_config_from_file: + file_loc = op_table_config.find(table_config_from_file[1:-1]) + if table_config_from_opt_o: + option_o_loc = op_table_config.find( + table_config_from_opt_o[1:-1]) + if option_T: + option_T_loc = op_table_config.find(option_T[1:-1]) + + # Check if value from any of the sources is missing + if ((table_config_from_file and file_loc == -1) or + (table_config_from_opt_o and option_o_loc == -1) or + (option_T and option_T_loc == -1)): + print "Part of table_config missing in dumped file ", OP_FILE + match_itr = False + + # Check if the values got appended in the correct order + if match_itr: + if ((option_o_loc != CONF_NOT_PROVIDED and + option_o_loc < file_loc) or + (option_T_loc != CONF_NOT_PROVIDED and + (option_T_loc < file_loc or option_T_loc < option_o_loc))): + print "Detected incorrect config append order:" + match_itr = False + + if (key != 'table_config' and key != 'conn_config' and + key_val_dict_ip[key] != key_val_dict_op[key]): + print "Config mismatch between:" + match_itr = False + + if match_itr is False: + print "Input Config:", key, '=', key_val_dict_ip[key] + print "Dumped Config:", key, '=', key_val_dict_op[key] + print "\n" + + match = match and match_itr + + return match + +# ----------------- Execute Test -------------- +# If a wtperf conf file is provided use it, else generate a temp conf file +os.chdir(WTPERF_DIR) +if len(sys.argv) == 2: + conf_file = sys.argv[1] +else: + conf_file = TMP_CONF + generate_conf_file(conf_file) + +# Run a test with no options +if not run_test(conf_file): + exit(-1) + +# Run a test with -C, -T, -o provided +option_o = "verbose=2,conn_config=\"session_max=135\",table_config=\"type=lsm\",sample_interval=2,run_time=0,sample_rate=2,readonly=false" +option_C = "\"cache_size=10GB,session_max=115\"" +option_T = "\"allocation_size=8k,split_pct=92\"" +if not run_test(conf_file, option_C, option_T, option_o): + exit(-1) + +# Cleanup generated temp files +subprocess.check_call("rm -rf WT_TEST/", shell=True) +if len(sys.argv) == 1 and conf_file == TMP_CONF: + subprocess.check_call("rm " + TMP_CONF, shell=True) + +print "All tests succeeded" -- cgit v1.2.1 From 91dd1fa489cab34a40e3f0115fe6771326e9c410 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 18 Jan 2017 21:05:06 -0500 Subject: WT-3134 Coverity scan reports 1368529 and 1368528 (#3251) * Coverity complains: CID 1368529: Security best practices violations (TOCTOU) Calling function "fopen" that uses "fname" after a check function. This can cause a time-of-check, time-of-use race condition. We're doing: snprintf(buffer); stat(buffer); snprintf(buffer); fopen(buffer); and I think Coverity is ignoring the second snprintf(), and is complaining about a stat followed by an fopen some number of lines of code later. It's simple enough to give the two calls their own buffers, hopefully that will keep Coverity quiet. Use 1024 as the size of a path instead of 512, (that's the traditional MAXPATHLEN value). Use sizeof(home) in calls to testutil_work_dir_from_path() so we don't accidentally diverge from the declared size. Clean up an error call, there's no need for two error messages. * Coverity complains: CID 1368528: (DEADCODE) Execution cannot reach the expression "","" inside this statement: "pos += (uint32_t)snprintf(c...". Replace boolean variable with a "const char *" that's set to either an empty string or a comma, removing the need for the test. Use size_t as the size of an object in memory, not a uint32_t. Don't declare variables in block scope. Assignment operators are the lowest priority operator (well, except for comma), don't bother declaring the order of evalution for an assignment operator. strlen() returns a size_t length, don't evaluate it as a boolean. --- bench/wtperf/wtperf.c | 118 ++++++++++++++++++++---------------------- test/recovery/random-abort.c | 17 +++--- test/recovery/truncated-log.c | 4 +- 3 files changed, 65 insertions(+), 74 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 2f747fa3fc7..91cedee8328 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2361,11 +2361,11 @@ main(int argc, char *argv[]) { CONFIG_OPTS *opts; WTPERF *wtperf, _wtperf; - size_t req_len, sreq_len; + size_t pos, req_len, sreq_len; bool monitor_set; int ch, ret; const char *cmdflags = "C:h:m:O:o:T:"; - const char *config_opts; + const char *append_comma, *config_opts; char *cc_buf, *path, *sess_cfg, *tc_buf, *user_cconfig, *user_tconfig; /* The first WTPERF structure (from which all others are derived). */ @@ -2502,19 +2502,14 @@ main(int argc, char *argv[]) __wt_stream_set_line_buffer(stdout); /* Concatenate non-default configuration strings. */ - if ((opts->verbose > 1 && strlen(debug_cconfig)) || + if ((opts->verbose > 1 && strlen(debug_cconfig) != 0) || user_cconfig != NULL || opts->session_count_idle > 0 || wtperf->compress_ext != NULL || wtperf->async_config != NULL) { - bool append_comma; - uint32_t pos; - - append_comma = false; - pos = 0; req_len = 20; - req_len += (wtperf->async_config != NULL ? - strlen(wtperf->async_config) : 0); - req_len += (wtperf->compress_ext != NULL ? - strlen(wtperf->compress_ext) : 0); + req_len += wtperf->async_config != NULL ? + strlen(wtperf->async_config) : 0; + req_len += wtperf->compress_ext != NULL ? + strlen(wtperf->compress_ext) : 0; if (opts->session_count_idle > 0) { sreq_len = strlen("session_max=") + 6; req_len += sreq_len; @@ -2524,89 +2519,88 @@ main(int argc, char *argv[]) opts->session_count_idle + wtperf->workers_cnt + opts->populate_threads + 10); } - req_len += (user_cconfig != NULL ? strlen(user_cconfig) : 0); - req_len += (debug_cconfig != NULL ? strlen(debug_cconfig) : 0); + req_len += user_cconfig != NULL ? strlen(user_cconfig) : 0; + req_len += debug_cconfig != NULL ? strlen(debug_cconfig) : 0; cc_buf = dmalloc(req_len); + pos = 0; + append_comma = ""; if (wtperf->async_config != NULL && - strlen(wtperf->async_config)) { - pos += (uint32_t)snprintf( + strlen(wtperf->async_config) != 0) { + pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", wtperf->async_config); - append_comma = true; + append_comma, wtperf->async_config); + append_comma = ","; } if (wtperf->compress_ext != NULL && - strlen(wtperf->compress_ext)) { - pos += (uint32_t)snprintf( + strlen(wtperf->compress_ext) != 0) { + pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", wtperf->compress_ext); - append_comma = true; + append_comma, wtperf->compress_ext); + append_comma = ","; } - if (sess_cfg != NULL && strlen(sess_cfg)) { - pos += (uint32_t)snprintf( + if (sess_cfg != NULL && strlen(sess_cfg) != 0) { + pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", sess_cfg); - append_comma = true; + append_comma, sess_cfg); + append_comma = ","; } - if (user_cconfig != NULL && strlen(user_cconfig)) { - pos += (uint32_t)snprintf( + if (user_cconfig != NULL && strlen(user_cconfig) != 0) { + pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", user_cconfig); - append_comma = true; + append_comma, user_cconfig); + append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_cconfig)) { - pos += (uint32_t)snprintf( + if (opts->verbose > 1 && strlen(debug_cconfig) != 0) { + pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", debug_cconfig); - append_comma = true; + append_comma, debug_cconfig); + append_comma = ","; } - if (strlen(cc_buf) && (ret = + if (strlen(cc_buf) != 0 && (ret = config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) goto err; } - if ((opts->verbose > 1 && strlen(debug_tconfig)) || opts->index || + if ((opts->verbose > 1 && strlen(debug_tconfig) != 0) || opts->index || user_tconfig != NULL || wtperf->compress_table != NULL) { - bool append_comma; - uint32_t pos; - - append_comma = false; - pos = 0; req_len = 20; - req_len += (wtperf->compress_table != NULL ? - strlen(wtperf->compress_table) : 0); - req_len += (opts->index ? strlen(INDEX_COL_NAMES) : 0); - req_len += (user_tconfig != NULL ? strlen(user_tconfig) : 0); - req_len += (debug_tconfig != NULL ? strlen(debug_tconfig) : 0); + req_len += wtperf->compress_table != NULL ? + strlen(wtperf->compress_table) : 0; + req_len += opts->index ? strlen(INDEX_COL_NAMES) : 0; + req_len += user_tconfig != NULL ? strlen(user_tconfig) : 0; + req_len += debug_tconfig != NULL ? strlen(debug_tconfig) : 0; tc_buf = dmalloc(req_len); + pos = 0; + append_comma = ""; if (wtperf->compress_table != NULL && - strlen(wtperf->compress_table)) { - pos += (uint32_t)snprintf( + strlen(wtperf->compress_table) != 0) { + pos += (size_t)snprintf( tc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", wtperf->compress_table); - append_comma = true; + append_comma, wtperf->compress_table); + append_comma = ","; } if (opts->index) { - pos += (uint32_t)snprintf( + pos += (size_t)snprintf( tc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", INDEX_COL_NAMES); - append_comma = true; + append_comma, INDEX_COL_NAMES); + append_comma = ","; } - if (user_tconfig != NULL && strlen(user_tconfig)) { - pos += (uint32_t)snprintf( + if (user_tconfig != NULL && strlen(user_tconfig) != 0) { + pos += (size_t)snprintf( tc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", user_tconfig); - append_comma = true; + append_comma, user_tconfig); + append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_tconfig)) { - pos += (uint32_t)snprintf( + if (opts->verbose > 1 && strlen(debug_tconfig) != 0) { + pos += (size_t)snprintf( tc_buf + pos, req_len - pos, "%s%s", - append_comma ? "," : "", debug_tconfig); - append_comma = true; + append_comma, debug_tconfig); + append_comma = ","; } - if (strlen(tc_buf) && (ret = + if (strlen(tc_buf) != 0 && (ret = config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) goto err; } diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index a6e4d9801e5..660ef0cca67 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -31,7 +31,7 @@ #include #include -static char home[512]; /* Program working dir */ +static char home[1024]; /* Program working dir */ static const char *progname; /* Program name */ /* * These two names for the URI and file system must be maintained in tandem. @@ -227,7 +227,7 @@ main(int argc, char *argv[]) pid_t pid; bool fatal, rand_th, rand_time, verify_only; const char *working_dir; - char fname[64], kname[64]; + char fname[64], kname[64], statname[1024]; if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) progname = argv[0]; @@ -268,7 +268,7 @@ main(int argc, char *argv[]) if (argc != 0) usage(); - testutil_work_dir_from_path(home, 512, working_dir); + testutil_work_dir_from_path(home, sizeof(home), working_dir); /* * If the user wants to verify they need to tell us how many threads * there were so we can find the old record files. @@ -316,8 +316,8 @@ main(int argc, char *argv[]) * still exists in case the child aborts for some reason we * don't stay in this loop forever. */ - snprintf(fname, sizeof(fname), "%s/%s", home, fs_main); - while (stat(fname, &sb) != 0 && kill(pid, 0) == 0) + snprintf(statname, sizeof(statname), "%s/%s", home, fs_main); + while (stat(statname, &sb) != 0 && kill(pid, 0) == 0) sleep(1); sleep(timeout); @@ -352,11 +352,8 @@ main(int argc, char *argv[]) for (i = 0; i < nth; ++i) { middle = 0; snprintf(fname, sizeof(fname), RECORDS_FILE, i); - if ((fp = fopen(fname, "r")) == NULL) { - fprintf(stderr, - "Failed to open %s. i %" PRIu32 "\n", fname, i); - testutil_die(errno, "fopen"); - } + if ((fp = fopen(fname, "r")) == NULL) + testutil_die(errno, "fopen: %s", fname); /* * For every key in the saved file, verify that the key exists diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c index c265263d44c..6a142b8e710 100644 --- a/test/recovery/truncated-log.c +++ b/test/recovery/truncated-log.c @@ -35,7 +35,7 @@ #define snprintf _snprintf #endif -static char home[512]; /* Program working dir */ +static char home[1024]; /* Program working dir */ static const char *progname; /* Program name */ static const char * const uri = "table:main"; @@ -290,7 +290,7 @@ main(int argc, char *argv[]) if (argc != 0) usage(); - testutil_work_dir_from_path(home, 512, working_dir); + testutil_work_dir_from_path(home, sizeof(home), working_dir); testutil_make_work_dir(home); /* -- cgit v1.2.1 From 45777eb7682e4bbed46be555ad667101775a160c Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 19 Jan 2017 18:59:53 -0500 Subject: WT-3105 Fix the thread group usage on eviction reconfigure and add test. (#3252) --- src/conn/conn_cache.c | 3 +-- src/include/connection.h | 1 - test/suite/test_reconfig01.py | 12 ++++++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 9b07b46abcd..2b0e5081f04 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -144,8 +144,7 @@ __wt_cache_config(WT_SESSION_IMPL *session, bool reconfigure, const char *cfg[]) WT_RET(__wt_thread_group_resize( session, &conn->evict_threads, conn->evict_threads_min, - WT_MAX(conn->evict_threads_min, - WT_MIN(conn->evict_threads_max, EVICT_GROUP_INCR)), + conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); return (0); diff --git a/src/include/connection.h b/src/include/connection.h index 7d2b78e9f66..64ac4271db1 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -301,7 +301,6 @@ struct __wt_connection_impl { uint32_t evict_threads_max;/* Max eviction threads */ uint32_t evict_threads_min;/* Min eviction threads */ -#define EVICT_GROUP_INCR 4 /* Evict group size increased in batches */ uint32_t evict_tune_datapts_needed;/* Data needed to tune */ struct timespec evict_tune_last_action_time;/* Time of last action */ struct timespec evict_tune_last_time; /* Time of last check */ diff --git a/test/suite/test_reconfig01.py b/test/suite/test_reconfig01.py index e76becac76a..cbc8bca5740 100644 --- a/test/suite/test_reconfig01.py +++ b/test/suite/test_reconfig01.py @@ -64,6 +64,18 @@ class test_reconfig01(wttest.WiredTigerTestCase): # same ops_max of 512 and thread of 8. self.conn.reconfigure("async=(enabled=true)") + def test_reconfig_eviction(self): + # Increase the max number of running threads (default 8). + self.conn.reconfigure("eviction=(threads_max=10)") + # Increase the min number of running threads (default 1). + self.conn.reconfigure("eviction=(threads_min=5)") + # Decrease the max number of running threads. + self.conn.reconfigure("eviction=(threads_max=7)") + # Decrease the min number of running threads. + self.conn.reconfigure("eviction=(threads_min=2)") + # Set min and max the same. + self.conn.reconfigure("eviction=(threads_min=6,threads_max=6)") + def test_reconfig_lsm_manager(self): # We create and populate a tiny LSM so that we can start off with # the LSM threads running and change the numbers of threads. -- cgit v1.2.1 From 3ac01b8a147ed5652199c0e577f3300bf4d78a57 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 20 Jan 2017 15:48:10 -0500 Subject: Add a verbose message if we don't find any log files, (#3245) user pointed us at the wrong directory. --- src/log/log.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/log/log.c b/src/log/log.c index 74c5442d405..da500a74e87 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1655,10 +1655,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); if (logcount == 0) - /* - * Return it is not supported if none don't exist. - */ - return (ENOTSUP); + WT_RET_MSG(session, ENOTSUP, "no log files found"); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); -- cgit v1.2.1 From 573bc1a8027e21176c1f3e27483b0abc719131a0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sun, 22 Jan 2017 17:38:06 -0500 Subject: Set the database home and configure error handling before (#3256) loading extensions, custom filesystems (for example) needs to know the database home. --- src/conn/conn_api.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 50617240d38..f691a76b1f2 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2175,6 +2175,15 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, if (cval.val) F_SET(conn, WT_CONN_READONLY); + /* Configure error messages so we get them right early. */ + WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); + if (cval.len != 0) + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->error_prefix)); + + /* Set the database home so extensions have access to it. */ + WT_ERR(__conn_home(session, home, cfg)); + /* * Load early extensions before doing further initialization (one early * extension is to configure a file system). @@ -2198,6 +2207,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR( __conn_chk_file_system(session, F_ISSET(conn, WT_CONN_READONLY))); + /* Make sure no other thread of control already owns this database. */ + WT_ERR(__conn_single(session, cfg)); + /* * Capture the config_base setting file for later use. Again, if the * application doesn't want us to read the base configuration file, @@ -2207,18 +2219,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); config_base_set = cval.val != 0; - /* Configure error messages so we get them right early. */ - WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); - if (cval.len != 0) - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &conn->error_prefix)); - - /* Get the database home. */ - WT_ERR(__conn_home(session, home, cfg)); - - /* Make sure no other thread of control already owns this database. */ - WT_ERR(__conn_single(session, cfg)); - /* * Build the real configuration stack, in the following order (where * later entries override earlier entries): -- cgit v1.2.1 From bf8de9767982da9ae0f1542f3744c8aa8544fb82 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sun, 22 Jan 2017 17:56:29 -0500 Subject: Coverity 1369053: assigning values that are never subsequently used. (#3257) --- bench/wtperf/wtperf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 91cedee8328..baa259f8817 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2551,12 +2551,10 @@ main(int argc, char *argv[]) append_comma, user_cconfig); append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_cconfig) != 0) { + if (opts->verbose > 1 && strlen(debug_cconfig) != 0) pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", append_comma, debug_cconfig); - append_comma = ","; - } if (strlen(cc_buf) != 0 && (ret = config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) @@ -2593,12 +2591,10 @@ main(int argc, char *argv[]) append_comma, user_tconfig); append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_tconfig) != 0) { + if (opts->verbose > 1 && strlen(debug_tconfig) != 0) pos += (size_t)snprintf( tc_buf + pos, req_len - pos, "%s%s", append_comma, debug_tconfig); - append_comma = ","; - } if (strlen(tc_buf) != 0 && (ret = config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) -- cgit v1.2.1 From 2764dd76aebbf6b71b61bf574b01a8028526731d Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Mon, 23 Jan 2017 00:19:30 -0500 Subject: WT-3120 Fix ordering problem in connection_close for custom filesystem loaded via shared lib (#3239) Also add fail_fs extension, as well as a simple test for it. --- build_posix/Make.subdirs | 1 + dist/s_void | 5 + ext/test/fail_fs/Makefile.am | 9 + ext/test/fail_fs/fail_fs.c | 703 ++++++++++++++++++++++++++++++++++++++ src/conn/conn_handle.c | 11 +- src/conn/conn_open.c | 25 +- src/include/extern.h | 2 +- test/csuite/Makefile.am | 3 + test/csuite/wt3120_filesys/main.c | 98 ++++++ 9 files changed, 837 insertions(+), 20 deletions(-) create mode 100644 ext/test/fail_fs/Makefile.am create mode 100644 ext/test/fail_fs/fail_fs.c create mode 100644 test/csuite/wt3120_filesys/main.c diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index 01f23dcbbc1..4ecec37ca6c 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -17,6 +17,7 @@ ext/encryptors/nop ext/encryptors/rotn ext/extractors/csv ext/test/kvs_bdb HAVE_BERKELEY_DB +ext/test/fail_fs . api/leveldb LEVELDB examples/c diff --git a/dist/s_void b/dist/s_void index 025f6d4c7eb..4a6b4ad91a2 100755 --- a/dist/s_void +++ b/dist/s_void @@ -78,6 +78,11 @@ func_ok() -e '/int demo_file_sync$/d' \ -e '/int demo_fs_directory_list_free$/d' \ -e '/int demo_fs_exist$/d' \ + -e '/int fail_file_lock$/d' \ + -e '/int fail_file_sync$/d' \ + -e '/int fail_fs_directory_list_free$/d' \ + -e '/int fail_fs_exist$/d' \ + -e '/int fail_fs_terminate$/d' \ -e '/int handle_message$/d' \ -e '/int handle_progress$/d' \ -e '/int helium_cursor_reset$/d' \ diff --git a/ext/test/fail_fs/Makefile.am b/ext/test/fail_fs/Makefile.am new file mode 100644 index 00000000000..f31f5395cd1 --- /dev/null +++ b/ext/test/fail_fs/Makefile.am @@ -0,0 +1,9 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_LTLIBRARIES = libwiredtiger_fail_fs.la +libwiredtiger_fail_fs_la_SOURCES = fail_fs.c + +# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well +# as installation, it will only build static libraries. As far as I can tell, +# the "approved" libtool way to turn them back on is by adding -rpath. +libwiredtiger_fail_fs_la_LDFLAGS = -avoid-version -module -rpath /nowhere diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c new file mode 100644 index 00000000000..e2538023a2c --- /dev/null +++ b/ext/test/fail_fs/fail_fs.c @@ -0,0 +1,703 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "queue.h" + +#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) + +/* + * A "fail file system", that is, a file system extension that fails when we + * want it to. This is only used in test frameworks, this fact allows us + * to simplify some error paths. + */ +typedef struct { + WT_FILE_SYSTEM iface; + /* + * WiredTiger performs schema and I/O operations in parallel, all file + * system and file handle access must be thread-safe. This extension + * uses a single, global file system lock. + */ + pthread_rwlock_t lock; /* Lock */ + int64_t read_ops; + int64_t write_ops; + int64_t allow_reads; + int64_t allow_writes; + /* Queue of file handles */ + TAILQ_HEAD(fail_file_handle_qh, fail_file_handle) fileq; + WT_EXTENSION_API *wtext; /* Extension functions */ +} FAIL_FILE_SYSTEM; + +typedef struct fail_file_handle { + WT_FILE_HANDLE iface; + + /* + * Track the system file descriptor for each file. + */ + FAIL_FILE_SYSTEM *fail_fs; /* Enclosing file system */ + TAILQ_ENTRY(fail_file_handle) q; /* Queue of handles */ + int fd; /* System file descriptor */ +} FAIL_FILE_HANDLE; + +static int fail_file_close(WT_FILE_HANDLE *, WT_SESSION *); +static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *); +static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool); +static int fail_file_read( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *); +static int fail_file_size( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); +static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *); +static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); +static int fail_file_write( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *); +static bool fail_fs_arg( + const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, + int64_t *argp); +static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, const char *, char ***, uint32_t *); +static int fail_fs_directory_list_free( + WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t); +static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); +static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **); +static int fail_fs_remove( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t); +static int fail_fs_rename( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t); +static int fail_fs_size( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *); +static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *); + +/* + * We use pthread functions for portable locking. + * Assert on errors for simplicity. + */ +static void +fail_fs_allocate_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_init(lockp, NULL) == 0); +} + +static void +fail_fs_destroy_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_destroy(lockp) == 0); +} + +static void +fail_fs_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_wrlock(lockp) == 0); +} + +static void +fail_fs_unlock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_unlock(lockp) == 0); +} + +/* + * fail_file_close -- + * ANSI C close. + */ +static int +fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + + if (fail_fh->fd < 0) + return (EINVAL); + ret = close(fail_fh->fd); + fail_fh->fd = -1; + fail_file_handle_remove(session, fail_fh); + return (ret); +} + +/* + * fail_file_handle_remove -- + * Destroy an in-memory file handle. Should only happen on remove or + * shutdown. + */ +static void +fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh) +{ + FAIL_FILE_SYSTEM *fail_fs; + + (void)session; /* Unused */ + fail_fs = fail_fh->fail_fs; + + TAILQ_REMOVE(&fail_fs->fileq, fail_fh, q); + + free(fail_fh->iface.name); + free(fail_fh); +} + +/* + * fail_file_lock -- + * Lock/unlock a file. + */ +static int +fail_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock) +{ + /* Locks are always granted. */ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + (void)lock; /* Unused */ + + return (0); +} + +/* + * fail_file_read -- + * POSIX pread. + */ +static int +fail_file_read(WT_FILE_HANDLE *file_handle, + WT_SESSION *session, wt_off_t offset, size_t len, void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t read_ops; + int ret; + size_t chunk; + ssize_t nr; + uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + read_ops = ++fail_fs->read_ops; + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->allow_reads != 0 && read_ops % fail_fs->allow_reads == 0) { + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " reads\n", fail_fh->iface.name, read_ops); + return (EIO); + } + + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-read: failed to read %" PRIu64 + " bytes at offset %" PRIu64 ": %s", + fail_fh->iface.name, (uint64_t)len, + (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_file_size -- + * Get the size of a file in bytes, by file handle. + */ +static int +fail_file_size( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep) +{ + FAIL_FILE_HANDLE *fail_fh; + struct stat statbuf; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + ret = 0; + + if ((ret = fstat(fail_fh->fd, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_file_sync -- + * Ensure the content of the file is stable. This is a no-op in our + * memory backed file system. + */ +static int +fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + + return (0); +} + +/* + * fail_file_truncate -- + * POSIX ftruncate. + */ +static int +fail_file_truncate( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t offset) +{ + FAIL_FILE_HANDLE *fail_fh; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + return (ftruncate(fail_fh->fd, offset)); +} + +/* + * fail_file_write -- + * POSIX pwrite. + */ +static int +fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, + wt_off_t offset, size_t len, const void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t write_ops; + int ret; + size_t chunk; + ssize_t nr; + const uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + write_ops = ++fail_fs->write_ops; + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->allow_writes != 0 && + write_ops % fail_fs->allow_writes == 0) { + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " writes\n", fail_fh->iface.name, write_ops); + return (EIO); + } + + /* Break writes larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-write: failed to write %" PRIu64 + " bytes at offset %" PRIu64 ": %s", + fail_fh->iface.name, (uint64_t)len, + (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_fs_arg -- + * If the key matches, return the value interpreted as an integer. + */ +static bool +fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, + int64_t *argp) +{ + char *s; + int64_t result; + + if (strncmp(match, key->str, key->len) == 0 && + match[key->len] == '\0') { + s = (char *)value->str; + result = strtoll(s, &s, 10); + if ((size_t)(s - (char *)value->str) == value->len) { + *argp = result; + return (true); + } + } + return (false); +} + +/* + * fail_fs_directory_list -- + * Return a list of files in a given sub-directory. + */ +static int +fail_fs_directory_list(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *directory, + const char *prefix, char ***dirlistp, uint32_t *countp) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + size_t len, prefix_len; + uint32_t allocated, count; + int ret; + char *name, **entries; + + (void)session; /* Unused */ + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + ret = 0; + *dirlistp = NULL; + *countp = 0; + + entries = NULL; + allocated = count = 0; + len = strlen(directory); + prefix_len = prefix == NULL ? 0 : strlen(prefix); + + fail_fs_lock(&fail_fs->lock); + TAILQ_FOREACH(fail_fh, &fail_fs->fileq, q) { + name = fail_fh->iface.name; + if (strncmp(name, directory, len) != 0 || + (prefix != NULL && strncmp(name, prefix, prefix_len) != 0)) + continue; + + /* + * Increase the list size in groups of 10, it doesn't + * matter if the list is a bit longer than necessary. + */ + if (count >= allocated) { + entries = realloc( + entries, (allocated + 10) * sizeof(char *)); + if (entries == NULL) { + ret = ENOMEM; + goto err; + } + memset(entries + allocated * sizeof(char *), + 0, 10 * sizeof(char *)); + allocated += 10; + } + entries[count++] = strdup(name); + } + + *dirlistp = entries; + *countp = count; + +err: fail_fs_unlock(&fail_fs->lock); + if (ret == 0) + return (0); + + if (entries != NULL) { + while (count > 0) + free(entries[--count]); + free(entries); + } + + return (ret); +} + +/* + * fail_fs_directory_list_free -- + * Free memory allocated by fail_fs_directory_list. + */ +static int +fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, char **dirlist, uint32_t count) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + if (dirlist != NULL) { + while (count > 0) + free(dirlist[--count]); + free(dirlist); + } + return (0); +} + +/* + * fail_fs_exist -- + * Return if the file exists. + */ +static int +fail_fs_exist(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, bool *existp) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + *existp = (access(name, 0) == 0); + return (0); +} + +/* + * fail_fs_open -- + * fopen for the fail file system. + */ +static int +fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, + const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags, + WT_FILE_HANDLE **file_handlep) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_FILE_HANDLE *file_handle; + int open_flags; + int ret; + + (void)file_type; /* Unused */ + (void)session; /* Unused */ + + *file_handlep = NULL; + ret = 0; + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fail_fh = NULL; + + fail_fs_lock(&fail_fs->lock); + + open_flags = 0; + if ((flags & WT_FS_OPEN_CREATE) != 0) + open_flags |= O_CREAT; + if ((flags & WT_FS_OPEN_EXCLUSIVE) != 0) + open_flags |= O_EXCL; + if ((flags & WT_FS_OPEN_READONLY) != 0) + open_flags |= O_RDONLY; + else + open_flags |= O_RDWR; + + if ((ret = open(name, open_flags, 0666)) < 0) + goto err; + + /* We create a handle structure for each open. */ + if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Initialize private information. */ + fail_fh->fail_fs = fail_fs; + fail_fh->fd = ret; + ret = 0; + + /* Initialize public information. */ + file_handle = (WT_FILE_HANDLE *)fail_fh; + if ((file_handle->name = strdup(name)) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Setup the function call table. */ + file_handle->close = fail_file_close; + file_handle->fh_advise = NULL; + file_handle->fh_extend = NULL; + file_handle->fh_extend_nolock = NULL; + file_handle->fh_lock = fail_file_lock; + file_handle->fh_map = NULL; + file_handle->fh_map_discard = NULL; + file_handle->fh_map_preload = NULL; + file_handle->fh_unmap = NULL; + file_handle->fh_read = fail_file_read; + file_handle->fh_size = fail_file_size; + file_handle->fh_sync = fail_file_sync; + file_handle->fh_sync_nowait = NULL; + file_handle->fh_truncate = fail_file_truncate; + file_handle->fh_write = fail_file_write; + + TAILQ_INSERT_HEAD(&fail_fs->fileq, fail_fh, q); + + *file_handlep = file_handle; + + if (0) { +err: free(fail_fh); + } + + fail_fs_unlock(&fail_fs->lock); + return (ret); +} + +/* + * fail_fs_remove -- + * POSIX remove. + */ +static int +fail_fs_remove(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (unlink(name)); +} + +/* + * fail_fs_rename -- + * POSIX rename. + */ +static int +fail_fs_rename(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *from, const char *to, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (rename(from, to)); +} + +/* + * fail_fs_size -- + * Get the size of a file in bytes, by file name. + */ +static int +fail_fs_size(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, wt_off_t *sizep) +{ + struct stat statbuf; + int ret; + + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + ret = 0; + if ((ret = stat(name, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_fs_terminate -- + * Discard any resources on termination + */ +static int +fail_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + + while ((fail_fh = TAILQ_FIRST(&fail_fs->fileq)) != NULL) + fail_file_handle_remove(session, fail_fh); + + fail_fs_destroy_lock(&fail_fs->lock); + free(fail_fs); + + return (0); +} + +/* + * wiredtiger_extension_init -- + * WiredTiger fail filesystem extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + WT_FILE_SYSTEM *file_system; + int ret; + + ret = 0; + wtext = conn->get_extension_api(conn); + if ((fail_fs = calloc(1, sizeof(FAIL_FILE_SYSTEM))) == NULL) { + (void)wtext->err_printf(wtext, NULL, + "fail_file_system extension_init: %s", + wtext->strerror(wtext, NULL, ENOMEM)); + return (ENOMEM); + } + fail_fs->wtext = wtext; + file_system = (WT_FILE_SYSTEM *)fail_fs; + + /* Get any configuration values. */ + if ((ret = wtext->config_parser_open_arg( + wtext, NULL, config, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { + if (fail_fs_arg("allow_writes", &k, &v, &fail_fs->allow_writes)) + continue; + if (fail_fs_arg("allow_reads", &k, &v, &fail_fs->allow_reads)) + continue; + + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: unexpected configuration " + "information: %.*s=%.*s: %s", + (int)k.len, k.str, (int)v.len, v.str, + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + + fail_fs_allocate_lock(&fail_fs->lock); + /* Initialize the in-memory jump table. */ + file_system->fs_directory_list = fail_fs_directory_list; + file_system->fs_directory_list_free = fail_fs_directory_list_free; + file_system->fs_exist = fail_fs_exist; + file_system->fs_open_file = fail_fs_open; + file_system->fs_remove = fail_fs_remove; + file_system->fs_rename = fail_fs_rename; + file_system->fs_size = fail_fs_size; + file_system->terminate = fail_fs_terminate; + if ((ret = conn->set_file_system(conn, file_system, NULL)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONNECTION.set_file_system: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + return (0); + +err: free(fail_fs); + return (ret); +} diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 3f7fc9bb2a7..7203b75e4ae 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -109,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ -int +void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { - WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) - return (0); + return; session = conn->default_session; @@ -149,11 +148,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); - /* Destroy the file-system configuration. */ - if (conn->file_system != NULL && conn->file_system->terminate != NULL) - WT_TRET(conn->file_system->terminate( - conn->file_system, (WT_SESSION *)session)); - /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); @@ -162,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); - return (ret); } diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index d4ace127bb2..f8029f2c728 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -159,15 +159,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Discard transaction state. */ __wt_txn_global_destroy(session); - /* Close extensions, first calling any unload entry point. */ - while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { - TAILQ_REMOVE(&conn->dlhqh, dlh, q); - - if (dlh->terminate != NULL) - WT_TRET(dlh->terminate(wt_conn)); - WT_TRET(__wt_dlclose(session, dlh)); - } - /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); @@ -199,8 +190,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_free(session, s->hazard); } + /* Destroy the file-system configuration. */ + if (conn->file_system != NULL && conn->file_system->terminate != NULL) + WT_TRET(conn->file_system->terminate( + conn->file_system, (WT_SESSION *)session)); + + /* Close extensions, first calling any unload entry point. */ + while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { + TAILQ_REMOVE(&conn->dlhqh, dlh, q); + + if (dlh->terminate != NULL) + WT_TRET(dlh->terminate(wt_conn)); + WT_TRET(__wt_dlclose(session, dlh)); + } + /* Destroy the handle. */ - WT_TRET(__wt_connection_destroy(conn)); + __wt_connection_destroy(conn); return (ret); } diff --git a/src/include/extern.h b/src/include/extern.h index 566eb386c29..16b3c916b24 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -262,7 +262,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index a96492c1e71..bcdbf120d67 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -40,6 +40,9 @@ noinst_PROGRAMS += test_wt2853_perf test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c noinst_PROGRAMS += test_wt2999_join_extractor +test_wt3120_filesys_SOURCES = wt3120_filesys/main.c +noinst_PROGRAMS += test_wt3120_filesys + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c new file mode 100644 index 00000000000..abf660db046 --- /dev/null +++ b/test/csuite/wt3120_filesys/main.c @@ -0,0 +1,98 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3120 + * Test case description: A simple file system extension built into + * a shared library. + * Failure mode: Loading the file system and closing the connection + * is enough to evoke the failure. This test does slightly more + * than that. + */ + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor; + WT_SESSION *session; + char *kstr, *vstr; + char buf[100]; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + snprintf(buf, sizeof(buf), + "create,extensions=" + "[\"../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so\"]"); + testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->create(session, opts->uri, + "key_format=S,value_format=S")); + + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + cursor->set_key(cursor, "a"); + cursor->set_value(cursor, "0"); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, "b"); + cursor->set_value(cursor, "1"); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + + /* Force to disk and re-open. */ + testutil_check(opts->conn->close(opts->conn, NULL)); + testutil_check(wiredtiger_open(opts->home, NULL, NULL, &opts->conn)); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + testutil_check(cursor->next(cursor)); + cursor->get_key(cursor, &kstr); + cursor->get_value(cursor, &vstr); + testutil_assert(strcmp(kstr, "a") == 0); + testutil_assert(strcmp(vstr, "0") == 0); + testutil_check(cursor->next(cursor)); + cursor->get_key(cursor, &kstr); + cursor->get_value(cursor, &vstr); + testutil_assert(strcmp(kstr, "b") == 0); + testutil_assert(strcmp(vstr, "1") == 0); + testutil_assert(cursor->next(cursor) == WT_NOTFOUND); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + printf("Success\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1 From 52171b4c668528c80d1e2084183899f294d4c797 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 00:51:14 -0500 Subject: WT-3144 Print WT_REF instead of WT_REF.page in verbose/debugging output. (#3258) --- src/btree/bt_debug.c | 2 +- src/btree/bt_split.c | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index b62125e069d..a89eca230fd 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -652,7 +652,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) page = ref->page; mod = page->modify; - WT_RET(ds->f(ds, "%p", (void *)page)); + WT_RET(ds->f(ds, "%p", (void *)ref)); switch (page->type) { case WT_PAGE_COL_INT: diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 6b0b8a08c02..7cfcd08f931 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -2086,8 +2086,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); if ((ret = __split_insert(session, ref)) != 0) { @@ -2178,8 +2177,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { @@ -2207,8 +2205,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); ret = __split_parent(session, ref, NULL, 0, 0, false, true); @@ -2229,8 +2226,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) page = ref->page; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref); /* * This isn't a split: a reconciliation failed because we couldn't write -- cgit v1.2.1 From 5e6ffcc7ef98a609e4bbc0ecfef58dade45de1d7 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 00:53:47 -0500 Subject: WT-3144 Make it less likely for random lookups to return WT_NOTFOUND (#3259) There may be empty pages in the tree, and they're useless to us when trying to find random samples. If we don't find a non-empty page in "entries" random guesses, take the first non-empty page in the tree. If the search page contains nothing other than empty pages, restart from the root some number of times before giving up. --- src/btree/row_srch.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index aa299a161da..5b3f1195784 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -792,9 +792,11 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *current, *descent; + uint32_t i, entries, retry; btree = S2BT(session); current = NULL; + retry = 100; if (0) { restart: /* @@ -812,8 +814,32 @@ restart: /* break; WT_INTL_INDEX_GET(session, page, pindex); - descent = pindex->index[ - __wt_random(&session->rnd) % pindex->entries]; + entries = pindex->entries; + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + */ + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state != WT_REF_DELETED) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state != WT_REF_DELETED) + break; + } + if (i == entries) { + if (--retry > 0) + goto restart; + return (WT_NOTFOUND); + } /* * Swap the current page for the child page. If the page splits -- cgit v1.2.1 From f214daa45a860021f107c498ddfd1328b6b3f517 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 07:49:41 -0500 Subject: WT-3144 bug fix: random cursor returns not-found when descending to an empty page. clang 3.8 complains descent might be left uninitialized in some case. I don't think that's possible, but it's a simple change. --- src/btree/row_srch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 5b3f1195784..1c3d5ad5daa 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -823,6 +823,7 @@ restart: /* * search page contains nothing other than empty pages, restart * from the root some number of times before giving up. */ + descent = NULL; for (i = 0; i < entries; ++i) { descent = pindex->index[__wt_random(&session->rnd) % entries]; @@ -835,7 +836,7 @@ restart: /* if (descent->state != WT_REF_DELETED) break; } - if (i == entries) { + if (i == entries || descent == NULL) { if (--retry > 0) goto restart; return (WT_NOTFOUND); -- cgit v1.2.1 From b2ab33d476c657120c56ed31aa05f54557f010e0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 11:34:06 -0500 Subject: WT-3120 Fix ordering problem in connection_close for filesystem loaded in an extension (#3261) This commit represents fixes for Coverity errors, LeakSanitizer errors, and additional cleanup: * pread/pwrite return value is -1 on error, but the error is in errno. * Convert size_t and off_t to uintmax_t/PRIuMAX, not uint64_t/PRIu64. * Coverity ID 1369085 (#1 of 1): Extra sizeof expression (SIZEOF_MISMATCH) suspicious_pointer_arithmetic: Adding allocated * 8UL /* sizeof (char *) */ to pointer entries of type char ** is suspicious because adding an integral value to this pointer automatically scales that value by the size, 8 bytes, of the pointed-to type, char *. Most likely, the multiplication by sizeof (char *) in this expression is extraneous and should be eliminated. * CID 1369084 (#1 of 1): Resource leak (RESOURCE_LEAK) 9. overwrite_var: Overwriting handle ret in ret = 12 leaks the handle. * CID 1369083 (#1 of 1): Logically dead code (DEADCODE) dead_error_line: Execution cannot reach this statement: while (count > 0U) null: At condition entries != NULL, the value of entries must be NULL. dead_error_condition: The condition entries != NULL cannot be true. * Custom filesystems have to configure early-load, otherwise we'll have already configured a default filesystem by the time the extension is loaded. * Add early-load configuration to the wt3120_filesys test. * Add code to WiredTiger that fails if a custom filesystem is configured after we've already configured a default filesystem. --- examples/c/ex_file_system.c | 13 ++++++---- ext/test/fail_fs/fail_fs.c | 50 +++++++++++++++++++++++---------------- src/conn/conn_api.c | 10 ++++++++ test/csuite/wt3120_filesys/main.c | 7 +++--- 4 files changed, 51 insertions(+), 29 deletions(-) diff --git a/examples/c/ex_file_system.c b/examples/c/ex_file_system.c index 56869171558..e807ac54d3b 100644 --- a/examples/c/ex_file_system.c +++ b/examples/c/ex_file_system.c @@ -399,6 +399,7 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, uint32_t allocated, count; int ret = 0; char *name, **entries; + void *p; (void)session; /* Unused */ @@ -424,14 +425,16 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - entries = realloc( - entries, (allocated + 10) * sizeof(char *)); - if (entries == NULL) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { ret = ENOMEM; goto err; } - memset(entries + allocated * sizeof(char *), - 0, 10 * sizeof(char *)); + + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); allocated += 10; } entries[count++] = strdup(name); diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index e2538023a2c..29d469768c5 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -224,10 +224,11 @@ fail_file_read(WT_FILE_HANDLE *file_handle, chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { (void)wtext->err_printf(wtext, session, - "%s: handle-read: failed to read %" PRIu64 - " bytes at offset %" PRIu64 ": %s", - fail_fh->iface.name, (uint64_t)len, - (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + "%s: handle-read: failed to read %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); ret = (nr == 0 ? WT_ERROR : errno); break; } @@ -327,10 +328,11 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) { (void)wtext->err_printf(wtext, session, - "%s: handle-write: failed to write %" PRIu64 - " bytes at offset %" PRIu64 ": %s", - fail_fh->iface.name, (uint64_t)len, - (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + "%s: handle-write: failed to write %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); ret = (nr == 0 ? WT_ERROR : errno); break; } @@ -376,6 +378,7 @@ fail_fs_directory_list(WT_FILE_SYSTEM *file_system, uint32_t allocated, count; int ret; char *name, **entries; + void *p; (void)session; /* Unused */ @@ -401,14 +404,15 @@ fail_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - entries = realloc( - entries, (allocated + 10) * sizeof(char *)); - if (entries == NULL) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { ret = ENOMEM; goto err; } - memset(entries + allocated * sizeof(char *), - 0, 10 * sizeof(char *)); + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); allocated += 10; } entries[count++] = strdup(name); @@ -476,16 +480,17 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_FILE_HANDLE *file_handle; - int open_flags; - int ret; + int fd, open_flags, ret; (void)file_type; /* Unused */ (void)session; /* Unused */ *file_handlep = NULL; - ret = 0; - fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fail_fh = NULL; + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fd = -1; + ret = 0; fail_fs_lock(&fail_fs->lock); @@ -499,8 +504,10 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, else open_flags |= O_RDWR; - if ((ret = open(name, open_flags, 0666)) < 0) + if ((fd = open(name, open_flags, 0666)) < 0) { + ret = errno; goto err; + } /* We create a handle structure for each open. */ if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) { @@ -510,8 +517,7 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, /* Initialize private information. */ fail_fh->fail_fs = fail_fs; - fail_fh->fd = ret; - ret = 0; + fail_fh->fd = fd; /* Initialize public information. */ file_handle = (WT_FILE_HANDLE *)fail_fh; @@ -542,7 +548,9 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, *file_handlep = file_handle; if (0) { -err: free(fail_fh); +err: if (fd != -1) + (void)close(fd); + free(fail_fh); } fail_fs_unlock(&fail_fs->lock); diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index f691a76b1f2..d76e08067b5 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1987,6 +1987,16 @@ __conn_set_file_system( CONNECTION_API_CALL(conn, session, set_file_system, config, cfg); WT_UNUSED(cfg); + /* + * You can only configure a file system once, and attempting to do it + * again probably means the extension argument didn't have early-load + * set and we've already configured the default file system. + */ + if (conn->file_system != NULL) + WT_ERR_MSG(session, EPERM, + "filesystem already configured; custom filesystems should " + "enable \"early_load\" configuration"); + conn->file_system = file_system; err: API_END_RET(session, ret); diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c index abf660db046..a4b830d6a70 100644 --- a/test/csuite/wt3120_filesys/main.c +++ b/test/csuite/wt3120_filesys/main.c @@ -36,6 +36,8 @@ * than that. */ +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + int main(int argc, char *argv[]) { @@ -43,7 +45,7 @@ main(int argc, char *argv[]) WT_CURSOR *cursor; WT_SESSION *session; char *kstr, *vstr; - char buf[100]; + char buf[1024]; opts = &_opts; memset(opts, 0, sizeof(*opts)); @@ -51,8 +53,7 @@ main(int argc, char *argv[]) testutil_make_work_dir(opts->home); snprintf(buf, sizeof(buf), - "create,extensions=" - "[\"../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so\"]"); + "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))"); testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); -- cgit v1.2.1 From d7dc59045b87a37f029c0046082489af557c7018 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 23 Jan 2017 17:49:50 -0500 Subject: WT-2790 Fix a text case false positive in test_sweep01. (#3263) --- test/suite/test_sweep01.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index 71f8fcb180e..5559190caca 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -116,10 +116,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # Give slow machines time to process files. stat_cursor = self.session.open_cursor('statistics:', None, None) this_nfile = stat_cursor[stat.conn.file_open][2] + removed = stat_cursor[stat.conn.dh_sweep_remove][2] stat_cursor.close() self.pr("==== loop " + str(sleep)) self.pr("this_nfile " + str(this_nfile)) - if this_nfile == final_nfile: + self.pr("removed " + str(removed)) + # On slow machines there can be a lag where files get closed but + # the sweep server cannot yet remove the handles. So wait for the + # removed statistic to indicate forward progress too. + if this_nfile == final_nfile and removed != remove1: break c.close() self.pr("Sweep loop took " + str(sleep)) -- cgit v1.2.1 From 75345eabdf5e54aa56fa51134fc53d5ae75aa7d8 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 18:05:36 -0500 Subject: WT-3120 Add error handling to get_key/get_value in a test (#3262) --- test/csuite/wt3120_filesys/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c index a4b830d6a70..09dce624066 100644 --- a/test/csuite/wt3120_filesys/main.c +++ b/test/csuite/wt3120_filesys/main.c @@ -80,13 +80,13 @@ main(int argc, char *argv[]) testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, &cursor)); testutil_check(cursor->next(cursor)); - cursor->get_key(cursor, &kstr); - cursor->get_value(cursor, &vstr); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); testutil_assert(strcmp(kstr, "a") == 0); testutil_assert(strcmp(vstr, "0") == 0); testutil_check(cursor->next(cursor)); - cursor->get_key(cursor, &kstr); - cursor->get_value(cursor, &vstr); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); testutil_assert(strcmp(kstr, "b") == 0); testutil_assert(strcmp(vstr, "1") == 0); testutil_assert(cursor->next(cursor) == WT_NOTFOUND); -- cgit v1.2.1 From 314675c75a777f18995cbac6303b3065c88f5e06 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Tue, 24 Jan 2017 01:30:09 -0500 Subject: WT-3137 Fix a hang in logging due to a race condition (#3223) --- src/include/log.h | 1 + src/log/log_slot.c | 199 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 156 insertions(+), 44 deletions(-) diff --git a/src/include/log.h b/src/include/log.h index d9fea892c68..82fcbf1be58 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -254,6 +254,7 @@ struct __wt_log { #define WT_SLOT_POOL 128 WT_LOGSLOT *active_slot; /* Active slot */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + int32_t pool_index; /* Index into slot pool */ size_t slot_buf_size; /* Buffer size for slots */ #ifdef HAVE_DIAGNOSTIC uint64_t write_calls; /* Calls to log_write */ diff --git a/src/log/log_slot.c b/src/log/log_slot.c index a29a34e5652..cb44cadcb70 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -8,6 +8,49 @@ #include "wt_internal.h" +#ifdef HAVE_DIAGNOSTIC +/* + * __log_slot_dump -- + * Dump the entire slot state. + */ +static void +__log_slot_dump(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int32_t earliest, i; + + conn = S2C(session); + log = conn->log; + earliest = 0; + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (__wt_log_cmp(&slot->slot_release_lsn, + &log->slot_pool[earliest].slot_release_lsn) < 0) + earliest = i; + __wt_errx(session, "Slot %d:", i); + __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32, + slot->slot_state, slot->flags); + __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32, + slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset); + __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32, + slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset); + __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, + slot->slot_release_lsn.l.file, + slot->slot_release_lsn.l.offset); + __wt_errx(session, " Offset: start: %" PRIu32 + " last:%" PRIu32, (uint32_t)slot->slot_start_offset, + (uint32_t)slot->slot_last_offset); + __wt_errx(session, " Unbuffered: %" PRId64 + " error: %" PRId32, slot->slot_unbuffered, + slot->slot_error); + } + __wt_errx(session, "Earliest slot: %d", earliest); + +} +#endif + /* * __wt_log_slot_activate -- * Initialize a slot to become active. @@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) conn = S2C(session); log = conn->log; - slot->slot_state = 0; /* * !!! slot_release_lsn must be set outside this function because * this function may be called after a log file switch and the @@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * set for closing the file handle on a log file switch. The flags * are reset when the slot is freed. See log_slot_free. */ + slot->slot_unbuffered = 0; slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; slot->slot_start_offset = log->alloc_lsn.l.offset; slot->slot_last_offset = log->alloc_lsn.l.offset; slot->slot_fh = log->log_fh; slot->slot_error = 0; - slot->slot_unbuffered = 0; + WT_DIAGNOSTIC_YIELD; + /* + * Set the slot state last. Other threads may have a stale pointer + * to this slot and could try to alter the state and other fields once + * they see the state cleared. + */ + WT_PUBLISH(slot->slot_state, 0); } /* @@ -50,6 +99,10 @@ __log_slot_close( WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); @@ -101,9 +154,32 @@ retry: * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ - while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) && - slot->slot_unbuffered == 0) - __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif + if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { + while (slot->slot_unbuffered == 0) { + __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, "SLOT_CLOSE: Slot %" + PRIu32 " Timeout unbuffered, state 0x%" + PRIx64 " unbuffered %" PRIu64, + (uint32_t)(slot - &log->slot_pool[0]), + slot->slot_state, + slot->slot_unbuffered); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif + } + } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; @@ -218,7 +294,11 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int32_t i; + int32_t i, pool_i; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); @@ -232,16 +312,22 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif /* * Keep trying until we can find a free slot. */ for (;;) { /* - * For now just restart at 0. We could use log->pool_index - * if that is inefficient. + * Rotate among the slots to lessen collisions. */ - for (i = 0; i < WT_SLOT_POOL; i++) { - slot = &log->slot_pool[i]; + for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; + i++, pool_i++) { + if (pool_i >= WT_SLOT_POOL) + pool_i = 0; + slot = &log->slot_pool[pool_i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the @@ -256,6 +342,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_STAT_CONN_INCR(session, log_slot_transitions); log->active_slot = slot; + log->pool_index = pool_i; return (0); } } @@ -264,6 +351,19 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) */ __wt_cond_auto_signal(session, conn->log_wrlsn_cond); __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, + "SLOT_NEW: Timeout free slot"); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif } /* NOTREACHED */ } @@ -311,10 +411,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. + * The release LSN is usually the same as the slot_start_lsn except + * around a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; + log->pool_index = 0; if (0) { err: while (--i >= 0) @@ -370,53 +473,62 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOGSLOT *slot; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join; -#ifdef HAVE_DIAGNOSTIC - bool unbuf_force; -#endif + bool unbuffered, yld; conn = S2C(session); log = conn->log; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ + unbuffered = false; #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; + yld = (++log->write_calls % 7) == 0; + if ((log->write_calls % WT_THOUSAND) == 0 || + mysize > WT_LOG_SLOT_BUF_MAX) { +#else + yld = false; + if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif + unbuffered = true; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + } for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; - /* - * Try to join our size into the existing size and - * atomically write it back into the state. - */ - flag_state = WT_LOG_SLOT_FLAGS(old_state); - released = WT_LOG_SLOT_RELEASED(old_state); - join_offset = WT_LOG_SLOT_JOINED(old_state); -#ifdef HAVE_DIAGNOSTIC - if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { -#else - if (mysize > WT_LOG_SLOT_BUF_MAX) { -#endif - new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; - F_SET(myslot, WT_MYSLOT_UNBUFFERED); - myslot->slot = slot; - } else - new_join = join_offset + (int32_t)mysize; - new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( - (int64_t)new_join, (int64_t)released, (int64_t)flag_state); - - /* - * Check if the slot is open for joining and we are able to - * swap in our size into the state. - */ - if (WT_LOG_SLOT_OPEN(old_state) && - __wt_atomic_casiv64( - &slot->slot_state, old_state, new_state)) - break; + if (WT_LOG_SLOT_OPEN(old_state)) { + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); + if (unbuffered) + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, + (int64_t)flag_state); + + /* + * Braces used due to potential empty body warning. + */ + if (yld) { + WT_DIAGNOSTIC_YIELD; + } + /* + * Attempt to swap our size into the state. + */ + if (__wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + } /* * The slot is no longer open or we lost the race to * update it. Yield and try again. @@ -428,8 +540,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, * We joined this slot. Fill in our information to return to * the caller. */ - if (mysize != 0) - WT_STAT_CONN_INCR(session, log_slot_joins); + WT_STAT_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) -- cgit v1.2.1 From 3695a0dd4dbb1612518ed3f68a2e3c6e7550e0ed Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 09:09:24 -0500 Subject: WT-3137 Fix a hang in logging due to a race condition (#3266) Lint: Don't print int32_t's with %d. WT_LOGSLOT.slot_error is an int, not an int32_t. Don't print off_t's as 32-bits, use the maximum size unsigned object. --- src/include/log.h | 2 +- src/log/log_slot.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/include/log.h b/src/include/log.h index 82fcbf1be58..a6be3582b4d 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -163,7 +163,7 @@ struct __wt_logslot { WT_CACHE_LINE_PAD_BEGIN volatile int64_t slot_state; /* Slot state */ int64_t slot_unbuffered; /* Unbuffered data in this slot */ - int32_t slot_error; /* Error value */ + int slot_error; /* Error value */ wt_off_t slot_start_offset; /* Starting file offset */ wt_off_t slot_last_offset; /* Last record offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ diff --git a/src/log/log_slot.c b/src/log/log_slot.c index cb44cadcb70..d70c0d689be 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -19,7 +19,7 @@ __log_slot_dump(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int32_t earliest, i; + int earliest, i; conn = S2C(session); log = conn->log; @@ -39,9 +39,9 @@ __log_slot_dump(WT_SESSION_IMPL *session) __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, slot->slot_release_lsn.l.file, slot->slot_release_lsn.l.offset); - __wt_errx(session, " Offset: start: %" PRIu32 - " last:%" PRIu32, (uint32_t)slot->slot_start_offset, - (uint32_t)slot->slot_last_offset); + __wt_errx(session, " Offset: start: %" PRIuMAX + " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset, + (uintmax_t)slot->slot_last_offset); __wt_errx(session, " Unbuffered: %" PRId64 " error: %" PRId32, slot->slot_unbuffered, slot->slot_error); -- cgit v1.2.1 From d5ae763f990af5ba5522b07c18b9b37fdaae0e88 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 20:28:32 -0500 Subject: WT-3113 Add a verbose mode to dump the cache when eviction is stuck. (#3234) --- dist/api_data.py | 1 + dist/flags.py | 1 + src/config/config_def.c | 60 +++---- src/conn/conn_api.c | 1 + src/evict/evict_lru.c | 370 ++++++++++++++++-------------------------- src/include/cache.h | 2 +- src/include/extern.h | 3 +- src/include/flags.h | 45 ++--- src/include/wiredtiger.in | 26 +-- src/txn/txn.c | 95 +++++++++++ test/suite/test_reconfig04.py | 2 - 11 files changed, 305 insertions(+), 301 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 324d1e4f281..b1332320a7c 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -524,6 +524,7 @@ connection_runtime_config = [ 'checkpoint', 'compact', 'evict', + 'evict_stuck', 'evictserver', 'fileops', 'handleops', diff --git a/dist/flags.py b/dist/flags.py index 70e18712839..55ce233e60d 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -64,6 +64,7 @@ flags = { 'VERB_COMPACT', 'VERB_EVICT', 'VERB_EVICTSERVER', + 'VERB_EVICT_STUCK', 'VERB_FILEOPS', 'VERB_HANDLEOPS', 'VERB_LOG', diff --git a/src/config/config_def.c b/src/config/config_def.c index 6a93c1d05e2..b11a8d63fdb 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -147,12 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -750,12 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -837,12 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -919,12 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1001,12 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index d76e08067b5..124250a7a7d 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "checkpoint", WT_VERB_CHECKPOINT }, { "compact", WT_VERB_COMPACT }, { "evict", WT_VERB_EVICT }, + { "evict_stuck", WT_VERB_EVICT_STUCK }, { "evictserver", WT_VERB_EVICTSERVER }, { "fileops", WT_VERB_FILEOPS }, { "handleops", WT_VERB_HANDLEOPS }, diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 9b969de9a9e..0cf746f84eb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -285,7 +285,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) conn = S2C(session); cache = conn->cache; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* * Ensure the cache stuck timer is initialized when starting eviction. */ @@ -353,12 +353,12 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) + struct timespec now; +#endif WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; -#ifdef HAVE_DIAGNOSTIC - struct timespec now; -#endif uint64_t orig_pages_evicted; conn = S2C(session); @@ -395,11 +395,15 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) __wt_epoch(session, &cache->stuck_ts); } else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { /* - * After being stuck for 5 minutes, give up. + * If we're stuck for 5 minutes in diagnostic mode, or the + * verbose evict_stuck flag is configured, log the cache + * and transaction state. + * + * If we're stuck for 5 minutes in diagnostic mode, give up. * * We don't do this check for in-memory workloads because * application threads are not blocked by the cache being full. @@ -408,11 +412,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) */ __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { - ret = ETIMEDOUT; - __wt_err(session, ret, +#if defined(HAVE_DIAGNOSTIC) + __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); - WT_TRET(__wt_dump_stuck_info(session, NULL)); + ret = ETIMEDOUT; + WT_TRET(__wt_verbose_dump_txn(session)); + WT_TRET(__wt_verbose_dump_cache(session)); return (ret); +#elif defined(HAVE_VERBOSE) + if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) { + WT_RET(__wt_verbose_dump_txn(session)); + WT_RET(__wt_verbose_dump_cache(session)); + + /* Reset the timer. */ + __wt_epoch(session, &cache->stuck_ts); + } +#endif } #endif } @@ -2184,226 +2199,138 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) S2BT(session)->evict_priority = 0; } -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* - * __dump_txn_state -- - * Output debugging information about the global transaction state. + * __verbose_dump_cache_single -- + * Output diagnostic information about a single file in the cache. */ static int -__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) +__verbose_dump_cache_single(WT_SESSION_IMPL *session, + uint64_t *total_bytesp, uint64_t *total_dirty_bytesp) { - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - WT_TXN *txn; - WT_TXN_STATE *s; - const char *iso_tag; - uint64_t id; - uint32_t i, session_cnt; - - conn = S2C(session); - txn_global = &conn->txn_global; - WT_ORDERED_READ(session_cnt, conn->session_cnt); - - /* Note: odd string concatenation avoids spelling errors. */ - if (fprintf(fp, "==========\n" "transaction state dump\n") < 0) - return (EIO); - - if (fprintf(fp, - "current ID: %" PRIu64 "\n" - "last running ID: %" PRIu64 "\n" - "oldest ID: %" PRIu64 "\n" - "oldest named snapshot ID: %" PRIu64 "\n", - txn_global->current, txn_global->last_running, - txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0) - return (EIO); - - if (fprintf(fp, - "checkpoint running? %s\n" - "checkpoint generation: %" PRIu64 "\n" - "checkpoint pinned ID: %" PRIu64 "\n" - "checkpoint txn ID: %" PRIu64 "\n" - "session count: %" PRIu32 "\n", - txn_global->checkpoint_running ? "yes" : "no", - txn_global->checkpoint_gen, - txn_global->checkpoint_pinned, - txn_global->checkpoint_txnid, - session_cnt) < 0) - return (EIO); - - if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0) - return (EIO); - - /* - * Walk each session transaction state and dump information. Accessing - * the content of session handles is not thread safe, so some - * information may change while traversing if other threads are active - * at the same time, which is OK since this is diagnostic code. - */ - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip sessions with no active transaction */ - if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) - continue; + WT_DATA_HANDLE *dhandle; + WT_PAGE *page; + WT_REF *next_walk; + size_t size; + uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; + uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; + uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; + uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; - txn = &conn->sessions[i].txn; - iso_tag = "INVALID"; - switch (txn->isolation) { - case WT_ISO_READ_COMMITTED: - iso_tag = "WT_ISO_READ_COMMITTED"; - break; - case WT_ISO_READ_UNCOMMITTED: - iso_tag = "WT_ISO_READ_UNCOMMITTED"; - break; - case WT_ISO_SNAPSHOT: - iso_tag = "WT_ISO_SNAPSHOT"; - break; + intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; + intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; + leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; + leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; + + next_walk = NULL; + while (__wt_tree_walk(session, &next_walk, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + page = next_walk->page; + size = page->memory_footprint; + + if (WT_PAGE_IS_INTERNAL(page)) { + ++intl_pages; + intl_bytes += size; + intl_bytes_max = WT_MAX(intl_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++intl_dirty_pages; + intl_dirty_bytes += size; + intl_dirty_bytes_max = + WT_MAX(intl_dirty_bytes_max, size); + } + } else { + ++leaf_pages; + leaf_bytes += size; + leaf_bytes_max = WT_MAX(leaf_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++leaf_dirty_pages; + leaf_dirty_bytes += size; + leaf_dirty_bytes_max = + WT_MAX(leaf_dirty_bytes_max, size); + } } - - if (fprintf(fp, - "ID: %6" PRIu64 - ", mod count: %u" - ", pinned ID: %" PRIu64 - ", snap min: %" PRIu64 - ", snap max: %" PRIu64 - ", metadata pinned ID: %" PRIu64 - ", flags: 0x%08" PRIx32 - ", name: %s" - ", isolation: %s" "\n", - id, - txn->mod_count, - s->pinned_id, - txn->snap_min, - txn->snap_max, - s->metadata_pinned, - txn->flags, - conn->sessions[i].name == NULL ? - "EMPTY" : conn->sessions[i].name, - iso_tag) < 0) - return (EIO); } + dhandle = session->dhandle; + if (dhandle->checkpoint == NULL) + WT_RET(__wt_msg(session, "%s():", dhandle->name)); + else + WT_RET(__wt_msg(session, "%s(checkpoint=%s):", + dhandle->name, dhandle->checkpoint)); + if (intl_pages != 0) + WT_RET(__wt_msg(session, + "internal: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + intl_pages, + intl_bytes / WT_MEGABYTE, + intl_pages - intl_dirty_pages, + intl_dirty_pages, + (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE, + intl_dirty_bytes / WT_MEGABYTE, + intl_bytes_max / WT_MEGABYTE, + intl_dirty_bytes_max / WT_MEGABYTE)); + if (leaf_pages != 0) + WT_RET(__wt_msg(session, + "leaf: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + leaf_pages, + leaf_bytes / WT_MEGABYTE, + leaf_pages - leaf_dirty_pages, + leaf_dirty_pages, + (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE, + leaf_dirty_bytes / WT_MEGABYTE, + leaf_bytes_max / WT_MEGABYTE, + leaf_dirty_bytes_max / WT_MEGABYTE)); + + *total_bytesp += intl_bytes + leaf_bytes; + *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes; + return (0); } /* - * __dump_cache -- - * Output debugging information about the size of the files in cache. + * __wt_verbose_dump_cache -- + * Output diagnostic information about the cache. */ -static int -__dump_cache(WT_SESSION_IMPL *session, FILE *fp) +int +__wt_verbose_dump_cache(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle, *saved_dhandle; - WT_PAGE *page; - WT_REF *next_walk; - uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; - uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; - uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; - uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t total_bytes, total_dirty_bytes; - size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; - /* Note: odd string concatenation avoids spelling errors. */ - if (fprintf(fp, "==========\n" "cache dump\n") < 0) - return (EIO); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "cache dump")); - saved_dhandle = session->dhandle; + __wt_spin_lock(session, &conn->dhandle_lock); TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; - intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; - leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; - leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; - - next_walk = NULL; - session->dhandle = dhandle; - while (__wt_tree_walk(session, &next_walk, - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && - next_walk != NULL) { - page = next_walk->page; - size = page->memory_footprint; - - if (WT_PAGE_IS_INTERNAL(page)) { - ++intl_pages; - intl_bytes += size; - intl_bytes_max = WT_MAX(intl_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++intl_dirty_pages; - intl_dirty_bytes += size; - intl_dirty_bytes_max = - WT_MAX(intl_dirty_bytes_max, size); - } - } else { - ++leaf_pages; - leaf_bytes += size; - leaf_bytes_max = WT_MAX(leaf_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++leaf_dirty_pages; - leaf_dirty_bytes += size; - leaf_dirty_bytes_max = - WT_MAX(leaf_dirty_bytes_max, size); - } - } - } - session->dhandle = NULL; - - if (dhandle->checkpoint == NULL) { - if (fprintf(fp, - "%s(): \n", dhandle->name) < 0) - return (EIO); - } else { - if (fprintf(fp, "%s(checkpoint=%s): \n", - dhandle->name, dhandle->checkpoint) < 0) - return (EIO); - } - if (intl_pages != 0) { - if (fprintf(fp, - "\t" "internal: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - intl_pages, - intl_bytes >> 20, - intl_pages - intl_dirty_pages, - intl_dirty_pages, - (intl_bytes - intl_dirty_bytes) >> 20, - intl_dirty_bytes >> 20, - intl_bytes_max >> 20, - intl_dirty_bytes_max >> 20) < 0) - return (EIO); - } - if (leaf_pages != 0) { - if (fprintf(fp, - "\t" "leaf: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - leaf_pages, - leaf_bytes >> 20, - leaf_pages - leaf_dirty_pages, - leaf_dirty_pages, - (leaf_bytes - leaf_dirty_bytes) >> 20, - leaf_dirty_bytes >> 20, - leaf_bytes_max >> 20, - leaf_dirty_bytes_max >> 20) < 0) - return (EIO); - } - - total_bytes += intl_bytes + leaf_bytes; - total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; + WT_WITH_DHANDLE(session, dhandle, + ret = __verbose_dump_cache_single( + session, &total_bytes, &total_dirty_bytes)); + if (ret != 0) + break; } - session->dhandle = saved_dhandle; + __wt_spin_unlock(session, &conn->dhandle_lock); + WT_RET(ret); /* * Apply the overhead percentage so our total bytes are comparable with @@ -2411,39 +2338,16 @@ __dump_cache(WT_SESSION_IMPL *session, FILE *fp) */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); - if (fprintf(fp, + WT_RET(__wt_msg(session, "cache dump: " - "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" - "total dirty bytes: %" PRIu64 "MB\n", - total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, - total_dirty_bytes >> 20) < 0) - return (EIO); - if (fprintf(fp, "==========\n") < 0) - return (EIO); + "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB", + total_bytes / WT_MEGABYTE, + __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE)); + WT_RET(__wt_msg(session, + "total dirty bytes: %" PRIu64 "MB", + total_dirty_bytes / WT_MEGABYTE)); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); return (0); } - -/* - * __wt_dump_stuck_info -- - * Dump debugging information to a file (default stderr) about the state - * of WiredTiger when we have determined that the cache is stuck full. - */ -int -__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) -{ - FILE *fp; - WT_DECL_RET; - - if (ofile == NULL) - fp = stderr; - else if ((fp = fopen(ofile, "w")) == NULL) - return (EIO); - - WT_ERR(__dump_txn_state(session, fp)); - WT_ERR(__dump_cache(session, fp)); -err: if (ofile != NULL && fclose(fp) != 0) - return (EIO); - return (ret); -} #endif diff --git a/src/include/cache.h b/src/include/cache.h index 70f6169200d..abd5a1901f7 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -83,7 +83,7 @@ struct __wt_cache { uint64_t worker_evicts; /* Pages evicted by worker threads */ uint64_t evict_max_page_size; /* Largest page seen at eviction */ -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) struct timespec stuck_ts; /* Stuck timestamp */ #endif diff --git a/src/include/extern.h b/src/include/extern.h index 16b3c916b24..88fb8823930 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -352,7 +352,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -741,6 +741,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index 2f0c207078a..0b92a12c686 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -90,28 +90,29 @@ #define WT_VERB_COMPACT 0x00000008 #define WT_VERB_EVICT 0x00000010 #define WT_VERB_EVICTSERVER 0x00000020 -#define WT_VERB_FILEOPS 0x00000040 -#define WT_VERB_HANDLEOPS 0x00000080 -#define WT_VERB_LOG 0x00000100 -#define WT_VERB_LSM 0x00000200 -#define WT_VERB_LSM_MANAGER 0x00000400 -#define WT_VERB_METADATA 0x00000800 -#define WT_VERB_MUTEX 0x00001000 -#define WT_VERB_OVERFLOW 0x00002000 -#define WT_VERB_READ 0x00004000 -#define WT_VERB_REBALANCE 0x00008000 -#define WT_VERB_RECONCILE 0x00010000 -#define WT_VERB_RECOVERY 0x00020000 -#define WT_VERB_RECOVERY_PROGRESS 0x00040000 -#define WT_VERB_SALVAGE 0x00080000 -#define WT_VERB_SHARED_CACHE 0x00100000 -#define WT_VERB_SPLIT 0x00200000 -#define WT_VERB_TEMPORARY 0x00400000 -#define WT_VERB_THREAD_GROUP 0x00800000 -#define WT_VERB_TRANSACTION 0x01000000 -#define WT_VERB_VERIFY 0x02000000 -#define WT_VERB_VERSION 0x04000000 -#define WT_VERB_WRITE 0x08000000 +#define WT_VERB_EVICT_STUCK 0x00000040 +#define WT_VERB_FILEOPS 0x00000080 +#define WT_VERB_HANDLEOPS 0x00000100 +#define WT_VERB_LOG 0x00000200 +#define WT_VERB_LSM 0x00000400 +#define WT_VERB_LSM_MANAGER 0x00000800 +#define WT_VERB_METADATA 0x00001000 +#define WT_VERB_MUTEX 0x00002000 +#define WT_VERB_OVERFLOW 0x00004000 +#define WT_VERB_READ 0x00008000 +#define WT_VERB_REBALANCE 0x00010000 +#define WT_VERB_RECONCILE 0x00020000 +#define WT_VERB_RECOVERY 0x00040000 +#define WT_VERB_RECOVERY_PROGRESS 0x00080000 +#define WT_VERB_SALVAGE 0x00100000 +#define WT_VERB_SHARED_CACHE 0x00200000 +#define WT_VERB_SPLIT 0x00400000 +#define WT_VERB_TEMPORARY 0x00800000 +#define WT_VERB_THREAD_GROUP 0x01000000 +#define WT_VERB_TRANSACTION 0x02000000 +#define WT_VERB_VERIFY 0x04000000 +#define WT_VERB_VERSION 0x08000000 +#define WT_VERB_WRITE 0x10000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 90989cc679d..03bff7cd04f 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1982,12 +1982,13 @@ struct __wt_connection { * as a list\, such as "verbose=[evictserver\,read]"., a * list\, with values chosen from the following options: \c "api"\, \c * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c - * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, - * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, - * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\, - * \c "version"\, \c "write"; default empty.} + * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c + * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2513,12 +2514,13 @@ struct __wt_connection { * WiredTiger is configured with --enable-verbose. Options are given as a * list\, such as "verbose=[evictserver\,read]"., a list\, with * values chosen from the following options: \c "api"\, \c "block"\, \c - * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, - * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c - * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c - * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c + * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c + * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c + * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write"; + * default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as "write_through=[data]". Configuring \c write_through requires diff --git a/src/txn/txn.c b/src/txn/txn.c index 660d37b17d5..e5e59c2b901 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -803,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); __wt_free(session, txn_global->states); } + +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) +/* + * __wt_verbose_dump_txn -- + * Output diagnostic information about the global transaction state. + */ +int +__wt_verbose_dump_txn(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN *txn; + WT_TXN_STATE *s; + const char *iso_tag; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "transaction state dump")); + + WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); + WT_RET(__wt_msg(session, + "last running ID: %" PRIu64, txn_global->last_running)); + WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + + WT_RET(__wt_msg(session, "checkpoint running? %s", + txn_global->checkpoint_running ? "yes" : "no")); + WT_RET(__wt_msg(session, + "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen)); + WT_RET(__wt_msg(session, + "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned)); + WT_RET(__wt_msg(session, + "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid)); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); + + WT_RET(__wt_msg(session, "Transaction state of active sessions:")); + + /* + * Walk each session transaction state and dump information. Accessing + * the content of session handles is not thread safe, so some + * information may change while traversing if other threads are active + * at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + + txn = &conn->sessions[i].txn; + iso_tag = "INVALID"; + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + } + + WT_RET(__wt_msg(session, + "ID: %6" PRIu64 + ", mod count: %u" + ", pinned ID: %" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); + } + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + + return (0); +} +#endif diff --git a/test/suite/test_reconfig04.py b/test/suite/test_reconfig04.py index be5e6d3729e..51d9b91c1f4 100644 --- a/test/suite/test_reconfig04.py +++ b/test/suite/test_reconfig04.py @@ -26,9 +26,7 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. -import fnmatch, os, time import wiredtiger, wttest -from wtdataset import SimpleDataSet # test_reconfig04.py # Test WT_SESSION::reconfigure -- cgit v1.2.1 From 8aa3922883e7f3d4a9003211faf595250c3bbfdd Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 22:07:16 -0500 Subject: WT-3097 Avoid waiting for threads to timeout during close (#3253) * Add run-time flags checking to __wt_cond_wait_signal(), and its wrappers (__wt_cond_wait(), __wt_cond_auto_wait_signal() and __wt_cond_auto_wait()) so callers of those functions can configure a check that ensures that if the waiting thread races with a waking thread that's turned off flags so the waiting thread quits, the waiting thread returns immediately. * Rework the WT_SESSION.transaction_sync code to wait for the entire time it's configured to wait, it will be awoken if the log reaches stability before that. * Assert we're not waiting longer than a second if not checking the run status. * Set/Clear WT_CONN_LOG_SERVER_RUN in __wt_logmgr_open/__wt_logmgr_destroy rather than in the connection open code. (It's the only server-run flag that gets set in the connection-open code, and I can't see any reason for that exception.) --- dist/api_data.py | 2 +- dist/s_string.ok | 4 +++ src/async/async_api.c | 5 ++- src/async/async_worker.c | 2 +- src/conn/conn_cache.c | 6 ++-- src/conn/conn_cache_pool.c | 8 ++--- src/conn/conn_ckpt.c | 26 +++++++++++---- src/conn/conn_handle.c | 2 +- src/conn/conn_log.c | 50 +++++++++++++---------------- src/conn/conn_open.c | 17 ++++++---- src/conn/conn_stat.c | 25 +++++++++++---- src/conn/conn_sweep.c | 24 +++++++++++--- src/evict/evict_lru.c | 16 ++++++---- src/include/extern.h | 8 ++--- src/include/extern_posix.h | 4 +-- src/include/extern_win.h | 4 +-- src/include/misc.i | 5 +-- src/include/mutex.h | 4 +-- src/include/wiredtiger.in | 2 +- src/log/log.c | 21 ++++++------ src/log/log_slot.c | 2 +- src/lsm/lsm_worker.c | 2 +- src/os_posix/os_mtx_cond.c | 28 ++++++++++++---- src/os_win/os_mtx_cond.c | 43 +++++++++++++++++-------- src/session/session_api.c | 38 ++++++++++++++-------- src/support/cond_auto.c | 80 ++++++++++------------------------------------ src/support/thread_group.c | 2 +- 27 files changed, 237 insertions(+), 193 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index b1332320a7c..1d669fa7fe0 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -718,7 +718,7 @@ wiredtiger_open_common =\ ]), Config('extensions', '', r''' list of shared library extensions to load (using dlopen). - Any values specified to an library extension are passed to + Any values specified to a library extension are passed to WT_CONNECTION::load_extension as the \c config parameter (for example, extensions=(/path/ext.so={entry=my_entry}))''', diff --git a/dist/s_string.ok b/dist/s_string.ok index 2b998c27813..bb0cacd9d5d 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -1217,6 +1217,7 @@ upg uri uri's uris +usec usecs usedp userbad @@ -1247,6 +1248,9 @@ vunpack vw vxr waitpid +waker +wakeup +wakeups walk's warmup wb diff --git a/src/async/async_api.c b/src/async/async_api.c index 54bcb7cd26c..026a008188c 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session) async = conn->async; TAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); - WT_RET(__wt_cond_alloc( - session, "async flush", false, &async->flush_cond)); + WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond)); WT_RET(__wt_async_op_init(session)); /* @@ -541,7 +540,7 @@ retry: async->flush_op.state = WT_ASYNCOP_READY; WT_RET(__wt_async_op_enqueue(session, &async->flush_op)); while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) - __wt_cond_wait(session, async->flush_cond, 100000); + __wt_cond_wait(session, async->flush_cond, 100000, NULL); /* * Flush is done. Clear the flags. */ diff --git a/src/async/async_worker.c b/src/async/async_worker.c index b1bc3902f7c..11f59ed14f1 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) { while (async->flush_state == WT_ASYNC_FLUSHING && async->flush_gen == my_gen) - __wt_cond_wait(session, async->flush_cond, 10000); + __wt_cond_wait(session, async->flush_cond, 10000, NULL); } /* diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 2b0e5081f04..28dd06332e0 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -187,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", - false, 10000, WT_MILLION, &cache->evict_cond)); + WT_RET(__wt_cond_auto_alloc(session, + "cache eviction server", 10000, WT_MILLION, &cache->evict_cond)); WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); @@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) cache->bytes_dirty_intl + cache->bytes_dirty_leaf, cache->pages_dirty_intl + cache->pages_dirty_leaf); - WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 79c2fc23da5..49b766f4602 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -32,7 +32,7 @@ */ #define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 #define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 -#define WT_CACHE_POOL_READ_MULTIPLIER 1 +#define WT_CACHE_POOL_READ_MULTIPLIER 1 static void __cache_pool_adjust( WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *); @@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) TAILQ_INIT(&cp->cache_pool_qh); WT_ERR(__wt_spin_init( session, &cp->cache_pool_lock, "cache shared pool")); - WT_ERR(__wt_cond_alloc(session, - "cache pool server", false, &cp->cache_pool_cond)); + WT_ERR(__wt_cond_alloc( + session, "cache pool server", &cp->cache_pool_cond)); __wt_process.cache_pool = cp; __wt_verbose(session, @@ -733,7 +733,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) __wt_cond_wait( - session, cp->cache_pool_cond, WT_MILLION); + session, cp->cache_pool_cond, WT_MILLION, NULL); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index faeef4e71a2..7797ed4421c 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -62,6 +62,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) return (0); } +/* + * __ckpt_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__ckpt_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT)); +} + /* * __ckpt_server -- * The checkpoint server thread. @@ -78,14 +88,18 @@ __ckpt_server(void *arg) conn = S2C(session); wt_session = (WT_SESSION *)session; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { + for (;;) { /* * Wait... * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ - __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); + __wt_cond_wait(session, + conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__ckpt_server_run_chk(session)) + break; /* * Checkpoint the database if the connection is marked dirty. @@ -113,7 +127,8 @@ __ckpt_server(void *arg) * it so we don't do another checkpoint * immediately. */ - __wt_cond_wait(session, conn->ckpt_cond, 1); + __wt_cond_wait( + session, conn->ckpt_cond, 1, NULL); } } else WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); @@ -152,8 +167,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; - WT_RET(__wt_cond_alloc( - session, "checkpoint server", false, &conn->ckpt_cond)); + WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond)); /* * Start the thread. diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 7203b75e4ae..54bcfd98aba 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -79,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init( session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); WT_RET(__wt_cond_alloc( - session, "LSM worker cond", false, &conn->lsm_manager.work_cond)); + session, "LSM worker cond", &conn->lsm_manager.work_cond)); /* * Generation numbers. diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 8f8f8614ba8..c6dd795389d 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -174,7 +174,7 @@ __logmgr_config( WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); return (0); } @@ -341,7 +341,7 @@ __wt_log_truncate_files( conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && + if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -505,8 +505,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - __wt_cond_auto_signal( - session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn @@ -517,8 +516,9 @@ __log_file_server(void *arg) continue; } } + /* Wait until the next event. */ - __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); + __wt_cond_wait(session, conn->log_file_cond, 100000, NULL); } if (0) { @@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg) if (yield++ < WT_THOUSAND) __wt_yield(); else - /* - * Send in false because if we did any work we would - * not be on this path. - */ __wt_cond_auto_wait( - session, conn->log_wrlsn_cond, did_work); + session, conn->log_wrlsn_cond, did_work, NULL); } /* * On close we need to do this one more time because there could @@ -840,10 +836,9 @@ __log_server(void *arg) } /* Wait until the next event. */ - __wt_epoch(session, &start); - __wt_cond_auto_wait_signal(session, - conn->log_cond, did_work, &signalled); + __wt_cond_auto_wait_signal( + session, conn->log_cond, did_work, NULL, &signalled); __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } @@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; - WT_RET(__wt_cond_alloc( - session, "log sync", false, &log->log_sync_cond)); - WT_RET(__wt_cond_alloc( - session, "log write", false, &log->log_write_cond)); + WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond)); + WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); @@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); + F_SET(conn, WT_CONN_LOG_SERVER_RUN); + /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. @@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) session_flags = WT_SESSION_NO_DATA_HANDLES; WT_RET(__wt_open_internal_session(conn, "log-close-server", false, session_flags, &conn->log_file_session)); - WT_RET(__wt_cond_alloc(conn->log_file_session, - "log close server", false, &conn->log_file_cond)); + WT_RET(__wt_cond_alloc( + conn->log_file_session, "log close server", &conn->log_file_cond)); /* * Start the log file close thread. @@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, - "log write lsn server", false, 10000, WT_MILLION, - &conn->log_wrlsn_cond)); + "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_auto_alloc(conn->log_session, - "log server", false, 50000, WT_MILLION, &conn->log_cond)); + "log server", 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); + F_CLR(conn, WT_CONN_LOG_SERVER_RUN); + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without @@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index f8029f2c728..5b20377d437 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Tell internal server threads to run: this must be set before opening * any sessions. */ - F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); + F_SET(conn, WT_CONN_SERVER_RUN); /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, @@ -100,8 +100,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_yield(); } - /* Clear any pending async ops. */ + /* + * Clear any pending async operations and shut down the async worker + * threads and system before closing LSM. + */ WT_TRET(__wt_async_flush(session)); + WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is @@ -110,14 +114,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); - WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); - WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); + WT_TRET(__wt_sweep_destroy(session)); + + /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ @@ -126,7 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); - /* Shut down metadata tracking, required before creating tables. */ + /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* @@ -140,7 +144,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); - F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 3bcdfd7ecb1..31dc9c45992 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -485,8 +485,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running"); @@ -497,6 +496,16 @@ err: __wt_scr_free(session, &tmp); return (ret); } +/* + * __statlog_server_run_chk -- + * Check to decide if the statistics log server should continue running. + */ +static bool +__statlog_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS)); +} + /* * __statlog_server -- * The statistics server thread. @@ -525,10 +534,14 @@ __statlog_server(void *arg) WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs); + __wt_cond_wait(session, conn->stat_cond, + conn->stat_usecs, __statlog_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__statlog_server_run_chk(session)) + break; if (WT_STAT_ENABLED(session)) WT_ERR(__statlog_log_one(session, &path, &tmp)); @@ -563,7 +576,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn) session = conn->stat_session; WT_RET(__wt_cond_alloc( - session, "statistics log server", false, &conn->stat_cond)); + session, "statistics log server", &conn->stat_cond)); /* * Start the thread. diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 7d5cb7d7c72..f9b7305c7d8 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -245,6 +245,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) return (ret == EBUSY ? 0 : ret); } +/* + * __sweep_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__sweep_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP)); +} + /* * __sweep_server -- * The handle sweep server thread. @@ -266,11 +276,15 @@ __sweep_server(void *arg) /* * Sweep for dead and excess handles. */ - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, - conn->sweep_cond, conn->sweep_interval * WT_MILLION); + __wt_cond_wait(session, conn->sweep_cond, + conn->sweep_interval * WT_MILLION, __sweep_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__sweep_server_run_chk(session)) + break; + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); @@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session) session = conn->sweep_session; WT_RET(__wt_cond_alloc( - session, "handle sweep server", false, &conn->sweep_cond)); + session, "handle sweep server", &conn->sweep_cond)); WT_RET(__wt_thread_create( session, &conn->sweep_tid, __sweep_server, session)); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 0cf746f84eb..48ea1ccb02b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -267,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) } #endif - __wt_cond_auto_signal(session, cache->evict_cond); + __wt_cond_signal(session, cache->evict_cond); } /* @@ -311,9 +311,10 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); + /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait( - session, cache->evict_cond, did_work); + session, cache->evict_cond, did_work, NULL); __wt_verbose(session, WT_VERB_EVICTSERVER, "waking"); } else WT_ERR(__evict_lru_pages(session, false)); @@ -712,8 +713,8 @@ __evict_pass(WT_SESSION_IMPL *session) */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); - __wt_cond_wait( - session, cache->evict_cond, WT_THOUSAND); + __wt_cond_wait(session, + cache->evict_cond, WT_THOUSAND, NULL); continue; } @@ -1102,7 +1103,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)) - __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait( + session, conn->evict_threads.wait_cond, 10000, NULL); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -2102,8 +2104,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ - __wt_cond_wait( - session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait(session, + conn->evict_threads.wait_cond, 10000, NULL); cache->app_waits++; break; default: diff --git a/src/include/extern.h b/src/include/extern.h index 88fb8823930..eb2f9a0e784 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -613,11 +613,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index 5acb7b0ed27..fed7835ada1 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 11b45f11304..0bfc821c7a6 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/misc.i b/src/include/misc.i index f36be32d6a2..d5692a3f9cf 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -11,11 +11,12 @@ * Wait on a mutex, optionally timing out. */ static inline void -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)) { bool notused; - __wt_cond_wait_signal(session, cond, usecs, ¬used); + __wt_cond_wait_signal(session, cond, usecs, run_func, ¬used); } /* diff --git a/src/include/mutex.h b/src/include/mutex.h index 727a690bb1c..06b8c4a3304 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -21,8 +21,8 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ /* - * The following fields are only used for automatically adjusting - * condition variables. They could be in a separate structure. + * The following fields are used for automatically adjusting condition + * variable wait times. */ uint64_t min_wait; /* Minimum wait duration */ uint64_t max_wait; /* Maximum wait duration */ diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 03bff7cd04f..f05d3d4ab55 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -2362,7 +2362,7 @@ struct __wt_connection { * @config{exclusive, fail if the database already exists\, generally used with * the \c create option., a boolean flag; default \c false.} * @config{extensions, list of shared library extensions to load (using dlopen). - * Any values specified to an library extension are passed to + * Any values specified to a library extension are passed to * WT_CONNECTION::load_extension as the \c config parameter (for example\, * extensions=(/path/ext.so={entry=my_entry}))., a list of strings; * default empty.} diff --git a/src/log/log.c b/src/log/log.c index da500a74e87..614ae1a9b6d 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -43,11 +43,11 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) */ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); if (++yield_count < WT_THOUSAND) __wt_yield(); else - __wt_cond_wait(session, log->log_write_cond, 200); + __wt_cond_wait(session, log->log_write_cond, 200, NULL); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } @@ -89,7 +89,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } /* @@ -170,7 +170,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ while (log->sync_lsn.l.file < min_lsn->l.file) { __wt_cond_signal(session, S2C(session)->log_file_cond); - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); @@ -915,7 +915,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) else { WT_STAT_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - __wt_cond_auto_signal( + __wt_cond_signal( session, conn->log_cond); } } @@ -1490,7 +1490,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); continue; } locked = true; @@ -2160,7 +2161,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); __wt_yield(); } else WT_ERR(__wt_log_force_write(session, 1, NULL)); @@ -2169,12 +2170,14 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_write_cond, 10000); + __wt_cond_wait( + session, log->log_write_cond, 10000, NULL); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); } /* diff --git a/src/log/log_slot.c b/src/log/log_slot.c index d70c0d689be..d6e692f8c51 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -349,7 +349,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* * If we didn't find any free slots signal the worker thread. */ - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index b0d0758775d..ffa00c0a5e7 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -154,7 +154,7 @@ __lsm_worker(void *arg) /* Don't busy wait if there was any work to do. */ if (!progress) { - __wt_cond_wait(session, cookie->work_cond, 10000); + __wt_cond_wait(session, cookie->work_cond, 10000, NULL); continue; } } diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index be8b1abda31..a5ee78f9e3e 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; @@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -42,8 +41,8 @@ err: __wt_free(session, cond); * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { struct timespec ts; WT_DECL_RET; @@ -62,6 +61,23 @@ __wt_cond_wait_signal( WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { __wt_epoch(session, &ts); ts.tv_sec += (time_t) @@ -81,7 +97,7 @@ __wt_cond_wait_signal( ret == ETIME || #endif ret == ETIMEDOUT) { - *signalled = false; +skipping: *signalled = false; ret = 0; } diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 79c62ccd7f2..0001c6c2322 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; @@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, InitializeConditionVariable(&cond->cond); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { BOOL sleepret; DWORD milliseconds, windows_error; @@ -59,8 +58,26 @@ __wt_cond_wait_signal( EnterCriticalSection(&cond->mtx); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { - milliseconds64 = usecs / 1000; + milliseconds64 = usecs / WT_THOUSAND; /* * Check for 32-bit unsigned integer overflow @@ -90,7 +107,7 @@ __wt_cond_wait_signal( if (sleepret == 0) { windows_error = __wt_getlasterror(); if (windows_error == ERROR_TIMEOUT) { - *signalled = false; +skipping: *signalled = false; sleepret = 1; } } @@ -117,17 +134,17 @@ void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; - bool locked; - - locked = false; __wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name); /* - * Our callers are often setting flags to cause a thread to exit. Add - * a barrier to ensure the flags are seen by the threads. + * Our callers often set flags to cause a thread to exit. Add a barrier + * to ensure exit flags are seen by the sleeping threads, otherwise we + * can wake up a thread, it immediately goes back to sleep, and we'll + * hang. Use a full barrier (we may not write before waiting on thread + * join). */ - WT_WRITE_BARRIER(); + WT_FULL_BARRIER(); /* * Fast path if we are in (or can enter), a state where the next waiter diff --git a/src/session/session_api.c b/src/session/session_api.c index fcbfa8809b3..71626e098cb 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1488,6 +1488,20 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) err: API_END_RET(session, ret); } +/* + * __transaction_sync_run_chk -- + * Check to decide if the transaction sync call should continue running. + */ +static bool +__transaction_sync_run_chk(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN)); +} + /* * __session_transaction_sync -- * WT_SESSION->transaction_sync method. @@ -1502,7 +1516,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; WT_TXN *txn; struct timespec now, start; - uint64_t timeout_ms, waited_ms; + uint64_t remaining_usec, timeout_ms, waited_ms; bool forever; session = (WT_SESSION_IMPL *)wt_session; @@ -1555,22 +1569,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach - * our timeout. + * our timeout, or there's some other reason to quit. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + if (!__transaction_sync_run_chk(session)) + WT_ERR(ETIMEDOUT); + __wt_cond_signal(session, conn->log_file_cond); __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); - if (forever || waited_ms < timeout_ms) - /* - * Note, we will wait an increasing amount of time - * each iteration, likely doubling. Also note that - * the function timeout value is in usecs (we are - * computing the wait time in msecs and passing that - * in, unchanged, as the usecs to wait). - */ - __wt_cond_wait(session, log->log_sync_cond, waited_ms); - else + if (forever || waited_ms < timeout_ms) { + remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND; + __wt_cond_wait(session, log->log_sync_cond, + remaining_usec, __transaction_sync_run_chk); + } else WT_ERR(ETIMEDOUT); } @@ -1825,7 +1837,7 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); + WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond)); if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c index a3ae67f5baa..600e5eab0ff 100644 --- a/src/support/cond_auto.c +++ b/src/support/cond_auto.c @@ -1,29 +1,9 @@ /*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. + * See the file LICENSE for redistribution information. */ #include "wt_internal.h" @@ -38,13 +18,12 @@ * Allocate and initialize an automatically adjusting condition variable. */ int -__wt_cond_auto_alloc( - WT_SESSION_IMPL *session, const char *name, - bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +__wt_cond_auto_alloc(WT_SESSION_IMPL *session, + const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) { WT_CONDVAR *cond; - WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + WT_RET(__wt_cond_alloc(session, name, condp)); cond = *condp; cond->min_wait = min; @@ -54,34 +33,20 @@ __wt_cond_auto_alloc( return (0); } -/* - * __wt_cond_auto_signal -- - * Signal a condition variable. - */ -void -__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) -{ - - WT_ASSERT(session, cond->min_wait != 0); - __wt_cond_signal(session, cond); -} - /* * __wt_cond_auto_wait_signal -- * Wait on a mutex, optionally timing out. If we get it before the time * out period expires, let the caller know. - * TODO: Can this version of the API be removed, now that we have the - * auto adjusting condition variables? */ void -__wt_cond_auto_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { uint64_t delta; /* * Catch cases where this function is called with a condition variable - * that was initialized non-auto. + * that wasn't initialized to do automatic adjustments. */ WT_ASSERT(session, cond->min_wait != 0); @@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal( cond->max_wait, cond->prev_wait + delta); } - __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled); + __wt_cond_wait_signal( + session, cond, cond->prev_wait, run_func, signalled); if (progress || *signalled) WT_STAT_CONN_INCR(session, cond_auto_wait_reset); @@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal( * out period expires, let the caller know. */ void -__wt_cond_auto_wait( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +__wt_cond_auto_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) { - bool signalled; - - /* - * Call the signal version so the wait period is reset if the - * condition is woken explicitly. - */ - __wt_cond_auto_wait_signal(session, cond, progress, &signalled); -} + bool notused; -/* - * __wt_cond_auto_destroy -- - * Destroy a condition variable. - */ -int -__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) -{ - return (__wt_cond_destroy(session, condp)); + __wt_cond_auto_wait_signal(session, cond, progress, run_func, ¬used); } diff --git a/src/support/thread_group.c b/src/support/thread_group.c index beb143e63e2..2b4b7ad4e61 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -259,7 +259,7 @@ __wt_thread_group_create( __wt_rwlock_init(session, &group->lock); WT_ERR(__wt_cond_alloc( - session, "Thread group cond", false, &group->wait_cond)); + session, "thread group cond", &group->wait_cond)); cond_alloced = true; __wt_writelock(session, &group->lock); -- cgit v1.2.1 From 0a70661a0d33c9705509955baafded2855054a29 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 26 Jan 2017 16:54:46 -0500 Subject: WT-3156 Add check in assertions for errors. (#3271) --- src/log/log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log/log.c b/src/log/log.c index 614ae1a9b6d..1482cc0aca1 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -2202,12 +2202,12 @@ err: /* * If one of the sync flags is set, assert the proper LSN has moved to - * match. + * match on success. */ - WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) || + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); - WT_ASSERT(session, - !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || + __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } -- cgit v1.2.1 From 1e24579efee68f6fdb6a4c582275a50d95d7eb81 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Wed, 1 Feb 2017 12:11:48 +1100 Subject: WT-3115 Convert the dhandle list lock into a read/write lock. (#3236) It was a spinlock, but most acquirers only need shared access and it can be a contention point in many-table workloads. Split uses of the handle list lock into small operations. In particular, only hold the handle list lock to get the "next" handle, not for loops over all the handles in the system. Update statistics around handle list lock and corresponding doc. --- dist/flags.py | 3 +- dist/s_stat | 3 - dist/stat_data.py | 4 +- src/conn/conn_dhandle.c | 55 ++++++----- src/conn/conn_handle.c | 4 +- src/conn/conn_stat.c | 8 +- src/conn/conn_sweep.c | 2 +- src/cursor/cur_backup.c | 8 +- src/docs/upgrading.dox | 6 ++ src/evict/evict_lru.c | 50 +++++----- src/evict/evict_stat.c | 2 +- src/include/cache.i | 2 +- src/include/connection.h | 6 +- src/include/dhandle.h | 18 ++++ src/include/extern.h | 1 + src/include/flags.h | 33 +++---- src/include/schema.h | 72 +++++++++++--- src/include/stat.h | 4 +- src/include/wiredtiger.in | 218 +++++++++++++++++++++--------------------- src/lsm/lsm_cursor.c | 4 +- src/lsm/lsm_manager.c | 12 +-- src/lsm/lsm_stat.c | 4 +- src/lsm/lsm_tree.c | 63 ++++++------ src/lsm/lsm_work_unit.c | 4 +- src/schema/schema_drop.c | 2 +- src/schema/schema_rename.c | 2 +- src/schema/schema_worker.c | 2 +- src/session/session_dhandle.c | 43 +++++---- src/support/stat.c | 16 +--- src/txn/txn_ckpt.c | 5 +- 30 files changed, 359 insertions(+), 297 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index 55ce233e60d..216f7c29e0a 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -117,7 +117,8 @@ flags = { 'SESSION_CAN_WAIT', 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', - 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_HANDLE_LIST_READ', + 'SESSION_LOCKED_HANDLE_LIST_WRITE', 'SESSION_LOCKED_METADATA', 'SESSION_LOCKED_PASS', 'SESSION_LOCKED_SCHEMA', diff --git a/dist/s_stat b/dist/s_stat index 5d5937e1833..6aeeca6faa6 100755 --- a/dist/s_stat +++ b/dist/s_stat @@ -25,9 +25,6 @@ cat << UNUSED_STAT_FIELDS lock_checkpoint_count lock_checkpoint_wait_application lock_checkpoint_wait_internal -lock_handle_list_count -lock_handle_list_wait_application -lock_handle_list_wait_internal lock_metadata_count lock_metadata_wait_application lock_metadata_wait_internal diff --git a/dist/stat_data.py b/dist/stat_data.py index 0af5d6d017e..a4d92345f88 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -288,9 +288,7 @@ connection_stats = [ LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'), LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'), LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'), - LockStat('lock_handle_list_count', 'handle-list lock acquisitions'), - LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'), - LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'), + LockStat('lock_handle_list_wait_eviction', 'handle-list lock eviction thread wait time (usecs)'), LockStat('lock_metadata_count', 'metadata lock acquisitions'), LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'), LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'), diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index b2f4bb04ce4..866b8633f71 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -25,21 +25,19 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) } /* - * __conn_dhandle_alloc -- + * __wt_conn_dhandle_alloc -- * Allocate a new data handle and return it linked into the connection's * list. */ -static int -__conn_dhandle_alloc(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep) +int +__wt_conn_dhandle_alloc( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint64_t bucket; - *dhandlep = NULL; - WT_RET(__wt_calloc_one(session, &dhandle)); __wt_rwlock_init(session, &dhandle->rwlock); @@ -75,7 +73,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket); - *dhandlep = dhandle; + session->dhandle = dhandle; return (0); err: __conn_dhandle_destroy(session, dhandle); @@ -122,10 +120,7 @@ __wt_conn_dhandle_find( } } - WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle)); - - session->dhandle = dhandle; - return (0); + return (WT_NOTFOUND); } /* @@ -419,12 +414,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t bucket; conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - /* * If we're given a URI, then we walk only the hash list for that * name. If we don't have a URI we walk the entire dhandle list. @@ -432,29 +426,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, + &conn->dhhash[bucket], hashq)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || strcmp(uri, dhandle->name) != 0) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || !WT_PREFIX_MATCH(dhandle->name, "file:") || WT_IS_METADATA(dhandle)) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } - return (0); +err: WT_DHANDLE_RELEASE(dhandle); + return (ret); } /* @@ -473,7 +480,8 @@ __wt_conn_dhandle_close_all( conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, session->dhandle == NULL); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; @@ -534,7 +542,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final) dhandle = session->dhandle; bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ @@ -583,7 +592,7 @@ __wt_conn_dhandle_discard_single( } /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __conn_dhandle_remove(session, final)); if (set_pass_intr) (void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 54bcfd98aba..4f8d89fa9d2 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -53,7 +53,6 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); - WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); @@ -64,6 +63,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ + __wt_rwlock_init(session, &conn->dhandle_lock); __wt_rwlock_init(session, &conn->hot_backup_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); @@ -134,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); - __wt_spin_destroy(session, &conn->dhandle_lock); + __wt_rwlock_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 31dc9c45992..d89392b66c6 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) struct timespec ts; struct tm *tm, _tm; WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_FSTREAM *log_stream; conn = S2C(session); @@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * Lock the schema and walk the list of open handles, dumping * any that match the list of object sources. */ - if (conn->stat_sources != NULL) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply( + if (conn->stat_sources != NULL) + WT_RET(__wt_conn_btree_apply( session, NULL, __statlog_apply, NULL, NULL)); - WT_RET(ret); - } /* * Walk the list of open LSM trees, dumping any that match the diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index f9b7305c7d8..8c186c63939 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __sweep_remove_one(session, dhandle)); if (ret == 0) WT_STAT_CONN_INCR(session, dh_sweep_remove); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 08b15e6ca5e..61ced8d11e7 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) static int __backup_all(WT_SESSION_IMPL *session) { - WT_DECL_RET; - /* Build a list of the file objects that need to be copied. */ - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); - - return (ret); + return (__wt_meta_apply_all( + session, NULL, __backup_list_uri_append, NULL)); } /* diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 4a356f7da61..f463e6bc615 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -7,6 +7,12 @@ The WiredTiger Utility can now \c truncate an object. Removing all contents from the specified object. +
Handle list lock statistics
+
+In the 2.9.1 release we added statistics tracking handle list lock timing, we +have switched that lock from a spin lock to a read-write lock, and consequently +changed the statistics tracking lock related wait time. +
@section version_291 Upgrading to Version 2.9.1 diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 48ea1ccb02b..de1cff85816 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -24,40 +24,40 @@ static int __evict_walk_file( (S2C(s)->evict_threads.current_threads > 1) /* - * __evict_lock_dhandle -- - * Try to get the dhandle lock, with yield and sleep back off. + * __evict_lock_handle_list -- + * Try to get the handle list lock, with yield and sleep back off. * Keep timing statistics overall. */ static int -__evict_lock_dhandle(WT_SESSION_IMPL *session) +__evict_lock_handle_list(WT_SESSION_IMPL *session) { struct timespec enter, leave; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_SPINLOCK *dh_lock; - int64_t **stats; + WT_RWLOCK *dh_lock; u_int spins; bool dh_stats; conn = S2C(session); cache = conn->cache; dh_lock = &conn->dhandle_lock; - stats = (int64_t **)conn->stats; - dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1; /* - * Maintain lock acquisition timing statistics as if this were a - * regular lock acquisition. + * Setup tracking of handle lock acquisition wait time if statistics + * are enabled. */ + dh_stats = WT_STAT_ENABLED(session); + if (dh_stats) __wt_epoch(session, &enter); + /* * Use a custom lock acquisition back off loop so the eviction server * notices any interrupt quickly. */ for (spins = 0; - (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY && + (ret = __wt_try_readlock(session, dh_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); @@ -70,8 +70,9 @@ __evict_lock_dhandle(WT_SESSION_IMPL *session) WT_RET(ret); if (dh_stats) { __wt_epoch(session, &leave); - stats[session->stat_bucket][dh_lock->stat_int_usecs_off] += - (int64_t)WT_TIMEDIFF_US(leave, enter); + WT_STAT_CONN_INCRV( + session, lock_handle_list_wait_eviction, + (int64_t)WT_TIMEDIFF_US(leave, enter)); } return (0); } @@ -379,18 +380,17 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - /* - * If we gave up acquiring the lock, that indicates a - * session is waiting for us to clear walks. Do that - * as part of a normal pass (without the handle list + * Try to get the handle list lock: if we give up, that + * indicates a session is waiting for us to clear walks. Do + * that as part of a normal pass (without the handle list * lock) to avoid deadlock. */ - if ((ret = __evict_lock_dhandle(session)) == EBUSY) + if ((ret = __evict_lock_handle_list(session)) == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); WT_RET(ret); cache->pages_evicted = 0; @@ -1321,7 +1321,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - WT_ERR(__evict_lock_dhandle(session)); + WT_ERR(__evict_lock_handle_list(session)); dhandle_locked = true; } @@ -1400,7 +1400,7 @@ retry: while (slot < max_entries) { (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; /* @@ -1447,7 +1447,7 @@ retry: while (slot < max_entries) { } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; } @@ -2319,8 +2319,11 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); - __wt_spin_lock(session, &conn->dhandle_lock); - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; @@ -2331,7 +2334,6 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) if (ret != 0) break; } - __wt_spin_unlock(session, &conn->dhandle_lock); WT_RET(ret); /* diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c index 2dd3b1e83a0..7c2d5722a63 100644 --- a/src/evict/evict_stat.c +++ b/src/evict/evict_stat.c @@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_STAT_DATA_SET(session, cache_state_root_size, btree->root.page->memory_footprint); - WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); + __evict_stat_walk(session); } diff --git a/src/include/cache.i b/src/include/cache.i index 17ab39e97d2..d71978ccf35 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -364,7 +364,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) * block eviction), we don't want to highjack the thread for eviction. */ if (F_ISSET(session, WT_SESSION_NO_EVICTION | - WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA)) return (0); /* In memory configurations don't block when the cache is full. */ diff --git a/src/include/connection.h b/src/include/connection.h index 64ac4271db1..3a719e59608 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -123,12 +123,16 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ ++conn->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ --conn->dhandle_count; \ @@ -163,13 +167,13 @@ struct __wt_connection_impl { WT_SPINLOCK api_lock; /* Connection API spinlock */ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ - WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ + WT_RWLOCK dhandle_lock; /* Data handle list lock */ /* * We distribute the btree page locks across a set of spin locks. Don't diff --git a/src/include/dhandle.h b/src/include/dhandle.h index dcc788f0839..4f318e7bccf 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -37,6 +37,24 @@ #define WT_SESSION_META_DHANDLE(s) \ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) +#define WT_DHANDLE_ACQUIRE(dhandle) \ + (void)__wt_atomic_add32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_RELEASE(dhandle) \ + (void)__wt_atomic_sub32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\ + if (dhandle == NULL) \ + dhandle = TAILQ_FIRST(head); \ + else { \ + WT_DHANDLE_RELEASE(dhandle); \ + dhandle = TAILQ_NEXT(dhandle, field); \ + } \ + if (dhandle != NULL) \ + WT_DHANDLE_ACQUIRE(dhandle); \ +} while (0) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. diff --git a/src/include/extern.h b/src/include/extern.h index eb2f9a0e784..d7d58c58048 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -254,6 +254,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index 0b92a12c686..5219bf33ed6 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -53,22 +53,23 @@ #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008 -#define WT_SESSION_LOCKED_METADATA 0x00000010 -#define WT_SESSION_LOCKED_PASS 0x00000020 -#define WT_SESSION_LOCKED_SCHEMA 0x00000040 -#define WT_SESSION_LOCKED_SLOT 0x00000080 -#define WT_SESSION_LOCKED_TABLE 0x00000100 -#define WT_SESSION_LOCKED_TURTLE 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_PASS 0x00000040 +#define WT_SESSION_LOCKED_SCHEMA 0x00000080 +#define WT_SESSION_LOCKED_SLOT 0x00000100 +#define WT_SESSION_LOCKED_TABLE 0x00000200 +#define WT_SESSION_LOCKED_TURTLE 0x00000400 +#define WT_SESSION_LOGGING_INMEM 0x00000800 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 +#define WT_SESSION_NO_CACHE 0x00002000 +#define WT_SESSION_NO_DATA_HANDLES 0x00004000 +#define WT_SESSION_NO_EVICTION 0x00008000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 +#define WT_SESSION_SERVER_ASYNC 0x00080000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 diff --git a/src/include/schema.h b/src/include/schema.h index bb116e5cf2f..fff57951c0e 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -78,6 +78,11 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) +/* Make it simple to check a generic locked state on the handle list lock */ +#define WT_SESSION_LOCKED_HANDLE_LIST \ + (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST_WRITE) + /* * WT_WITH_LOCK_WAIT -- * Wait for a lock, perform an operation, drop the lock. @@ -122,16 +127,47 @@ struct __wt_table { &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) /* - * WT_WITH_HANDLE_LIST_LOCK -- - * Acquire the data handle list lock, perform an operation, drop the lock. + * WT_WITH_HANDLE_LIST_READ_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. * * Note: always waits because some operations need the handle list lock to * discard handles, and we only expect it to be held across short * operations. */ -#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) +#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ + op; \ + } else { \ + __wt_readlock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HANDLE_LIST_WRITE_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. + * Automatically upgrade from a read lock if held. + */ +#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ + __wt_writelock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) /* * WT_WITH_METADATA_LOCK -- @@ -192,15 +228,21 @@ struct __wt_table { WT_CONNECTION_IMPL *__conn = S2C(session); \ bool __checkpoint_locked = \ F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ - bool __handle_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + bool __handle_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + bool __handle_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ bool __table_locked = \ F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ - if (__handle_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - __wt_spin_unlock(session, &__conn->dhandle_lock); \ + if (__handle_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &__conn->dhandle_lock); \ + } \ + if (__handle_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &__conn->dhandle_lock); \ } \ if (__table_locked) { \ F_CLR(session, WT_SESSION_LOCKED_TABLE); \ @@ -227,8 +269,12 @@ struct __wt_table { __wt_spin_lock(session, &__conn->table_lock); \ F_SET(session, WT_SESSION_LOCKED_TABLE); \ } \ - if (__handle_locked) { \ - __wt_spin_lock(session, &__conn->dhandle_lock); \ - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + if (__handle_read_locked) { \ + __wt_readlock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + } \ + if (__handle_write_locked) { \ + __wt_writelock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ } \ } while (0) diff --git a/src/include/stat.h b/src/include/stat.h index fd3e3290d95..8b2e78a4ed5 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -392,9 +392,7 @@ struct __wt_connection_stats { int64_t lock_checkpoint_count; int64_t lock_checkpoint_wait_application; int64_t lock_checkpoint_wait_internal; - int64_t lock_handle_list_count; - int64_t lock_handle_list_wait_application; - int64_t lock_handle_list_wait_internal; + int64_t lock_handle_list_wait_eviction; int64_t lock_metadata_count; int64_t lock_metadata_wait_application; int64_t lock_metadata_wait_internal; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f05d3d4ab55..d1e3d383396 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -4595,240 +4595,236 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133 /*! lock: checkpoint lock internal thread wait time (usecs) */ #define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134 -/*! lock: handle-list lock acquisitions */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1135 -/*! lock: handle-list lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1136 -/*! lock: handle-list lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1137 +/*! lock: handle-list lock eviction thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1138 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1139 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1140 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1141 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1142 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1143 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1144 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1145 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1146 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1147 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1148 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1149 +#define WT_STAT_CONN_LOG_SLOT_RACES 1147 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1151 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1149 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1152 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1153 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1154 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1155 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1153 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1156 +#define WT_STAT_CONN_LOG_FLUSH 1154 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1157 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1155 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1158 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1159 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1160 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1161 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1162 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1163 +#define WT_STAT_CONN_LOG_SCANS 1161 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1164 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1162 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1165 +#define WT_STAT_CONN_LOG_WRITE_LSN 1163 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1166 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1167 +#define WT_STAT_CONN_LOG_SYNC 1165 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1168 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1166 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1169 +#define WT_STAT_CONN_LOG_SYNC_DIR 1167 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1170 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1171 +#define WT_STAT_CONN_LOG_WRITES 1169 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1172 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1173 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1174 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1175 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1176 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1177 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1175 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1178 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1179 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1180 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1181 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1182 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1183 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1184 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1185 +#define WT_STAT_CONN_REC_PAGES 1183 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1186 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1184 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1187 +#define WT_STAT_CONN_REC_PAGE_DELETE 1185 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1188 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1189 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1190 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1191 +#define WT_STAT_CONN_SESSION_OPEN 1189 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1192 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1193 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1194 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1207 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1208 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1209 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1210 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1211 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1212 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1213 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1214 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1215 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1216 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1217 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1218 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1219 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1220 +#define WT_STAT_CONN_PAGE_SLEEP 1218 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1221 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1222 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1223 +#define WT_STAT_CONN_TXN_BEGIN 1221 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1230 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT 1230 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1233 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1234 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1232 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1235 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1236 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1237 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1235 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1238 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1239 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1240 +#define WT_STAT_CONN_TXN_SYNC 1238 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1241 +#define WT_STAT_CONN_TXN_COMMIT 1239 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1242 +#define WT_STAT_CONN_TXN_ROLLBACK 1240 /*! * @} diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index a2511f48e2b..60afbc99ade 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1692,8 +1692,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, bulk = cval.val != 0; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree); + /* * Check whether the exclusive open for a bulk load succeeded, and * if it did ensure that it's safe to bulk load into the tree. diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index cbd83a5cd30..6dc06146179 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -387,8 +387,8 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; - __wt_spin_lock(session, &conn->dhandle_lock); - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readlock(session, &conn->dhandle_lock); + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) @@ -448,14 +448,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = false; } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); } return (ret); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 150de968722..21e8991be94 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -33,9 +33,7 @@ __curstat_lsm_init( "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Propagate all, fast and/or clear to the cursors we open. */ diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 71a981a6284..a9275976023 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); } @@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, metadata = NULL; /* If the tree can be opened, it already exists. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - if (ret == 0) { + if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } @@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session, } *treep = lsm_tree; + + WT_ASSERT(session, lsm_tree->excl_session == + (exclusive ? session : NULL)); return (0); } @@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session, conn = S2C(session); lsm_tree = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) @@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, { WT_DECL_RET; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - ret = __lsm_tree_find(session, uri, exclusive, treep); + /* + * Dropping and re-acquiring the lock is safe here, since the tree open + * call checks to see if another thread beat it to opening the tree + * before proceeding. + */ + if (exclusive) + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); + else + WT_WITH_HANDLE_LIST_READ_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); if (ret == WT_NOTFOUND) - ret = __lsm_tree_open(session, uri, exclusive, treep); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_open(session, uri, exclusive, treep)); - WT_ASSERT(session, ret != 0 || - (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -857,9 +866,7 @@ __wt_lsm_tree_alter( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -899,9 +906,7 @@ __wt_lsm_tree_drop( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ @@ -934,7 +939,7 @@ __wt_lsm_tree_drop( WT_ASSERT(session, !lsm_tree->active); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1007,7 +1010,7 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1068,7 +1069,7 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); } @@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skipp = true; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, @@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = false; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index d9c185a3f58..4349acf7b55 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; @@ -517,7 +517,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); WT_RET(ret); diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index c1a4f257648..49801e4e5f9 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -30,7 +30,7 @@ __drop_file( WT_RET(__wt_schema_backup_check(session, filename)); /* Close all btree handles associated with this file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index f512482c162..a374f4c2831 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -33,7 +33,7 @@ __rename_file( WT_RET(__wt_schema_backup_check(session, filename)); WT_RET(__wt_schema_backup_check(session, newfile)); /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, false)); WT_ERR(ret); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index fb7f8cec074..e5f71b5d56f 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all( session, uri, false)); WT_ERR(ret); diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index f1251794b89..ee9bddbfc19 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -44,8 +44,7 @@ __session_discard_dhandle( TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); - (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1); - + WT_DHANDLE_RELEASE(dhandle_cache->dhandle); __wt_overwrite_and_free(session, dhandle_cache); } @@ -412,17 +411,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) /* * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's - * cache. Since the data handle isn't locked, this must be called holding - * the handle list lock, and we must increment the handle's reference - * count before releasing it. + * cache. We must increment the handle's reference count while holding + * the handle list lock. */ static int __session_find_shared_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); - (void)__wt_atomic_add32(&session->dhandle->session_ref, 1); - return (0); + WT_DECL_RET; + + WT_WITH_HANDLE_LIST_READ_LOCK(session, + if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + if (ret != WT_NOTFOUND) + return (ret); + + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + return (ret); } /* @@ -450,16 +459,16 @@ __session_get_dhandle( * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __session_find_shared_dhandle(session, uri, checkpoint)); - WT_RET(ret); + WT_RET(__session_find_shared_dhandle(session, uri, checkpoint)); /* * Fixup the reference count on failure (we incremented the reference * count while holding the handle-list lock). */ - if ((ret = __session_add_dhandle(session)) != 0) - (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); + if ((ret = __session_add_dhandle(session)) != 0) { + WT_DHANDLE_RELEASE(session->dhandle); + session->dhandle = NULL; + } return (ret); } @@ -505,17 +514,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, * reopen handles in the meantime. A combination of the schema * and handle list locks are used to enforce this. */ - if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); __wt_writeunlock(session, &dhandle->rwlock); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_session_get_btree( - session, uri, checkpoint, cfg, flags))); + ret = __wt_session_get_btree( + session, uri, checkpoint, cfg, flags)); return (ret); } diff --git a/src/support/stat.c b/src/support/stat.c index 167d17137ce..fd38e1b79ee 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -759,9 +759,7 @@ static const char * const __stats_connection_desc[] = { "lock: checkpoint lock acquisitions", "lock: checkpoint lock application thread wait time (usecs)", "lock: checkpoint lock internal thread wait time (usecs)", - "lock: handle-list lock acquisitions", - "lock: handle-list lock application thread wait time (usecs)", - "lock: handle-list lock internal thread wait time (usecs)", + "lock: handle-list lock eviction thread wait time (usecs)", "lock: metadata lock acquisitions", "lock: metadata lock application thread wait time (usecs)", "lock: metadata lock internal thread wait time (usecs)", @@ -1044,9 +1042,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_checkpoint_count = 0; stats->lock_checkpoint_wait_application = 0; stats->lock_checkpoint_wait_internal = 0; - stats->lock_handle_list_count = 0; - stats->lock_handle_list_wait_application = 0; - stats->lock_handle_list_wait_internal = 0; + stats->lock_handle_list_wait_eviction = 0; stats->lock_metadata_count = 0; stats->lock_metadata_wait_application = 0; stats->lock_metadata_wait_internal = 0; @@ -1351,12 +1347,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_checkpoint_wait_application); to->lock_checkpoint_wait_internal += WT_STAT_READ(from, lock_checkpoint_wait_internal); - to->lock_handle_list_count += - WT_STAT_READ(from, lock_handle_list_count); - to->lock_handle_list_wait_application += - WT_STAT_READ(from, lock_handle_list_wait_application); - to->lock_handle_list_wait_internal += - WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_handle_list_wait_eviction += + WT_STAT_READ(from, lock_handle_list_wait_eviction); to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); to->lock_metadata_wait_application += WT_STAT_READ(from, lock_metadata_wait_application); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3b19162fd3d..7b33b0c7788 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -640,9 +640,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, WT_WITH_TABLE_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_get_handles, NULL)))); + ret = __checkpoint_apply_all( + session, cfg, __wt_checkpoint_get_handles, NULL))); WT_ERR(ret); /* -- cgit v1.2.1 From 0562f92104f0b2d8ef218d9fe465ef718bc2d9cd Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 2 Feb 2017 16:40:30 +1100 Subject: WT-3150 Reduce impact of checkpoints on eviction. (#3265) In particular, don't have the eviction server give up all walks each time it is interrupted, and only wait for requesting threads to make progress: don't go to sleep. --- src/evict/evict_lru.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index de1cff85816..3cb513fd87b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -281,7 +281,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - bool did_work; + bool did_work, was_intr; conn = S2C(session); cache = conn->cache; @@ -309,8 +309,21 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); + was_intr = cache->pass_intr != 0; __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); + + /* + * If the eviction server was interrupted, wait until + * requests have been processed: the system may + * otherwise be busy so don't go to sleep. + */ + if (was_intr) { + while (cache->pass_intr != 0) + __wt_yield(); + continue; + } + __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); /* Don't rely on signals: check periodically. */ @@ -372,7 +385,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || + cache->pass_intr != 0) return (0); /* -- cgit v1.2.1 From 3e68fb2d7da35eeb122308971f02203c58caa538 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 3 Feb 2017 03:28:50 +1100 Subject: WT-3139 Enhance wtperf to support periodic table scans (#3268) * Enhance wtperf to support periodic table scans * Implement scans as read_range. * Use a random cursor to set key in table properly. * Don't allow insert workload with table specifier. * Reset the rand cursor so it isn't positioned. * Make wtperf pre_load_data an option. --- bench/wtperf/config.c | 42 ++++++- bench/wtperf/idle_table_cycle.c | 2 + bench/wtperf/stress/btree-split-stress.wtperf | 3 +- bench/wtperf/wtperf.c | 163 ++++++++++++++++++++------ bench/wtperf/wtperf.h | 5 + bench/wtperf/wtperf_opt.i | 10 +- src/docs/wtperf.dox | 6 +- 7 files changed, 183 insertions(+), 48 deletions(-) diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index a15a3485dde..9eea99eeec4 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -215,6 +215,7 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) return (EINVAL); } workp = &wtperf->workload[wtperf->workload_cnt++]; + workp->table_index = INT32_MAX; while ((ret = scan->next(scan, &k, &v)) == 0) { if (STRING_MATCH("count", k.str, k.len)) { @@ -233,12 +234,28 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) goto err; continue; } + if (STRING_MATCH("pause", k.str, k.len)) { + if ((workp->pause = v.val) < 0) + goto err; + continue; + } if (STRING_MATCH("read", k.str, k.len) || STRING_MATCH("reads", k.str, k.len)) { if ((workp->read = v.val) < 0) goto err; continue; } + if (STRING_MATCH("read_range", k.str, k.len)) { + if ((workp->read_range = v.val) < 0) + goto err; + continue; + } + if (STRING_MATCH("table", k.str, k.len)) { + if (v.val <= 0) + goto err; + workp->table_index = (int32_t)v.val - 1; + continue; + } if (STRING_MATCH("throttle", k.str, k.len)) { workp->throttle = (uint64_t)v.val; continue; @@ -760,16 +777,33 @@ config_sanity(WTPERF *wtperf) opts->value_sz_min = opts->value_sz; } - if (opts->readonly && wtperf->workload != NULL) + if (wtperf->workload != NULL) for (i = 0, workp = wtperf->workload; - i < wtperf->workload_cnt; ++i, ++workp) - if (workp->insert != 0 || workp->update != 0 || - workp->truncate != 0) { + i < wtperf->workload_cnt; ++i, ++workp) { + if (opts->readonly && + (workp->insert != 0 || workp->update != 0 || + workp->truncate != 0)) { fprintf(stderr, "Invalid workload: insert, update or " "truncate specified with readonly\n"); return (EINVAL); } + if (workp->insert != 0 && + workp->table_index != INT32_MAX) { + fprintf(stderr, + "Invalid workload: Cannot insert into " + "specific table only\n"); + return (EINVAL); + } + if (workp->table_index != INT32_MAX && + workp->table_index >= (int32_t)opts->table_count) { + fprintf(stderr, + "Workload table index %" PRId32 + " is larger than table count %" PRId32, + workp->table_index, opts->table_count); + return (EINVAL); + } + } return (0); } diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c index 13fa55e86f5..bb44cfbde59 100644 --- a/bench/wtperf/idle_table_cycle.c +++ b/bench/wtperf/idle_table_cycle.c @@ -120,6 +120,7 @@ cycle_idle_tables(void *arg) return (NULL); start = stop; +#if 1 /* * Drop the table. Keep retrying on EBUSY failure - it is an * expected return when checkpoints are happening. @@ -136,6 +137,7 @@ cycle_idle_tables(void *arg) } if (check_timing(wtperf, "drop", start, &stop) != 0) return (NULL); +#endif } return (NULL); diff --git a/bench/wtperf/stress/btree-split-stress.wtperf b/bench/wtperf/stress/btree-split-stress.wtperf index 86bb288fc6d..eb6ca1cfddc 100644 --- a/bench/wtperf/stress/btree-split-stress.wtperf +++ b/bench/wtperf/stress/btree-split-stress.wtperf @@ -6,5 +6,4 @@ run_time=300 reopen_connection=false populate_threads=2 value_sz=256 -read_range=100 -threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1)) +threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1,read_range=100)) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index baa259f8817..044fd38dc06 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -432,19 +432,17 @@ err: wtperf->error = wtperf->stop = true; * search do them. Ensuring the keys we see are always in order. */ static int -do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) +do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor, int64_t read_range) { - CONFIG_OPTS *opts; - size_t range; uint64_t next_val, prev_val; + int64_t range; char *range_key_buf; char buf[512]; int ret; - opts = wtperf->opts; ret = 0; - if (opts->read_range == 0) + if (read_range == 0) return (0); memset(&buf[0], 0, 512 * sizeof(char)); @@ -454,7 +452,7 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &range_key_buf)); extract_key(range_key_buf, &next_val); - for (range = 0; range < opts->read_range; ++range) { + for (range = 0; range < read_range; ++range) { prev_val = next_val; ret = cursor->next(cursor); /* We are done if we reach the end. */ @@ -475,12 +473,56 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) return (0); } +/* pre_load_data -- + * Pull everything into cache before starting the workload phase. + */ +static int +pre_load_data(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_SESSION *session; + char *key; + int ret; + size_t i; + + opts = wtperf->opts; + conn = wtperf->conn; + + if ((ret = conn->open_session( + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); + goto err; + } + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + while (cursor->next(cursor) == 0) + if ((ret = cursor->get_key(cursor, &key)) != 0) + goto err; + if ((ret = cursor->close(cursor)) != 0) + goto err; + } + if ((ret = session->close(session, NULL)) != 0) + goto err; + if (ret != 0) +err: lprintf(wtperf, ret, 0, "Pre-workload traverse error"); + return (ret); +} + static void * worker(void *arg) { struct timespec start, stop; CONFIG_OPTS *opts; TRACK *trk; + WORKLOAD *workload; WTPERF *wtperf; WTPERF_THREAD *thread; WT_CONNECTION *conn; @@ -495,13 +537,14 @@ worker(void *arg) char buf[512]; thread = (WTPERF_THREAD *)arg; + workload = thread->workload; wtperf = thread->wtperf; opts = wtperf->opts; conn = wtperf->conn; cursors = NULL; - log_table_cursor = NULL; /* -Wconditional-initialized */ + cursor = log_table_cursor = NULL; /* -Wconditional-initialized */ ops = 0; - ops_per_txn = thread->workload->ops_per_txn; + ops_per_txn = workload->ops_per_txn; session = NULL; trk = NULL; @@ -510,7 +553,6 @@ worker(void *arg) lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); goto err; } - cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); for (i = 0; i < opts->table_count_idle; i++) { snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); if ((ret = session->open_cursor( @@ -525,14 +567,34 @@ worker(void *arg) goto err; } } - for (i = 0; i < opts->table_count; i++) { + if (workload->table_index != INT32_MAX) { if ((ret = session->open_cursor(session, - wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + wtperf->uris[workload->table_index], + NULL, NULL, &cursor)) != 0) { lprintf(wtperf, ret, 0, "worker: WT_SESSION.open_cursor: %s", - wtperf->uris[i]); + wtperf->uris[workload->table_index]); + goto err; + } + if ((ret = session->open_cursor(session, + wtperf->uris[workload->table_index], + NULL, "next_random=true", &thread->rand_cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: random %s", + wtperf->uris[workload->table_index]); goto err; } + } else { + cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + } } if (opts->log_like_table && (ret = session->open_cursor(session, wtperf->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) { @@ -543,19 +605,19 @@ worker(void *arg) } /* Setup the timer for throttling. */ - if (thread->workload->throttle != 0) + if (workload->throttle != 0) setup_throttle(thread); /* Setup for truncate */ - if (thread->workload->truncate != 0) + if (workload->truncate != 0) if ((ret = setup_truncate(wtperf, thread, session)) != 0) goto err; key_buf = thread->key_buf; value_buf = thread->value_buf; - op = thread->workload->ops; - op_end = op + sizeof(thread->workload->ops); + op = workload->ops; + op_end = op + sizeof(workload->ops); if ((ops_per_txn != 0 || opts->log_like_table) && (ret = session->begin_transaction(session, NULL)) != 0) { @@ -564,6 +626,8 @@ worker(void *arg) } while (!wtperf->stop) { + if (workload->pause != 0) + (void)sleep((unsigned int)workload->pause); /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -603,10 +667,12 @@ worker(void *arg) generate_key(opts, key_buf, next_val); - /* - * Spread the data out around the multiple databases. - */ - cursor = cursors[map_key_to_table(wtperf->opts, next_val)]; + if (workload->table_index == INT32_MAX) + /* + * Spread the data out around the multiple databases. + */ + cursor = cursors[ + map_key_to_table(wtperf->opts, next_val)]; /* * Skip the first time we do an operation, when trk->ops @@ -642,7 +708,8 @@ worker(void *arg) * for several operations, confirming that the * next key is in the correct order. */ - ret = do_range_reads(wtperf, cursor); + ret = do_range_reads(wtperf, + cursor, workload->read_range); } if (ret == 0 || ret == WT_NOTFOUND) @@ -689,7 +756,7 @@ worker(void *arg) */ strncpy(value_buf, value, opts->value_sz_max - 1); - if (thread->workload->update_delta != 0) + if (workload->update_delta != 0) update_value_delta(thread); if (value_buf[0] == 'a') value_buf[0] = 'b'; @@ -806,7 +873,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { /* Schedule the next operation */ if (++op == op_end) - op = thread->workload->ops; + op = workload->ops; /* * Decrement throttle ops and check if we should sleep @@ -843,7 +910,7 @@ run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt) uint8_t *p, *end; /* Jump around the array to roughly spread out the operations. */ - jump = 100 / op_cnt; + jump = (int)(100 / op_cnt); /* * Find a read operation and replace it with another operation. This @@ -884,17 +951,6 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) opts = wtperf->opts; - /* Confirm reads, inserts, truncates and updates cannot all be zero. */ - if (workp->insert == 0 && workp->read == 0 && - workp->truncate == 0 && workp->update == 0) { - lprintf(wtperf, EINVAL, 0, "no operations scheduled"); - return (EINVAL); - } - - /* - * Handle truncate first - it's a special case that can't be used in - * a mixed workload. - */ if (workp->truncate != 0) { if (workp->insert != 0 || workp->read != 0 || workp->update != 0) { @@ -906,6 +962,12 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) return (0); } + /* Confirm reads, inserts and updates cannot all be zero. */ + if (workp->insert == 0 && workp->read == 0 && workp->update == 0) { + lprintf(wtperf, EINVAL, 0, "no operations scheduled"); + return (EINVAL); + } + /* * Check for a simple case where the thread is only doing insert or * update operations (because the default operation for a @@ -2244,6 +2306,8 @@ start_run(WTPERF *wtperf) opts->checkpoint_threads, checkpoint_worker) != 0) goto err; } + if (opts->pre_load_data && (ret = pre_load_data(wtperf)) != 0) + goto err; /* Execute the workload. */ if ((ret = execute_workload(wtperf)) != 0) goto err; @@ -2827,13 +2891,42 @@ static uint64_t wtperf_rand(WTPERF_THREAD *thread) { CONFIG_OPTS *opts; + WT_CURSOR *rnd_cursor; WTPERF *wtperf; double S1, S2, U; uint64_t rval; + int ret; + char *key_buf; wtperf = thread->wtperf; opts = wtperf->opts; + /* + * If we have a random cursor set up then use it. + */ + if ((rnd_cursor = thread->rand_cursor) != NULL) { + if ((ret = rnd_cursor->next(rnd_cursor))) { + lprintf(wtperf, ret, 0, "worker: rand next failed"); + /* 0 is outside the expected range. */ + return (0); + } + if ((ret = rnd_cursor->get_key(rnd_cursor, &key_buf)) != 0) { + lprintf(wtperf, ret, 0, + "worker: rand next key retrieval"); + return (0); + } + /* + * Resetting the cursor is not fatal. We still return the + * value we retrieved above. We do it so that we don't + * leave a cursor positioned. + */ + if ((ret = rnd_cursor->reset(rnd_cursor)) != 0) + lprintf(wtperf, ret, 0, + "worker: rand cursor reset failed"); + extract_key(key_buf, &rval); + return (rval); + } + /* * Use WiredTiger's random number routine: it's lock-free and fairly * good. diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 81d74e134f6..db88d0b0271 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -66,6 +66,9 @@ typedef struct { uint64_t throttle; /* Maximum operations/second */ /* Number of operations per transaction. Zero for autocommit */ int64_t ops_per_txn; + int64_t pause; /* Time between scans */ + int64_t read_range; /* Range of reads */ + int32_t table_index; /* Table to focus ops on */ int64_t truncate; /* Truncate ratio */ uint64_t truncate_pct; /* Truncate Percent */ uint64_t truncate_count; /* Truncate Count */ @@ -225,6 +228,7 @@ typedef struct { struct __wtperf_thread { /* Per-thread structure */ WTPERF *wtperf; /* Enclosing configuration */ + WT_CURSOR *rand_cursor; /* Random key cursor */ WT_RAND_STATE rnd; /* Random number generation state */ @@ -241,6 +245,7 @@ struct __wtperf_thread { /* Per-thread structure */ TRACK ckpt; /* Checkpoint operations */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ + TRACK scan; /* Scan operations */ TRACK update; /* Update operations */ TRACK truncate; /* Truncate operations */ TRACK truncate_sleep; /* Truncate sleep operations */ diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 680eb53a90e..63cef4c28fb 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -145,12 +145,13 @@ DEF_OPT_AS_UINT32(populate_ops_per_txn, 0, "phase, zero for auto-commit") DEF_OPT_AS_UINT32(populate_threads, 1, "number of populate threads, 1 for bulk load") +DEF_OPT_AS_BOOL(pre_load_data, 0, + "Scan all data prior to starting the workload phase to warm the cache") DEF_OPT_AS_UINT32(random_range, 0, "if non zero choose a value from within this range as the key for " "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") DEF_OPT_AS_BOOL(range_partition, 0, "partition data by range (vs hash)") -DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search") DEF_OPT_AS_BOOL(readonly, 0, "reopen the connection between populate and workload phases in readonly " "mode. Requires reopen_connection turned on (default). Requires that " @@ -192,9 +193,10 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' " "which would create 2 threads doing nothing but reads and 8 threads " "each doing 50% inserts and 25% reads and updates. Allowed configuration " - "values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', " - "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are " - "also behavior modifiers, supported modifiers are 'ops_per_txn'") + "values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', " + "'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. " + "There are also behavior modifiers, supported modifiers are " + "'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", "WT_SESSION.begin_transaction configuration string, applied during the " "populate phase when populate_ops_per_txn is nonzero") diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 83aadf8a776..2eac0fef3f4 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -195,14 +195,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load +@par pre_load_data (boolean, default=false) +Scan all data prior to starting the workload phase to warm the cache @par random_range (unsigned int, default=0) if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) partition data by range (vs hash) -@par read_range (unsigned int, default=0) -scan a range of keys after each search @par readonly (boolean, default=false) reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) @@ -228,7 +228,7 @@ number of tables to run operations over. Keys are divided evenly over the table @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. @par threads (string, default="") -workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' @par transaction_config (string, default="") WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero @par table_name (string, default="test") -- cgit v1.2.1 From 17ec908453f8dae29d18cd8ba172360ef0473c8f Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 2 Feb 2017 14:01:31 -0500 Subject: WT-3157 Fix checkpoint error path (#3274) --- src/txn/txn_ckpt.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 7b33b0c7788..90804db3240 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -181,7 +181,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; - u_int i; + u_int i, j; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { @@ -189,10 +189,22 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); - WT_RET(ret); + WT_ERR(ret); } - return (0); +err: + /* + * If we have an error somewhere in processing the handles, then + * we need to mark earlier trees dirty. + */ + if (ret != 0) + for (j = 0; j < i; ++j) { + if (session->ckpt_handle[j] == NULL) + continue; + WT_WITH_DHANDLE(session, session->ckpt_handle[j], + S2BT(session)->modified = true); + } + return (ret); } /* @@ -824,7 +836,7 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0 && !conn->modified) + if (ret != 0) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; @@ -1340,7 +1352,6 @@ __checkpoint_tree( WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; - int was_modified; bool fake_ckpt; WT_UNUSED(cfg); @@ -1351,7 +1362,6 @@ __checkpoint_tree( conn = S2C(session); dhandle = session->dhandle; fake_ckpt = false; - was_modified = btree->modified; /* * Set the checkpoint LSN to the maximum LSN so that if logging is @@ -1482,10 +1492,9 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) { + if (ret != 0) { btree->modified = true; - if (!S2C(session)->modified) - S2C(session)->modified = true; + S2C(session)->modified = true; } __wt_meta_ckptlist_free(session, ckptbase); -- cgit v1.2.1 From 009959863f181a07d6c5bb73bcd0e4f1fded7b78 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 12:57:31 +1100 Subject: WT-3150 Fix: don't spin forever during eviction interrupts. (#3276) --- src/evict/evict_lru.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 3cb513fd87b..a071730d4bd 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -319,7 +319,9 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) * otherwise be busy so don't go to sleep. */ if (was_intr) { - while (cache->pass_intr != 0) + while (cache->pass_intr != 0 && + F_ISSET(conn, WT_CONN_EVICTION_RUN) && + F_ISSET(thread, WT_THREAD_RUN)) __wt_yield(); continue; } -- cgit v1.2.1 From 6df1a46875156202f560d6d173ba0be7afe8ca98 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 15:45:32 +1100 Subject: WT-3148 Improve efficiency of eviction with many small trees. (#3264) --- src/evict/evict_lru.c | 95 ++++++++++++++++++++++++++++++++++++++------------- src/include/btree.i | 22 ++++++++++++ src/include/extern.h | 1 + src/support/rand.c | 12 +++++++ 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a071730d4bd..2b7b46e19fa 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1559,6 +1559,19 @@ __evict_walk_file(WT_SESSION_IMPL *session, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; total_slots = max_entries - queue->evict_entries; + btree_inuse = cache_inuse = 0; + target_pages_clean = target_pages_dirty = 0; + + /* + * The number of times we should fill the queue by the end of + * considering all trees. + */ +#define QUEUE_FILLS_PER_PASS 10 + + /* + * The minimum number of pages we should consider per tree. + */ +#define MIN_PAGES_PER_TREE 10 /* * The target number of pages for this tree is proportional to the @@ -1567,13 +1580,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, * cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { - btree_inuse = __wt_btree_bytes_inuse(session); + btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_clean = 0; + } if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); @@ -1581,35 +1593,58 @@ __evict_walk_file(WT_SESSION_IMPL *session, bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_dirty = 0; + } - target_pages = WT_MAX(target_pages_clean, target_pages_dirty); + /* + * Weight the number of target pages by the number of times we want to + * fill the cache per pass through all the trees. Note that we don't + * build this into the calculation above because we don't want to favor + * small trees, so round to a whole number of slots (zero for small + * trees) before multiplying. + */ + target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * + QUEUE_FILLS_PER_PASS; + /* + * Randomly walk trees with a small fraction of the cache in case there + * are so many trees that none of them use enough of the cache to be + * allocated slots. + * + * The chance of walking a tree is equal to the chance that a random + * byte in cache belongs to the tree, weighted by how many times we + * want to fill queues during a pass through all the trees in cache. + */ if (target_pages == 0) { - /* - * Randomly walk trees with a tiny fraction of the cache in - * case there are so many trees that none of them use enough of - * the cache to be allocated slots. Walk small trees 1% of the - * time. - */ - if (__wt_random(&session->rnd) > UINT32_MAX / 100) + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { + btree_inuse = __wt_btree_bytes_evictable(session); + cache_inuse = __wt_cache_bytes_inuse(cache); + } else { + btree_inuse = __wt_btree_dirty_leaf_inuse(session); + cache_inuse = __wt_cache_dirty_leaf_inuse(cache); + } + if (btree_inuse == 0 || cache_inuse == 0) + return (0); + if (__wt_random64(&session->rnd) % cache_inuse > + btree_inuse * QUEUE_FILLS_PER_PASS) return (0); - target_pages = 10; } + /* + * There is some cost associated with walking a tree. If we're going + * to visit this tree, always look for a minimum number of pages. + */ + if (target_pages < MIN_PAGES_PER_TREE) + target_pages = MIN_PAGES_PER_TREE; + + /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - /* Randomize the walk direction. */ - if (btree->evict_walk_reverse) - FLD_SET(walk_flags, WT_READ_PREV); - /* * Examine at least a reasonable number of pages before deciding * whether to give up. When we are only looking for dirty pages, @@ -1620,6 +1655,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + + /* Randomize the walk direction. */ + if (btree->evict_walk_reverse) + FLD_SET(walk_flags, WT_READ_PREV); + /* * Get some more eviction candidate pages. * @@ -1752,12 +1794,17 @@ fast: /* If the page can't be evicted, give up. */ session, cache_eviction_pages_queued, (u_int)(evict - start)); /* - * If we didn't find any candidates in the file, reverse the direction - * of the walk and skip it next time. + * If gave up the walk, reverse the direction of the walk and skip it + * next time. */ if (give_up) btree->evict_walk_reverse = !btree->evict_walk_reverse; - if (pages_queued == 0 && !urgent_queued) + + /* + * If we couldn't find the number of pages we were looking for, skip + * the tree next time. + */ + if (pages_queued < target_pages / 2 && !urgent_queued) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) diff --git a/src/include/btree.i b/src/include/btree.i index 09fa8df8c56..1e971fa81c9 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -70,6 +70,28 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem)); } +/* + * __wt_btree_bytes_evictable -- + * Return the number of bytes that can be evicted (i.e. bytes apart from + * the pinned root page). + */ +static inline uint64_t +__wt_btree_bytes_evictable(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + uint64_t bytes_inmem, bytes_root; + + btree = S2BT(session); + cache = S2C(session)->cache; + + bytes_inmem = btree->bytes_inmem; + bytes_root = btree->root.page->memory_footprint; + + return (bytes_inmem <= bytes_root ? 0 : + __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); +} + /* * __wt_btree_dirty_inuse -- * Return the number of dirty bytes in use. diff --git a/src/include/extern.h b/src/include/extern.h index d7d58c58048..863d2a02861 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -688,6 +688,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/support/rand.c b/src/support/rand.c index a5b229b9abc..4fae43edc8e 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state) return ((z << 16) + (w & 65535)); } + +/* + * __wt_random64 -- + * Return a 64-bit pseudo-random number. + */ +uint64_t +__wt_random64(WT_RAND_STATE volatile * rnd_state) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + return (((uint64_t)__wt_random(rnd_state) << 32) + + __wt_random(rnd_state)); +} -- cgit v1.2.1 From de3424c0bca2d7660acaff17383e05849d164a16 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 17:00:41 +1100 Subject: WT-3148 Check that we have a root page when calculating evictable size. --- src/include/btree.i | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/include/btree.i b/src/include/btree.i index 1e971fa81c9..378d93dd2ee 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -80,13 +80,15 @@ __wt_btree_bytes_evictable(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; + WT_PAGE *root_page; uint64_t bytes_inmem, bytes_root; btree = S2BT(session); cache = S2C(session)->cache; + root_page = btree->root.page; bytes_inmem = btree->bytes_inmem; - bytes_root = btree->root.page->memory_footprint; + bytes_root = root_page == NULL ? 0 : root_page->memory_footprint; return (bytes_inmem <= bytes_root ? 0 : __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); -- cgit v1.2.1 From b2173f8f063b1528dcd086f00ca8cf072f0445d0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sun, 5 Feb 2017 19:55:36 -0500 Subject: WT-3111 util_create() doesnt free memory assigned to "uri" (#3279) Always print an error message if a WT_SESSION method fails, we don't know if the WiredTiger library printed out a message or not. Free memory allocated by util_uri() in some cases where we either didn't have the necessary free call, or simply returned without freeing memory. Try and be more consistent with error messages, use the leading call as the first string, and any arguments to that call as the second. Replace some of the places we're writing to stderr explicitly with the utility error handlers. Initialize the return variable from util_uri() in all cases. Change error messages that referenced WT_SESSION.open to reference WT_SESSION.open_cursor. --- bench/wtperf/wtperf.c | 2 +- bench/wtperf/wtperf.h | 1 - src/evict/evict_lru.c | 5 ++--- src/utilities/util.h | 2 +- src/utilities/util_alter.c | 9 ++++++--- src/utilities/util_compact.c | 14 +++----------- src/utilities/util_create.c | 12 +++++++----- src/utilities/util_drop.c | 10 ++++++---- src/utilities/util_dump.c | 26 +++++++++++++------------- src/utilities/util_list.c | 21 ++++++++++----------- src/utilities/util_load.c | 2 +- src/utilities/util_load_json.c | 2 +- src/utilities/util_loadtext.c | 13 +++++++++---- src/utilities/util_main.c | 4 ++-- src/utilities/util_printlog.c | 14 +++----------- src/utilities/util_read.c | 19 +++++++++++++------ src/utilities/util_rebalance.c | 30 +++++++++++++----------------- src/utilities/util_rename.c | 15 ++++----------- src/utilities/util_salvage.c | 30 +++++++++++++----------------- src/utilities/util_stat.c | 6 +++--- src/utilities/util_truncate.c | 11 ++++++----- src/utilities/util_upgrade.c | 30 +++++++++++++----------------- src/utilities/util_verify.c | 34 +++++++++++++++------------------- src/utilities/util_write.c | 20 +++++++++++++------- 24 files changed, 158 insertions(+), 174 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 044fd38dc06..7f5e5ad3373 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2905,7 +2905,7 @@ wtperf_rand(WTPERF_THREAD *thread) * If we have a random cursor set up then use it. */ if ((rnd_cursor = thread->rand_cursor) != NULL) { - if ((ret = rnd_cursor->next(rnd_cursor))) { + if ((ret = rnd_cursor->next(rnd_cursor)) != 0) { lprintf(wtperf, ret, 0, "worker: rand next failed"); /* 0 is outside the expected range. */ return (0); diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index db88d0b0271..3efb8ab700e 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -245,7 +245,6 @@ struct __wtperf_thread { /* Per-thread structure */ TRACK ckpt; /* Checkpoint operations */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ - TRACK scan; /* Scan operations */ TRACK update; /* Update operations */ TRACK truncate; /* Truncate operations */ TRACK truncate_sleep; /* Truncate sleep operations */ diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 2b7b46e19fa..db39a5acdee 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -198,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } - WT_ASSERT(session, - !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } @@ -1781,7 +1780,7 @@ fast: /* If the page can't be evicted, give up. */ ++pages_queued; if (WT_PAGE_IS_INTERNAL(page)) - ++internal_pages; + ++internal_pages; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, diff --git a/src/utilities/util.h b/src/utilities/util.h index cf12d7d4aa6..93a96d44219 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *); int util_list(WT_SESSION *, int, char *[]); int util_load(WT_SESSION *, int, char *[]); int util_loadtext(WT_SESSION *, int, char *[]); -char *util_name(WT_SESSION *, const char *, const char *); int util_printlog(WT_SESSION *, int, char *[]); int util_read(WT_SESSION *, int, char *[]); int util_read_line(WT_SESSION *, ULINE *, bool, bool *); @@ -51,5 +50,6 @@ int util_stat(WT_SESSION *, int, char *[]); int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop); int util_truncate(WT_SESSION *, int, char *[]); int util_upgrade(WT_SESSION *, int, char *[]); +char *util_uri(WT_SESSION *, const char *, const char *); int util_verify(WT_SESSION *, int, char *[]); int util_write(WT_SESSION *, int, char *[]); diff --git a/src/utilities/util_alter.c b/src/utilities/util_alter.c index d228c15cd48..ef01a1ed826 100644 --- a/src/utilities/util_alter.c +++ b/src/utilities/util_alter.c @@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[]) for (configp = argv; configp != NULL && *configp != NULL; configp += 2) if ((ret = session->alter( - session, configp[0], configp[1])) != 0) - break; - return (ret); + session, configp[0], configp[1])) != 0) { + (void)util_err(session, ret, + "session.alter: %s, %s", configp[0], configp[1]); + return (1); + } + return (0); } static int diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c index c114eb207fa..e469b4dce6e 100644 --- a/src/utilities/util_compact.c +++ b/src/utilities/util_compact.c @@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->compact(session, uri, NULL)) != 0) { - fprintf(stderr, "%s: compact(%s): %s\n", - progname, uri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->compact(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.compact: %s", uri); free(uri); - return (ret); } diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c index 4e609736f2d..7c22a67792b 100644 --- a/src/utilities/util_create.c +++ b/src/utilities/util_create.c @@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - const char *config, *uri; + char *config, *uri; - config = NULL; + config = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF) switch (ch) { case 'c': /* command-line configuration */ @@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[]) if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); if ((ret = session->create(session, uri, config)) != 0) - return (util_err(session, ret, "%s: session.create", uri)); - return (0); + (void)util_err(session, ret, "session.create: %s", uri); + + free(uri); + return (ret); } static int diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c index ba41445dfb6..456005d445d 100644 --- a/src/utilities/util_drop.c +++ b/src/utilities/util_drop.c @@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - ret = session->drop(session, name, "force"); + if ((ret = session->drop(session, uri, "force")) != 0) + (void)util_err(session, ret, "session.drop: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 3f8b4a49dfe..cded40a8b45 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -37,10 +37,10 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) size_t len; int ch, i; bool hex, json, reverse; - char *checkpoint, *config, *name, *p, *simplename; + char *checkpoint, *config, *p, *simpleuri, *uri; hex = json = reverse = false; - checkpoint = config = name = simplename = NULL; + checkpoint = config = simpleuri = uri = NULL; cursor = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) switch (ch) { @@ -89,11 +89,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) if (json && i > 0) if (dump_json_separator(session) != 0) goto err; - free(name); - free(simplename); - name = simplename = NULL; + free(uri); + free(simpleuri); + uri = simpleuri = NULL; - if ((name = util_name(session, argv[i], "table")) == NULL) + if ((uri = util_uri(session, argv[i], "table")) == NULL) goto err; len = @@ -113,19 +113,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) (void)strcat(config, json ? "dump=json" : (hex ? "dump=hex" : "dump=print")); if ((ret = session->open_cursor( - session, name, NULL, config, &cursor)) != 0) { + session, uri, NULL, config, &cursor)) != 0) { fprintf(stderr, "%s: cursor open(%s) failed: %s\n", - progname, name, session->strerror(session, ret)); + progname, uri, session->strerror(session, ret)); goto err; } - if ((simplename = strdup(name)) == NULL) { + if ((simpleuri = strdup(uri)) == NULL) { (void)util_err(session, errno, NULL); goto err; } - if ((p = strchr(simplename, '(')) != NULL) + if ((p = strchr(simpleuri, '(')) != NULL) *p = '\0'; - if (dump_config(session, simplename, cursor, hex, json) != 0) + if (dump_config(session, simpleuri, cursor, hex, json) != 0) goto err; if (dump_record(cursor, reverse, json) != 0) @@ -148,8 +148,8 @@ err: ret = 1; } free(config); - free(name); - free(simplename); + free(uri); + free(simpleuri); if (cursor != NULL && (ret = cursor->close(cursor)) != 0) { (void)util_err(session, ret, NULL); ret = 1; diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index e91dbfce05b..f19ba4d1f97 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; bool cflag, vflag; - char *name; + char *uri; cflag = vflag = false; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF) switch (ch) { case 'c': @@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[]) case 0: break; case 1: - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); break; default: return (usage()); } - ret = list_print(session, name, cflag, vflag); - - free(name); + ret = list_print(session, uri, cflag, vflag); + free(uri); return (ret); } @@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) * List the high-level objects in the database. */ static int -list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) +list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag) { WT_CURSOR *cursor; WT_DECL_RET; @@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI)); } - found = name == NULL; + found = uri == NULL; while ((ret = cursor->next(cursor)) == 0) { /* Get the key. */ if ((ret = cursor->get_key(cursor, &key)) != 0) @@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) /* * If a name is specified, only show objects that match. */ - if (name != NULL) { - if (!WT_PREFIX_MATCH(key, name)) + if (uri != NULL) { + if (!WT_PREFIX_MATCH(key, uri)) continue; found = true; } @@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) if (ret != WT_NOTFOUND) return (util_cerr(cursor, "next", ret)); if (!found) { - fprintf(stderr, "%s: %s: not found\n", progname, name); + fprintf(stderr, "%s: %s: not found\n", progname, uri); return (1); } diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index ac18df80851..ca77643eb49 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -126,7 +126,7 @@ load_dump(WT_SESSION *session) append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index 020a4ed9ba9..1189d49a483 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -242,7 +242,7 @@ json_data(WT_SESSION *session, LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } keyformat = cursor->key_format; diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c index f9c5b6e9a1f..7602d43f8c9 100644 --- a/src/utilities/util_loadtext.c +++ b/src/utilities/util_loadtext.c @@ -15,9 +15,11 @@ static int usage(void); int util_loadtext(WT_SESSION *session, int argc, char *argv[]) { + WT_DECL_RET; int ch; - const char *uri; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF) switch (ch) { case 'f': /* input file */ @@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - return (text(session, uri)); + ret = text(session, uri); + + free(uri); + return (ret); } /* @@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri) */ if ((ret = session->open_cursor( session, uri, NULL, "append,overwrite", &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + return (util_err(session, ret, "%s: session.open_cursor", uri)); /* * We're about to load strings, make sure the formats match. diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 001a66d6d9e..7157f0d90fe 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -285,11 +285,11 @@ usage(void) } /* - * util_name -- + * util_uri -- * Build a name. */ char * -util_name(WT_SESSION *session, const char *s, const char *type) +util_uri(WT_SESSION *session, const char *s, const char *type) { size_t len; char *name; diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index e7fa2134934..5f3ed43905b 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -14,8 +14,8 @@ int util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; - int ch; uint32_t flags; + int ch; flags = 0; while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) @@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - ret = __wt_txn_printlog(session, flags); - - if (ret != 0) { - fprintf(stderr, "%s: printlog failed: %s\n", - progname, session->strerror(session, ret)); - goto err; - } + if ((ret = __wt_txn_printlog(session, flags)) != 0) + (void)util_err(session, ret, "printlog"); - if (0) { -err: ret = 1; - } return (ret); } diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c index 2e766377aa9..393949b6a1c 100644 --- a/src/utilities/util_read.c +++ b/src/utilities/util_read.c @@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool rkey, rval; - const char *uri, *value; + char *uri, *value; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are a uri followed by a list of keys. */ if (argc < 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ - if ((ret = session->open_cursor( - session, uri, NULL, NULL, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c index 45f161487e5..c188ea17d22 100644 --- a/src/utilities/util_rebalance.c +++ b/src/utilities/util_rebalance.c @@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->rebalance(session, name, NULL)) != 0) { - fprintf(stderr, "%s: rebalance(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->rebalance(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.rebalance: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index aee299c6e63..bb2d40cd103 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are the object uri and new name. */ if (argc != 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); newuri = argv[1]; - if ((ret = session->rename(session, uri, newuri, NULL)) != 0) { - fprintf(stderr, "%s: rename %s to %s: %s\n", - progname, uri, newuri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->rename(session, uri, newuri, NULL)) != 0) + (void)util_err( + session, ret, "session.rename: %s, %s", uri, newuri); free(uri); - return (ret); } diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c index 679d1074457..6cc2278b846 100644 --- a/src/utilities/util_salvage.c +++ b/src/utilities/util_salvage.c @@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; const char *force; - char *name; + char *uri; force = NULL; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF) switch (ch) { case 'F': @@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the file name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "file")) == NULL) + if ((uri = util_uri(session, *argv, "file")) == NULL) return (1); - if ((ret = session->salvage(session, name, force)) != 0) { - fprintf(stderr, "%s: salvage(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->salvage(session, uri, force)) != 0) + (void)util_err(session, ret, "session.salvage: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c index 4376f559ceb..1b75d9ea8bf 100644 --- a/src/utilities/util_stat.c +++ b/src/utilities/util_stat.c @@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) objname = (char *)""; break; case 1: - if ((objname = util_name(session, *argv, "table")) == NULL) + if ((objname = util_uri(session, *argv, "table")) == NULL) return (1); objname_free = true; break; @@ -82,8 +82,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) (ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0) if (printf("%s=%s\n", desc, pval) < 0) { - ret = errno; - break; + (void)util_err(session, errno, "printf"); + goto err; } if (ret == WT_NOTFOUND) ret = 0; diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c index 9325c0d7e84..35de02345c8 100644 --- a/src/utilities/util_truncate.c +++ b/src/utilities/util_truncate.c @@ -15,8 +15,9 @@ util_truncate(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,13 +31,13 @@ util_truncate(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->truncate(session, name, NULL, NULL, NULL)) != 0) - return (util_err(session, ret, "%s: session.truncate", name)); + if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0) + (void)util_err(session, ret, "session.truncate: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 63b23f28c16..f89bd46e133 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->upgrade(session, name, NULL)) != 0) { - fprintf(stderr, "%s: upgrade(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->upgrade(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.upgrade: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index 82bdd780cd3..d0587fcfc8c 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) size_t size; int ch; bool dump_address, dump_blocks, dump_layout, dump_pages; - char *config, *dump_offsets, *name; + char *config, *dump_offsets, *uri; dump_address = dump_blocks = dump_layout = dump_pages = false; - config = dump_offsets = name = NULL; + config = dump_offsets = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF) switch (ch) { case 'd': @@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); /* Build the configuration string as necessary. */ @@ -69,7 +69,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) strlen("dump_offsets[],") + (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20; if ((config = malloc(size)) == NULL) { - (void)util_err(session, errno, NULL); + ret = util_err(session, errno, NULL); goto err; } snprintf(config, size, @@ -82,23 +82,19 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) dump_offsets != NULL ? "]," : "", dump_pages ? "dump_pages," : ""); } - if ((ret = session->verify(session, name, config)) != 0) { - fprintf(stderr, "%s: verify(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->verify(session, uri, config)) != 0) + (void)util_err(session, ret, "session.verify: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(config); - free(name); - +err: free(config); + free(uri); return (ret); } diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index 7d9bce02b36..b931fad064d 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool append, overwrite, rkey; - const char *uri; - char config[100]; + char *uri, config[100]; append = overwrite = false; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF) switch (ch) { case 'a': @@ -47,15 +47,21 @@ util_write(WT_SESSION *session, int argc, char *argv[]) } else if (argc < 3 || ((argc - 1) % 2 != 0)) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ (void)snprintf(config, sizeof(config), "%s,%s", append ? "append=true" : "", overwrite ? "overwrite=true" : ""); - if ((ret = session->open_cursor( - session, uri, NULL, config, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + if ((ret = + session->open_cursor(session, uri, NULL, config, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a -- cgit v1.2.1 From 2185e4206c238389665fa024c3f891160942c04d Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 6 Feb 2017 11:29:25 -0500 Subject: WT-3157 More aggressive error handling. (#3275) * More aggressive error handling. * Alternative checkpoint cleanup. (#3281) --- src/txn/txn_ckpt.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 90804db3240..59dcc23acc5 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -181,7 +181,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; - u_int i, j; + u_int i; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { @@ -189,22 +189,10 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); - WT_ERR(ret); + WT_RET(ret); } -err: - /* - * If we have an error somewhere in processing the handles, then - * we need to mark earlier trees dirty. - */ - if (ret != 0) - for (j = 0; j < i; ++j) { - if (session->ckpt_handle[j] == NULL) - continue; - WT_WITH_DHANDLE(session, session->ckpt_handle[j], - S2BT(session)->modified = true); - } - return (ret); + return (0); } /* @@ -555,7 +543,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; - bool full, idle, logging, tracking; + bool failed, full, idle, logging, tracking; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; @@ -836,12 +824,13 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0) + failed = ret != 0; + if (failed) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) - WT_TRET(__wt_meta_track_off(session, false, ret != 0)); + WT_TRET(__wt_meta_track_off(session, false, failed)); cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); @@ -874,6 +863,13 @@ err: /* for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; + /* + * If the operation failed, mark all trees dirty so they are + * included if a future checkpoint can succeed. + */ + if (failed) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + S2BT(session)->modified = true); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } -- cgit v1.2.1 From 2a59c1fd79ff98b89046404ccb756114d74fa5f4 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 8 Feb 2017 01:53:18 -0500 Subject: WT-3161 Panic on a write error in logging. (#3278) It is not possible to continue without risking data loss. --- src/log/log.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/log/log.c b/src/log/log.c index 1482cc0aca1..b07ef8c1bd5 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -62,6 +62,8 @@ static int __log_fs_write(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { + WT_DECL_RET; + /* * If we're writing into a new log file, we have to wait for all * writes to the previous log file to complete otherwise there could @@ -71,7 +73,10 @@ __log_fs_write(WT_SESSION_IMPL *session, __log_wait_for_earlier_slot(session, slot); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } - return (__wt_write(session, slot->slot_fh, offset, len, buf)); + if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) + WT_PANIC_MSG(session, ret, + "%s: fatal log failure", slot->slot_fh->name); + return (ret); } /* -- cgit v1.2.1 From 15b7658a380e374e627b86e7629c8fad3ef349dc Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 8 Feb 2017 23:25:22 -0500 Subject: WT-3164 Ensure all relevant btree fields are reset on checkpoint error. (#3283) --- src/txn/txn_ckpt.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 59dcc23acc5..5932e058552 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -524,6 +524,17 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, #endif } +/* + * __checkpoint_fail_reset -- + * Reset fields when a failure occurs. + */ +static void +__checkpoint_fail_reset(WT_SESSION_IMPL *session) +{ + S2BT(session)->modified = true; + S2BT(session)->ckpt = NULL; +} + /* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. @@ -869,7 +880,7 @@ err: /* */ if (failed) WT_WITH_DHANDLE(session, session->ckpt_handle[i], - S2BT(session)->modified = true); + __checkpoint_fail_reset(session)); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } -- cgit v1.2.1 From 0b9e4534b2e01a7bf3dec00c91d6f38dfbcc0dd0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 9 Feb 2017 09:15:15 -0500 Subject: WT-3088 bug: WiredTiger can evict the tree's current eviction walk point (#3280) WT-3088 bug: WiredTiger can evict the tree's current eviction walk point --- src/btree/bt_debug.c | 2 -- src/btree/bt_split.c | 74 +++++++++++++++++++++++++++------------------------- src/include/btmem.h | 8 +++--- src/include/btree.i | 4 +-- src/include/extern.h | 1 + 5 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index a89eca230fd..d664da2ebd3 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -699,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", evict-lru")); if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) WT_RET(ds->f(ds, ", overflow-keys")); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) - WT_RET(ds->f(ds, ", split-block")); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) WT_RET(ds->f(ds, ", split-insert")); if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE)) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 7cfcd08f931..8122d242666 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -53,6 +53,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session) return (oldest); } +/* + * __wt_split_obsolete -- + * Check if it is safe to free / evict based on split generation. + */ +bool +__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) +{ + return (split_gen < __split_oldest_gen(session)); +} + /* * __split_stash_add -- * Add a new entry into the session's split stash list. @@ -394,8 +404,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) +__split_ref_step1(WT_SESSION_IMPL *session, + WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; WT_REF *child_ref, *ref; @@ -418,30 +428,25 @@ __split_ref_step1( child = ref->page; /* - * Block eviction and splits in newly created pages. + * Block eviction in newly created pages. * * Once the split is live, newly created internal pages might be * evicted and their WT_REF structures freed. If that happened * before all threads exit the index of the page that previously * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page's modify - * structure has a field with a transaction ID that's checked - * before any internal page is evicted. Unfortunately, we don't - * know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but - * the act of updating the original page's index is what allows - * the eviction to happen. + * ensure that doesn't happen, the newly created page contains + * the current split generation and can't be evicted until + * all readers have left the old generation. * - * Split blocking was because historic versions of the split - * code didn't update the WT_REF.home field until after the - * split was live, so the WT_REF.home fields being updated could - * split again before the update, there's a race between splits - * as to which would update them first. The current code updates - * the WT_REF.home fields before going live (in this function), - * this shouldn't be an issue, but for now splits remain turned - * off. + * Historic, we also blocked splits in newly created pages + * because we didn't update the WT_REF.home field until after + * the split was live, so the WT_REF.home fields being updated + * could split again before the update, there's a race between + * splits as to which would update them first. The current code + * updates the WT_REF.home fields before going live (in this + * function), this isn't an issue. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + child->pg_intl_split_gen = split_gen; /* * We use a page flag to prevent the child from splitting from @@ -473,7 +478,6 @@ __split_ref_step2( WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; - WT_PAGE *child; WT_REF *ref; uint32_t i; @@ -503,14 +507,9 @@ __split_ref_step2( continue; WT_ERR(ret); - child = ref->page; - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); + __split_verify_intl_key_order(session, ref->page)); #endif WT_ERR(__wt_hazard_clear(session, ref)); @@ -653,8 +652,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the root page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, false); + __split_ref_step1(session, alloc_index, split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -686,7 +689,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * fails, we don't roll back that change, because threads may already * be using the new index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); root_decr += size; @@ -838,6 +840,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the parent page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -908,7 +914,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * * Acquire a new split generation. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); @@ -1160,8 +1165,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, true); + __split_ref_step1(session, alloc_index, split_gen, true); /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, @@ -1207,7 +1216,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * back that change, because threads may already be using the new parent * page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); page_decr += size; @@ -1284,10 +1292,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; - /* Skip pages that aren't ready to split. */ - if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) - return (EBUSY); - if (trylock) WT_RET(__wt_try_writelock(session, &parent->page_lock)); else diff --git a/src/include/btmem.h b/src/include/btmem.h index 43c1a309d52..39ca223aebf 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -483,6 +483,7 @@ struct __wt_page { */ struct { WT_REF *parent_ref; /* Parent reference */ + uint64_t split_gen; /* Generation of last split */ struct __wt_page_index { uint32_t entries; @@ -492,6 +493,8 @@ struct __wt_page { } intl; #undef pg_intl_parent_ref #define pg_intl_parent_ref u.intl.parent_ref +#undef pg_intl_split_gen +#define pg_intl_split_gen u.intl.split_gen /* * Macros to copy/set the index because the name is obscured to ensure @@ -593,9 +596,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ -#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ diff --git a/src/include/btree.i b/src/include/btree.i index 378d93dd2ee..315efa86fa6 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1348,8 +1348,8 @@ __wt_page_can_evict( * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. */ - if (WT_PAGE_IS_INTERNAL(page) && - F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) + if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( + session, page->pg_intl_split_gen)) return (false); /* diff --git a/src/include/extern.h b/src/include/extern.h index 863d2a02861..836a7cb1ae6 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -160,6 +160,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 722b9d1b3da5dbfc4703f41855ae219df3fc6f57 Mon Sep 17 00:00:00 2001 From: Mark Benvenuto Date: Sat, 11 Feb 2017 08:14:52 -0500 Subject: WT-3173 Add runtime detection for s390x CRC32 hardware support (#3290) --- src/checksum/power8/crc32_wrapper.c | 4 ++-- src/checksum/zseries/crc32-s390x.c | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/checksum/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c index ddfa2bdaeb8..a9be9ced1c6 100644 --- a/src/checksum/power8/crc32_wrapper.c +++ b/src/checksum/power8/crc32_wrapper.c @@ -1,4 +1,6 @@ #if defined(__powerpc64__) +#include "wt_internal.h" + #define CRC_TABLE #include "crc32_constants.h" @@ -68,8 +70,6 @@ out: } #endif -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. diff --git a/src/checksum/zseries/crc32-s390x.c b/src/checksum/zseries/crc32-s390x.c index f77d6768d42..28b46594220 100644 --- a/src/checksum/zseries/crc32-s390x.c +++ b/src/checksum/zseries/crc32-s390x.c @@ -6,8 +6,20 @@ * Author(s): Hendrik Brueckner * */ +#include "wt_internal.h" + #include #include + +#if defined(HAVE_CRC32_HARDWARE) + +#include + +/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */ +#ifndef HWCAP_S390_VX +#define HWCAP_S390_VX 2048 +#endif + #include "crc32-s390x.h" #include "slicing-consts.h" @@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l /* Main CRC-32 functions */ DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le) -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. @@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len) return (~__wt_crc32c_le_vx(0xffffffff, chunk, len)); } +#endif + /* * __wt_checksum_init -- * WiredTiger: detect CRC hardware and set the checksum function. @@ -89,8 +101,14 @@ void __wt_checksum_init(void) { #if defined(HAVE_CRC32_HARDWARE) - __wt_process.checksum = __wt_checksum_hw; -#else + unsigned long caps = getauxval(AT_HWCAP); + + if (caps & HWCAP_S390_VX) + __wt_process.checksum = __wt_checksum_hw; + else + __wt_process.checksum = __wt_checksum_sw; + +#else /* !HAVE_CRC32_HARDWARE */ __wt_process.checksum = __wt_checksum_sw; #endif } -- cgit v1.2.1 From 7f5d0f9981214c723f2ed90cf4533887ed406176 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 13 Feb 2017 10:49:24 +1100 Subject: WT-3170 Change when eviction walk point is saved, cleanup splits. (#3284) * Change how eviction walk point is saved during walk. * After 0b9e453, we no longer need to do any non-DIAGNOSTIC work after completing the split (previously, we had changes to make the newly created split pages evictable, but now they are initially given a generation number which will prevent their eviction until it's OK). Rename __split_ref_step2() to be __split_verify_intl(), and change it to verify all of the internal pages involved in the split. Previously, we only verified the pages we had to read and update anyway. Now we don't have to update any pages and we're only reading pages in DIAGNOSTIC mode, verify all of them. Don't release the hazard pointer explicitly, use the more standard __wt_page_release() call (it should make no difference, it's just a bit more consistent). Rename __split_ref_step1() to be __split_ref_prepare(), there's no longer a step #2. * We don't need to publish WT_BTREE.evict_ref, or use a barrier: in one we're guaranteed that only the writing thread will check the assertion in the discard code (that we're not discarding the eviction's reference), and in the other case we're doing hazard-pointer coupling, which implies there is a barrier in the code path before the page can possibly be discarded by any thread. * Review barriers use in splits. (#3288). In all cases, use the pattern "Update the page index, which includes a barrier to make the split live, switch to benign error mode, then verify the pages involved in the split are correct." --- src/btree/bt_split.c | 164 ++++++++++++++++++++-------------------------- src/btree/bt_walk.c | 4 +- src/evict/evict_lru.c | 49 +++++++++----- src/include/session.h | 2 - src/session/session_api.c | 5 -- 5 files changed, 103 insertions(+), 121 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8122d242666..fcb14be7c76 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -197,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session, #ifdef HAVE_DIAGNOSTIC /* * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * Verify the key order on an internal page after a split. */ static void __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -249,6 +249,42 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) break; } } + +/* + * __split_verify_intl -- + * Verify a set of internal pages involved in a split. + */ +static int +__split_verify_intl(WT_SESSION_IMPL *session, + WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first) +{ + WT_DECL_RET; + WT_REF *ref; + + /* The split is complete and live, verify all of the pages involved. */ + if (page1 != NULL) + __split_verify_intl_key_order(session, page1); + if (page2 != NULL) + __split_verify_intl_key_order(session, page2); + + /* Skip the first slot on non-root internal pages, it's not set. */ + WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) { + if (skip_first) { + skip_first = false; + continue; + } + WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT)); + + __split_verify_intl_key_order(session, ref->page); + + WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + } WT_INTL_FOREACH_END; + + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error during page split"); +} #endif /* @@ -400,11 +436,11 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_ref_step1 -- + * __split_ref_prepare -- * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1(WT_SESSION_IMPL *session, +__split_ref_prepare(WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; @@ -469,58 +505,6 @@ __split_ref_step1(WT_SESSION_IMPL *session, } } -/* - * __split_ref_step2 -- - * Allow the newly created children to be evicted or split. - */ -static int -__split_ref_step2( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) -{ - WT_DECL_RET; - WT_REF *ref; - uint32_t i; - - /* - * The split has gone live, enable eviction and splits on the newly - * created internal pages. - */ - WT_WRITE_BARRIER(); - - for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { - ref = pindex->index[i]; - - /* - * We don't hold hazard pointers on created pages, they cannot - * be evicted because the page-modify transaction value set as - * they were created prevents eviction. (See above, we reset - * that value as part of fixing up the page.) But, an eviction - * thread might be attempting to evict the page (the WT_REF may - * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF - * may be WT_REF_READING), or it may be in some other state. - * Acquire a hazard pointer for any in-memory pages so we know - * the state of the page. Ignore pages not in-memory (deleted, - * on-disk, being read), there's no in-memory structure to fix. - */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) - continue; - WT_ERR(ret); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, ref->page)); -#endif - - WT_ERR(__wt_hazard_clear(session, ref)); - } - - return (0); - -err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error resolving a split"); -} - /* * __split_root -- * Split the root page in-memory, deepening the tree. @@ -657,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root->pg_intl_split_gen = split_gen; /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, split_gen, false); + __split_ref_prepare(session, alloc_index, split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -665,19 +649,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, root)); -#endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, false)); + alloc_index = NULL; /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* We've installed the allocated page-index, ensure error handling. */ - alloc_index = NULL; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + ret = __split_verify_intl(session, root, NULL, root, false)); + WT_ERR(ret); +#endif /* * We can't free the previous root's index, there may be threads using @@ -852,11 +833,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF @@ -875,18 +851,27 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, __wt_free(session, ref->page_del); } + /* + * Set the discarded WT_REF state to split, ensuring we don't + * race with any discard of the WT_REF deleted fields. + */ WT_PUBLISH(ref->state, WT_REF_SPLIT); - } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); + /* + * Push out the change: not required for correctness, but stops + * threads spinning on incorrect page references. + */ + WT_FULL_BARRIER(); + } /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); +#endif + /* * !!! * Swapping in the new page index released the page for eviction, we can @@ -1170,34 +1155,27 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page->pg_intl_split_gen = split_gen; /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, split_gen, true); + __split_ref_prepare(session, alloc_index, split_gen, true); /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* Confirm the page's index hasn't moved, then update it. */ + /* + * Confirm the page's index hasn't moved, then update it, which makes + * the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, page)); -#endif - - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, true)); - /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* - * Push out the changes: not required for correctness, but no reason - * to wait. - */ - WT_FULL_BARRIER(); +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + ret = __split_verify_intl(session, parent, page, page, true)); + WT_ERR(ret); +#endif /* * We don't care about the page-index we allocated, all we needed was diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 049700952ee..ddaa2e5f70b 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * - * We may be passed a pointer to btree->evict_page that we are clearing - * here. We check when discarding pages that we're not discarding that - * page, so this clear must be done before the page is released. + * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index db39a5acdee..efe056aee02 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -756,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session) * Clear a single walk point. */ static int -__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) +__evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -773,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) if ((ref = btree->evict_ref) == NULL) return (0); - if (count_stat) - WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); + WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); /* - * Clear evict_ref first, in case releasing it forces eviction (we - * assert we never try to evict the current eviction walk point). + * Clear evict_ref before releasing it in case that forces eviction (we + * assert that we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; + WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); @@ -803,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__evict_clear_walk(session, true))); + WT_TRET(__evict_clear_walk(session))); return (ret); } @@ -848,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) /* Clear any existing LRU eviction walk for the file. */ WT_WITH_PASS_LOCK(session, - ret = __evict_clear_walk(session, true)); + ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); @@ -1662,8 +1662,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, FLD_SET(walk_flags, WT_READ_PREV); /* - * Get some more eviction candidate pages. - * + * Get some more eviction candidate pages, starting at the last saved + * point. Clear the saved point immediately, we assert when discarding + * pages we're not discarding an eviction point, so this clear must be + * complete before the page is released. + */ + ref = btree->evict_ref; + btree->evict_ref = NULL; + + /* * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a @@ -1676,7 +1683,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, for (evict = start, pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( - session, &btree->evict_ref, &refs_walked, walk_flags)) { + session, &ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs * pages seen. Some workloads create "deserts" in trees where @@ -1690,7 +1697,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (give_up) break; - if ((ref = btree->evict_ref) == NULL) { + if (ref == NULL) { if (++restarts == 2) break; WT_STAT_CONN_INCR( @@ -1812,6 +1819,8 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period /= 2; /* + * Give up the walk occasionally. + * * If we happen to end up on the root page or a page requiring urgent * eviction, clear it. We have to track hazard pointers, and the root * page complicates that calculation. @@ -1823,16 +1832,20 @@ fast: /* If the page can't be evicted, give up. */ * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ - if ((ref = btree->evict_ref) != NULL) { - /* Give up the walk occasionally. */ + if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->read_gen == WT_READGEN_OLDEST || - ref->page->memory_footprint >= btree->splitmempage) - WT_RET(__evict_clear_walk(session, restarts == 0)); - else if (ref->page->read_gen == WT_READGEN_OLDEST) + ref->page->memory_footprint >= btree->splitmempage) { + if (restarts == 0) + WT_STAT_CONN_INCR( + session, cache_eviction_walks_abandoned); + WT_RET(__wt_page_release(cache->walk_session, + ref, WT_READ_NO_EVICT)); + ref = NULL; + } else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( - session, &btree->evict_ref, - &refs_walked, walk_flags)); + session, &ref, &refs_walked, walk_flags)); + btree->evict_ref = ref; } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); diff --git a/src/include/session.h b/src/include/session.h index 7dd523aea26..085f871a34f 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -52,8 +52,6 @@ struct __wt_session_impl { const char *lastop; /* Last operation */ uint32_t id; /* UID, offset in session array */ - WT_CONDVAR *cond; /* Condition variable */ - WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ WT_DATA_HANDLE *dhandle; /* Current data handle */ diff --git a/src/session/session_api.c b/src/session/session_api.c index 71626e098cb..3a5d06f1b61 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config) /* Release common session resources. */ WT_TRET(__wt_session_release_resources(session)); - /* Destroy the thread's mutex. */ - WT_TRET(__wt_cond_destroy(session, &session->cond)); - /* The API lock protects opening and closing of sessions. */ __wt_spin_lock(session, &conn->api_lock); @@ -1837,8 +1834,6 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond)); - if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); -- cgit v1.2.1 From a8fe04026ef55b8f59df24ff75ae151c7c370e2a Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Sun, 12 Feb 2017 20:13:24 -0500 Subject: WT-3135 WT-3159 Fix search_near() with custom collators for index keys of variable length. (#3254) * For checkpoint logging, use a format that ends in 'u' to be compatible with previously created log files. In previous WT versions, these formats end in 'U', and a final 'U' does have a prefixed size. Now, a 'U' in any position has a prefixed size. --- dist/s_string.ok | 1 + dist/s_void | 4 + src/cursor/cur_index.c | 25 +- src/include/packing.i | 7 +- src/txn/txn_log.c | 4 +- test/csuite/Makefile.am | 3 + test/csuite/wt3135_search_near_collator/main.c | 360 +++++++++++++++++++++++++ 7 files changed, 398 insertions(+), 6 deletions(-) create mode 100644 test/csuite/wt3135_search_near_collator/main.c diff --git a/dist/s_string.ok b/dist/s_string.ok index bb0cacd9d5d..d2e9dffaa48 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -1182,6 +1182,7 @@ txt typedef uB uS +ui uint uintmax unbare diff --git a/dist/s_void b/dist/s_void index 4a6b4ad91a2..947153e730b 100755 --- a/dist/s_void +++ b/dist/s_void @@ -87,6 +87,10 @@ func_ok() -e '/int handle_progress$/d' \ -e '/int helium_cursor_reset$/d' \ -e '/int helium_session_verify$/d' \ + -e '/int index_compare_primary$/d' \ + -e '/int index_compare_S$/d' \ + -e '/int index_compare_u$/d' \ + -e '/int index_extractor_u$/d' \ -e '/int log_print_err$/d' \ -e '/int lz4_error$/d' \ -e '/int lz4_pre_size$/d' \ diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 4786b0524bc..13180efdea4 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -240,7 +240,16 @@ __curindex_search(WT_CURSOR *cursor) found_key = child->key; if (found_key.size < cursor->key.size) WT_ERR(WT_NOTFOUND); - found_key.size = cursor->key.size; + + /* + * Custom collators expect to see complete keys, pass an item containing + * all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, child->key_format, + cindex->iface.key_format, &child->key, &found_key)); + else + found_key.size = cursor->key.size; WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, &cmp)); @@ -307,8 +316,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) * so we flip the sign of the result to match what callers expect. */ found_key = child->key; - if (found_key.size > cursor->key.size) - found_key.size = cursor->key.size; + if (found_key.size > cursor->key.size) { + /* + * Custom collators expect to see complete keys, pass an item + * containing all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, cindex->iface.key_format, + &child->key, &found_key)); + else + found_key.size = cursor->key.size; + } WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, exact)); diff --git a/src/include/packing.i b/src/include/packing.i index 17ca261bcfc..8ba3dd536ac 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -168,10 +168,15 @@ next: if (pack->cur == pack->end) (int)(pack->end - pack->orig), pack->orig); return (0); case 'u': - case 'U': /* Special case for items with a size prefix. */ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u'; return (0); + case 'U': + /* + * Don't change the type. 'U' is used internally, so this type + * was already changed to explicitly include the size. + */ + return (0); case 'b': case 'h': case 'i': diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 7ad295f421b..2931dc1ce82 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, WT_ITEM ckpt_snapshot_unused; uint32_t ckpt_file, ckpt_offset; u_int ckpt_nsnapshot_unused; - const char *fmt = WT_UNCHECKED_STRING(IIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIu); if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, @@ -297,7 +297,7 @@ __wt_txn_checkpoint_log( uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; - const char *fmt = WT_UNCHECKED_STRING(IIIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIIu); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index bcdbf120d67..5167b42b433 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -43,6 +43,9 @@ noinst_PROGRAMS += test_wt2999_join_extractor test_wt3120_filesys_SOURCES = wt3120_filesys/main.c noinst_PROGRAMS += test_wt3120_filesys +test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c +noinst_PROGRAMS += test_wt3135_search_near_collator + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3135_search_near_collator/main.c b/test/csuite/wt3135_search_near_collator/main.c new file mode 100644 index 00000000000..3113d29dfa9 --- /dev/null +++ b/test/csuite/wt3135_search_near_collator/main.c @@ -0,0 +1,360 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3135 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +#define TEST_ENTRY_COUNT 5 +typedef const char *TEST_SET[TEST_ENTRY_COUNT]; +static TEST_SET test_sets[] = { + { "0", "01", "012", "0123", "01234" }, + { "A", "B", "C", "D", "E" }, + { "5", "54", "543", "5432", "54321" }, + { "54321", "5433", "544", "55", "6" } +}; +#define TEST_SET_COUNT (sizeof(test_sets) / sizeof(test_sets[0])) + +static bool +item_str_equal(WT_ITEM *item, const char *str) +{ + return (item->size == strlen(str) + 1 && strncmp((char *)item->data, + str, item->size) == 0); +} + +static int +compare_int(int a, int b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int +index_compare_primary(WT_PACK_STREAM *s1, WT_PACK_STREAM *s2, int *cmp) +{ + int64_t pkey1, pkey2; + int rc1, rc2; + + rc1 = wiredtiger_unpack_int(s1, &pkey1); + rc2 = wiredtiger_unpack_int(s2, &pkey2); + + if (rc1 == 0 && rc2 == 0) + *cmp = compare_int(pkey1, pkey2); + else if (rc1 != 0 && rc2 != 0) + *cmp = 0; + else if (rc1 != 0) + *cmp = -1; + else + *cmp = 1; + return (0); +} + +static int +index_compare_S(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_PACK_STREAM *s1, *s2; + const char *skey1, *skey2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "Si", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "Si", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_str(s1, &skey1)); + testutil_check(wiredtiger_unpack_str(s2, &skey2)); + + if ((*cmp = strcmp(skey1, skey2)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_compare_u(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM skey1, skey2; + WT_PACK_STREAM *s1, *s2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "ui", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "ui", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_item(s1, &skey1)); + testutil_check(wiredtiger_unpack_item(s2, &skey2)); + + if ((*cmp = strcmp(skey1.data, skey2.data)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_extractor_u(WT_EXTRACTOR *extractor, WT_SESSION *session, + const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor) +{ + (void)extractor; + (void)session; + (void)key; + + result_cursor->set_key(result_cursor, value); + return result_cursor->insert(result_cursor); +} + +static WT_COLLATOR collator_S = { index_compare_S, NULL, NULL }; +static WT_COLLATOR collator_u = { index_compare_u, NULL, NULL }; +static WT_EXTRACTOR extractor_u = { index_extractor_u, NULL, NULL }; + +/* + * Check search() and search_near() using the test string indicated + * by test_index. + */ +static void +search_using_str(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + int exact, ret; + const char *result; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + cursor->set_key(cursor, test_str); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &result)); + + if (test_index == 0) + testutil_assert(strcmp(result, str_01) == 0 && exact > 0); + else if (test_index == 1) + testutil_assert(strcmp(result, str_01) == 0 && exact == 0); + else if (test_index == 2) + testutil_assert((strcmp(result, str_0123) == 0 && exact > 0) || + (strcmp(result, str_01) == 0 && exact < 0)); + else if (test_index == 3) + testutil_assert(strcmp(result, str_0123) == 0 && exact == 0); + else if (test_index == 4) + testutil_assert(strcmp(result, str_0123) == 0 && exact < 0); + + cursor->set_key(cursor, test_str); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * Check search() and search_near() using the test string indicated + * by test_index against a table containing a variable sized item. + */ +static void +search_using_item(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + WT_ITEM item; + size_t testlen; + int exact, ret; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + testlen = strlen(test_str) + 1; + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &item)); + + if (test_index == 0) + testutil_assert(item_str_equal(&item, str_01) && exact > 0); + else if (test_index == 1) + testutil_assert(item_str_equal(&item, str_01) && exact == 0); + else if (test_index == 2) + testutil_assert((item_str_equal(&item, str_0123) && exact > 0) + || (item_str_equal(&item, str_01) && exact < 0)); + else if (test_index == 3) + testutil_assert(item_str_equal(&item, str_0123) && exact == 0); + else if (test_index == 4) + testutil_assert(item_str_equal(&item, str_0123) && exact < 0); + + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * For each set of data, perform tests. + */ +static void +test_one_set(WT_SESSION *session, TEST_SET set) +{ + WT_CURSOR *cursor; + WT_ITEM item; + int32_t i; + + /* + * Part 1: Using a custom collator, insert some elements + * and verify results from search_near. + */ + + testutil_check(session->create(session, + "table:main", "key_format=i,value_format=S,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:def_collator", "columns=(v)")); + testutil_check(session->create(session, + "index:main:custom_collator", + "columns=(v),collator=collator_S")); + + /* Insert only elements #1 and #3. */ + testutil_check(session->open_cursor(session, + "table:main", NULL, NULL, &cursor)); + cursor->set_key(cursor, 0); + cursor->set_value(cursor, set[1]); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, 1); + cursor->set_value(cursor, set[3]); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + /* Check all elements in def_collator index. */ + testutil_check(session->open_cursor(session, + "index:main:def_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* Check all elements in custom_collator index */ + testutil_check(session->open_cursor(session, + "index:main:custom_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* + * Part 2: perform the same checks using a custom collator and + * extractor. + */ + testutil_check(session->create(session, + "table:main2", "key_format=i,value_format=u,columns=(k,v)")); + + testutil_check(session->create(session, "index:main2:idx_w_coll", + "key_format=u,collator=collator_u,extractor=extractor_u")); + + testutil_check(session->open_cursor(session, + "table:main2", NULL, NULL, &cursor)); + + memset(&item, 0, sizeof(item)); + item.size = strlen(set[1]) + 1; + item.data = set[1]; + cursor->set_key(cursor, 1); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + item.size = strlen(set[3]) + 1; + item.data = set[3]; + cursor->set_key(cursor, 3); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + testutil_check(cursor->close(cursor)); + + testutil_check(session->open_cursor(session, + "index:main2:idx_w_coll", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_item(cursor, set, i); + testutil_check(cursor->close(cursor)); + + testutil_check(session->drop(session, "table:main", NULL)); + testutil_check(session->drop(session, "table:main2", NULL)); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + int32_t i; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + /* Add any collators and extractors used by tests */ + testutil_check(opts->conn->add_collator(opts->conn, "collator_S", + &collator_S, NULL)); + testutil_check(opts->conn->add_collator(opts->conn, "collator_u", + &collator_u, NULL)); + testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u", + &extractor_u, NULL)); + + for (i = 0; i < (int32_t)TEST_SET_COUNT; i++) { + printf("test set %d\n", i); + test_one_set(session, test_sets[i]); + } + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1 From 2258dac42020b486b78947d434fde72c236d1e48 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 13 Feb 2017 10:02:37 -0500 Subject: WT-3174 Coverity/lint cleanup (#3293) * WT-3174 Coverity/lint cleanup clang38 complaints: wt3135_search_near_collator/main.c:75:22: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] *cmp = compare_int(pkey1, pkey2); ~~~~~~~~~~~ ^~~~~ wt3135_search_near_collator/main.c:75:29: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] *cmp = compare_int(pkey1, pkey2); ~~~~~~~~~~~ ^~~~~ * Coverity complains in __split_root(): dead_error_condition: The switch value complete cannot be WT_ERR_PANIC. CID 1371132 (#1 of 1): Logically dead code (DEADCODE) dead_error_begin: Execution cannot reach this statement: case WT_ERR_PANIC:. Revert a minor part of 7f5d0f9, don't switch to benign error mode (setting WT_ERR_IGNORE) until after the split has been verified in DIAGNOSTIC mode. That makes sense and should make Coverity happy. * Fix type-casting, sizeof()/sizeof() is a size_t. --- src/btree/bt_split.c | 18 +++++++++--------- test/csuite/wt3135_search_near_collator/main.c | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fcb14be7c76..3142e52be0d 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -651,15 +651,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_intl(session, root, NULL, root, false)); WT_ERR(ret); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * We can't free the previous root's index, there may be threads using * it. Add to the session's discard list, to be freed once we know no @@ -864,14 +864,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_FULL_BARRIER(); } - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * !!! * Swapping in the new page index released the page for eviction, we can @@ -1168,15 +1168,15 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_intl(session, parent, page, page, true)); WT_ERR(ret); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * We don't care about the page-index we allocated, all we needed was * the array of WT_REF structures, which has now been split into the diff --git a/test/csuite/wt3135_search_near_collator/main.c b/test/csuite/wt3135_search_near_collator/main.c index 3113d29dfa9..8783034a7d8 100644 --- a/test/csuite/wt3135_search_near_collator/main.c +++ b/test/csuite/wt3135_search_near_collator/main.c @@ -57,7 +57,7 @@ item_str_equal(WT_ITEM *item, const char *str) } static int -compare_int(int a, int b) +compare_int(int64_t a, int64_t b) { return (a < b ? -1 : (a > b ? 1 : 0)); } @@ -329,7 +329,7 @@ main(int argc, char *argv[]) { TEST_OPTS *opts, _opts; WT_SESSION *session; - int32_t i; + size_t i; opts = &_opts; memset(opts, 0, sizeof(*opts)); @@ -349,8 +349,8 @@ main(int argc, char *argv[]) testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u", &extractor_u, NULL)); - for (i = 0; i < (int32_t)TEST_SET_COUNT; i++) { - printf("test set %d\n", i); + for (i = 0; i < TEST_SET_COUNT; i++) { + printf("test set %" WT_SIZET_FMT "\n", i); test_one_set(session, test_sets[i]); } -- cgit v1.2.1 From dc33b134ea0e231fd87924c6a50e6f8230a7c6bf Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 14 Feb 2017 14:22:10 +1100 Subject: WT-3175 Don't verify children during splits up the tree. (#3294) Reverts part of 7f5d0f9981214c723f2ed90cf4533887ed406176. Fixes a deadlock in diagnostic mode. Also revert a change that could cause diagnostic code to read pages into cache: we don't want diagnostic adding cache pressure and we already verify pages as they are evicted. --- src/btree/bt_split.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 3142e52be0d..45550ff627f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -251,29 +251,33 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __split_verify_intl -- - * Verify a set of internal pages involved in a split. + * __split_verify_root -- + * Verify a root page involved in a split. */ static int -__split_verify_intl(WT_SESSION_IMPL *session, - WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first) +__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_REF *ref; /* The split is complete and live, verify all of the pages involved. */ - if (page1 != NULL) - __split_verify_intl_key_order(session, page1); - if (page2 != NULL) - __split_verify_intl_key_order(session, page2); - - /* Skip the first slot on non-root internal pages, it's not set. */ - WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) { - if (skip_first) { - skip_first = false; + __split_verify_intl_key_order(session, page); + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * An eviction thread might be attempting to evict the page + * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based + * page (the WT_REF may be WT_REF_READING), or it may be in + * some other state. Acquire a hazard pointer for any + * in-memory pages so we know the state of the page. + * + * Ignore pages not in-memory (deleted, on-disk, being read), + * there's no in-memory structure to check. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) continue; - } - WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT)); + WT_ERR(ret); __split_verify_intl_key_order(session, ref->page); @@ -653,7 +657,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - ret = __split_verify_intl(session, root, NULL, root, false)); + ret = __split_verify_root(session, root)); WT_ERR(ret); #endif @@ -1170,8 +1174,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - ret = __split_verify_intl(session, parent, page, page, true)); - WT_ERR(ret); + __split_verify_intl_key_order(session, parent)); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, page)); #endif /* The split is complete and verified, ignore benign errors. */ -- cgit v1.2.1 From 5b16ddd3815fb043061ac35151e277b919d7e463 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 14 Feb 2017 15:32:43 +1100 Subject: WT-3152 Switch the table lock to a rwlock. (#3291) --- dist/flags.py | 3 +- dist/s_define.list | 2 + src/conn/conn_handle.c | 4 +- src/cursor/cur_table.c | 2 +- src/include/connection.h | 2 +- src/include/flags.h | 23 +++++----- src/include/schema.h | 104 +++++++++++++++++++++++++++++++++------------- src/schema/schema_list.c | 2 +- src/session/session_api.c | 15 +++---- src/txn/txn_ckpt.c | 2 +- 10 files changed, 106 insertions(+), 53 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index 216f7c29e0a..b20a7181532 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -123,7 +123,8 @@ flags = { 'SESSION_LOCKED_PASS', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', - 'SESSION_LOCKED_TABLE', + 'SESSION_LOCKED_TABLE_READ', + 'SESSION_LOCKED_TABLE_WRITE', 'SESSION_LOCKED_TURTLE', 'SESSION_LOGGING_INMEM', 'SESSION_LOOKASIDE_CURSOR', diff --git a/dist/s_define.list b/dist/s_define.list index 53a3df87615..8911d888077 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -39,6 +39,8 @@ WT_PADDING_CHECK WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT +WT_SESSION_LOCKED_TABLE_READ +WT_SESSION_LOCKED_TABLE_WRITE WT_SESSION_LOCKED_TURTLE WT_SIZE_CHECK WT_STATS_FIELD_TO_OFFSET diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 4f8d89fa9d2..287e9ca7b99 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -59,12 +59,12 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); - WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ __wt_rwlock_init(session, &conn->dhandle_lock); __wt_rwlock_init(session, &conn->hot_backup_lock); + __wt_rwlock_init(session, &conn->table_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS; ++i) @@ -142,7 +142,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); - __wt_spin_destroy(session, &conn->table_lock); + __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 76f7fc5865f..7e8cd153d2d 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -769,7 +769,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) return (0); /* If the table is incomplete, wait on the table lock and recheck. */ - WT_WITH_TABLE_LOCK(session, complete = table->cg_complete); + WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete); if (!complete) WT_RET_MSG(session, EINVAL, "'%s' not available until all column groups are created", diff --git a/src/include/connection.h b/src/include/connection.h index 3a719e59608..ce483d3291a 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -171,7 +171,7 @@ struct __wt_connection_impl { WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ - WT_SPINLOCK table_lock; /* Table creation spinlock */ + WT_RWLOCK table_lock; /* Table list lock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ WT_RWLOCK dhandle_lock; /* Data handle list lock */ diff --git a/src/include/flags.h b/src/include/flags.h index 5219bf33ed6..c1fff920e3b 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -59,17 +59,18 @@ #define WT_SESSION_LOCKED_PASS 0x00000040 #define WT_SESSION_LOCKED_SCHEMA 0x00000080 #define WT_SESSION_LOCKED_SLOT 0x00000100 -#define WT_SESSION_LOCKED_TABLE 0x00000200 -#define WT_SESSION_LOCKED_TURTLE 0x00000400 -#define WT_SESSION_LOGGING_INMEM 0x00000800 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 -#define WT_SESSION_NO_CACHE 0x00002000 -#define WT_SESSION_NO_DATA_HANDLES 0x00004000 -#define WT_SESSION_NO_EVICTION 0x00008000 -#define WT_SESSION_NO_LOGGING 0x00010000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 -#define WT_SESSION_SERVER_ASYNC 0x00080000 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 +#define WT_SESSION_LOCKED_TURTLE 0x00000800 +#define WT_SESSION_LOGGING_INMEM 0x00001000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 +#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_NO_DATA_HANDLES 0x00008000 +#define WT_SESSION_NO_EVICTION 0x00010000 +#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 +#define WT_SESSION_SERVER_ASYNC 0x00100000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 diff --git a/src/include/schema.h b/src/include/schema.h index fff57951c0e..9a6e1e54e80 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -78,10 +78,13 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) -/* Make it simple to check a generic locked state on the handle list lock */ +/* Helpers for the locked state of the handle list and table locks. */ #define WT_SESSION_LOCKED_HANDLE_LIST \ (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ WT_SESSION_LOCKED_HANDLE_LIST_WRITE) +#define WT_SESSION_LOCKED_TABLE \ + (WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_TABLE_WRITE) /* * WT_WITH_LOCK_WAIT -- @@ -90,7 +93,7 @@ struct __wt_table { #define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \ if (F_ISSET(session, (flag))) { \ op; \ - } else { \ + } else { \ __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ @@ -139,7 +142,7 @@ struct __wt_table { #define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ op; \ - } else { \ + } else { \ __wt_readlock(session, &S2C(session)->dhandle_lock); \ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ op; \ @@ -150,15 +153,14 @@ struct __wt_table { /* * WT_WITH_HANDLE_LIST_WRITE_LOCK -- - * Acquire the data handle list lock in shared mode, perform an operation, - * drop the lock. The handle list lock is a read-write lock so the - * implementation is different to the other lock macros. - * Automatically upgrade from a read lock if held. + * Acquire the data handle list lock in exclusive mode, perform an + * operation, drop the lock. The handle list lock is a read-write lock so + * the implementation is different to the other lock macros. */ #define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ op; \ - } else { \ + } else { \ WT_ASSERT(session, \ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ __wt_writelock(session, &S2C(session)->dhandle_lock); \ @@ -201,22 +203,58 @@ struct __wt_table { } while (0) /* - * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT -- + * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK, + * WT_WITH_TABLE_WRITE_LOCK_NOWAIT -- * Acquire the table lock, perform an operation, drop the lock. + * The table lock is a read-write lock so the implementation is different + * to most other lock macros. + * + * Note: readlock always waits because some operations need the table lock + * to discard handles, and we only expect it to be held across short + * operations. */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ - WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ +#define WT_WITH_TABLE_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_readlock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) -#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \ + +#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_writelock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ +} while (0) +#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \ WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_NOWAIT(session, ret, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else if ((ret = __wt_try_writelock(session, \ + &S2C(session)->table_lock)) == 0) { \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) /* @@ -232,8 +270,10 @@ struct __wt_table { F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ bool __handle_write_locked = \ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ - bool __table_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ + bool __table_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \ + bool __table_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ if (__handle_read_locked) { \ @@ -244,9 +284,13 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ __wt_writeunlock(session, &__conn->dhandle_lock); \ } \ - if (__table_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_TABLE); \ - __wt_spin_unlock(session, &__conn->table_lock); \ + if (__table_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &__conn->table_lock); \ + } \ + if (__table_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &__conn->table_lock); \ } \ if (__schema_locked) { \ F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ @@ -265,9 +309,13 @@ struct __wt_table { __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ } \ - if (__table_locked) { \ - __wt_spin_lock(session, &__conn->table_lock); \ - F_SET(session, WT_SESSION_LOCKED_TABLE); \ + if (__table_read_locked) { \ + __wt_readlock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + } \ + if (__table_write_locked) { \ + __wt_writelock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ } \ if (__handle_read_locked) { \ __wt_readlock(session, &__conn->dhandle_lock); \ diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index ea7374b7554..74ef5135a4a 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session, /* Make sure the metadata is open before getting other locks. */ WT_RET(__wt_metadata_cursor(session, NULL)); - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __wt_schema_open_table( session, name, namelen, ok_incomplete, &table)); WT_RET(ret); diff --git a/src/session/session_api.c b/src/session/session_api.c index 3a5d06f1b61..d282c5d0c32 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config) cfg[1] = NULL; WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_alter(session, uri, cfg)))); err: if (ret != 0) @@ -518,7 +518,7 @@ __wt_session_create( WT_DECL_RET; WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_create(session, uri, config))); return (ret); } @@ -766,7 +766,7 @@ __session_rename(WT_SESSION *wt_session, WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: if (ret != 0) @@ -855,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) if (lock_wait) WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, ret = + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)))); else WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret = + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); } else { if (lock_wait) WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg))); else WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, ret = __wt_schema_drop(session, uri, cfg))); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 5932e058552..3261c8089f4 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -650,7 +650,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_get_handles, NULL))); WT_ERR(ret); -- cgit v1.2.1 From 988c297f22bbce3a40f7eb9ed22cdb7d9bf0a9c8 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 13 Feb 2017 23:44:11 -0500 Subject: WT-3144 bug fix: random cursor returns not-found when descending to an empty page (#3289) * If random descent through the tree fails, fallback to skipping through the tree's pages; if skipping through the tree's pages fails, fallback to a random entry from the first page in the tree that contains anything at all. * Add tests that create a tree with enough data for multiple pages, reopens the connection so we have a real tree, then truncates most / all of the tree and makes sure random lookups find data / fail (respectively). That way we're testing WT_REF_DELETED, not just empty pages. * Fix a documentation error, we never implemented a next_random_sample_percent configuration. --- src/btree/bt_cursor.c | 134 ++++++++++++++++++++++++++------------- src/btree/row_srch.c | 14 ++-- src/docs/cursor-random.dox | 5 -- test/suite/test_cursor_random.py | 49 ++++++++++++++ 4 files changed, 144 insertions(+), 58 deletions(-) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index d18b9b76992..c0b028725c7 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -846,7 +846,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; - uint64_t skip; + uint64_t n, skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; @@ -862,60 +862,104 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); +#ifdef HAVE_DIAGNOSTIC /* - * If retrieving random values without sampling, or we don't have a - * page reference, pick a roughly random leaf page in the tree. + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - /* - * Skip past the sample size of the leaf pages in the tree - * between each random key return to compensate for unbalanced - * trees. - * - * Use the underlying file size divided by its block allocation - * size as our guess of leaf pages in the file (this can be - * entirely wrong, as it depends on how many pages are in this - * particular checkpoint, how large the leaf and internal pages - * really are, and other factors). Then, divide that value by - * the configured sample size and increment the final result to - * make sure tiny files don't leave us with a skip value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_sample_size != 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } + __wt_cursor_key_order_reset(cbt); +#endif - /* - * Choose a leaf page from the tree. - */ + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); - WT_ERR(ret); - } else { + if (ret == 0) + goto random_page_entry; + /* - * Read through the tree, skipping leaf pages. Be cautious about - * the skip count: if the last leaf page skipped was also the - * last leaf page in the tree, it may be set to zero on return - * with the end-of-walk condition. - * - * Pages read for data sampling aren't "useful"; don't update - * the read generation of pages already in memory, and if a page - * is read, set its generation to a low value so it is evicted - * quickly. + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. */ - for (skip = - cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | - WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } } + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 1c3d5ad5daa..0858e42356b 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -798,14 +798,7 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) current = NULL; retry = 100; - if (0) { -restart: /* - * Discard the currently held page and restart the search from - * the root. - */ - WT_RET(__wt_page_release(session, current, 0)); - } - +restart: /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { @@ -837,6 +830,11 @@ restart: /* break; } if (i == entries || descent == NULL) { + /* + * Discard the currently held page and restart from the + * root. + */ + WT_RET(__wt_page_release(session, current, 0)); if (--retry > 0) goto restart; return (WT_NOTFOUND); diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox index a0a3212be6d..b6434e3d161 100644 --- a/src/docs/cursor-random.dox +++ b/src/docs/cursor-random.dox @@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object into \c next_random_sample_size pieces, and each subsequent retrieval returns a record from the next one of those pieces. -For example, setting \c next_random_sample_percent to \c 10 would cause -the cursor to sequentially return records from each tenth part of the -object. Setting \c next_random_sample_percent to \c 1000 would cause the -cursor to sequentially return records from each .1% of the object. - */ diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index 3bda6dc9946..ee0f85a29ee 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -71,6 +71,15 @@ class test_cursor_random(wttest.WiredTigerTestCase): self.assertEquals(cursor.reset(), 0) cursor.close() + # Check that next_random fails with an empty tree, repeatedly. + def test_cursor_random_empty(self): + uri = self.type + self.session.create(uri, 'key_format=S,value_format=S') + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,5): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + cursor.close + # Check that next_random works with a single value, repeatedly. def test_cursor_random_single_record(self): uri = self.type @@ -127,6 +136,46 @@ class test_cursor_random(wttest.WiredTigerTestCase): def test_cursor_random_multiple_page_records(self): self.cursor_random_multiple_page_records(0) + # Check that next_random fails in the presence of a set of values, some of + # which are deleted. + def test_cursor_random_deleted_partial(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + start = self.session.open_cursor(uri, None) + start.set_key(ds.key(10)) + end = self.session.open_cursor(uri, None) + end.set_key(ds.key(10000-10)) + self.session.truncate(None, start, end, None) + self.assertEqual(start.close(), 0) + self.assertEqual(end.close(), 0) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertEqual(cursor.next(), 0) + + # Check that next_random fails in the presence of a set of values, all of + # which are deleted. + def test_cursor_random_deleted_all(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + self.session.truncate(uri, None, None, None) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + # Check that opening a random cursor on column-store returns not-supported. class test_cursor_random_column(wttest.WiredTigerTestCase): scenarios = make_scenarios([ -- cgit v1.2.1 From df64d277ae99adf98824fbf2118626c77fd2f199 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Tue, 14 Feb 2017 16:39:24 +1100 Subject: WT-3149 Have eviction choose a random point when walking a tree. (#3285) Only choose a random point when there is no saved walk point. Fixes to random search as well - noticed search termination conditions when sampling the search page vs. walking it sequentially weren't the same. Changed that, which caused the test_compact02 test to fail. There's an underlying bug in this code, if we return WT_NOTFOUND, we can lose a hazard pointer on the page of the tree we unsucessfully searched. Add a page-release in the case of returning not-found. --- dist/filelist | 1 + src/btree/bt_cursor.c | 180 ++-------------------- src/btree/bt_random.c | 413 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/btree/row_srch.c | 237 ----------------------------- src/evict/evict_lru.c | 28 +++- src/include/extern.h | 7 +- 6 files changed, 454 insertions(+), 412 deletions(-) create mode 100644 src/btree/bt_random.c diff --git a/dist/filelist b/dist/filelist index 13d67ef961b..3886035eaa9 100644 --- a/dist/filelist +++ b/dist/filelist @@ -30,6 +30,7 @@ src/btree/bt_io.c src/btree/bt_misc.c src/btree/bt_ovfl.c src/btree/bt_page.c +src/btree/bt_random.c src/btree/bt_read.c src/btree/bt_rebalance.c src/btree/bt_ret.c diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index c0b028725c7..5fde2237538 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -76,11 +76,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) } /* - * __cursor_valid -- + * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline bool -__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +bool +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; @@ -330,7 +330,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -338,7 +338,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (valid) @@ -419,14 +419,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } /* @@ -462,7 +462,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - if (__cursor_valid(cbt, &upd)) { + if (__wt_cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) @@ -537,7 +537,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || + ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); @@ -552,7 +552,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __cursor_valid(cbt, NULL)) + cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); @@ -682,12 +682,12 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible - * in __cursor_valid, or we can miss conflict. + * in __wt_cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -711,7 +711,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); @@ -786,7 +786,8 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && + if ((cbt->compare != 0 || + !__wt_cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } @@ -800,7 +801,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); @@ -829,155 +830,6 @@ err: if (ret == WT_RESTART) { return (ret); } -/* - * __wt_btcur_next_random -- - * Move to a random record in the tree. There are two algorithms, one - * where we select a record at random from the whole tree on each - * retrieval and one where we first select a record at random from the - * whole tree, and then subsequently sample forward from that location. - * The sampling approach allows us to select reasonably uniform random - * points from unbalanced trees. - */ -int -__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_SESSION_IMPL *session; - WT_UPDATE *upd; - wt_off_t size; - uint64_t n, skip; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; - - /* - * Only supports row-store: applications can trivially select a random - * value from a column-store, if there were any reason to do so. - */ - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, ENOTSUP, - "WT_CURSOR.next_random only supported by row-store tables"); - - WT_STAT_CONN_INCR(session, cursor_next); - WT_STAT_DATA_INCR(session, cursor_next); - -#ifdef HAVE_DIAGNOSTIC - /* - * Under some conditions we end up using the underlying cursor.next to - * walk through the object. Since there are multiple calls, we can hit - * the cursor-order checks, turn them off. - */ - __wt_cursor_key_order_reset(cbt); -#endif - - /* - * If we don't have a current position in the tree, or if retrieving - * random values without sampling, pick a roughly random leaf page in - * the tree and return an entry from it. - */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - WT_ERR(__cursor_func_init(cbt, true)); - WT_WITH_PAGE_INDEX( - session, ret = __wt_row_random_descent(session, cbt)); - if (ret == 0) - goto random_page_entry; - - /* - * Random descent may return not-found: the tree might be empty - * or have so many deleted items we didn't find any valid pages. - * We can't return WT_NOTFOUND to the application unless a tree - * is really empty, fallback to skipping through tree pages. - */ - WT_ERR_NOTFOUND_OK(ret); - } - - /* - * Cursor through the tree, skipping past the sample size of the leaf - * pages in the tree between each random key return to compensate for - * unbalanced trees. - * - * If the random descent attempt failed, we don't have a configured - * sample size, use 100 for no particular reason. - */ - if (cbt->next_random_sample_size == 0) - cbt->next_random_sample_size = 100; - - /* - * If the random descent attempt failed, or it's our first skip attempt, - * we haven't yet set the pages to skip, do it now. - * - * Use the underlying file size divided by its block allocation size as - * our guess of leaf pages in the file (this can be entirely wrong, as - * it depends on how many pages are in this particular checkpoint, how - * large the leaf and internal pages really are, and other factors). - * Then, divide that value by the configured sample size and increment - * the final result to make sure tiny files don't leave us with a skip - * value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_leaf_skip == 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } - - /* - * Be paranoid about loop termination: first, if the last leaf page - * skipped was also the last leaf page in the tree, skip may be set to - * zero on return along with the NULL WT_REF end-of-walk condition. - * Second, if a tree has no valid pages at all (the condition after - * initial creation), we might make no progress at all, or finally, if - * a tree has only deleted pages, we'll make progress, but never get a - * useful WT_REF. And, of course, the tree can switch from one of these - * states to another without warning. Decrement skip regardless of what - * is happening in the search, guarantee we eventually quit. - * - * Pages read for data sampling aren't "useful"; don't update the read - * generation of pages already in memory, and if a page is read, set - * its generation to a low value so it is evicted quickly. - */ - for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { - n = skip; - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); - if (n == skip) { - if (skip == 0) - break; - --skip; - } - } - - /* - * We can't return WT_NOTFOUND to the application unless a tree is - * really empty, fallback to a random entry from the first page in the - * tree that has anything at all. - */ - if (cbt->ref == NULL) - WT_ERR(__wt_btcur_next(cbt, false)); - -random_page_entry: - /* - * Select a random entry from the leaf page. If it's not valid, move to - * the next entry, if that doesn't work, move to the previous entry. - */ - WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__cursor_valid(cbt, &upd)) - WT_ERR(__wt_kv_return(session, cbt, upd)); - else { - if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) - ret = __wt_btcur_prev(cbt, false); - WT_ERR(ret); - } - return (0); - -err: WT_TRET(__cursor_reset(cbt)); - return (ret); -} - /* * __wt_btcur_compare -- * Return a comparison between two cursors. diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c new file mode 100644 index 00000000000..3cc6838c4c8 --- /dev/null +++ b/src/btree/bt_random.c @@ -0,0 +1,413 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_row_random_leaf -- + * Return a random key from a row-store leaf page. + */ +int +__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint64_t samples; + uint32_t choice, entries, i; + int level; + + page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + __cursor_pos_clear(cbt); + + /* If the page has disk-based entries, select from them. */ + if (page->entries != 0) { + cbt->compare = 0; + cbt->slot = __wt_random(&session->rnd) % page->entries; + + /* + * The real row-store search function builds the key, so we + * have to as well. + */ + return (__wt_row_leaf_key(session, + page, page->pg_row + cbt->slot, cbt->tmp, false)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list. + * + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) + break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + for (samples = entries; level > 0; samples += entries) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; + cbt->ins_head = ins_head; + cbt->compare = 0; + + /* + * Random lookups in newly created collections can be slow if a page + * consists of a large skiplist. Schedule the page for eviction if we + * encounter a large skiplist. This worthwhile because applications + * that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. + */ + if (samples > 5000) + __wt_page_evict_soon(session, cbt->ref); + + return (0); +} + +/* + * __wt_random_descent -- + * Find a random leaf page in a tree. + */ +int +__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + uint32_t flags, i, entries, retry; + + btree = S2BT(session); + current = NULL; + retry = 100; + + /* Eviction should not be tapped to do eviction. */ + flags = WT_READ_RESTART_OK; + if (eviction) + LF_SET(WT_READ_NO_EVICT); + + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, flags)); + } + + /* Search the internal pages of the tree. */ + current = &btree->root; + for (;;) { + page = current->page; + if (!WT_PAGE_IS_INTERNAL(page)) + break; + + WT_INTL_INDEX_GET(session, page, pindex); + entries = pindex->entries; + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + * + * Eviction is only looking for a place in the cache and so only + * wants in-memory pages (but a deleted page is fine); currently + * our other caller is looking for a key/value pair on a random + * leave page, and so will accept any page that contains a valid + * key/value pair, so on-disk is fine, but deleted is not. + */ + descent = NULL; + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state == WT_REF_MEM || + (!eviction && descent->state == WT_REF_DISK)) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state == WT_REF_MEM || + (!eviction && + descent->state == WT_REF_DISK)) + break; + } + if (i == entries || descent == NULL) { + if (--retry > 0) + goto restart; + + WT_RET(__wt_page_release(session, current, flags)); + return (WT_NOTFOUND); + } + + /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. + */ + if ((ret = + __wt_page_swap(session, current, descent, flags)) == 0) { + current = descent; + continue; + } + if (ret == WT_RESTART) + goto restart; + return (ret); + } + + *refp = current; + return (0); +} + +/* + * __wt_btcur_next_random -- + * Move to a random record in the tree. There are two algorithms, one + * where we select a record at random from the whole tree on each + * retrieval and one where we first select a record at random from the + * whole tree, and then subsequently sample forward from that location. + * The sampling approach allows us to select reasonably uniform random + * points from unbalanced trees. + */ +int +__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + wt_off_t size; + uint64_t n, skip; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = cbt->btree; + + /* + * Only supports row-store: applications can trivially select a random + * value from a column-store, if there were any reason to do so. + */ + if (btree->type != BTREE_ROW) + WT_RET_MSG(session, ENOTSUP, + "WT_CURSOR.next_random only supported by row-store tables"); + + WT_STAT_CONN_INCR(session, cursor_next); + WT_STAT_DATA_INCR(session, cursor_next); + +#ifdef HAVE_DIAGNOSTIC + /* + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. + */ + __wt_cursor_key_order_reset(cbt); +#endif + + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { + WT_ERR(__cursor_func_init(cbt, true)); + WT_WITH_PAGE_INDEX(session, + ret = __wt_random_descent(session, &cbt->ref, false)); + if (ret == 0) + goto random_page_entry; + + /* + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. + */ + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } + } + + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: + /* + * Select a random entry from the leaf page. If it's not valid, move to + * the next entry, if that doesn't work, move to the previous entry. + */ + WT_ERR(__wt_row_random_leaf(session, cbt)); + if (__wt_cursor_valid(cbt, &upd)) + WT_ERR(__wt_kv_return(session, cbt, upd)); + else { + if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) + ret = __wt_btcur_prev(cbt, false); + WT_ERR(ret); + } + return (0); + +err: WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 0858e42356b..9c3d467340e 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -623,240 +623,3 @@ leaf_match: cbt->compare = 0; err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } - -/* - * __wt_row_random_leaf -- - * Return a random key from a row-store leaf page. - */ -int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_INSERT *ins, **start, **stop; - WT_INSERT_HEAD *ins_head; - WT_PAGE *page; - uint64_t samples; - uint32_t choice, entries, i; - int level; - - page = cbt->ref->page; - start = stop = NULL; /* [-Wconditional-uninitialized] */ - entries = 0; /* [-Wconditional-uninitialized] */ - - __cursor_pos_clear(cbt); - - /* If the page has disk-based entries, select from them. */ - if (page->entries != 0) { - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->entries; - - /* - * The real row-store search function builds the key, so we - * have to as well. - */ - return (__wt_row_leaf_key(session, - page, page->pg_row + cbt->slot, cbt->tmp, false)); - } - - /* - * If the tree is new (and not empty), it might have a large insert - * list. - * - * Walk down the list until we find a level with at least 50 entries, - * that's where we'll start rolling random numbers. The value 50 is - * used to ignore levels with only a few entries, that is, levels which - * are potentially badly skewed. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { - start = &ins_head->head[level]; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - - if (entries > 50) - break; - } - - /* - * If it's a tiny list and we went all the way to level 0, correct the - * level; entries is correctly set. - */ - if (level < 0) - level = 0; - - /* - * Step down the skip list levels, selecting a random chunk of the name - * space at each level. - */ - for (samples = entries; level > 0; samples += entries) { - /* - * There are (entries) or (entries + 1) chunks of the name space - * considered at each level. They are: between start and the 1st - * element, between the 1st and 2nd elements, and so on to the - * last chunk which is the name space after the stop element on - * the current level. This last chunk of name space may or may - * not be there: as we descend the levels of the skip list, this - * chunk may appear, depending if the next level down has - * entries logically after the stop point in the current level. - * We can't ignore those entries: because of the algorithm used - * to determine the depth of a skiplist, there may be a large - * number of entries "revealed" by descending a level. - * - * If the next level down has more items after the current stop - * point, there are (entries + 1) chunks to consider, else there - * are (entries) chunks. - */ - if (*(stop - 1) == NULL) - choice = __wt_random(&session->rnd) % entries; - else - choice = __wt_random(&session->rnd) % (entries + 1); - - if (choice == entries) { - /* - * We selected the name space after the stop element on - * this level. Set the start point to the current stop - * point, descend a level and move the stop element to - * the end of the list, that is, the end of the newly - * discovered name space, counting entries as we go. - */ - start = stop; - --start; - --level; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - } else { - /* - * We selected another name space on the level. Move the - * start pointer the selected number of entries forward - * to the start of the selected chunk (if the selected - * number is 0, start won't move). Set the stop pointer - * to the next element in the list and drop both start - * and stop down a level. - */ - for (i = 0; i < choice; ++i) - start = &(*start)->next[level]; - stop = &(*start)->next[level]; - - --start; - --stop; - --level; - - /* Count the entries in the selected name space. */ - for (entries = 0, - ins = *start; ins != *stop; ins = ins->next[level]) - ++entries; - } - } - - /* - * When we reach the bottom level, entries will already be set. Select - * a random entry from the name space and return it. - * - * It should be impossible for the entries count to be 0 at this point, - * but check for it out of paranoia and to quiet static testing tools. - */ - if (entries > 0) - entries = __wt_random(&session->rnd) % entries; - for (ins = *start; entries > 0; --entries) - ins = ins->next[0]; - - cbt->ins = ins; - cbt->ins_head = ins_head; - cbt->compare = 0; - - /* - * Random lookups in newly created collections can be slow if a page - * consists of a large skiplist. Schedule the page for eviction if we - * encounter a large skiplist. This worthwhile because applications - * that take a sample often take many samples, so the overhead of - * traversing the skip list each time accumulates to real time. - */ - if (samples > 5000) - __wt_page_evict_soon(session, cbt->ref); - - return (0); -} - -/* - * __wt_row_random_descent -- - * Find a random leaf page in a row-store tree. - */ -int -__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *current, *descent; - uint32_t i, entries, retry; - - btree = S2BT(session); - current = NULL; - retry = 100; - -restart: - /* Search the internal pages of the tree. */ - current = &btree->root; - for (;;) { - page = current->page; - if (page->type != WT_PAGE_ROW_INT) - break; - - WT_INTL_INDEX_GET(session, page, pindex); - entries = pindex->entries; - - /* - * There may be empty pages in the tree, and they're useless to - * us. If we don't find a non-empty page in "entries" random - * guesses, take the first non-empty page in the tree. If the - * search page contains nothing other than empty pages, restart - * from the root some number of times before giving up. - */ - descent = NULL; - for (i = 0; i < entries; ++i) { - descent = - pindex->index[__wt_random(&session->rnd) % entries]; - if (descent->state != WT_REF_DELETED) - break; - } - if (i == entries) - for (i = 0; i < entries; ++i) { - descent = pindex->index[i]; - if (descent->state != WT_REF_DELETED) - break; - } - if (i == entries || descent == NULL) { - /* - * Discard the currently held page and restart from the - * root. - */ - WT_RET(__wt_page_release(session, current, 0)); - if (--retry > 0) - goto restart; - return (WT_NOTFOUND); - } - - /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. - */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { - current = descent; - continue; - } - if (ret == WT_RESTART) - goto restart; - return (ret); - } - - cbt->ref = current; - return (0); -} diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index efe056aee02..42fe4d4608e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1654,10 +1654,29 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + /* + * Choose a random point in the tree if looking for candidates in a + * tree with no starting point set. This is mostly aimed at ensuring + * eviction fairly visits all pages in trees with a lot of in-cache + * content. + */ + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid for our walk */ + WT_WITH_PAGE_INDEX(session, ret = + __wt_random_descent(session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + + /* + * Reverse the direction of the walk each time we start at a + * random point so both ends of the tree are equally likely to + * be visited. + */ + btree->evict_walk_reverse = !btree->evict_walk_reverse; + } + walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - /* Randomize the walk direction. */ if (btree->evict_walk_reverse) FLD_SET(walk_flags, WT_READ_PREV); @@ -1799,13 +1818,6 @@ fast: /* If the page can't be evicted, give up. */ WT_STAT_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); - /* - * If gave up the walk, reverse the direction of the walk and skip it - * next time. - */ - if (give_up) - btree->evict_walk_reverse = !btree->evict_walk_reverse; - /* * If we couldn't find the number of pages we were looking for, skip * the tree next time. diff --git a/src/include/extern.h b/src/include/extern.h index 836a7cb1ae6..8e55077c2a9 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -98,6 +98,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -105,7 +106,6 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((w extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -150,6 +150,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags @@ -193,8 +196,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 92c48cfcd9c66ba66386fd48ca326ec750057d86 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 00:56:29 -0500 Subject: WT-2909 Add a custom file system fault injection test to verify checkpoint integrity (#3272) Implement a custom file system, and use it via a test case to add validate checkpoint integrity in the face of file-system level errors. --- dist/s_string.ok | 2 + dist/s_void | 1 + ext/test/fail_fs/fail_fs.c | 197 ++++++-- test/csuite/Makefile.am | 3 + test/csuite/wt2909_checkpoint_integrity/main.c | 660 +++++++++++++++++++++++++ test/utility/misc.c | 2 +- test/utility/test_util.h | 2 +- 7 files changed, 827 insertions(+), 40 deletions(-) create mode 100644 test/csuite/wt2909_checkpoint_integrity/main.c diff --git a/dist/s_string.ok b/dist/s_string.ok index d2e9dffaa48..e033f77327f 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -770,6 +770,7 @@ idx ifdef ifdef's iiSii +iiiS iiii iiu ikey @@ -1138,6 +1139,7 @@ subgetraw subgets subinit sublicense +subtest subtree sunique superset diff --git a/dist/s_void b/dist/s_void index 947153e730b..90425d5a718 100755 --- a/dist/s_void +++ b/dist/s_void @@ -82,6 +82,7 @@ func_ok() -e '/int fail_file_sync$/d' \ -e '/int fail_fs_directory_list_free$/d' \ -e '/int fail_fs_exist$/d' \ + -e '/int fail_fs_simulate_fail$/d' \ -e '/int fail_fs_terminate$/d' \ -e '/int handle_message$/d' \ -e '/int handle_progress$/d' \ diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 29d469768c5..a6376ce203b 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -35,16 +35,29 @@ #include #include #include +#include #include #include "queue.h" -#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) +#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) + +#define FAIL_FS_ENV_ENABLE "WT_FAIL_FS_ENABLE" +#define FAIL_FS_ENV_WRITE_ALLOW "WT_FAIL_FS_WRITE_ALLOW" +#define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW" /* * A "fail file system", that is, a file system extension that fails when we - * want it to. This is only used in test frameworks, this fact allows us - * to simplify some error paths. + * want it to. This is only used in test frameworks, this fact allows us to + * simplify some error paths. This code is not portable to Windows, as it has + * direct knowledge of file descriptors, environment variables and stack + * traces. + * + * When the filesystem extension is configured, parameters can set how many + * reads or writes can be allowed before failure. If this is not fine-grained + * enough, an 'environment' configuration parameter can be specified. If that + * is used, then on every file system read or write, environment variables are + * checked that control when reading or writing should fail. */ typedef struct { WT_FILE_SYSTEM iface; @@ -54,6 +67,9 @@ typedef struct { * uses a single, global file system lock. */ pthread_rwlock_t lock; /* Lock */ + bool fail_enabled; + bool use_environment; + bool verbose; int64_t read_ops; int64_t write_ops; int64_t allow_reads; @@ -86,12 +102,12 @@ static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); static int fail_file_write( WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *); static bool fail_fs_arg( - const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, - int64_t *argp); + const char *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, int64_t *); static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, char ***, uint32_t *); static int fail_fs_directory_list_free( WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t); +static void fail_fs_env(const char *, int64_t *); static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *, const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **); @@ -99,6 +115,8 @@ static int fail_fs_remove( WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t); static int fail_fs_rename( WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t); +static int fail_fs_simulate_fail( + FAIL_FILE_HANDLE *, WT_SESSION *, int64_t, const char *); static int fail_fs_size( WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *); static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *); @@ -145,8 +163,12 @@ fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) fail_fh = (FAIL_FILE_HANDLE *)file_handle; + /* + * We don't actually open an fd when opening directories for flushing, + * so ignore that case here. + */ if (fail_fh->fd < 0) - return (EINVAL); + return (0); ret = close(fail_fh->fd); fail_fh->fd = -1; fail_file_handle_remove(session, fail_fh); @@ -198,7 +220,7 @@ fail_file_read(WT_FILE_HANDLE *file_handle, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; - int64_t read_ops; + int64_t envint, read_ops; int ret; size_t chunk; ssize_t nr; @@ -207,19 +229,34 @@ fail_file_read(WT_FILE_HANDLE *file_handle, fail_fh = (FAIL_FILE_HANDLE *)file_handle; fail_fs = fail_fh->fail_fs; wtext = fail_fs->wtext; + read_ops = 0; ret = 0; fail_fs_lock(&fail_fs->lock); - read_ops = ++fail_fs->read_ops; + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_READ_ALLOW, + &fail_fs->allow_reads); + fail_fs->read_ops = 0; + } + read_ops = ++fail_fs->read_ops; + } else + fail_fs->fail_enabled = false; + } else + read_ops = ++fail_fs->read_ops; + fail_fs_unlock(&fail_fs->lock); - if (fail_fs->allow_reads != 0 && read_ops % fail_fs->allow_reads == 0) { - (void)wtext->msg_printf(wtext, session, - "fail_fs: %s: simulated failure after %" PRId64 - " reads\n", fail_fh->iface.name, read_ops); - return (EIO); - } + if (fail_fs->fail_enabled && fail_fs->allow_reads != 0 && + read_ops % fail_fs->allow_reads == 0) + return (fail_fs_simulate_fail( + fail_fh, session, read_ops, "read")); + /* Break reads larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { @@ -262,7 +299,7 @@ fail_file_size( /* * fail_file_sync -- * Ensure the content of the file is stable. This is a no-op in our - * memory backed file system. + * file system. */ static int fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session) @@ -300,7 +337,7 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; - int64_t write_ops; + int64_t envint, write_ops; int ret; size_t chunk; ssize_t nr; @@ -309,19 +346,32 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, fail_fh = (FAIL_FILE_HANDLE *)file_handle; fail_fs = fail_fh->fail_fs; wtext = fail_fs->wtext; + write_ops = 0; ret = 0; fail_fs_lock(&fail_fs->lock); - write_ops = ++fail_fs->write_ops; + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_WRITE_ALLOW, + &fail_fs->allow_writes); + fail_fs->write_ops = 0; + } + write_ops = ++fail_fs->write_ops; + } else + fail_fs->fail_enabled = false; + } else + write_ops = ++fail_fs->write_ops; + fail_fs_unlock(&fail_fs->lock); - if (fail_fs->allow_writes != 0 && - write_ops % fail_fs->allow_writes == 0) { - (void)wtext->msg_printf(wtext, session, - "fail_fs: %s: simulated failure after %" PRId64 - " writes\n", fail_fh->iface.name, write_ops); - return (EIO); - } + if (fail_fs->fail_enabled && fail_fs->allow_writes != 0 && + write_ops % fail_fs->allow_writes == 0) + return (fail_fs_simulate_fail( + fail_fh, session, write_ops, "write")); /* Break writes larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { @@ -348,17 +398,12 @@ static bool fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int64_t *argp) { - char *s; - int64_t result; - if (strncmp(match, key->str, key->len) == 0 && - match[key->len] == '\0') { - s = (char *)value->str; - result = strtoll(s, &s, 10); - if ((size_t)(s - (char *)value->str) == value->len) { - *argp = result; - return (true); - } + match[key->len] == '\0' && + (value->type == WT_CONFIG_ITEM_BOOL || + value->type == WT_CONFIG_ITEM_NUM)) { + *argp = value->val; + return (true); } return (false); } @@ -453,6 +498,30 @@ fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system, return (0); } +/* + * fail_fs_env -- + * If the name is in the environment, return its integral value. + */ +static void +fail_fs_env(const char *name, int64_t *valp) +{ + int64_t result; + char *s, *value; + + result = 0; + if ((value = getenv(name)) != NULL) { + s = value; + if (strcmp(value, "true") == 0) + result = 1; + else if (strcmp(value, "false") != 0) { + result = strtoll(value, &s, 10); + if (*s != '\0') + result = 0; + } + } + *valp = result; +} + /* * fail_fs_exist -- * Return if the file exists. @@ -482,7 +551,6 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, WT_FILE_HANDLE *file_handle; int fd, open_flags, ret; - (void)file_type; /* Unused */ (void)session; /* Unused */ *file_handlep = NULL; @@ -492,6 +560,9 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, fd = -1; ret = 0; + if (fail_fs->verbose) + fprintf(stderr, "fail_fs: open: %s\n", name); + fail_fs_lock(&fail_fs->lock); open_flags = 0; @@ -504,7 +575,14 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, else open_flags |= O_RDWR; - if ((fd = open(name, open_flags, 0666)) < 0) { + /* + * Opening a file handle on a directory is only to support filesystems + * that require a directory sync for durability. This is a no-op + * for this file system. + */ + if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) + fd = -1; + else if ((fd = open(name, open_flags, 0666)) < 0) { ret = errno; goto err; } @@ -587,6 +665,38 @@ fail_fs_rename(WT_FILE_SYSTEM *file_system, return (rename(from, to)); } +/* + * fail_fs_simulate_fail -- + * Simulate a failure from this file system by reporting it + * and returning a non-zero return code. + */ +static int +fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, + int64_t nops, const char *opkind) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int btret, i; + void *bt[100]; + char **btstr; + + fail_fs = fail_fh->fail_fs; + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " %s operations\n", fail_fh->iface.name, nops, opkind); + btret = backtrace(bt, sizeof(bt)/sizeof(bt[0])); + if ((btstr = backtrace_symbols(bt, btret)) != NULL) { + for (i = 0; i < btret; i++) + (void)wtext->msg_printf(wtext, session, " %s", + btstr[i]); + free(btstr); + } + } + return (EIO); +} + /* * fail_fs_size -- * Get the size of a file in bytes, by file name. @@ -641,6 +751,7 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) WT_CONFIG_PARSER *config_parser; WT_EXTENSION_API *wtext; WT_FILE_SYSTEM *file_system; + int64_t argval; int ret; ret = 0; @@ -663,9 +774,17 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) goto err; } while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { - if (fail_fs_arg("allow_writes", &k, &v, &fail_fs->allow_writes)) + if (fail_fs_arg("environment", &k, &v, &argval)) { + fail_fs->use_environment = (argval != 0); + continue; + } else if (fail_fs_arg("verbose", &k, &v, &argval)) { + fail_fs->verbose = (argval != 0); + continue; + } else if (fail_fs_arg("allow_writes", &k, &v, + &fail_fs->allow_writes)) continue; - if (fail_fs_arg("allow_reads", &k, &v, &fail_fs->allow_reads)) + else if (fail_fs_arg("allow_reads", &k, &v, + &fail_fs->allow_reads)) continue; (void)wtext->err_printf(wtext, NULL, @@ -687,6 +806,8 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) wtext->strerror(wtext, NULL, ret)); goto err; } + if (fail_fs->allow_writes != 0 || fail_fs->allow_reads != 0) + fail_fs->fail_enabled = true; fail_fs_allocate_lock(&fail_fs->lock); /* Initialize the in-memory jump table. */ diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index 5167b42b433..0158d0c96d1 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -37,6 +37,9 @@ noinst_PROGRAMS += test_wt2834_join_bloom_fix test_wt2853_perf_SOURCES = wt2853_perf/main.c noinst_PROGRAMS += test_wt2853_perf +test_wt2909_checkpoint_integrity_SOURCES = wt2909_checkpoint_integrity/main.c +noinst_PROGRAMS += test_wt2909_checkpoint_integrity + test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c noinst_PROGRAMS += test_wt2999_join_extractor diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c new file mode 100644 index 00000000000..efc459ff271 --- /dev/null +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -0,0 +1,660 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +#include +#include +#include + +/* + * JIRA ticket reference: WT-2909 + * Test case description: + * + * This test attempts to check the integrity of checkpoints by injecting + * failures (by means of a custom file system) and then trying to recover. To + * insulate the top level program from various crashes that may occur when + * injecting failures, the "populate" code runs in another process, and is + * expected to sometimes fail. Then the top level program runs recovery (with + * the normal file system) and checks the results. Any failure at the top level + * indicates a checkpoint integrity problem. + * + * Each subtest uses the same kind of schema and data, the only variance is + * when the faults are injected. At the moment, this test only injects during + * checkpoints, and only injects write failures. It varies in the number of + * successful writes that occur before an injected failure (during a checkpoint + * operation), this can be indicated with "-o N". When N is not specified, the + * test attempts to find the optimal range of N for testing. Clearly when N is + * large, then the checkpoint may be successfully written, and the data + * represented by the checkpoint will be fully present. When N is small, + * nothing of interest is written and no data is present. To find the sweet + * spot where interesting failures occur, the test does a binary search to find + * the approximate N that divides the "small" and "large" cases. This is not + * strictly deterministic, a given N may give different results on different + * runs. But approximate optimal N can be determined, allowing a series of + * additional tests clustered around this N. + * + * The data is stored in two tables, one having indices. Both tables have + * the same keys and are updated with the same key in a single transaction. + * + * Failure mode: + * If one table is out of step with the other, that is detected as a failure at + * the top level. If an index is missing values (or has extra values), that is + * likewise a failure at the top level. If the tables or the home directory + * cannot be opened, that is a top level error. The tables must be present + * as an initial checkpoint is done without any injected fault. + */ + +/* + * This program does not run on Windows. The non-portable aspects at minimum + * are fork/exec the use of environment variables (used by fail_fs), and file + * name and build locations of dynamically loaded libraries. + */ +#define BIG_SIZE (1024 * 10) +#define BIG_CONTENTS "" +#define MAX_ARGS 20 +#define MAX_OP_RANGE 1000 +#define STDERR_FILE "stderr.txt" +#define STDOUT_FILE "stdout.txt" +#define TESTS_PER_OP_VALUE 3 +#define VERBOSE_PRINT 10000 + +static int check_results(TEST_OPTS *, uint64_t *); +static void check_values(WT_CURSOR *, int, int, int, char *); +static int create_big_string(char **); +static void cursor_count_items(WT_CURSOR *, uint64_t *); +static void disable_failures(void); +static void enable_failures(uint64_t, uint64_t); +static void generate_key(uint32_t, int *); +static void generate_value(uint32_t, uint32_t, char *, int *, int *, int *, + char **); +static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, + uint64_t *); +static void run_check_subtest_range(TEST_OPTS *, const char *, bool); +static int run_process(TEST_OPTS *, const char *, char *[], int *); +static int subtest_main(int, char *[], bool); +static void subtest_populate(TEST_OPTS *, bool); +int main(int, char *[]); + +extern int __wt_optind; + +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + +/* + * check_results -- + * Check all the tables and verify the results. + */ +static int +check_results(TEST_OPTS *opts, uint64_t *foundp) +{ + WT_CURSOR *maincur, *maincur2, *v0cur, *v1cur, *v2cur; + WT_SESSION *session; + uint64_t count, idxcount, nrecords; + uint32_t rndint; + int key, key_got, ret, v0, v1, v2; + char *bigref, *big; + + testutil_check(create_big_string(&bigref)); + nrecords = opts->nrecords; + testutil_check(wiredtiger_open(opts->home, NULL, + "create,log=(enabled)", &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + testutil_check(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + testutil_check(session->open_cursor(session, "index:subtest:v0", NULL, + NULL, &v0cur)); + testutil_check(session->open_cursor(session, "index:subtest:v1", NULL, + NULL, &v1cur)); + testutil_check(session->open_cursor(session, "index:subtest:v2", NULL, + NULL, &v2cur)); + + count = 0; + while ((ret = maincur->next(maincur)) == 0) { + testutil_check(maincur2->next(maincur2)); + testutil_check(maincur2->get_key(maincur2, &key_got)); + testutil_check(maincur2->get_value(maincur2, &rndint)); + + generate_key((uint32_t)count, &key); + generate_value(rndint, (uint32_t)count, + bigref, &v0, &v1, &v2, &big); + testutil_assert(key == key_got); + + /* Check the key/values in main table. */ + testutil_check(maincur->get_key(maincur, &key_got)); + testutil_assert(key == key_got); + check_values(maincur, v0, v1, v2, big); + + /* Check the values in the indices. */ + v0cur->set_key(v0cur, v0); + testutil_check(v0cur->search(v0cur)); + check_values(v0cur, v0, v1, v2, big); + v1cur->set_key(v1cur, v1); + testutil_check(v1cur->search(v1cur)); + check_values(v1cur, v0, v1, v2, big); + v2cur->set_key(v2cur, v2); + testutil_check(v2cur->search(v2cur)); + check_values(v2cur, v0, v1, v2, big); + + count++; + if (count % VERBOSE_PRINT == 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, + nrecords); + } + if (count % VERBOSE_PRINT != 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, nrecords); + + /* + * Always expect at least one entry, as populate does a + * checkpoint after the first insert. + */ + testutil_assert(count > 0); + testutil_assert(ret == WT_NOTFOUND); + testutil_assert(maincur2->next(maincur2) == WT_NOTFOUND); + cursor_count_items(v0cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v1cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v2cur, &idxcount); + testutil_assert(count == idxcount); + + testutil_check(opts->conn->close(opts->conn, NULL)); + opts->conn = NULL; + + free(bigref); + *foundp = count; + return (0); +} + +/* + * check_values -- + * Check that the values in the cursor match the given values. + */ +static void +check_values(WT_CURSOR *cursor, int v0, int v1, int v2, char *big) +{ + int v0_got, v1_got, v2_got; + char *big_got; + + testutil_check(cursor->get_value(cursor, &v0_got, &v1_got, &v2_got, + &big_got)); + testutil_assert(v0 == v0_got); + testutil_assert(v1 == v1_got); + testutil_assert(v2 == v2_got); + testutil_assert(strcmp(big, big_got) == 0); +} + +/* + * create_big_string -- + * Create and fill the "reference" big array. + */ +static int create_big_string(char **bigp) +{ + size_t i, mod; + char *big; + + if ((big = malloc(BIG_SIZE + 1)) == NULL) + return (ENOMEM); + mod = strlen(BIG_CONTENTS); + for (i = 0; i < BIG_SIZE; i++) { + big[i] = BIG_CONTENTS[i % mod]; + } + big[BIG_SIZE] = '\0'; + *bigp = big; + return (0); +} + +/* + * cursor_count_items -- + * Count the number of items in the table by traversing + * through the cursor. + */ +static void +cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) +{ + int ret; + + *countp = 0; + + cursor->reset(cursor); + while ((ret = cursor->next(cursor)) == 0) + (*countp)++; + testutil_assert(ret == WT_NOTFOUND); +} + +/* + * disable_failures -- + * Disable failures in the fail file system. + */ +static void +disable_failures(void) +{ + setenv("WT_FAIL_FS_ENABLE", "0", 1); +} + +/* + * enable_failures -- + * Enable failures in the fail file system. + */ +static void +enable_failures(uint64_t allow_writes, uint64_t allow_reads) +{ + char value[100]; + + setenv("WT_FAIL_FS_ENABLE", "1", 1); + snprintf(value, sizeof(value), "%" PRIu64, allow_writes); + setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1); + snprintf(value, sizeof(value), "%" PRIu64, allow_reads); + setenv("WT_FAIL_FS_READ_ALLOW", value, 1); +} + +/* + * generate_key -- + * Generate a key used by the "subtest" and "subtest2" tables. + */ +static void +generate_key(uint32_t i, int *keyp) +{ + *keyp = (int)i; +} + +/* + * generate_value -- + * Generate values for the "subtest" table. + */ +static void +generate_value(uint32_t rndint, uint32_t i, char *bigref, + int *v0p, int *v1p, int *v2p, char **bigp) +{ + *v0p = (int)(i * 7); + *v1p = (int)(i * 10007); + *v2p = (int)(i * 100000007); + *bigp = &bigref[rndint % BIG_SIZE]; +} + +/* + * run_check_subtest -- + * Run the subtest with the given parameters and check the results. + */ +static void +run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, + bool close_test, uint64_t *nresultsp) +{ + int narg; + int estatus; + char rarg[20], sarg[20]; + char *subtest_args[MAX_ARGS]; + + narg = 0; + if (debugger != NULL) { + subtest_args[narg++] = (char *)debugger; + subtest_args[narg++] = (char *)"--"; + } + + subtest_args[narg++] = (char *)opts->progname; + /* "subtest" must appear before arguments */ + if (close_test) + subtest_args[narg++] = (char *)"subtest_close"; + else + subtest_args[narg++] = (char *)"subtest"; + subtest_args[narg++] = (char *)"-h"; + subtest_args[narg++] = opts->home; + subtest_args[narg++] = (char *)"-v"; /* subtest is always verbose */ + subtest_args[narg++] = (char *)"-p"; + subtest_args[narg++] = (char *)"-o"; + snprintf(sarg, sizeof(sarg), "%" PRIu64, nops); + subtest_args[narg++] = sarg; /* number of operations */ + subtest_args[narg++] = (char *)"-n"; + snprintf(rarg, sizeof(rarg), "%" PRIu64, opts->nrecords); + subtest_args[narg++] = rarg; /* number of records */ + subtest_args[narg++] = NULL; + testutil_assert(narg <= MAX_ARGS); + if (opts->verbose) + printf("running a separate process with %" PRIu64 + " operations until fail...\n", nops); + testutil_clean_work_dir(opts->home); + testutil_check(run_process( + opts, debugger != NULL ? debugger : opts->progname, + subtest_args, &estatus)); + if (opts->verbose) + printf("process exited %d\n", estatus); + + /* + * Verify results in parent process. + */ + testutil_check(check_results(opts, nresultsp)); +} + +/* + * run_check_subtest_range -- + * + * Run successive tests via binary search that determines the approximate + * crossover point between when data is recoverable or not. Once that is + * determined, run the subtest in a range near that crossover point. + * + * The theory is that running at the crossover point will tend to trigger + * "interesting" failures at the borderline when the checkpoint is about to, + * or has, succeeded. If any of those failures creates a WT home directory + * that cannot be recovered, the top level test will fail. + */ +static void +run_check_subtest_range(TEST_OPTS *opts, const char *debugger, bool close_test) +{ + uint64_t cutoff, high, low, mid, nops, nresults; + int i; + bool got_failure, got_success; + + if (opts->verbose) + printf("Determining best range of operations until failure, " + "with close_test %s.\n", + (close_test ? "enabled" : "disabled")); + + run_check_subtest(opts, debugger, 1, close_test, &cutoff); + low = 0; + high = MAX_OP_RANGE; + mid = (low + high) / 2; + while (mid != low) { + run_check_subtest(opts, debugger, mid, close_test, + &nresults); + if (nresults > cutoff) + high = mid; + else + low = mid; + mid = (low + high) / 2; + } + /* + * mid is the number of ops that is the crossover point. + * Run some tests near that point to try to trigger weird + * failures. If mid is too low or too high, it indicates + * there is a fundamental problem with the test. + */ + testutil_assert(mid > 1 && mid < MAX_OP_RANGE - 1); + if (opts->verbose) + printf("Retesting around %" PRIu64 " operations.\n", + mid); + + got_failure = false; + got_success = false; + for (nops = mid - 10; nops < mid + 10; nops++) { + for (i = 0; i < TESTS_PER_OP_VALUE; i++) { + run_check_subtest(opts, debugger, nops, + close_test, &nresults); + if (nresults > cutoff) + got_failure = true; + else + got_success = true; + } + } + /* + * Check that it really ran with a crossover point. + */ + testutil_assert(got_failure); + testutil_assert(got_success); +} + +/* + * run_process -- + * Run a program with arguments, wait until it completes. + */ +static int +run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) +{ + int pid; + + if (opts->verbose) { + printf("running: "); + for (char **arg = argv; *arg != NULL; arg++) + printf("%s ", *arg); + printf("\n"); + } + if ((pid = fork()) == 0) { + execv(prog, argv); + } else if (pid < 0) + return (errno); + + waitpid(pid, status, 0); + return (0); +} + +/* + * subtest_main -- + * The main program for the subtest + */ +static int +subtest_main(int argc, char *argv[], bool close_test) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + char config[1024], filename[1024]; + + opts = &_opts; + if (testutil_disable_long_tests()) + return (0); + memset(opts, 0, sizeof(*opts)); + + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + /* Redirect stderr, stdout. */ + sprintf(filename, "%s/%s", opts->home, STDERR_FILE); + freopen(filename, "a", stderr); + sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); + freopen(filename, "a", stdout); + snprintf(config, sizeof(config), + "create,cache_size=250M,log=(enabled)," + "transaction_sync=(enabled,method=none),extensions=(" + WT_FAIL_FS_LIB + "=(early_load,config={environment=true,verbose=true})]"); + + testutil_check(wiredtiger_open(opts->home, NULL, config, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->create(session, "table:subtest", + "key_format=i,value_format=iiiS," + "columns=(id,v0,v1,v2,big)")); + + testutil_check(session->create(session, "table:subtest2", + "key_format=i,value_format=i")); + + testutil_check(session->create(session, "index:subtest:v0", + "columns=(v0)")); + testutil_check(session->create(session, "index:subtest:v1", + "columns=(v1)")); + testutil_check(session->create(session, "index:subtest:v2", + "columns=(v2)")); + + testutil_check(session->close(session, NULL)); + + subtest_populate(opts, close_test); + + testutil_cleanup(opts); + + return (0); +} + +/* + * This macro is used as a substitute for testutil_check, except that it is + * aware of when a failure may be expected due to the effects of the fail_fs. + * This macro is used only in subtest_populate(), it uses local variables. + */ +#define CHECK(expr) { \ + int _ret; \ + _ret = expr; \ + if (_ret != 0) { \ + if (!failmode || \ + (_ret != WT_RUN_RECOVERY && _ret != EIO)) { \ + fprintf(stderr, " BAD RETURN %d for \"%s\"\n", \ + _ret, #expr); \ + testutil_check(_ret); \ + } else \ + failed = true; \ + } \ +} + +/* + * subtest_populate -- + * Populate the tables. + */ +static void +subtest_populate(TEST_OPTS *opts, bool close_test) +{ + WT_CURSOR *maincur, *maincur2; + WT_RAND_STATE rnd; + WT_SESSION *session; + uint64_t nrecords; + uint32_t i, rndint; + int key, v0, v1, v2; + char *big, *bigref; + bool failed, failmode; + + failmode = failed = false; + __wt_random_init_seed(NULL, &rnd); + CHECK(create_big_string(&bigref)); + nrecords = opts->nrecords; + + CHECK(opts->conn->open_session( + opts->conn, NULL, NULL, &session)); + + CHECK(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + + CHECK(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + + for (i = 0; i < nrecords && !failed; i++) { + rndint = __wt_random(&rnd); + generate_key(i, &key); + generate_value(rndint, i, bigref, &v0, &v1, &v2, &big); + CHECK(session->begin_transaction(session, NULL)); + maincur->set_key(maincur, key); + maincur->set_value(maincur, v0, v1, v2, big); + CHECK(maincur->insert(maincur)); + + maincur2->set_key(maincur2, key); + maincur2->set_value(maincur2, rndint); + CHECK(maincur2->insert(maincur2)); + CHECK(session->commit_transaction(session, NULL)); + + if (i == 0) + /* + * Force an initial checkpoint, that helps to + * distinguish a clear failure from just not running + * long enough. + */ + CHECK(session->checkpoint(session, NULL)); + + if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) + printf(" %d/%" PRIu64 "\n", (i + 1), nrecords); + /* Attempt to isolate the failures to checkpointing. */ + if (i == (nrecords/100)) { + enable_failures(opts->nops, 1000000); + failmode = true; /* CHECK should expect failures. */ + CHECK(session->checkpoint(session, NULL)); + failmode = false; + disable_failures(); + if (failed && opts->verbose) + printf("checkpoint failed (expected).\n"); + } + } + + /* + * Closing handles after an extreme fail is likely to cause + * cascading failures (or crashes), so recommended practice is + * to immediately exit. We're interested in testing both with + * and without the recommended practice. + */ + if (failed) { + if (!close_test) { + fprintf(stderr, "exit early.\n"); + exit(0); + } else + fprintf(stderr, "closing after failure.\n"); + } + + free(bigref); + CHECK(maincur->close(maincur)); + CHECK(maincur2->close(maincur2)); + CHECK(session->close(session, NULL)); +} + +/* + * main -- + * The main program for the test. When invoked with "subtest" + * argument, run the subtest. Otherwise, run a separate process + * for each needed subtest, and check the results. + */ +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + uint64_t nresults; + const char *debugger; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + debugger = NULL; + + testutil_check(testutil_parse_opts(argc, argv, opts)); + argc -= __wt_optind; + argv += __wt_optind; + if (opts->nrecords == 0) + opts->nrecords = 50000; + + while (argc > 0) { + if (strcmp(argv[0], "subtest") == 0) + return (subtest_main(argc, argv, false)); + else if (strcmp(argv[0], "subtest_close") == 0) + return (subtest_main(argc, argv, true)); + else if (strcmp(argv[0], "gdb") == 0) + debugger = "/usr/bin/gdb"; + else + testutil_assert(false); + argc--; + argv++; + } + if (opts->verbose) { + printf("Number of operations until failure: %" PRIu64 + " (change with -o N)\n", opts->nops); + printf("Number of records: %" PRIu64 + " (change with -n N)\n", opts->nrecords); + } + if (opts->nops == 0) { + run_check_subtest_range(opts, debugger, false); + run_check_subtest_range(opts, debugger, true); + } else + run_check_subtest(opts, debugger, opts->nops, + opts->nrecords, &nresults); + + testutil_clean_work_dir(opts->home); + testutil_cleanup(opts); + + return (0); +} diff --git a/test/utility/misc.c b/test/utility/misc.c index 1491c9a6938..1ba08ddd77f 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -78,7 +78,7 @@ testutil_work_dir_from_path(char *buffer, size_t len, const char *dir) * Remove the work directory. */ void -testutil_clean_work_dir(char *dir) +testutil_clean_work_dir(const char *dir) { size_t len; int ret; diff --git a/test/utility/test_util.h b/test/utility/test_util.h index f6a9cd68e02..489bbe18d87 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -183,7 +183,7 @@ void *dmalloc(size_t); void *drealloc(void *, size_t); void *dstrdup(const void *); void *dstrndup(const char *, size_t); -void testutil_clean_work_dir(char *); +void testutil_clean_work_dir(const char *); void testutil_cleanup(TEST_OPTS *); bool testutil_disable_long_tests(void); void testutil_make_work_dir(char *); -- cgit v1.2.1 From e66634960eeaf60d1b13c26308053e0baf51030b Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 14 Feb 2017 08:36:14 -0500 Subject: WT-2909 Create automatable test verifying checkpoint integrity after errors (#3295) * WT-2909 Create automatable test verifying checkpoint integrity after errors Make gcc 4.7 work again. * Linux (Red Hat 5.3.1-6) declares backtrace(3) to return an int, FreeBSD (10.3-RELEASE-p11) declares it to return a size_t. * Remove repeated #include files, check for error returns from a few functions. * The Linux/FreeBSD backtrace() calls are fundamentally incompatible, add an #ifdef. --- ext/test/fail_fs/fail_fs.c | 13 +++++++--- test/csuite/wt2909_checkpoint_integrity/main.c | 33 +++++++++++++------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index a6376ce203b..0ea4a7d5e00 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -95,8 +95,7 @@ static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *); static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool); static int fail_file_read( WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *); -static int fail_file_size( - WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); +static int fail_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *); static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); static int fail_file_write( @@ -676,7 +675,11 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, { FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; +#ifdef __linux__ int btret, i; +#else + size_t btret, i; +#endif void *bt[100]; char **btstr; @@ -686,7 +689,11 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 " %s operations\n", fail_fh->iface.name, nops, opkind); - btret = backtrace(bt, sizeof(bt)/sizeof(bt[0])); +#ifdef __linux__ + btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); +#else + btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); +#endif if ((btstr = backtrace_symbols(bt, btret)) != NULL) { for (i = 0; i < btret; i++) (void)wtext->msg_printf(wtext, session, " %s", diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index efc459ff271..bf7f86cfd07 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -27,8 +27,6 @@ */ #include "test_util.h" -#include -#include #include /* @@ -243,7 +241,7 @@ cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) *countp = 0; - cursor->reset(cursor); + testutil_check(cursor->reset(cursor)); while ((ret = cursor->next(cursor)) == 0) (*countp)++; testutil_assert(ret == WT_NOTFOUND); @@ -256,7 +254,7 @@ cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) static void disable_failures(void) { - setenv("WT_FAIL_FS_ENABLE", "0", 1); + testutil_check(setenv("WT_FAIL_FS_ENABLE", "0", 1)); } /* @@ -268,11 +266,11 @@ enable_failures(uint64_t allow_writes, uint64_t allow_reads) { char value[100]; - setenv("WT_FAIL_FS_ENABLE", "1", 1); + testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1)); snprintf(value, sizeof(value), "%" PRIu64, allow_writes); - setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1); + testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1)); snprintf(value, sizeof(value), "%" PRIu64, allow_reads); - setenv("WT_FAIL_FS_READ_ALLOW", value, 1); + testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1)); } /* @@ -307,10 +305,8 @@ static void run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, bool close_test, uint64_t *nresultsp) { - int narg; - int estatus; - char rarg[20], sarg[20]; - char *subtest_args[MAX_ARGS]; + int estatus, narg; + char rarg[20], sarg[20], *subtest_args[MAX_ARGS]; narg = 0; if (debugger != NULL) { @@ -427,19 +423,21 @@ static int run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) { int pid; + char **arg; if (opts->verbose) { printf("running: "); - for (char **arg = argv; *arg != NULL; arg++) + for (arg = argv; *arg != NULL; arg++) printf("%s ", *arg); printf("\n"); } if ((pid = fork()) == 0) { - execv(prog, argv); + (void)execv(prog, argv); + testutil_die(errno, "%s", prog); } else if (pid < 0) return (errno); - waitpid(pid, status, 0); + (void)waitpid(pid, status, 0); return (0); } @@ -464,9 +462,9 @@ subtest_main(int argc, char *argv[], bool close_test) /* Redirect stderr, stdout. */ sprintf(filename, "%s/%s", opts->home, STDERR_FILE); - freopen(filename, "a", stderr); + testutil_assert(freopen(filename, "a", stderr) != NULL); sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); - freopen(filename, "a", stdout); + testutil_assert(freopen(filename, "a", stdout) != NULL); snprintf(config, sizeof(config), "create,cache_size=250M,log=(enabled)," "transaction_sync=(enabled,method=none),extensions=(" @@ -572,7 +570,8 @@ subtest_populate(TEST_OPTS *opts, bool close_test) CHECK(session->checkpoint(session, NULL)); if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) - printf(" %d/%" PRIu64 "\n", (i + 1), nrecords); + printf(" %" PRIu32 "/%" PRIu64 "\n", + (i + 1), nrecords); /* Attempt to isolate the failures to checkpointing. */ if (i == (nrecords/100)) { enable_failures(opts->nops, 1000000); -- cgit v1.2.1 From 152d4778f58fe8d9448c530c7cda07801499e8d7 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 14 Feb 2017 10:57:35 -0500 Subject: WT-2909 Create automatable test verifying checkpoint integrity after errors (#3296) FreeBSD's backtrace is the outlier, everybody else (OS X, Solaris, Linux) is using int types, not size_t. --- ext/test/fail_fs/fail_fs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 0ea4a7d5e00..9445dbf9aca 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -675,10 +675,10 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, { FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; -#ifdef __linux__ - int btret, i; -#else +#ifdef __FreeBSD__ size_t btret, i; +#else + int btret, i; #endif void *bt[100]; char **btstr; @@ -689,10 +689,10 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 " %s operations\n", fail_fh->iface.name, nops, opkind); -#ifdef __linux__ - btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); -#else +#ifdef __FreeBSD__ btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); +#else + btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); #endif if ((btstr = backtrace_symbols(bt, btret)) != NULL) { for (i = 0; i < btret; i++) -- cgit v1.2.1 From a6a0483f2b4f1617bc1aa1179685b74bad990290 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 14:30:51 -0500 Subject: WT-3180 bug fix: disable long tests in the top-level main program, (#3298) rather than the subtest. Disable core files for the subtest, as they are rarely interesting. Fix some uint64 values/parameters that were declared as uint32. --- test/csuite/wt2909_checkpoint_integrity/main.c | 29 ++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index bf7f86cfd07..ddf249fb406 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -27,6 +27,8 @@ */ #include "test_util.h" +#include +#include #include /* @@ -87,8 +89,8 @@ static int create_big_string(char **); static void cursor_count_items(WT_CURSOR *, uint64_t *); static void disable_failures(void); static void enable_failures(uint64_t, uint64_t); -static void generate_key(uint32_t, int *); -static void generate_value(uint32_t, uint32_t, char *, int *, int *, int *, +static void generate_key(uint64_t, int *); +static void generate_value(uint32_t, uint64_t, char *, int *, int *, int *, char **); static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, uint64_t *); @@ -140,9 +142,8 @@ check_results(TEST_OPTS *opts, uint64_t *foundp) testutil_check(maincur2->get_key(maincur2, &key_got)); testutil_check(maincur2->get_value(maincur2, &rndint)); - generate_key((uint32_t)count, &key); - generate_value(rndint, (uint32_t)count, - bigref, &v0, &v1, &v2, &big); + generate_key(count, &key); + generate_value(rndint, count, bigref, &v0, &v1, &v2, &big); testutil_assert(key == key_got); /* Check the key/values in main table. */ @@ -278,7 +279,7 @@ enable_failures(uint64_t allow_writes, uint64_t allow_reads) * Generate a key used by the "subtest" and "subtest2" tables. */ static void -generate_key(uint32_t i, int *keyp) +generate_key(uint64_t i, int *keyp) { *keyp = (int)i; } @@ -288,7 +289,7 @@ generate_key(uint32_t i, int *keyp) * Generate values for the "subtest" table. */ static void -generate_value(uint32_t rndint, uint32_t i, char *bigref, +generate_value(uint32_t rndint, uint64_t i, char *bigref, int *v0p, int *v1p, int *v2p, char **bigp) { *v0p = (int)(i * 7); @@ -451,12 +452,16 @@ subtest_main(int argc, char *argv[], bool close_test) TEST_OPTS *opts, _opts; WT_SESSION *session; char config[1024], filename[1024]; + struct rlimit rlim; - opts = &_opts; if (testutil_disable_long_tests()) return (0); + opts = &_opts; memset(opts, 0, sizeof(*opts)); + memset(&rlim, 0, sizeof(rlim)); + /* No core files during fault injection tests. */ + testutil_check(setrlimit(RLIMIT_CORE, &rlim)); testutil_check(testutil_parse_opts(argc, argv, opts)); testutil_make_work_dir(opts->home); @@ -527,8 +532,8 @@ subtest_populate(TEST_OPTS *opts, bool close_test) WT_CURSOR *maincur, *maincur2; WT_RAND_STATE rnd; WT_SESSION *session; - uint64_t nrecords; - uint32_t i, rndint; + uint64_t i, nrecords; + uint32_t rndint; int key, v0, v1, v2; char *big, *bigref; bool failed, failmode; @@ -570,7 +575,7 @@ subtest_populate(TEST_OPTS *opts, bool close_test) CHECK(session->checkpoint(session, NULL)); if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) - printf(" %" PRIu32 "/%" PRIu64 "\n", + printf(" %" PRIu64 "/%" PRIu64 "\n", (i + 1), nrecords); /* Attempt to isolate the failures to checkpointing. */ if (i == (nrecords/100)) { @@ -617,6 +622,8 @@ main(int argc, char *argv[]) uint64_t nresults; const char *debugger; + if (testutil_disable_long_tests()) + return (0); opts = &_opts; memset(opts, 0, sizeof(*opts)); debugger = NULL; -- cgit v1.2.1 From a53bb9683b7f8e4fda3c6272ec8224857e756ba8 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 16:30:53 -0500 Subject: WT-3179 test bug: clang sanitizer failure in fail_fs #3300 hold the fs lock while manipulating the list of file handles. --- ext/test/fail_fs/fail_fs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 9445dbf9aca..cb87b43bfd9 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -156,11 +156,13 @@ static int fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) { FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; int ret; (void)session; /* Unused */ fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; /* * We don't actually open an fd when opening directories for flushing, @@ -170,14 +172,16 @@ fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) return (0); ret = close(fail_fh->fd); fail_fh->fd = -1; + fail_fs_lock(&fail_fs->lock); fail_file_handle_remove(session, fail_fh); + fail_fs_unlock(&fail_fs->lock); return (ret); } /* * fail_file_handle_remove -- * Destroy an in-memory file handle. Should only happen on remove or - * shutdown. + * shutdown. The file system lock must be held during this call. */ static void fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh) -- cgit v1.2.1 From 7a725a97d281095280515b0609f0e61747fd1b58 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 17:21:07 -0500 Subject: WT-3179 test bug: clang sanitizer failure in fail_fs Replaced a fprintf call, and cleaned up a call to access system call. --- ext/test/fail_fs/fail_fs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index cb87b43bfd9..d0d8a14c8c2 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -536,7 +536,7 @@ fail_fs_exist(WT_FILE_SYSTEM *file_system, (void)file_system; /* Unused */ (void)session; /* Unused */ - *existp = (access(name, 0) == 0); + *existp = (access(name, F_OK) == 0); return (0); } @@ -551,6 +551,7 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, { FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; WT_FILE_HANDLE *file_handle; int fd, open_flags, ret; @@ -563,8 +564,11 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, fd = -1; ret = 0; - if (fail_fs->verbose) - fprintf(stderr, "fail_fs: open: %s\n", name); + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, "fail_fs: open: %s", + name); + } fail_fs_lock(&fail_fs->lock); @@ -692,7 +696,7 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, wtext = fail_fs->wtext; (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 - " %s operations\n", fail_fh->iface.name, nops, opkind); + " %s operations", fail_fh->iface.name, nops, opkind); #ifdef __FreeBSD__ btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); #else -- cgit v1.2.1 From 70b5ab64d84cb8a22553def853ddb1a11393ff73 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 15 Feb 2017 18:08:10 +1100 Subject: WT-3149 Make random lookups for eviction more lightweight. (#3302) Eviction walks don't need to start on leaf pages: just try to descend through the tree and as soon as we can't swap to a child page, start the walk from the parent. --- src/btree/bt_random.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index 3cc6838c4c8..44de511f787 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -166,7 +166,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * __wt_random_descent -- - * Find a random leaf page in a tree. + * Find a random page in a tree for either sampling or eviction. */ int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) @@ -183,9 +183,11 @@ __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) retry = 100; /* Eviction should not be tapped to do eviction. */ - flags = WT_READ_RESTART_OK; if (eviction) - LF_SET(WT_READ_NO_EVICT); + flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | + WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK; + else + flags = WT_READ_RESTART_OK; if (0) { restart: /* @@ -205,6 +207,13 @@ restart: /* WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; + /* Eviction just wants any random child. */ + if (eviction) { + descent = pindex->index[ + __wt_random(&session->rnd) % entries]; + goto descend; + } + /* * There may be empty pages in the tree, and they're useless to * us. If we don't find a non-empty page in "entries" random @@ -212,10 +221,8 @@ restart: /* * search page contains nothing other than empty pages, restart * from the root some number of times before giving up. * - * Eviction is only looking for a place in the cache and so only - * wants in-memory pages (but a deleted page is fine); currently - * our other caller is looking for a key/value pair on a random - * leave page, and so will accept any page that contains a valid + * Random sampling is looking for a key/value pair on a random + * leaf page, and so will accept any page that contains a valid * key/value pair, so on-disk is fine, but deleted is not. */ descent = NULL; @@ -223,15 +230,14 @@ restart: /* descent = pindex->index[__wt_random(&session->rnd) % entries]; if (descent->state == WT_REF_MEM || - (!eviction && descent->state == WT_REF_DISK)) + descent->state == WT_REF_DISK) break; } if (i == entries) for (i = 0; i < entries; ++i) { descent = pindex->index[i]; if (descent->state == WT_REF_MEM || - (!eviction && - descent->state == WT_REF_DISK)) + descent->state == WT_REF_DISK) break; } if (i == entries || descent == NULL) { @@ -249,17 +255,25 @@ restart: /* * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ - if ((ret = +descend: if ((ret = __wt_page_swap(session, current, descent, flags)) == 0) { current = descent; continue; } + if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART)) + break; if (ret == WT_RESTART) goto restart; return (ret); } - *refp = current; + /* + * There is no point starting with the root page: the walk will exit + * immediately. In that case we aren't holding a hazard pointer so + * there is nothing to release. + */ + if (!eviction || !__wt_ref_is_root(current)) + *refp = current; return (0); } -- cgit v1.2.1 From 83ce29217f0bebad1c0a86e4eb827a70216b4641 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 15 Feb 2017 16:38:07 -0500 Subject: WT-3186 Fix error path and panic detection in logging loops. (#3304) --- src/include/extern.h | 2 +- src/log/log.c | 6 +++++- src/log/log_slot.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/include/extern.h b/src/include/extern.h index 8e55077c2a9..19ad9a880df 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -406,7 +406,7 @@ extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bo extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/log/log.c b/src/log/log.c index b07ef8c1bd5..d6caa55f8c7 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -2132,7 +2132,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_CONN_INCR(session, log_writes); - __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * The only time joining a slot should ever return an error is if it + * detects a panic. + */ + WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot)); /* * If the addition of this record crosses the buffer boundary, * switch in a new slot. diff --git a/src/log/log_slot.c b/src/log/log_slot.c index d6e692f8c51..542f010ea53 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -160,6 +160,7 @@ retry: #endif if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { while (slot->slot_unbuffered == 0) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; @@ -464,7 +465,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) * __wt_log_slot_join -- * Join a consolidated logging slot. */ -void +int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { @@ -498,6 +499,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, } for (;;) { WT_BARRIER(); + WT_RET(WT_SESSION_CHECK_PANIC(session)); slot = log->active_slot; old_state = slot->slot_state; if (WT_LOG_SLOT_OPEN(old_state)) { @@ -555,6 +557,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); + return (0); } /* -- cgit v1.2.1 From 8a1adcc4a1c4c25e1270290a8eb21173f41e83a9 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Thu, 16 Feb 2017 00:21:26 -0500 Subject: WT-3184 bug fix: special case searching an index that has a custom collator. (#3303) In this case, we must use the entire (raw) key to duplicate the position, instead of truncating to the visible part. --- src/cursor/cur_index.c | 3 +- src/cursor/cur_std.c | 7 +- src/include/wiredtiger.in | 5 +- test/csuite/Makefile.am | 3 + test/csuite/wt3184_dup_index_collator/main.c | 168 +++++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 test/csuite/wt3184_dup_index_collator/main.c diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 13180efdea4..6fc01c0421f 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -245,7 +245,8 @@ __curindex_search(WT_CURSOR *cursor) * Custom collators expect to see complete keys, pass an item containing * all the visible fields so it unpacks correctly. */ - if (cindex->index->collator != NULL) + if (cindex->index->collator != NULL && + !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH)) WT_ERR(__wt_struct_repack(session, child->key_format, cindex->iface.key_format, &child->key, &found_key)); else diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 7ace6d49cf0..99a9e373354 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -633,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) { + WT_DECL_RET; WT_ITEM key; /* @@ -662,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) * cursors cannot reference application memory after cursor operations * and that requirement will save the day. */ - WT_RET(cursor->search(cursor)); + F_SET(cursor, WT_CURSTD_RAW_SEARCH); + ret = cursor->search(cursor); + F_CLR(cursor, WT_CURSTD_RAW_SEARCH); - return (0); + return (ret); } /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index d1e3d383396..c148e759299 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -576,8 +576,9 @@ struct __wt_cursor { #define WT_CURSTD_OPEN 0x00200 #define WT_CURSTD_OVERWRITE 0x00400 #define WT_CURSTD_RAW 0x00800 -#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */ +#define WT_CURSTD_RAW_SEARCH 0x01000 +#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index 0158d0c96d1..e2b72532703 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -49,6 +49,9 @@ noinst_PROGRAMS += test_wt3120_filesys test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c noinst_PROGRAMS += test_wt3135_search_near_collator +test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c +noinst_PROGRAMS += test_wt3184_dup_index_collator + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3184_dup_index_collator/main.c b/test/csuite/wt3184_dup_index_collator/main.c new file mode 100644 index 00000000000..bcefd2f1a3b --- /dev/null +++ b/test/csuite/wt3184_dup_index_collator/main.c @@ -0,0 +1,168 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3184 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +static int +compare_int(int32_t a, int32_t b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int32_t +item_to_int(WT_ITEM *item) +{ + testutil_assert(item->size == sizeof(int32_t)); + return (*(int32_t *)item->data); +} + +static int +compare_int_items(WT_ITEM *itema, WT_ITEM *itemb) +{ + testutil_assert(itema->size == sizeof(int32_t)); + testutil_assert(itemb->size == sizeof(int32_t)); + return (compare_int(item_to_int(itema), item_to_int(itemb))); +} + +static void +print_int_item(const char *str, const WT_ITEM *item) +{ + if (item->size > 0) { + testutil_assert(item->size == sizeof(int32_t)); + printf("%s%" PRId32, str, *(int32_t *)item->data); + } else + printf("%s", str); +} + +static int +index_compare(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM ikey1, pkey1, ikey2, pkey2; + + (void)collator; + testutil_check(wiredtiger_struct_unpack(session, + key1->data, key1->size, "uu", &ikey1, &pkey1)); + testutil_check(wiredtiger_struct_unpack(session, + key2->data, key2->size, "uu", &ikey2, &pkey2)); + + print_int_item("index_compare: index key1 = ", &ikey1); + print_int_item(", primary key1 = ", &pkey1); + print_int_item(", index key2 = ", &ikey2); + print_int_item(", primary key2 = ", &pkey2); + printf("\n"); + + if ((*cmp = compare_int_items(&ikey1, &ikey2)) != 0) + return (0); + + if (pkey1.size != 0 && pkey2.size != 0) + *cmp = compare_int_items(&pkey1, &pkey2); + else if (pkey1.size != 0) + *cmp = 1; + else if (pkey2.size != 0) + *cmp = -1; + else + *cmp = 0; + + return (0); +} + +static WT_COLLATOR index_coll = { index_compare, NULL, NULL }; + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor, *cursor1; + WT_ITEM got, k, v; + WT_SESSION *session; + int32_t ki, vi; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(opts->conn->add_collator(opts->conn, "index_coll", + &index_coll, NULL)); + + testutil_check(session->create(session, + "table:main", "key_format=u,value_format=u,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:index", "columns=(v),collator=index_coll")); + + printf("adding new record\n"); + testutil_check(session->open_cursor(session, "table:main", NULL, NULL, + &cursor)); + + ki = 13; + vi = 17; + + k.data = &ki; k.size = sizeof(ki); + v.data = &vi; v.size = sizeof(vi); + + cursor->set_key(cursor, &k); + cursor->set_value(cursor, &v); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + printf("positioning index cursor\n"); + + testutil_check(session->open_cursor(session, "index:main:index", NULL, + NULL, &cursor)); + cursor->set_key(cursor, &v); + testutil_check(cursor->search(cursor)); + + printf("duplicating cursor\n"); + testutil_check(session->open_cursor(session, NULL, cursor, NULL, + &cursor1)); + cursor->get_value(cursor, &got); + testutil_assert(item_to_int(&got) == 17); + cursor1->get_value(cursor1, &got); + testutil_assert(item_to_int(&got) == 17); + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1 From 30036d415f83b4b376750bcc122ff8f43b829205 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 16 Feb 2017 21:38:32 -0500 Subject: WT-3188 More log loops needing to check panic. (#3307) --- src/log/log.c | 19 ++++++++++++++----- src/log/log_slot.c | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/log/log.c b/src/log/log.c index d6caa55f8c7..3477ca52502 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -24,7 +24,7 @@ static int __log_write_internal( * __log_wait_for_earlier_slot -- * Wait for write_lsn to catch up to this slot. */ -static void +static int __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; @@ -41,6 +41,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * unlock in case an earlier thread is trying to switch its * slot and complete its operation. */ + WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); __wt_cond_signal(session, conn->log_wrlsn_cond); @@ -51,6 +52,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } + return (0); } /* @@ -70,7 +72,7 @@ __log_fs_write(WT_SESSION_IMPL *session, * be a hole at the end of the previous log file that we cannot detect. */ if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { - __log_wait_for_earlier_slot(session, slot); + WT_RET(__log_wait_for_earlier_slot(session, slot)); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) @@ -110,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; + WT_RET(WT_SESSION_CHECK_PANIC(session)); WT_RET(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (start) @@ -174,6 +177,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * log file ready to close. */ while (log->sync_lsn.l.file < min_lsn->l.file) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_cond_signal(session, S2C(session)->log_file_cond); __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } @@ -1467,7 +1471,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); - __log_wait_for_earlier_slot(session, slot); + WT_ERR(__log_wait_for_earlier_slot(session, slot)); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; @@ -1488,6 +1492,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -2178,15 +2183,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); __wt_cond_wait( session, log->log_write_cond, 10000, NULL); + } } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); __wt_cond_wait( session, log->log_sync_cond, 10000, NULL); + } } /* diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 542f010ea53..b4655ff6c1a 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -220,6 +220,7 @@ __log_slot_switch_internal( if (slot != log->active_slot) return (0); + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * We may come through here multiple times if we were able to close * a slot but could not set up a new one. If we closed it already, @@ -582,6 +583,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * Set our offset if we are larger. */ -- cgit v1.2.1 From db4cfede16a49dfca37303e713ddb171c041a6b9 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Sat, 18 Feb 2017 08:53:32 +1100 Subject: WT-3187 Ramp up aggressiveness in reducing cache pool usage (#3306) * WT-3187 Ramp up aggressiveness in reducing cache pool usage We could get into situations where no participants looked like good candidates. Also put a failsafe into the balance loop, to ensure future failures to reduce usage won't lead to hang on shutdown. * KNF and wordsmithing. --- src/conn/conn_cache_pool.c | 56 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 49b766f4602..ed078991581 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -418,8 +418,9 @@ static void __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) { WT_CACHE_POOL *cp; - bool adjusted; uint64_t bump_threshold, highest; + int i; + bool adjusted; cp = __wt_process.cache_pool; adjusted = false; @@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) /* * Actively attempt to: - * - Reduce the amount allocated, if we are over the budget + * - Reduce the amount allocated, if we are over the budget. * - Increase the amount used if there is capacity and any pressure. + * Don't keep trying indefinitely, if we aren't succeeding in reducing + * the cache in use re-assessing the participants' states is necessary. + * We are also holding a lock across this process, which can slow + * participant shutdown if we spend a long time balancing. */ - while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { + for (i = 0; + i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD && + F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) { __cache_pool_adjust( session, highest, bump_threshold, forward, &adjusted); /* @@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *entry; uint64_t adjustment, highest_percentile, pressure, reserved, smallest; u_int pct_full; - bool busy, pool_full, grow; + bool busy, decrease_ok, grow, pool_full; *adjustedp = false; cp = __wt_process.cache_pool; @@ -611,6 +618,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) continue; + /* + * The bump threshold decreases as we try longer to balance + * the pool. Adjust how aggressively we free space from + * participants depending on how long we have been trying. + */ + decrease_ok = false; + /* + * Any participant is a candidate if we have been trying + * for long enough. + */ + if (bump_threshold == 0) + decrease_ok = true; + /* + * Participants that aren't doing application eviction and + * are showing a reasonable amount of usage are excluded + * even if we have been trying for a while. + */ + else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 && + (!busy && highest > 1)) + decrease_ok = true; + /* + * Any participant that is proportionally less busy is a + * candidate from the first attempt. + */ + else if (highest > 1 && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD) + decrease_ok = true; + /* * If the entry is currently allocated less than the reserved * size, increase its allocation. This should only happen if: @@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * Conditions for reducing the amount of resources for an * entry: * - the pool is full, - * - application threads are not busy doing eviction already, * - this entry has more than the minimum amount of space in * use, - * - the read pressure in this entry is below the threshold, - * other entries need more cache, the entry has more than - * the minimum space and there is no available space in the - * pool. + * - it was determined that this slot is a good candidate */ - } else if (pool_full && !busy && - entry->cache_size > reserved && - pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + } else if (pool_full && + entry->cache_size > reserved && decrease_ok) { grow = false; /* * Don't drop the size down too much - or it can -- cgit v1.2.1 From c23fa74a5fcefd751532ed0357ee0b237d487ab2 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 20 Feb 2017 11:02:13 +1100 Subject: WT-3189 Fix a segfault in eviction random page search. (#3308) A NULL page could be encountered when traversing a tree that is being used by exclusive access. --- src/btree/bt_random.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index 44de511f787..4c7ff861d26 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -201,6 +201,16 @@ restart: /* current = &btree->root; for (;;) { page = current->page; + /* + * When walking a tree for eviction, an exclusive operation may + * be in progress leaving the root page is not valid. Just give + * up in that case. + */ + if (page == NULL) { + WT_ASSERT(session, eviction); + break; + } + if (!WT_PAGE_IS_INTERNAL(page)) break; -- cgit v1.2.1 From acceacbab536b64d52a1f9ef2e6cbdd54a1996ef Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 20 Feb 2017 11:04:56 +1100 Subject: WT-3149 Use a range of eviction walk start points. (#3305) Choosing a random point isn't very efficient in append only workloads. --- src/evict/evict_lru.c | 51 ++++++++++++++++++++++++++++++++------------------- src/include/btree.h | 6 +++++- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 42fe4d4608e..07cf8542c53 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1654,31 +1654,36 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + /* * Choose a random point in the tree if looking for candidates in a * tree with no starting point set. This is mostly aimed at ensuring * eviction fairly visits all pages in trees with a lot of in-cache * content. */ - if (btree->evict_ref == NULL) { - /* Ensure internal pages indexes remain valid for our walk */ - WT_WITH_PAGE_INDEX(session, ret = - __wt_random_descent(session, &btree->evict_ref, true)); - WT_RET_NOTFOUND_OK(ret); - - /* - * Reverse the direction of the walk each time we start at a - * random point so both ends of the tree are equally likely to - * be visited. - */ - btree->evict_walk_reverse = !btree->evict_walk_reverse; - } - - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - if (btree->evict_walk_reverse) + switch (btree->evict_walk_state) { + case WT_EVICT_WALK_NEXT: + break; + case WT_EVICT_WALK_PREV: FLD_SET(walk_flags, WT_READ_PREV); + break; + case WT_EVICT_WALK_RAND_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + /* FALLTHROUGH */ + case WT_EVICT_WALK_RAND_NEXT: + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid */ + WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent( + session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + } + break; + default: + WT_RET_MSG(session, EINVAL, + "Invalid btree walk state encountered"); + } /* * Get some more eviction candidate pages, starting at the last saved @@ -1713,8 +1718,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); - if (give_up) + if (give_up) { + /* + * Try a different walk start point next time if a + * walk gave up. + */ + btree->evict_walk_state = + (btree->evict_walk_state + 1) % + WT_EVICT_WALK_MAX_LEGAL_VALUE; break; + } if (ref == NULL) { if (++restarts == 2) diff --git a/src/include/btree.h b/src/include/btree.h index d742310bf8f..976c1d2110c 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -141,7 +141,11 @@ struct __wt_btree { u_int evict_walk_skips; /* Number of walks skipped */ u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ - bool evict_walk_reverse; /* Walk direction */ + enum { + WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV + } evict_walk_state; /* Eviction walk state */ +#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1 enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING -- cgit v1.2.1 From e7b2a53c33271598c9041eec8363c95ff37daa58 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 20 Feb 2017 15:17:24 +1100 Subject: WT-3149 Fix a compiler warning on OS X. I guess I shouldn't try to future proof. (#3309) src/evict/evict_lru.c:1683:2: error: default label in switch which covers all enumeration values [-Werror,-Wcovered-switch-default] --- src/evict/evict_lru.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 07cf8542c53..f1949a7c320 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1680,9 +1680,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_RET_NOTFOUND_OK(ret); } break; - default: - WT_RET_MSG(session, EINVAL, - "Invalid btree walk state encountered"); } /* -- cgit v1.2.1 From 1aaf7b2d54886e4d323f05dfa6e08d86d614ee1c Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 21 Feb 2017 07:08:11 -0500 Subject: WT-3191 lint (#3310) * WT-3191 lint Remove WT_UNUSED(session), session is used in the function. * Check returns from WT_CURSOR.get_value(). * Lots of the csuite test programs have "normal" output now, change the testutil_die() function to include a "FAILED" message so it's possible to figure it out. Make the program name a global so we can print it out on error, add a standard testutil_set_progname function to set the program name and call it from everywhere. * Lint is deeply saddened by mixing-and-matching enums and ints, use a switch statement instead of arithmetic operations. * Avoid enum arithmetic with minimal casting. This change only uses the enum for one switch. --- src/evict/evict_lru.c | 8 ++++---- src/include/btree.h | 27 ++++++++++++--------------- src/include/cache.h | 9 +++++++++ src/log/log_slot.c | 1 - test/bloom/test_bloom.c | 13 ++++--------- test/checkpoint/test_checkpoint.c | 17 +++++++---------- test/checkpoint/test_checkpoint.h | 1 - test/csuite/wt3184_dup_index_collator/main.c | 4 ++-- test/cursor_order/cursor_order.c | 6 +----- test/fops/t.c | 12 ++---------- test/format/config.c | 12 ++++++------ test/format/format.h | 2 -- test/format/ops.c | 4 ++-- test/format/t.c | 15 ++++----------- test/format/wts.c | 2 +- test/huge/huge.c | 7 +------ test/manydbs/manydbs.c | 7 ++----- test/readonly/readonly.c | 7 ++----- test/recovery/random-abort.c | 7 ++----- test/recovery/truncated-log.c | 6 +----- test/salvage/salvage.c | 7 +------ test/thread/t.c | 6 +----- test/utility/misc.c | 17 +++++++++++++++++ test/utility/parse_opts.c | 5 +---- test/utility/test_util.h | 5 ++++- 25 files changed, 86 insertions(+), 121 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index f1949a7c320..f07a823ff57 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1663,7 +1663,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, * eviction fairly visits all pages in trees with a lot of in-cache * content. */ - switch (btree->evict_walk_state) { + switch ((WT_EVICT_WALK_START)btree->evict_start_type) { case WT_EVICT_WALK_NEXT: break; case WT_EVICT_WALK_PREV: @@ -1720,9 +1720,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, * Try a different walk start point next time if a * walk gave up. */ - btree->evict_walk_state = - (btree->evict_walk_state + 1) % - WT_EVICT_WALK_MAX_LEGAL_VALUE; + btree->evict_start_type = + (btree->evict_start_type + 1) % + WT_EVICT_WALK_START_NUM; break; } diff --git a/src/include/btree.h b/src/include/btree.h index 976c1d2110c..2aa0e470f59 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -130,22 +130,19 @@ struct __wt_btree { uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ - uint64_t bytes_inmem; /* Cache bytes in memory. */ - uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ - uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ - - WT_REF *evict_ref; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages */ - u_int evict_walk_period; /* Skip this many LRU walks */ - u_int evict_walk_saved; /* Saved walk skips for checkpoints */ - u_int evict_walk_skips; /* Number of walks skipped */ - u_int evict_disabled; /* Eviction disabled count */ + uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ + uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ + + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_saved; /* Saved walk skips for checkpoints */ + u_int evict_walk_skips; /* Number of walks skipped */ + u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ - enum { - WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV, - WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV - } evict_walk_state; /* Eviction walk state */ -#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1 + int evict_start_type; /* Start position for eviction walk + (see WT_EVICT_WALK_START). */ enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING diff --git a/src/include/cache.h b/src/include/cache.h index abd5a1901f7..04920c3585a 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -18,6 +18,15 @@ #define WT_EVICT_MAX_TREES 1000 /* Maximum walk points */ +/* Ways to position when starting an eviction walk. */ +typedef enum { + WT_EVICT_WALK_NEXT, + WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, + WT_EVICT_WALK_RAND_PREV +} WT_EVICT_WALK_START; +#define WT_EVICT_WALK_START_NUM (WT_EVICT_WALK_RAND_PREV + 1) + /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. diff --git a/src/log/log_slot.c b/src/log/log_slot.c index b4655ff6c1a..c685aec3ffc 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -574,7 +574,6 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) wt_off_t cur_offset, my_start; int64_t my_size, rel_size; - WT_UNUSED(session); slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; /* diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index 67249ff887e..bef509e01d8 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -29,8 +29,6 @@ #include "test_util.h" static struct { - char *progname; /* Program name */ - WT_CONNECTION *wt_conn; /* WT_CONNECTION handle */ WT_SESSION *wt_session; /* WT_SESSION handle */ @@ -61,10 +59,7 @@ main(int argc, char *argv[]) { int ch; - if ((g.progname = strrchr(argv[0], DIR_DELIM)) == NULL) - g.progname = argv[0]; - else - ++g.progname; + (void)testutil_set_progname(argv); /* Set default configuration values. */ g.c_cache = 10; @@ -75,7 +70,7 @@ main(int argc, char *argv[]) g.c_srand = 3233456; /* Set values from the command line. */ - while ((ch = __wt_getopt(g.progname, argc, argv, "c:f:k:o:s:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "c:f:k:o:s:")) != EOF) switch (ch) { case 'c': /* Cache size */ g.c_cache = (u_int)atoi(__wt_optarg); @@ -128,7 +123,7 @@ setup(void) */ snprintf(config, sizeof(config), "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,%s", - g.progname, g.c_cache, g.config_open == NULL ? "" : g.config_open); + progname, g.c_cache, g.config_open == NULL ? "" : g.config_open); testutil_check(wiredtiger_open(NULL, NULL, config, &conn)); @@ -246,7 +241,7 @@ populate_entries(void) void usage(void) { - fprintf(stderr, "usage: %s [-cfkos]\n", g.progname); + fprintf(stderr, "usage: %s [-cfkos]\n", progname); fprintf(stderr, "%s", "\t-c cache size\n" "\t-f number of bits per item\n" diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c index 4998019ad8e..c7132b433d2 100644 --- a/test/checkpoint/test_checkpoint.c +++ b/test/checkpoint/test_checkpoint.c @@ -50,10 +50,7 @@ main(int argc, char *argv[]) char *working_dir; const char *config_open; - if ((g.progname = strrchr(argv[0], DIR_DELIM)) == NULL) - g.progname = argv[0]; - else - ++g.progname; + (void)testutil_set_progname(argv); config_open = NULL; ret = 0; @@ -68,7 +65,7 @@ main(int argc, char *argv[]) runs = 1; while ((ch = __wt_getopt( - g.progname, argc, argv, "c:C:h:k:l:n:r:t:T:W:")) != EOF) + progname, argc, argv, "c:C:h:k:l:n:r:t:T:W:")) != EOF) switch (ch) { case 'c': g.checkpoint_name = __wt_optarg; @@ -132,7 +129,7 @@ main(int argc, char *argv[]) testutil_work_dir_from_path(g.home, 512, working_dir); - printf("%s: process %" PRIu64 "\n", g.progname, (uint64_t)getpid()); + printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid()); for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) { printf(" %d: %d workers, %d tables\n", cnt, g.nworkers, g.ntables); @@ -204,7 +201,7 @@ wt_connect(const char *config_open) snprintf(config, sizeof(config), "create,statistics=(fast),error_prefix=\"%s\",cache_size=1GB%s%s", - g.progname, + progname, config_open == NULL ? "" : ",", config_open == NULL ? "" : config_open); @@ -297,10 +294,10 @@ log_print_err(const char *m, int e, int fatal) g.running = 0; g.status = e; } - fprintf(stderr, "%s: %s: %s\n", g.progname, m, wiredtiger_strerror(e)); + fprintf(stderr, "%s: %s: %s\n", progname, m, wiredtiger_strerror(e)); if (g.logfp != NULL) fprintf(g.logfp, "%s: %s: %s\n", - g.progname, m, wiredtiger_strerror(e)); + progname, m, wiredtiger_strerror(e)); return (e); } @@ -333,7 +330,7 @@ usage(void) "usage: %s " "[-S] [-C wiredtiger-config] [-k keys] [-l log]\n\t" "[-n ops] [-c checkpoint] [-r runs] [-t f|r|v] [-W workers]\n", - g.progname); + progname); fprintf(stderr, "%s", "\t-C specify wiredtiger_open configuration arguments\n" "\t-c checkpoint name to used named checkpoints\n" diff --git a/test/checkpoint/test_checkpoint.h b/test/checkpoint/test_checkpoint.h index 0d0d02447d5..347bd2c6e89 100644 --- a/test/checkpoint/test_checkpoint.h +++ b/test/checkpoint/test_checkpoint.h @@ -58,7 +58,6 @@ typedef struct { u_int nkeys; /* Keys to load */ u_int nops; /* Operations per thread */ FILE *logfp; /* Message log file. */ - char *progname; /* Program name */ int nworkers; /* Number workers configured */ int ntables; /* Number tables configured */ int ntables_created; /* Number tables opened */ diff --git a/test/csuite/wt3184_dup_index_collator/main.c b/test/csuite/wt3184_dup_index_collator/main.c index bcefd2f1a3b..c969e7a1d7e 100644 --- a/test/csuite/wt3184_dup_index_collator/main.c +++ b/test/csuite/wt3184_dup_index_collator/main.c @@ -157,9 +157,9 @@ main(int argc, char *argv[]) printf("duplicating cursor\n"); testutil_check(session->open_cursor(session, NULL, cursor, NULL, &cursor1)); - cursor->get_value(cursor, &got); + testutil_check(cursor->get_value(cursor, &got)); testutil_assert(item_to_int(&got) == 17); - cursor1->get_value(cursor1, &got); + testutil_check(cursor1->get_value(cursor1, &got)); testutil_assert(item_to_int(&got) == 17); testutil_check(session->close(session, NULL)); diff --git a/test/cursor_order/cursor_order.c b/test/cursor_order/cursor_order.c index 85b8c68e545..62777f552bf 100644 --- a/test/cursor_order/cursor_order.c +++ b/test/cursor_order/cursor_order.c @@ -29,7 +29,6 @@ #include "cursor_order.h" static char home[512]; /* Program working dir */ -static char *progname; /* Program name */ static FILE *logfp; /* Log file */ static int handle_error(WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); @@ -51,10 +50,7 @@ main(int argc, char *argv[]) int ch, cnt, runs; char *config_open, *working_dir; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); cfg = &_cfg; config_open = NULL; diff --git a/test/fops/t.c b/test/fops/t.c index 7b4a7cf8fca..651d22c8deb 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -34,7 +34,6 @@ u_int nops; /* Operations */ const char *uri; /* Object */ const char *config; /* Object config */ -static char *progname; /* Program name */ static FILE *logfp; /* Log file */ static char home[512]; @@ -71,22 +70,15 @@ main(int argc, char *argv[]) int ch, cnt, ret, runs; char *config_open, *working_dir; - working_dir = NULL; - - /* Remove directories */ - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); if ((ret = pthread_rwlock_init(&single, NULL)) != 0) testutil_die(ret, "pthread_rwlock_init: single"); - config_open = NULL; nops = 1000; nthreads = 10; runs = 1; - + config_open = working_dir = NULL; while ((ch = __wt_getopt(progname, argc, argv, "C:h:l:n:r:t:")) != EOF) switch (ch) { case 'C': /* wiredtiger_open config */ diff --git a/test/format/config.c b/test/format/config.c index 50430fe073e..958ad6b7a99 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -104,7 +104,7 @@ config_setup(void) if (DATASOURCE("lsm") && g.type != ROW) { fprintf(stderr, "%s: lsm data_source is only compatible with row file_type\n", - g.progname); + progname); exit(EXIT_FAILURE); } @@ -681,7 +681,7 @@ config_single(const char *s, int perm) if ((ep = strchr(s, '=')) == NULL) { fprintf(stderr, - "%s: %s: illegal configuration value\n", g.progname, s); + "%s: %s: illegal configuration value\n", progname, s); exit(EXIT_FAILURE); } @@ -751,20 +751,20 @@ config_single(const char *s, int perm) v = strtol(ep, &p, 10); if (*p != '\0') { fprintf(stderr, "%s: %s: illegal numeric value\n", - g.progname, s); + progname, s); exit(EXIT_FAILURE); } } if (F_ISSET(cp, C_BOOL)) { if (v != 0 && v != 1) { fprintf(stderr, "%s: %s: value of boolean not 0 or 1\n", - g.progname, s); + progname, s); exit(EXIT_FAILURE); } } else if (v < cp->min || v > cp->maxset) { fprintf(stderr, "%s: %s: value outside min/max values of %" PRIu32 "-%" PRIu32 "\n", - g.progname, s, cp->min, cp->maxset); + progname, s, cp->min, cp->maxset); exit(EXIT_FAILURE); } *cp->v = (uint32_t)v; @@ -883,7 +883,7 @@ config_find(const char *s, size_t len) return (cp); fprintf(stderr, - "%s: %s: unknown configuration keyword\n", g.progname, s); + "%s: %s: unknown configuration keyword\n", progname, s); config_error(); exit(EXIT_FAILURE); } diff --git a/test/format/format.h b/test/format/format.h index 6bb44410acc..41cc48c4278 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -79,8 +79,6 @@ #define FORMAT_OPERATION_REPS 3 /* 3 thread operations sets */ typedef struct { - char *progname; /* Program name */ - char *home; /* Home directory */ char *home_backup; /* Hot-backup directory */ char *home_backup_init; /* Initialize backup command */ diff --git a/test/format/ops.c b/test/format/ops.c index 940318c87a9..1013d1da30b 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -1448,7 +1448,7 @@ notfound_chk(const char *f, int wt_ret, int bdb_notfound, uint64_t keyno) return (1); if (bdb_notfound) { - fprintf(stderr, "%s: %s:", g.progname, f); + fprintf(stderr, "%s: %s:", progname, f); if (keyno != 0) fprintf(stderr, " row %" PRIu64 ":", keyno); fprintf(stderr, @@ -1456,7 +1456,7 @@ notfound_chk(const char *f, int wt_ret, int bdb_notfound, uint64_t keyno) testutil_die(0, NULL); } if (wt_ret == WT_NOTFOUND) { - fprintf(stderr, "%s: %s:", g.progname, f); + fprintf(stderr, "%s: %s:", progname, f); if (keyno != 0) fprintf(stderr, " row %" PRIu64 ":", keyno); fprintf(stderr, diff --git a/test/format/t.c b/test/format/t.c index 7701595776c..c6686ae8b91 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -49,14 +49,7 @@ main(int argc, char *argv[]) config = NULL; -#ifdef _WIN32 - g.progname = "t_format.exe"; -#else - if ((g.progname = strrchr(argv[0], DIR_DELIM)) == NULL) - g.progname = argv[0]; - else - ++g.progname; -#endif + (void)testutil_set_progname(argv); #if 0 /* Configure the GNU malloc for debugging. */ @@ -74,7 +67,7 @@ main(int argc, char *argv[]) home = NULL; onerun = 0; while ((ch = __wt_getopt( - g.progname, argc, argv, "1C:c:H:h:Llqrt:")) != EOF) + progname, argc, argv, "1C:c:H:h:Llqrt:")) != EOF) switch (ch) { case '1': /* One run */ onerun = 1; @@ -179,7 +172,7 @@ main(int argc, char *argv[]) testutil_check(pthread_rwlock_init(&g.checkpoint_lock, NULL)); testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); - printf("%s: process %" PRIdMAX "\n", g.progname, (intmax_t)getpid()); + printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid()); while (++g.run_cnt <= g.c_runs || g.c_runs == 0 ) { startup(); /* Start a run */ @@ -344,7 +337,7 @@ usage(void) "usage: %s [-1Llqr] [-C wiredtiger-config]\n " "[-c config-file] [-H mount] [-h home] " "[name=value ...]\n", - g.progname); + progname); fprintf(stderr, "%s", "\t-1 run once\n" "\t-C specify wiredtiger_open configuration arguments\n" diff --git a/test/format/wts.c b/test/format/wts.c index da234ce53c7..a87aa5b9f88 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -144,7 +144,7 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) "cache_size=%" PRIu32 "MB," "checkpoint_sync=false," "error_prefix=\"%s\"", - g.c_cache, g.progname); + g.c_cache, progname); /* In-memory configuration. */ if (g.c_in_memory != 0) diff --git a/test/huge/huge.c b/test/huge/huge.c index 17e2db353d5..2b0d5f498e3 100644 --- a/test/huge/huge.c +++ b/test/huge/huge.c @@ -29,7 +29,6 @@ #include "test_util.h" static char home[512]; /* Program working dir */ -static const char *progname; /* Program name */ static uint8_t *big; /* Big key/value buffer */ #define GIGABYTE (1073741824) @@ -167,14 +166,10 @@ main(int argc, char *argv[]) int ch, small; char *working_dir; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); small = 0; working_dir = NULL; - while ((ch = __wt_getopt(progname, argc, argv, "h:s")) != EOF) switch (ch) { case 'h': diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c index 7e986d47af3..345c470ba90 100644 --- a/test/manydbs/manydbs.c +++ b/test/manydbs/manydbs.c @@ -32,7 +32,6 @@ #define HOME_BASE "WT_TEST" static char home[HOME_SIZE]; /* Base home directory */ static char hometmp[HOME_SIZE]; /* Each conn home directory */ -static const char *progname; /* Program name */ static const char * const uri = "table:main"; #define WTOPEN_CFG_COMMON \ @@ -129,10 +128,8 @@ main(int argc, char *argv[]) const char *working_dir, *wt_cfg; char cmd[128]; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); + dbs = MAX_DBS; working_dir = HOME_BASE; idle = false; diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c index a4b79f5859f..746aecbf6c5 100644 --- a/test/readonly/readonly.c +++ b/test/readonly/readonly.c @@ -39,7 +39,6 @@ static char home_rd[HOME_SIZE + sizeof(HOME_RD_SUFFIX)]; #define HOME_RD2_SUFFIX ".RDNOLOCK" /* Read-only dir no lock file */ static char home_rd2[HOME_SIZE + sizeof(HOME_RD2_SUFFIX)]; -static const char *progname; /* Program name */ static const char *saved_argv0; /* Program command */ static const char * const uri = "table:main"; @@ -172,10 +171,8 @@ main(int argc, char *argv[]) char cmd[512]; uint8_t buf[MAX_VAL]; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); + /* * Needed unaltered for system command later. */ diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index 660ef0cca67..1d6599ce1b3 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -32,7 +32,7 @@ #include static char home[1024]; /* Program working dir */ -static const char *progname; /* Program name */ + /* * These two names for the URI and file system must be maintained in tandem. */ @@ -229,10 +229,7 @@ main(int argc, char *argv[]) const char *working_dir; char fname[64], kname[64], statname[1024]; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); inmem = false; nth = MIN_TH; diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c index 6a142b8e710..1f0a0f7a7bd 100644 --- a/test/recovery/truncated-log.c +++ b/test/recovery/truncated-log.c @@ -36,7 +36,6 @@ #endif static char home[1024]; /* Program working dir */ -static const char *progname; /* Program name */ static const char * const uri = "table:main"; #define RECORDS_FILE "records" @@ -271,10 +270,7 @@ main(int argc, char *argv[]) pid_t pid; const char *working_dir; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); working_dir = "WT_TEST.truncated-log"; while ((ch = __wt_getopt(progname, argc, argv, "h:")) != EOF) diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index b8553bbd72d..942f7faba03 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -54,8 +54,6 @@ void run(int); void t(int, u_int, int); int usage(void); -static const char *progname; /* Program name */ - static FILE *res_fp; /* Results file */ static u_int page_type; /* File types */ static int value_unique; /* Values are unique */ @@ -70,10 +68,7 @@ main(int argc, char *argv[]) u_int ptype; int ch, r; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); r = 0; ptype = 0; diff --git a/test/thread/t.c b/test/thread/t.c index baadbf2adb9..9dfd02bdad2 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -37,7 +37,6 @@ int multiple_files; /* File per thread */ int session_per_op; /* New session per operation */ static char home[512]; /* Program working dir */ -static char *progname; /* Program name */ static FILE *logfp; /* Log file */ static int handle_error(WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); @@ -59,10 +58,7 @@ main(int argc, char *argv[]) int ch, cnt, runs; char *config_open, *working_dir; - if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) - progname = argv[0]; - else - ++progname; + (void)testutil_set_progname(argv); config_open = NULL; working_dir = NULL; diff --git a/test/utility/misc.c b/test/utility/misc.c index 1ba08ddd77f..8aee9d16f66 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -28,6 +28,7 @@ #include "test_util.h" void (*custom_die)(void) = NULL; +const char *progname = "program name not set"; /* * die -- @@ -42,7 +43,9 @@ testutil_die(int e, const char *fmt, ...) if (custom_die != NULL) (*custom_die)(); + fprintf(stderr, "%s: FAILED", progname); if (fmt != NULL) { + fprintf(stderr, ": "); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); @@ -54,6 +57,20 @@ testutil_die(int e, const char *fmt, ...) exit(EXIT_FAILURE); } +/* + * testutil_set_progname -- + * Set the global program name for error handling. + */ +const char * +testutil_set_progname(char * const *argv) +{ + if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) + progname = argv[0]; + else + ++progname; + return (progname); +} + /* * testutil_work_dir_from_path -- * Takes a buffer, its size and the intended work directory. diff --git a/test/utility/parse_opts.c b/test/utility/parse_opts.c index 74a1c021d5d..af9256b199a 100644 --- a/test/utility/parse_opts.c +++ b/test/utility/parse_opts.c @@ -43,10 +43,7 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) opts->running = true; opts->verbose = false; - if ((opts->progname = strrchr(argv[0], DIR_DELIM)) == NULL) - opts->progname = argv[0]; - else - ++opts->progname; + opts->progname = testutil_set_progname(argv); while ((ch = __wt_getopt(opts->progname, argc, argv, "A:h:n:o:pR:T:t:vW:")) != EOF) diff --git a/test/utility/test_util.h b/test/utility/test_util.h index 489bbe18d87..34829d06f6b 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -48,7 +48,7 @@ /* Generic option parsing structure shared by all test cases. */ typedef struct { char *home; - char *progname; + const char *progname; enum { TABLE_COL=1, /* Fixed-length column store */ TABLE_FIX=2, /* Variable-length column store */ TABLE_ROW=3 /* Row-store */ @@ -192,3 +192,6 @@ void testutil_work_dir_from_path(char *, size_t, const char *); void *thread_append(void *); void *thread_insert_append(void *); void *thread_prev(void *); + +extern const char *progname; +const char *testutil_set_progname(char * const *); -- cgit v1.2.1 From 774c4c208850622f1d908ff0b08bd812b459f59e Mon Sep 17 00:00:00 2001 From: Sasha Fedorova Date: Wed, 22 Feb 2017 22:14:45 +0000 Subject: Reduced the time that the eviction server has to wait for every data point measuring the eviction rate when it is making the tuning decisions. I observed performance improvements across the board from this change. --- src/evict/evict_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index f07a823ff57..d16594b0816 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -921,7 +921,7 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) #define EVICT_TUNE_DATAPT_MIN 3 /* Data points needed before deciding if we should keep adding workers or settle on an earlier value. */ -#define EVICT_TUNE_PERIOD 2 /* Tune period in seconds */ +#define EVICT_TUNE_PERIOD 1 /* Tune period in seconds */ /* * __evict_tune_workers -- -- cgit v1.2.1 From e4146723a89a5f318515f8ee2b662a4bb7d0b919 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 27 Feb 2017 19:16:41 -0500 Subject: WT-3182 Switch make-check to run the short test suite by default (#3313) --- test/checkpoint/smoke.sh | 4 ++-- test/csuite/wt2246_col_append/main.c | 5 +++-- test/csuite/wt2323_join_visibility/main.c | 5 +++-- test/csuite/wt2535_insert_race/main.c | 5 +++-- test/csuite/wt2834_join_bloom_fix/main.c | 6 +++--- test/csuite/wt2853_perf/main.c | 6 +++--- test/csuite/wt2909_checkpoint_integrity/main.c | 26 ++++++++++++-------------- test/utility/misc.c | 15 ++++++++++----- test/utility/test_util.h | 2 +- 9 files changed, 40 insertions(+), 34 deletions(-) diff --git a/test/checkpoint/smoke.sh b/test/checkpoint/smoke.sh index 123d4e00df5..39b1f428c2c 100755 --- a/test/checkpoint/smoke.sh +++ b/test/checkpoint/smoke.sh @@ -6,8 +6,8 @@ set -e echo "checkpoint: 3 mixed tables" $TEST_WRAPPER ./t -T 3 -t m -# We are done if short tests are requested -test -z "$TESTUTIL_DISABLE_LONG_TESTS" || exit 0 +# We are done unless long tests are enabled. +test "$TESTUTIL_ENABLE_LONG_TESTS" = "1" || exit 0 echo "checkpoint: 6 column-store tables" $TEST_WRAPPER ./t -T 6 -t c diff --git a/test/csuite/wt2246_col_append/main.c b/test/csuite/wt2246_col_append/main.c index 4b352b26051..976e2269da6 100644 --- a/test/csuite/wt2246_col_append/main.c +++ b/test/csuite/wt2246_col_append/main.c @@ -101,9 +101,10 @@ main(int argc, char *argv[]) uint64_t i, id; char buf[100]; + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; - if (testutil_disable_long_tests()) - return (0); memset(opts, 0, sizeof(*opts)); opts->table_type = TABLE_ROW; opts->n_append_threads = N_APPEND_THREADS; diff --git a/test/csuite/wt2323_join_visibility/main.c b/test/csuite/wt2323_join_visibility/main.c index 239a3f300d0..a61f707e008 100644 --- a/test/csuite/wt2323_join_visibility/main.c +++ b/test/csuite/wt2323_join_visibility/main.c @@ -92,10 +92,11 @@ main(int argc, char *argv[]) TEST_OPTS *opts, _opts; const char *tablename; + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; sharedopts = &_sharedopts; - if (testutil_disable_long_tests()) - return (0); memset(opts, 0, sizeof(*opts)); memset(sharedopts, 0, sizeof(*sharedopts)); diff --git a/test/csuite/wt2535_insert_race/main.c b/test/csuite/wt2535_insert_race/main.c index ae18760a829..ba17d485e07 100644 --- a/test/csuite/wt2535_insert_race/main.c +++ b/test/csuite/wt2535_insert_race/main.c @@ -49,9 +49,10 @@ main(int argc, char *argv[]) uint64_t current_value; int i; + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; - if (testutil_disable_long_tests()) - return (0); memset(opts, 0, sizeof(*opts)); opts->nthreads = 10; opts->nrecords = 1000; diff --git a/test/csuite/wt2834_join_bloom_fix/main.c b/test/csuite/wt2834_join_bloom_fix/main.c index 7c80496f1b6..f2c54b942be 100644 --- a/test/csuite/wt2834_join_bloom_fix/main.c +++ b/test/csuite/wt2834_join_bloom_fix/main.c @@ -59,11 +59,11 @@ main(int argc, char *argv[]) char flaguri[256]; char joinuri[256]; + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; - if (testutil_disable_long_tests()) - return (0); memset(opts, 0, sizeof(*opts)); - testutil_check(testutil_parse_opts(argc, argv, opts)); testutil_make_work_dir(opts->home); diff --git a/test/csuite/wt2853_perf/main.c b/test/csuite/wt2853_perf/main.c index 6cec9634cd1..b365b03493a 100644 --- a/test/csuite/wt2853_perf/main.c +++ b/test/csuite/wt2853_perf/main.c @@ -82,11 +82,11 @@ main(int argc, char *argv[]) int i, nfail; const char *tablename; + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; sharedopts = &_sharedopts; - - if (testutil_disable_long_tests()) - return (0); memset(opts, 0, sizeof(*opts)); memset(sharedopts, 0, sizeof(*sharedopts)); memset(insert_args, 0, sizeof(insert_args)); diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index ddf249fb406..0ae81543050 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -96,9 +96,8 @@ static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, uint64_t *); static void run_check_subtest_range(TEST_OPTS *, const char *, bool); static int run_process(TEST_OPTS *, const char *, char *[], int *); -static int subtest_main(int, char *[], bool); +static void subtest_main(int, char *[], bool); static void subtest_populate(TEST_OPTS *, bool); -int main(int, char *[]); extern int __wt_optind; @@ -446,7 +445,7 @@ run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) * subtest_main -- * The main program for the subtest */ -static int +static void subtest_main(int argc, char *argv[], bool close_test) { TEST_OPTS *opts, _opts; @@ -454,8 +453,6 @@ subtest_main(int argc, char *argv[], bool close_test) char config[1024], filename[1024]; struct rlimit rlim; - if (testutil_disable_long_tests()) - return (0); opts = &_opts; memset(opts, 0, sizeof(*opts)); memset(&rlim, 0, sizeof(rlim)); @@ -499,8 +496,6 @@ subtest_main(int argc, char *argv[], bool close_test) subtest_populate(opts, close_test); testutil_cleanup(opts); - - return (0); } /* @@ -622,8 +617,9 @@ main(int argc, char *argv[]) uint64_t nresults; const char *debugger; - if (testutil_disable_long_tests()) - return (0); + if (!testutil_enable_long_tests()) /* Ignore unless requested */ + return (EXIT_SUCCESS); + opts = &_opts; memset(opts, 0, sizeof(*opts)); debugger = NULL; @@ -635,11 +631,13 @@ main(int argc, char *argv[]) opts->nrecords = 50000; while (argc > 0) { - if (strcmp(argv[0], "subtest") == 0) - return (subtest_main(argc, argv, false)); - else if (strcmp(argv[0], "subtest_close") == 0) - return (subtest_main(argc, argv, true)); - else if (strcmp(argv[0], "gdb") == 0) + if (strcmp(argv[0], "subtest") == 0) { + subtest_main(argc, argv, false); + return (0); + } else if (strcmp(argv[0], "subtest_close") == 0) { + subtest_main(argc, argv, true); + return (0); + } else if (strcmp(argv[0], "gdb") == 0) debugger = "/usr/bin/gdb"; else testutil_assert(false); diff --git a/test/utility/misc.c b/test/utility/misc.c index 8aee9d16f66..61dad3d76c2 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -166,20 +166,25 @@ testutil_cleanup(TEST_OPTS *opts) } /* - * testutil_disable_long_tests -- - * Return if TESTUTIL_DISABLE_LONG_TESTS is set. + * testutil_enable_long_tests -- + * Return if TESTUTIL_ENABLE_LONG_TESTS is set. */ bool -testutil_disable_long_tests(void) +testutil_enable_long_tests(void) { const char *res; + bool enable_long_tests; if (__wt_getenv(NULL, - "TESTUTIL_DISABLE_LONG_TESTS", &res) == WT_NOTFOUND) + "TESTUTIL_ENABLE_LONG_TESTS", &res) == WT_NOTFOUND) return (false); + /* Accept anything other than "TESTUTIL_ENABLE_LONG_TESTS=0". */ + enable_long_tests = res[0] != '0'; + free((void *)res); - return (true); + + return (enable_long_tests); } /* diff --git a/test/utility/test_util.h b/test/utility/test_util.h index 34829d06f6b..406ed2c4961 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -185,7 +185,7 @@ void *dstrdup(const void *); void *dstrndup(const char *, size_t); void testutil_clean_work_dir(const char *); void testutil_cleanup(TEST_OPTS *); -bool testutil_disable_long_tests(void); +bool testutil_enable_long_tests(void); void testutil_make_work_dir(char *); int testutil_parse_opts(int, char * const *, TEST_OPTS *); void testutil_work_dir_from_path(char *, size_t, const char *); -- cgit v1.2.1 From 09e26f73985f3ba023602de7dad9ad036700cf25 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Tue, 28 Feb 2017 15:27:28 +1100 Subject: WT-3182 Update Evergreen configuration to include long make check (#3314) --- test/mciproject.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/mciproject.yml b/test/mciproject.yml index eb74914eb46..6456475aa00 100644 --- a/test/mciproject.yml +++ b/test/mciproject.yml @@ -65,7 +65,7 @@ tasks: ./build_posix/reconf ${configure_env_vars|} ./configure --enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-verbose ${make_command|make} ${smp_command|} 2>&1 - ${make_command|make} VERBOSE=1 check 2>&1 + TESTUTIL_ENABLE_LONG_TESTS=1 ${make_command|make} VERBOSE=1 check 2>&1 fi - command: archive.targz_pack params: -- cgit v1.2.1 From f3747a2625a531e0405fd8f5f256184ca5479197 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 28 Feb 2017 09:09:04 -0500 Subject: WT-3193 Close a race between verify and the eviction server (#3311) * Don't jump out of the loop if we find an invalid root page, return WT_NOTFOUND instead. Jumping out of the loop leads to potentially setting a returned WT_REF and returning 0, which aren't right for __wt_btcur_next_random(). For clarity, this change is not part of the bug fix: we should never be in this code unless we're in eviction, but I don't want to return success, either. We weren't always overwriting the returned WT_REF; do so with a NULL so no caller depends on it not being overwritten in some path. Fix a comment. * WT-3189 Fix a segfault in the eviction server random positioning Back out a26e81d: there's an underlying bug here that verify swaps different checkpoint root pages in-and-out of the WT_BTREE.root structure, without locking out eviction, so checking for a NULL WT_BTREE.root.page pointer isn't sufficient. * Add a comment to explain why we're checking for a NULL root page: there are paths that get here without ever reading in a page from the backing file (for example, when discarding lock-only handles). * Verify has to lock out eviction when swapping checkpoint root pages to/from the WT_BTREE.root structure. Fix a bug in error handling where we could leave a checkpoint loaded in the underlying block manager. * If we have to unload the checkpoint, we also may have to empty the cache. * Rather than turning off eviction in the "special" commands (rebalance, salvage, upgrade and verify), after setting the handle open flag and potentially letting eviction access the tree, turn off eviction before returning into the handle code and setting the handle-open flag. Change the WT_BTREE.close code to clear everything in the WT_BTREE structure the btree layer owns. On close, leave just cache and eviction information, and the LSM-primary flag. On open, leave just the operation flags set by the handle-manager. * Remove eviction lockout code from __wt_evict_file(), assert callers have already done so. Change __wt_session_lock_checkpoint() to lock out eviction before calling __wt_evict_file(). * Revert unnecessary/accidental change. * The only place we should see empty trees is when evicting handles that never loaded any backing pages, for whatever reason, check for that in __wt_evict_file(). Change the eviction code and the tree walk code to assert they never see handles without a valid root page. * Replace explicit WT_PTRDIFF() calls on structure members with offsetof(). * Rework a comment to clarify the reasons for special WT_BTREE initialization. --- src/btree/bt_cursor.c | 4 +- src/btree/bt_handle.c | 102 +++++++++++++++++++++++++----------------- src/btree/bt_random.c | 12 +---- src/btree/bt_rebalance.c | 15 +------ src/btree/bt_slvg.c | 12 ----- src/btree/bt_vrfy.c | 36 ++++++++++++--- src/btree/bt_walk.c | 11 +++-- src/evict/evict_file.c | 20 ++++++--- src/evict/evict_lru.c | 16 +++++-- src/include/btree.h | 37 ++++++++------- src/include/extern.h | 1 - src/include/session.h | 24 +++++----- src/session/session_api.c | 2 +- src/session/session_dhandle.c | 11 ++--- 14 files changed, 171 insertions(+), 132 deletions(-) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 5fde2237538..6a48c5f752b 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -512,7 +512,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; - __wt_btree_evictable(session, true); + __wt_evict_file_exclusive_off(session); } retry: WT_RET(__cursor_func_init(cbt, true)); @@ -766,7 +766,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; - __wt_btree_evictable(session, true); + __wt_evict_file_exclusive_off(session); } retry: WT_RET(__cursor_func_init(cbt, true)); diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 6ed70788759..d714dab6000 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -14,6 +14,40 @@ static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool); +/* + * __btree_initialize -- + * Initialize the WT_BTREE structure. + */ +static void +__btree_initialize(WT_BTREE *btree, bool closing) +{ + uint32_t mask; + + /* + * This function exists as a place to discuss how the WT_BTREE structure + * is initialized (or re-initialized, when the object is re-opened). The + * upper-level handle code sets/clears flags in the WT_BTREE structure, + * plus the eviction/cache code reads/writes cache information. The + * latter happens in-between a forced drop and sweep discarding the + * tree (where the tree is still "open" and has pages being evicted from + * the cache), but it's no longer part of the namespace. For all those + * reasons, parts of the WT_BTREE object must persist after it's closed. + */ + if (closing) { + /* + * Closing: clear everything except cache/eviction information + * and one LSM flag. + */ + memset(btree, 0, WT_BTREE_CLEAR_SIZE); + F_CLR(btree, ~(WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION)); + } else { + /* Opening: clear everything except the special flags. */ + mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); + memset(btree, 0, sizeof(*btree)); + btree->flags = mask; + } +} + /* * __wt_btree_open -- * Open a Btree. @@ -33,7 +67,10 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) bool creation, forced_salvage, readonly; dhandle = session->dhandle; + btree = S2BT(session); + __btree_initialize(btree, false); + btree->dhandle = session->dhandle; /* Checkpoint files are readonly. */ readonly = dhandle->checkpoint != NULL || @@ -126,6 +163,20 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) } } + /* + * Eviction ignores trees until the handle's open flag is set, configure + * eviction before that happens. + * + * Files that can still be bulk-loaded cannot be evicted. + * Permanently cache-resident files can never be evicted. + * Special operations don't enable eviction. (The underlying commands + * may turn on eviction, but it's their decision.) + */ + if (btree->bulk_load_ok || + F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_ERR(__wt_evict_file_exclusive_on(session)); + if (0) { err: WT_TRET(__wt_btree_close(session)); } @@ -155,13 +206,15 @@ __wt_btree_close(WT_SESSION_IMPL *session) /* Close the underlying block manager reference. */ WT_TRET(bm->close(bm, session)); - - btree->bm = NULL; } /* Close the Huffman tree. */ __wt_btree_huffman_close(session); + if (btree->collator_owned && btree->collator->terminate != NULL) + WT_TRET(btree->collator->terminate( + btree->collator, &session->iface)); + /* Destroy locks. */ __wt_rwlock_destroy(session, &btree->ovfl_lock); __wt_spin_destroy(session, &btree->flush_lock); @@ -170,18 +223,7 @@ __wt_btree_close(WT_SESSION_IMPL *session) __wt_free(session, btree->key_format); __wt_free(session, btree->value_format); - if (btree->collator_owned) { - if (btree->collator->terminate != NULL) - WT_TRET(btree->collator->terminate( - btree->collator, &session->iface)); - btree->collator_owned = 0; - } - btree->collator = NULL; - btree->kencryptor = NULL; - - btree->bulk_load_ok = false; - - F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + __btree_initialize(btree, true); return (ret); } @@ -267,9 +309,9 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); if (cval.val) - F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + F_SET(btree, WT_BTREE_IN_MEMORY); else - F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + F_CLR(btree, WT_BTREE_IN_MEMORY); WT_RET(__wt_config_gets(session, cfg, "ignore_in_memory_cache_size", &cval)); @@ -482,13 +524,10 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) /* * Newly created objects can be used for cursor inserts or for bulk * loads; set a flag that's cleared when a row is inserted into the - * tree. Objects being bulk-loaded cannot be evicted, we set it - * globally, there's no point in searching empty trees for eviction. + * tree. */ - if (creation) { + if (creation) btree->bulk_load_ok = true; - __wt_btree_evictable(session, false); - } /* * A note about empty trees: the initial tree is a single root page. @@ -580,27 +619,6 @@ __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) return (0); } -/* - * __wt_btree_evictable -- - * Setup or release a cache-resident tree. - */ -void -__wt_btree_evictable(WT_SESSION_IMPL *session, bool on) -{ - WT_BTREE *btree; - - btree = S2BT(session); - - /* Permanently cache-resident files can never be evicted. */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) - return; - - if (on) - F_CLR(btree, WT_BTREE_NO_EVICTION); - else - F_SET(btree, WT_BTREE_NO_EVICTION); -} - /* * __btree_preload -- * Pre-load internal pages. diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index 4c7ff861d26..25ede0a09ac 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -178,6 +178,8 @@ __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_REF *current, *descent; uint32_t flags, i, entries, retry; + *refp = NULL; + btree = S2BT(session); current = NULL; retry = 100; @@ -201,16 +203,6 @@ restart: /* current = &btree->root; for (;;) { page = current->page; - /* - * When walking a tree for eviction, an exclusive operation may - * be in progress leaving the root page is not valid. Just give - * up in that case. - */ - if (page == NULL) { - WT_ASSERT(session, eviction); - break; - } - if (!WT_PAGE_IS_INTERNAL(page)) break; diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 24b4f7bb33d..68848c7c8f5 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -406,12 +406,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_BTREE *btree; WT_DECL_RET; WT_REBALANCE_STUFF *rs, _rstuff; - bool evict_reset; WT_UNUSED(cfg); btree = S2BT(session); - evict_reset = false; /* * If the tree has never been written to disk, we're done, rebalance @@ -433,14 +431,6 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) /* Set the internal page tree type. */ rs->type = btree->root.page->type; - /* - * Get exclusive access to the file. (Not required, the only page in the - * cache is the root page, and that cannot be evicted; however, this way - * eviction ignores the tree entirely.) - */ - WT_ERR(__wt_evict_file_exclusive_on(session)); - evict_reset = true; - /* Recursively walk the tree. */ switch (rs->type) { case WT_PAGE_ROW_INT: @@ -471,10 +461,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) btree->root.page = rs->root; rs->root = NULL; -err: if (evict_reset) - __wt_evict_file_exclusive_off(session); - - /* Discard any leftover root page we created. */ +err: /* Discard any leftover root page we created. */ if (rs->root != NULL) { __wt_page_modify_clear(session, rs->root); __wt_page_out(session, &rs->root); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index fea979cac6e..165f932afb2 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -166,13 +166,11 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_DECL_RET; WT_STUFF *ss, stuff; uint32_t i, leaf_cnt; - bool evict_reset; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; - evict_reset = false; WT_CLEAR(stuff); ss = &stuff; @@ -183,13 +181,6 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); - /* - * Salvage handles its own page eviction; get exclusive access to the - * file, have eviction ignore the tree entirely. - */ - WT_ERR(__wt_evict_file_exclusive_on(session)); - evict_reset = true; - /* * Step 1: * Inform the underlying block manager that we're salvaging the file. @@ -350,9 +341,6 @@ err: WT_TRET(bm->salvage_end(bm, session)); if (ss->root_ref.page != NULL) __wt_ref_out(session, &ss->root_ref); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - /* Discard the leaf and overflow page memory. */ WT_TRET(__slvg_cleanup(session, ss)); diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 05990918215..3c90e580696 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -216,13 +216,11 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); - /* - * Ignore trees with no root page. - * Verify, then discard the checkpoint from the cache. - */ - if (root_addr_size != 0 && - (ret = __wt_btree_tree_open( - session, root_addr, root_addr_size)) == 0) { + /* Skip trees with no root page. */ + if (root_addr_size != 0) { + WT_ERR(__wt_btree_tree_open( + session, root_addr, root_addr_size)); + if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, @@ -230,14 +228,38 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) __wt_page_type_string( btree->root.page->type))); + __wt_evict_file_exclusive_off(session); + + /* Verify the tree. */ WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); + /* + * We have an exclusive lock on the handle, but we're + * swapping root pages in-and-out of that handle, and + * there's a race with eviction entering the tree and + * seeing an invalid root page. Eviction must work on + * trees being verified (else we'd have to do our own + * eviction), lock eviction out whenever we're loading + * a new root page. This loops works because we are + * called with eviction locked out, so we release the + * lock at the top of the loop and re-acquire it here. + */ + WT_TRET(__wt_evict_file_exclusive_on(session)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); + + /* + * We've finished one checkpoint's verification (verification, + * then cache eviction and checkpoint unload): if any errors + * occurred, quit. Done this way because otherwise we'd need + * at least two more state variables on error, one to know if + * we need to discard the tree from the cache and one to know + * if we need to unload the checkpoint. + */ WT_ERR(ret); /* Display the tree shape. */ diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index ddaa2e5f70b..86484feb7c9 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -348,16 +348,19 @@ __tree_walk_internal(WT_SESSION_IMPL *session, /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* - * We can reach here with a NULL or root reference; the release + * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); - couple = couple_orig = ref = &btree->root; - if (ref->page == NULL) - goto done; + /* + * We're not supposed to walk trees without root pages. As this + * has not always been the case, assert to debug that change. + */ + WT_ASSERT(session, btree->root.page != NULL); + couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 17b038fb003..3bc8fe36e5e 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -15,15 +15,27 @@ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; + btree = S2BT(session); + /* - * We need exclusive access to the file -- disable ordinary eviction - * and drain any blocks already queued. + * We need exclusive access to the file, we're about to discard the root + * page. Assert eviction has been locked out. */ - WT_RET(__wt_evict_file_exclusive_on(session)); + WT_ASSERT(session, + F_ISSET(btree, WT_BTREE_NO_EVICTION) || + !F_ISSET(session->dhandle, WT_DHANDLE_OPEN)); + + /* + * We do discard objects without pages in memory. If that's the case, + * we're done. + */ + if (btree->root.page == NULL) + return (0); /* Make sure the oldest transaction ID is up-to-date. */ WT_RET(__wt_txn_update_oldest( @@ -102,7 +114,5 @@ err: /* On error, clear any left-over tree walk. */ session, next_ref, WT_READ_NO_EVICT)); } - __wt_evict_file_exclusive_off(session); - return (ret); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index d16594b0816..e59a6c2f2d9 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1431,10 +1431,20 @@ retry: while (slot < max_entries) { if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + /* + * Assert the handle has a root page: eviction + * should have been locked out if the tree is + * being discarded or the root page is changing. + * As this has not always been the case, assert + * to debug that change. + */ + WT_ASSERT(session, btree->root.page != NULL); + cache->evict_file_next = dhandle; - WT_WITH_DHANDLE(session, dhandle, ret = - __evict_walk_file(session, queue, - max_entries, &slot)); + WT_WITH_DHANDLE(session, dhandle, + ret = __evict_walk_file( + session, queue, max_entries, &slot)); + WT_ASSERT(session, session->split_gen == 0); } __wt_spin_unlock(session, &cache->evict_walk_lock); diff --git a/src/include/btree.h b/src/include/btree.h index 2aa0e470f59..39971cd2987 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -118,10 +118,6 @@ struct __wt_btree { uint64_t last_recno; /* Column-store last record number */ - WT_REF root; /* Root page reference */ - bool modified; /* If the tree ever modified */ - bool bulk_load_ok; /* Bulk-load is a possibility */ - WT_BM *bm; /* Block manager reference */ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ @@ -130,6 +126,28 @@ struct __wt_btree { uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ + enum { + WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING + } checkpointing; /* Checkpoint in progress */ + + /* + * We flush pages from the tree (in order to make checkpoint faster), + * without a high-level lock. To avoid multiple threads flushing at + * the same time, lock the tree. + */ + WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ + + bool modified; /* If the tree ever modified */ + bool bulk_load_ok; /* Bulk-load is a possibility */ + + /* + * The tree's cache and eviction information persist after the handle + * is closed (clean cache pages may remain after the tree is closed). + * Be careful clearing the WT_BTREE structure. + */ +#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, root)) + WT_REF root; /* Root page reference */ + uint64_t bytes_inmem; /* Cache bytes in memory. */ uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ @@ -144,17 +162,6 @@ struct __wt_btree { int evict_start_type; /* Start position for eviction walk (see WT_EVICT_WALK_START). */ - enum { - WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING - } checkpointing; /* Checkpoint in progress */ - - /* - * We flush pages from the tree (in order to make checkpoint faster), - * without a high-level lock. To avoid multiple threads flushing at - * the same time, lock the tree. - */ - WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ - /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ #define WT_BTREE_IGNORE_CACHE 0x000200 /* Cache-resident object */ diff --git a/src/include/extern.h b/src/include/extern.h index 19ad9a880df..8e86eedf051 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -136,7 +136,6 @@ extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_btree_evictable(WT_SESSION_IMPL *session, bool on) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/session.h b/src/include/session.h index 085f871a34f..f3092dc3c6c 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -151,20 +151,16 @@ struct __wt_session_impl { uint32_t flags; /* - * The split stash memory and hazard information persist past session - * close because they are accessed by threads of control other than the - * thread owning the session. - * + * All of the following fields live at the end of the structure so it's + * easier to clear everything but the fields that persist. + */ +#define WT_SESSION_CLEAR_SIZE (offsetof(WT_SESSION_IMPL, rnd)) + + /* * The random number state persists past session close because we don't - * want to repeatedly allocate repeated values for skiplist depth if the + * want to repeatedly use the same values for skiplist depth when the * application isn't caching sessions. - * - * All of these fields live at the end of the structure so it's easier - * to clear everything but the fields that persist. */ -#define WT_SESSION_CLEAR_SIZE(s) \ - (WT_PTRDIFF(&(s)->rnd, s)) - WT_RAND_STATE rnd; /* Random number generation state */ /* Hashed handle reference list array */ @@ -173,6 +169,9 @@ struct __wt_session_impl { TAILQ_HEAD(__tables_hash, __wt_table) *tablehash; /* + * Split stash memory persists past session close because it's accessed + * by threads of control other than the thread owning the session. + * * Splits can "free" memory that may still be in use, and we use a * split generation number to track it, that is, the session stores a * reference to the memory and allocates a split generation; when no @@ -192,6 +191,9 @@ struct __wt_session_impl { /* * Hazard pointers. * + * Hazard information persists past session close because it's accessed + * by threads of control other than the thread owning the session. + * * Use the non-NULL state of the hazard field to know if the session has * previously been initialized. */ diff --git a/src/session/session_api.c b/src/session/session_api.c index d282c5d0c32..3d13287fbe6 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -128,7 +128,7 @@ __session_clear(WT_SESSION_IMPL *session) * * For these reasons, be careful when clearing the session structure. */ - memset(session, 0, WT_SESSION_CLEAR_SIZE(session)); + memset(session, 0, WT_SESSION_CLEAR_SIZE); WT_INIT_LSN(&session->bg_sync_lsn); diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index ee9bddbfc19..469da21a448 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -574,12 +574,13 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); /* - * Flush any pages in this checkpoint from the cache (we are about to - * re-write the checkpoint which will mean cached pages no longer have - * valid contents). This is especially noticeable with memory mapped - * files, since changes to the underlying file are visible to the in - * memory pages. + * Get exclusive access to the handle and then flush any pages in this + * checkpoint from the cache (we are about to re-write the checkpoint + * which will mean cached pages no longer have valid contents). This + * is especially noticeable with memory mapped files, since changes to + * the underlying file are visible to the in-memory pages. */ + WT_ERR(__wt_evict_file_exclusive_on(session)); WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); /* -- cgit v1.2.1 From 930369ce8dd3fcf43a77dc6c911b0038748228d3 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 28 Feb 2017 17:26:14 -0500 Subject: WT-3193 Close a race between verify opening a handle and eviction visiting it (#3315) * Eviction uses the WT_BTREE.dhandle reference because a WT_BTREE is what's stored in the WT_EVICT_ENTRY structure. --- src/btree/bt_handle.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index d714dab6000..3b64581fe1e 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -21,6 +21,7 @@ static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool); static void __btree_initialize(WT_BTREE *btree, bool closing) { + WT_DATA_HANDLE *dhandle; uint32_t mask; /* @@ -35,11 +36,16 @@ __btree_initialize(WT_BTREE *btree, bool closing) */ if (closing) { /* - * Closing: clear everything except cache/eviction information - * and one LSM flag. + * Closing: clear everything except cache/eviction information. + * (The LSM flag is used during cache eviction as an accounting + * modifier, eviction also uses the WT_DATA_HANDLE reference.) */ + dhandle = btree->dhandle; + memset(btree, 0, WT_BTREE_CLEAR_SIZE); F_CLR(btree, ~(WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION)); + + btree->dhandle = dhandle; } else { /* Opening: clear everything except the special flags. */ mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); -- cgit v1.2.1 From af29e5e78eed5c80427179024543bd88c5f2549d Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 28 Feb 2017 22:55:30 -0500 Subject: WT-3193 Close a race between verify opening a handle and eviction visiting it (#3316) Leave the WT_BTREE structure mostly untouched until discarding the upper-level data handle, hopefully resolving the problems we've been having with accessing WT_BTREE fields after "closing" the handle. --- src/btree/bt_handle.c | 94 +++++++++++++++++++++++++------------------------ src/conn/conn_dhandle.c | 3 +- src/include/btree.h | 69 ++++++++++++++++-------------------- src/include/extern.h | 1 + 4 files changed, 82 insertions(+), 85 deletions(-) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 3b64581fe1e..d30eee1e282 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -14,46 +14,6 @@ static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool); -/* - * __btree_initialize -- - * Initialize the WT_BTREE structure. - */ -static void -__btree_initialize(WT_BTREE *btree, bool closing) -{ - WT_DATA_HANDLE *dhandle; - uint32_t mask; - - /* - * This function exists as a place to discuss how the WT_BTREE structure - * is initialized (or re-initialized, when the object is re-opened). The - * upper-level handle code sets/clears flags in the WT_BTREE structure, - * plus the eviction/cache code reads/writes cache information. The - * latter happens in-between a forced drop and sweep discarding the - * tree (where the tree is still "open" and has pages being evicted from - * the cache), but it's no longer part of the namespace. For all those - * reasons, parts of the WT_BTREE object must persist after it's closed. - */ - if (closing) { - /* - * Closing: clear everything except cache/eviction information. - * (The LSM flag is used during cache eviction as an accounting - * modifier, eviction also uses the WT_DATA_HANDLE reference.) - */ - dhandle = btree->dhandle; - - memset(btree, 0, WT_BTREE_CLEAR_SIZE); - F_CLR(btree, ~(WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION)); - - btree->dhandle = dhandle; - } else { - /* Opening: clear everything except the special flags. */ - mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); - memset(btree, 0, sizeof(*btree)); - btree->flags = mask; - } -} - /* * __wt_btree_open -- * Open a Btree. @@ -68,15 +28,27 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_DATA_HANDLE *dhandle; WT_DECL_RET; size_t root_addr_size; + uint32_t mask; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; const char *filename; bool creation, forced_salvage, readonly; - dhandle = session->dhandle; - + /* + * This may be a re-open of an underlying object and we want to clear + * everything. We can't clear the operation flags, however, they're + * set by the connection handle software. + */ btree = S2BT(session); - __btree_initialize(btree, false); - btree->dhandle = session->dhandle; + mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); + memset(btree, 0, sizeof(*btree)); + btree->flags = mask; + + /* + * Set the data handle immediately, our called functions reasonably + * use it. + */ + dhandle = session->dhandle; + btree->dhandle = dhandle; /* Checkpoint files are readonly. */ readonly = dhandle->checkpoint != NULL || @@ -204,7 +176,24 @@ __wt_btree_close(WT_SESSION_IMPL *session) btree = S2BT(session); + /* + * The close process isn't the same as discarding the handle: we might + * re-open the handle, which isn't a big deal, but the backing blocks + * for the handle may not yet have been discarded from the cache, and + * eviction uses WT_BTREE structure elements. Free backing resources + * but leave the rest alone, and we'll discard the structure when we + * discard the data handle. + * + * Handles can be closed multiple times, ignore all but the first. + */ + if (F_ISSET(btree, WT_BTREE_CLOSED)) + return (0); + F_SET(btree, WT_BTREE_CLOSED); + + /* Discard any underlying block manager resources. */ if ((bm = btree->bm) != NULL) { + btree->bm = NULL; + /* Unload the checkpoint, unless it's a special command. */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) @@ -217,6 +206,7 @@ __wt_btree_close(WT_SESSION_IMPL *session) /* Close the Huffman tree. */ __wt_btree_huffman_close(session); + /* Terminate any associated collator. */ if (btree->collator_owned && btree->collator->terminate != NULL) WT_TRET(btree->collator->terminate( btree->collator, &session->iface)); @@ -229,11 +219,23 @@ __wt_btree_close(WT_SESSION_IMPL *session) __wt_free(session, btree->key_format); __wt_free(session, btree->value_format); - __btree_initialize(btree, true); - return (ret); } +/* + * __wt_btree_discard -- + * Discard a Btree. + */ +void +__wt_btree_discard(WT_SESSION_IMPL *session, void **handlep) +{ + WT_BTREE *btree; + + btree = *handlep; + *handlep = NULL; + __wt_overwrite_and_free(session, btree); +} + /* * __btree_conf -- * Configure a WT_BTREE structure. diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 866b8633f71..7b265c372db 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -18,7 +18,7 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) __wt_rwlock_destroy(session, &dhandle->rwlock); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); - __wt_free(session, dhandle->handle); + __wt_btree_discard(session, &dhandle->handle); __wt_spin_destroy(session, &dhandle->close_lock); __wt_stat_dsrc_discard(session, dhandle); __wt_overwrite_and_free(session, dhandle); @@ -192,6 +192,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) } WT_TRET(__wt_btree_close(session)); + F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); /* * If we marked a handle dead it will be closed by sweep, via diff --git a/src/include/btree.h b/src/include/btree.h index 39971cd2987..69ab2070eb9 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -118,6 +118,10 @@ struct __wt_btree { uint64_t last_recno; /* Column-store last record number */ + WT_REF root; /* Root page reference */ + bool modified; /* If the tree ever modified */ + bool bulk_load_ok; /* Bulk-load is a possibility */ + WT_BM *bm; /* Block manager reference */ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ @@ -126,6 +130,19 @@ struct __wt_btree { uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ + uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ + uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ + + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_saved; /* Saved walk skips for checkpoints */ + u_int evict_walk_skips; /* Number of walks skipped */ + u_int evict_disabled; /* Eviction disabled count */ + volatile uint32_t evict_busy; /* Count of threads in eviction */ + int evict_start_type; /* Start position for eviction walk + (see WT_EVICT_WALK_START). */ enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING } checkpointing; /* Checkpoint in progress */ @@ -137,46 +154,22 @@ struct __wt_btree { */ WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ - bool modified; /* If the tree ever modified */ - bool bulk_load_ok; /* Bulk-load is a possibility */ - - /* - * The tree's cache and eviction information persist after the handle - * is closed (clean cache pages may remain after the tree is closed). - * Be careful clearing the WT_BTREE structure. - */ -#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, root)) - WT_REF root; /* Root page reference */ - - uint64_t bytes_inmem; /* Cache bytes in memory. */ - uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ - uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ - - WT_REF *evict_ref; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages */ - u_int evict_walk_period; /* Skip this many LRU walks */ - u_int evict_walk_saved; /* Saved walk skips for checkpoints */ - u_int evict_walk_skips; /* Number of walks skipped */ - u_int evict_disabled; /* Eviction disabled count */ - volatile uint32_t evict_busy; /* Count of threads in eviction */ - int evict_start_type; /* Start position for eviction walk - (see WT_EVICT_WALK_START). */ - /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ -#define WT_BTREE_IGNORE_CACHE 0x000200 /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x000400 /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x000800 /* Look-aside table */ -#define WT_BTREE_LSM_PRIMARY 0x001000 /* Handle is current LSM primary */ -#define WT_BTREE_NO_CHECKPOINT 0x002000 /* Disable checkpoints */ -#define WT_BTREE_NO_EVICTION 0x004000 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */ -#define WT_BTREE_REBALANCE 0x020000 /* Handle is for rebalance */ -#define WT_BTREE_SALVAGE 0x040000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x080000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x100000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x200000 /* Handle is for verify */ +#define WT_BTREE_CLOSED 0x000200 /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */ +#define WT_BTREE_LSM_PRIMARY 0x002000 /* Handle is current LSM primary */ +#define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x008000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x010000 /* Disable logging */ +#define WT_BTREE_NO_RECONCILE 0x020000 /* Allow splits, even with no evict */ +#define WT_BTREE_REBALANCE 0x040000 /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x080000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x100000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x200000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x400000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/extern.h b/src/include/extern.h index 8e86eedf051..07cc3ce9921 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -133,6 +133,7 @@ extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_btree_discard(WT_SESSION_IMPL *session, void **handlep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From f70d3773671a5a9319900b4aef57bdc1a67afdc0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 2 Mar 2017 00:36:42 -0500 Subject: WT-3199 bug: eviction assertion failure (#3317) * Don't set WT_BTREE_NO_EVICTION in LSM without going through the locking dance. * Change so nothing in the WT_BTREE structure is cleaned up or discarded until that structure is being discarded or re-opened. This doesn't fix any bugs, but I think it's less fragile going forward. --- src/btree/bt_handle.c | 85 ++++++++++++++++++++++++++++++++----------------- src/conn/conn_dhandle.c | 12 ++++--- src/include/btree.i | 14 +++++--- src/include/extern.h | 2 +- src/lsm/lsm_cursor.c | 3 +- src/lsm/lsm_work_unit.c | 5 +-- 6 files changed, 79 insertions(+), 42 deletions(-) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index d30eee1e282..e8eb37bfb8e 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -14,6 +14,44 @@ static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool); +/* + * __btree_clear -- + * Clear a Btree, either on handle discard or re-open. + */ +static int +__btree_clear(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; + + btree = S2BT(session); + + /* + * If the tree hasn't gone through an open/close cycle, there's no + * cleanup to be done. + */ + if (!F_ISSET(btree, WT_BTREE_CLOSED)) + return (0); + + /* Close the Huffman tree. */ + __wt_btree_huffman_close(session); + + /* Terminate any associated collator. */ + if (btree->collator_owned && btree->collator->terminate != NULL) + WT_TRET(btree->collator->terminate( + btree->collator, &session->iface)); + + /* Destroy locks. */ + __wt_rwlock_destroy(session, &btree->ovfl_lock); + __wt_spin_destroy(session, &btree->flush_lock); + + /* Free allocated memory. */ + __wt_free(session, btree->key_format); + __wt_free(session, btree->value_format); + + return (ret); +} + /* * __wt_btree_open -- * Open a Btree. @@ -33,21 +71,21 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) const char *filename; bool creation, forced_salvage, readonly; + btree = S2BT(session); + dhandle = session->dhandle; + /* - * This may be a re-open of an underlying object and we want to clear - * everything. We can't clear the operation flags, however, they're - * set by the connection handle software. + * This may be a re-open of an underlying object and we have to clean + * up. We can't clear the operation flags, however, they're set by the + * connection handle software that called us. */ - btree = S2BT(session); + WT_RET(__btree_clear(session)); + mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); memset(btree, 0, sizeof(*btree)); btree->flags = mask; - /* - * Set the data handle immediately, our called functions reasonably - * use it. - */ - dhandle = session->dhandle; + /* Set the data handle first, our called functions reasonably use it. */ btree->dhandle = dhandle; /* Checkpoint files are readonly. */ @@ -203,22 +241,6 @@ __wt_btree_close(WT_SESSION_IMPL *session) WT_TRET(bm->close(bm, session)); } - /* Close the Huffman tree. */ - __wt_btree_huffman_close(session); - - /* Terminate any associated collator. */ - if (btree->collator_owned && btree->collator->terminate != NULL) - WT_TRET(btree->collator->terminate( - btree->collator, &session->iface)); - - /* Destroy locks. */ - __wt_rwlock_destroy(session, &btree->ovfl_lock); - __wt_spin_destroy(session, &btree->flush_lock); - - /* Free allocated memory. */ - __wt_free(session, btree->key_format); - __wt_free(session, btree->value_format); - return (ret); } @@ -226,14 +248,19 @@ __wt_btree_close(WT_SESSION_IMPL *session) * __wt_btree_discard -- * Discard a Btree. */ -void -__wt_btree_discard(WT_SESSION_IMPL *session, void **handlep) +int +__wt_btree_discard(WT_SESSION_IMPL *session) { WT_BTREE *btree; + WT_DECL_RET; + + ret = __btree_clear(session); - btree = *handlep; - *handlep = NULL; + btree = S2BT(session); __wt_overwrite_and_free(session, btree); + session->dhandle->handle = NULL; + + return (ret); } /* diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 7b265c372db..6958b79f10f 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -12,16 +12,20 @@ * __conn_dhandle_destroy -- * Destroy a data handle. */ -static void +static int __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) { + WT_DECL_RET; + + WT_WITH_DHANDLE(session, dhandle, ret = __wt_btree_discard(session)); + __wt_rwlock_destroy(session, &dhandle->rwlock); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); - __wt_btree_discard(session, &dhandle->handle); __wt_spin_destroy(session, &dhandle->close_lock); __wt_stat_dsrc_discard(session, dhandle); __wt_overwrite_and_free(session, dhandle); + return (ret); } /* @@ -76,7 +80,7 @@ __wt_conn_dhandle_alloc( session->dhandle = dhandle; return (0); -err: __conn_dhandle_destroy(session, dhandle); +err: WT_TRET(__conn_dhandle_destroy(session, dhandle)); return (ret); } @@ -604,7 +608,7 @@ __wt_conn_dhandle_discard_single( */ if (ret == 0 || final) { __conn_btree_config_clear(session); - __conn_dhandle_destroy(session, dhandle); + WT_TRET(__conn_dhandle_destroy(session, dhandle)); session->dhandle = NULL; } diff --git a/src/include/btree.i b/src/include/btree.i index 315efa86fa6..6dda2428122 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1549,7 +1549,7 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) * __wt_btree_lsm_switch_primary -- * Switch a btree handle to/from the current primary chunk of an LSM tree. */ -static inline void +static inline int __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) { WT_BTREE *btree; @@ -1563,13 +1563,15 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) cache = S2C(session)->cache; root = btree->root.page; - if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) - F_SET(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); + if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + F_SET(btree, WT_BTREE_LSM_PRIMARY); + WT_RET(__wt_evict_file_exclusive_on(session)); + } if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { pindex = WT_INTL_INDEX_GET_SAFE(root); if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) || pindex->entries != 1) - return; + return (0); first = pindex->index[0]; /* @@ -1590,8 +1592,10 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } - F_CLR(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); + F_CLR(btree, WT_BTREE_LSM_PRIMARY); + __wt_evict_file_exclusive_off(session); } + return (0); } /* diff --git a/src/include/extern.h b/src/include/extern.h index 07cc3ce9921..d0c9655fafb 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -133,7 +133,7 @@ extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_btree_discard(WT_SESSION_IMPL *session, void **handlep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 60afbc99ade..116740f8f0c 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -699,7 +699,8 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, - __wt_btree_lsm_switch_primary(session, true)); + ret = __wt_btree_lsm_switch_primary(session, true)); + WT_ERR(ret); } } diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 4349acf7b55..c9c350c5ac9 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -383,8 +383,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); - __wt_btree_lsm_switch_primary(session, false); - WT_ERR(__wt_session_release_btree(session)); + WT_TRET(__wt_btree_lsm_switch_primary(session, false)); + WT_TRET(__wt_session_release_btree(session)); + WT_ERR(ret); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); -- cgit v1.2.1 From 62f0543765deaf2f11b3c2e78d82940e500f004b Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 2 Mar 2017 21:17:25 -0500 Subject: WT-3203 bulk-load state changes can race (#3318) * The bulk-load state change (where multiple threads of control turn off the possibility of bulk-load in a tree), has always been able to race, but it's potentially dangerous now that turning off bulk-load involves calling `__wt_evict_file_exclusive_off()`. In the current tree, there's a diagnostic-only test that might fail because of this race. * The WT_BTREE_NO_EVICTION flag is no longer set other than through the __wt_evict_file_exclusive_on/off functions; remove that flag and use the WT_BTREE.evict_disabled counter by itself. --- src/btree/bt_cursor.c | 40 ++++++++++++++++++++++++---------------- src/btree/bt_handle.c | 2 +- src/btree/bt_read.c | 2 +- src/evict/evict_file.c | 2 +- src/evict/evict_lru.c | 44 +++++++++++++------------------------------- src/include/btree.h | 20 ++++++++++---------- src/include/btree.i | 7 +++---- 7 files changed, 53 insertions(+), 64 deletions(-) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 6a48c5f752b..4634059589b 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -54,6 +54,26 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) return (0); } +/* + * __cursor_disable_bulk -- + * Disable bulk loads into a tree. + */ +static inline void +__cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree) +{ + /* + * Once a tree is no longer empty, eviction should pay attention to it, + * and it's no longer possible to bulk-load into it. + * + * We use a compare-and-swap here to avoid races among the first + * inserts into a tree. Eviction is disabled when an empty tree is + * opened, it must only be enabled once. + */ + if (btree->bulk_load_ok && + __wt_atomic_cas8(&btree->bulk_load_ok, 1, 0)) + __wt_evict_file_exclusive_off(session); +} + /* * __cursor_fix_implicit -- * Return if search went past the end of the tree. @@ -506,14 +526,8 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); - /* - * The tree is no longer empty: eviction should pay attention to it, - * and it's no longer possible to bulk-load into it. - */ - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - __wt_evict_file_exclusive_off(session); - } + /* It's no longer possible to bulk-load into the tree. */ + __cursor_disable_bulk(session, btree); retry: WT_RET(__cursor_func_init(cbt, true)); @@ -760,14 +774,8 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); - /* - * The tree is no longer empty: eviction should pay attention to it, - * and it's no longer possible to bulk-load into it. - */ - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - __wt_evict_file_exclusive_off(session); - } + /* It's no longer possible to bulk-load into the tree. */ + __cursor_disable_bulk(session, btree); retry: WT_RET(__cursor_func_init(cbt, true)); diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index e8eb37bfb8e..ff199eb1e0e 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -562,7 +562,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) * tree. */ if (creation) - btree->bulk_load_ok = true; + btree->bulk_load_ok = 1; /* * A note about empty trees: the initial tree is a single root page. diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 39f9e1159cb..e87ddc082f2 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -590,7 +590,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - (F_ISSET(btree, WT_BTREE_NO_EVICTION) && + (btree->evict_disabled > 0 && !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) goto skip_evict; diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 3bc8fe36e5e..3d8f4a61ca7 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -27,7 +27,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * page. Assert eviction has been locked out. */ WT_ASSERT(session, - F_ISSET(btree, WT_BTREE_NO_EVICTION) || + btree->evict_disabled > 0 || !F_ISSET(session->dhandle, WT_DHANDLE_OPEN)); /* diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index e59a6c2f2d9..6863533acfb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -824,31 +824,19 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) btree = S2BT(session); cache = S2C(session)->cache; - /* - * Hold the walk lock to set the no-eviction flag. - * - * The no-eviction flag can be set permanently, in which case we never - * increment the no-eviction count. - */ + /* Hold the walk lock to turn off eviction. */ __wt_spin_lock(session, &cache->evict_walk_lock); - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - if (btree->evict_disabled != 0) - ++btree->evict_disabled; + if (++btree->evict_disabled > 1) { __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); } - ++btree->evict_disabled; /* * Ensure no new pages from the file will be queued for eviction after - * this point. + * this point, then clear any existing LRU eviction walk for the file. */ - F_SET(btree, WT_BTREE_NO_EVICTION); (void)__wt_atomic_addv32(&cache->pass_intr, 1); - - /* Clear any existing LRU eviction walk for the file. */ - WT_WITH_PASS_LOCK(session, - ret = __evict_clear_walk(session)); + WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); @@ -879,7 +867,6 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) if (0) { err: --btree->evict_disabled; - F_CLR(btree, WT_BTREE_NO_EVICTION); } __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); @@ -904,16 +891,11 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) */ WT_DIAGNOSTIC_YIELD; - WT_ASSERT(session, - btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); - - /* - * The no-eviction flag can be set permanently, in which case we never - * increment the no-eviction count. - */ + /* Hold the walk lock to turn on eviction. */ __wt_spin_lock(session, &cache->evict_walk_lock); - if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) - F_CLR(btree, WT_BTREE_NO_EVICTION); + WT_ASSERT(session, + btree->evict_ref == NULL && btree->evict_disabled > 0); + --btree->evict_disabled; __wt_spin_unlock(session, &cache->evict_walk_lock); } @@ -1372,7 +1354,7 @@ retry: while (slot < max_entries) { /* Skip files that don't allow eviction. */ btree = dhandle->handle; - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree->evict_disabled > 0) continue; /* @@ -1428,9 +1410,9 @@ retry: while (slot < max_entries) { * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && + if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + if (btree->evict_disabled == 0) { /* * Assert the handle has a root page: eviction * should have been locked out if the tree is @@ -2249,7 +2231,7 @@ __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || - F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) + S2BT(session)->evict_disabled > 0) return (false); /* Append to the urgent queue if we can. */ @@ -2259,7 +2241,7 @@ __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) __wt_spin_lock(session, &cache->evict_queue_lock); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || - F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) + S2BT(session)->evict_disabled > 0) goto done; __wt_spin_lock(session, &urgent_queue->evict_lock); diff --git a/src/include/btree.h b/src/include/btree.h index 69ab2070eb9..fc7cd352883 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -120,7 +120,8 @@ struct __wt_btree { WT_REF root; /* Root page reference */ bool modified; /* If the tree ever modified */ - bool bulk_load_ok; /* Bulk-load is a possibility */ + uint8_t bulk_load_ok; /* Bulk-load is a possibility + (want a bool but needs atomic cas) */ WT_BM *bm; /* Block manager reference */ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ @@ -139,7 +140,7 @@ struct __wt_btree { u_int evict_walk_period; /* Skip this many LRU walks */ u_int evict_walk_saved; /* Saved walk skips for checkpoints */ u_int evict_walk_skips; /* Number of walks skipped */ - u_int evict_disabled; /* Eviction disabled count */ + int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ int evict_start_type; /* Start position for eviction walk (see WT_EVICT_WALK_START). */ @@ -162,14 +163,13 @@ struct __wt_btree { #define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */ #define WT_BTREE_LSM_PRIMARY 0x002000 /* Handle is current LSM primary */ #define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */ -#define WT_BTREE_NO_EVICTION 0x008000 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x010000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x020000 /* Allow splits, even with no evict */ -#define WT_BTREE_REBALANCE 0x040000 /* Handle is for rebalance */ -#define WT_BTREE_SALVAGE 0x080000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x100000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x200000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x400000 /* Handle is for verify */ +#define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */ +#define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */ +#define WT_BTREE_REBALANCE 0x020000 /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x040000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x080000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x100000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x200000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/btree.i b/src/include/btree.i index 6dda2428122..cec6f67e9bd 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1401,7 +1401,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - F_ISSET(btree, WT_BTREE_NO_EVICTION) || + btree->evict_disabled > 0 || !__wt_page_can_evict(session, ref, NULL)) return (__wt_hazard_clear(session, ref)); @@ -1521,7 +1521,7 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) return (false); /* A tree that can be evicted always requires a switch. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree->evict_disabled == 0) return (true); /* Check for a tree with a single leaf page. */ @@ -1569,8 +1569,7 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) } if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { pindex = WT_INTL_INDEX_GET_SAFE(root); - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) || - pindex->entries != 1) + if (btree->evict_disabled == 0 || pindex->entries != 1) return (0); first = pindex->index[0]; -- cgit v1.2.1 From 9e3f71ef55b906b25c63e1000cf39949a587550d Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 8 Mar 2017 14:39:13 -0500 Subject: WT-3213 Only error if fixed-length and long_running_txn is set. --- test/format/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/format/config.c b/test/format/config.c index 958ad6b7a99..cd9856d641e 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -462,7 +462,7 @@ config_lrt(void) * stores. */ if (g.type == FIX) { - if (config_is_perm("long_running_txn")) + if (config_is_perm("long_running_txn") && g.c_long_running_txn) testutil_die(EINVAL, "long_running_txn not supported with fixed-length " "column store"); -- cgit v1.2.1 From a67019791436f1dfaca9cffda17e2fa9935296db Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 10 Mar 2017 14:00:13 +1100 Subject: WT-3207 Don't hold clean handles during checkpoints. (#3319) Previously, we gathered handles, then started a transaction, then figured out which handles were clean and released them. However, * checkpoints were keeping every handle in both its handle list and in the meta_tracking list because the *_apply_all functions were saving all handles when meta_tracking was active; and * we had acquired exclusive locks on checkpoints to be dropped before determining that we could skip a checkpoint in a clean tree. These locks blocked drops (among other things) until the checkpoint completed. The solution here is to first start the transaction, then check for clean handles as checkpoint visits them. However, this has to cope with races where a handle changes state in between the transaction starting and getting the handle (e.g., table creates, bulk loads completing). --- src/conn/conn_dhandle.c | 12 +- src/include/btree.h | 1 - src/include/txn.i | 3 +- src/meta/meta_apply.c | 6 +- src/meta/meta_table.c | 3 - src/session/session_dhandle.c | 17 +- src/txn/txn_ckpt.c | 413 +++++++++++++++++++++++------------------- 7 files changed, 247 insertions(+), 208 deletions(-) diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 6958b79f10f..25795a8d309 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -400,10 +400,7 @@ __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, return (ret == EBUSY ? 0 : ret); WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock(session, false)); - else - WT_TRET(__wt_session_release_btree(session)); + WT_TRET(__wt_session_release_btree(session)); return (ret); } @@ -497,7 +494,12 @@ __wt_conn_dhandle_close_all( session->dhandle = dhandle; - /* Lock the handle exclusively. */ + /* + * Lock the handle exclusively. If this is part of + * schema-changing operation (indicated by metadata tracking + * being enabled), hold the lock for the duration of the + * operation. + */ WT_ERR(__wt_session_get_btree(session, dhandle->name, dhandle->checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); diff --git a/src/include/btree.h b/src/include/btree.h index fc7cd352883..857dc6694c5 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -127,7 +127,6 @@ struct __wt_btree { u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ uint64_t checkpoint_gen; /* Checkpoint generation */ - bool include_checkpoint_txn;/* ID checks include checkpoint */ uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ diff --git a/src/include/txn.i b/src/include/txn.i index 0cc4a6f8439..314c948e4d1 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -125,7 +125,8 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * minimum of it with the oldest ID, which is what we want. */ oldest_id = txn_global->oldest_id; - include_checkpoint_txn = btree == NULL || btree->include_checkpoint_txn; + include_checkpoint_txn = btree == NULL || + btree->checkpoint_gen != txn_global->checkpoint_gen; WT_READ_BARRIER(); checkpoint_pinned = txn_global->checkpoint_pinned; diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index fb483c21dd9..dc93180a5e5 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -45,11 +45,7 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, session, uri, NULL, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock( - session, false)); - else - WT_TRET(__wt_session_release_btree(session)); + WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 4f60728b2d2..aca69d0e6a2 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -68,9 +68,6 @@ __wt_metadata_cursor_open( if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); - /* The metadata file always uses checkpoint IDs in visibility checks. */ - btree->include_checkpoint_txn = true; - return (0); } diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 469da21a448..7c96dd8b8a8 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -560,7 +560,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) { - WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; WT_ASSERT(session, WT_META_TRACKING(session)); @@ -568,10 +568,15 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) /* * Get the checkpoint handle exclusive, so no one else can access it - * while we are creating the new checkpoint. + * while we are creating the new checkpoint. Hold the lock until the + * checkpoint completes. */ WT_ERR(__wt_session_get_btree(session, saved_dhandle->name, checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); + if ((ret = __wt_meta_track_handle_lock(session, false)) != 0) { + WT_TRET(__wt_session_release_btree(session)); + goto err; + } /* * Get exclusive access to the handle and then flush any pages in this @@ -587,13 +592,9 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * We lock checkpoint handles that we are overwriting, so the handle * must be closed when we release it. */ - dhandle = session->dhandle; - F_SET(dhandle, WT_DHANDLE_DISCARD); + F_SET(session->dhandle, WT_DHANDLE_DISCARD); - WT_ERR(__wt_meta_track_handle_lock(session, false)); - - /* Restore the original btree in the session. */ + /* Restore the original data handle in the session. */ err: session->dhandle = saved_dhandle; - return (ret); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3261c8089f4..d6f0e45c042 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,9 +8,9 @@ #include "wt_internal.h" -static int __checkpoint_lock_tree( - WT_SESSION_IMPL *, bool, bool, const char *[]); -static int __checkpoint_mark_deletes(WT_SESSION_IMPL *, const char *[]); +static int __checkpoint_lock_dirty_tree( + WT_SESSION_IMPL *, bool, bool, bool, const char *[]); +static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool); static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]); static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); @@ -89,6 +89,33 @@ err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } +/* + * __checkpoint_update_generation -- + * Update the checkpoint generation of the current tree. + * + * This indicates that the tree will not be visited again by the current + * checkpoint. + */ +static void +__checkpoint_update_generation(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* + * Updates to the metadata are made by the checkpoint transaction, so + * the metadata tree's checkpoint generation should never be updated. + */ + if (WT_IS_METADATA(session->dhandle)) + return; + + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_DATA_SET(session, + btree_checkpoint_generation, btree->checkpoint_gen); +} + /* * __checkpoint_apply_all -- * Apply an operation to all files involved in a checkpoint. @@ -239,22 +266,95 @@ int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; + WT_CONFIG_ITEM cval; + WT_CURSOR *meta_cursor; WT_DECL_RET; const char *name; + bool force, metadata_race; + + btree = S2BT(session); + + /* Find out if we have to force a checkpoint. */ + force = false; + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + if (!force) { + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + force = cval.len != 0; + } /* Should not be called with anything other than a file object. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); /* Skip files that are never involved in a checkpoint. */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + return (0); + + /* + * We may have raced between starting the checkpoint transaction and + * some operation completing on the handle that updated the metadata + * (e.g., closing a bulk load cursor). All such operations either have + * exclusive access to the handle or hold the schema lock. We are now + * holding the schema lock and have an open btree handle, so if we + * can't update the metadata, then there has been some state change + * invisible to the checkpoint transaction. Skip checkpointing such + * files: they must have a recent durable point. + */ + if (!WT_IS_METADATA(session->dhandle)) { + WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); + WT_RET(__wt_metadata_cursor(session, &meta_cursor)); + meta_cursor->set_key(meta_cursor, session->dhandle->name); + ret = __wt_curfile_update_check(meta_cursor); + if (ret == WT_ROLLBACK) { + metadata_race = true; + ret = 0; + } else + metadata_race = false; + WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); + WT_RET(ret); + if (metadata_race) { + /* + * The conflict registers as a rollback error: that can + * safely be skipped here. + */ + F_CLR(&session->txn, WT_TXN_ERROR); + if (force) + WT_RET_MSG(session, EBUSY, + "forced or named checkpoint raced with " + "a metadata update"); + __wt_verbose(session, WT_VERB_CHECKPOINT, + "skipped checkpoint of %s with metadata conflict", + session->dhandle->name); + F_SET(btree, WT_BTREE_SKIP_CKPT); + __checkpoint_update_generation(session); + return (0); + } + } + + /* + * Decide whether the tree needs to be included in the checkpoint and + * if so, acquire the necessary locks. + */ + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, true, force, true, cfg)); + WT_RET(ret); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { + __checkpoint_update_generation(session); return (0); + } - /* Make sure there is space for the next entry. */ + /* + * Make sure there is space for the new entry: do this before getting + * the handle to avoid cleanup if we can't allocate the memory. + */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); - /* Not strictly necessary, but cleaner to clear the current handle. */ + /* + * The current tree will be included: get it again because the handle + * we have is only valid for the duration of this function. + */ name = session->dhandle->name; session->dhandle = NULL; @@ -266,48 +366,12 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) * with eviction and we don't want to unfairly penalize (or promote) * eviction in trees due to checkpoints. */ - btree = S2BT(session); btree->evict_walk_saved = btree->evict_walk_period; - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, true, true, cfg)); - if (ret != 0) { - WT_TRET(__wt_session_release_btree(session)); - return (ret); - } - - /* - * Flag that the handle is part of a checkpoint for the purposes - * of transaction visibility checks. - */ - WT_PUBLISH(btree->include_checkpoint_txn, true); - session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; return (0); } -/* - * __checkpoint_update_generation -- - * Update the checkpoint generation of the current tree. - * - * This indicates that the tree will not be visited again by the current - * checkpoint. - */ -static void -__checkpoint_update_generation(WT_SESSION_IMPL *session) -{ - WT_BTREE *btree; - - btree = S2BT(session); - if (!WT_IS_METADATA(session->dhandle)) - WT_PUBLISH(btree->include_checkpoint_txn, false); - - WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); - WT_STAT_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); -} - /* * __checkpoint_reduce_dirty_cache -- * Release clean trees from the list cached for checkpoints. @@ -433,36 +497,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); } -/* - * __checkpoint_release_clean_trees -- - * Release clean trees from the list cached for checkpoints. - */ -static int -__checkpoint_release_clean_trees(WT_SESSION_IMPL *session) -{ - WT_BTREE *btree; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - u_int i; - - for (i = 0; i < session->ckpt_handle_next; i++) { - dhandle = session->ckpt_handle[i]; - btree = dhandle->handle; - if (!F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - continue; - __wt_meta_ckptlist_free(session, btree->ckpt); - btree->ckpt = NULL; - WT_WITH_DHANDLE(session, dhandle, - __checkpoint_update_generation(session)); - session->ckpt_handle[i] = NULL; - WT_WITH_DHANDLE(session, dhandle, - ret = __wt_session_release_btree(session)); - WT_RET(ret); - } - - return (0); -} - /* * __checkpoint_stats -- * Update checkpoint timer stats. @@ -535,6 +569,96 @@ __checkpoint_fail_reset(WT_SESSION_IMPL *session) S2BT(session)->ckpt = NULL; } +/* + * __checkpoint_prepare -- + * Start the transaction for a checkpoint and gather handles. + */ +static int +__checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + const char *txn_cfg[] = { WT_CONFIG_BASE(session, + WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + + /* + * Start a snapshot transaction for the checkpoint. + * + * Note: we don't go through the public API calls because they have + * side effects on cursors, which applications can hold open across + * calls to checkpoint. + */ + WT_RET(__wt_txn_begin(session, txn_cfg)); + + /* Ensure a transaction ID is allocated prior to sharing it globally */ + WT_RET(__wt_txn_id_check(session)); + + /* + * Mark the connection as clean. If some data gets modified after + * generating checkpoint transaction id, connection will be reset to + * dirty when reconciliation marks the btree dirty on encountering the + * dirty page. + */ + conn->modified = false; + + /* + * Save the checkpoint session ID. + * + * We never do checkpoints in the default session (with id zero). + */ + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); + txn_global->checkpoint_id = session->id; + + /* + * Remove the checkpoint transaction from the global table. + * + * This allows ordinary visibility checks to move forward because + * checkpoints often take a long time and only write to the metadata. + */ + __wt_writelock(session, &txn_global->scan_rwlock); + txn_global->checkpoint_txnid = txn->id; + txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); + + /* + * Sanity check that the oldest ID hasn't moved on before we have + * cleared our entry. + */ + WT_ASSERT(session, + WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); + + /* + * Clear our entry from the global transaction session table. Any + * operation that needs to know about the ID for this checkpoint will + * consider the checkpoint ID in the global structure. Most operations + * can safely ignore the checkpoint ID (see the visible all check for + * details). + */ + txn_state->id = txn_state->pinned_id = + txn_state->metadata_pinned = WT_TXN_NONE; + __wt_writeunlock(session, &txn_global->scan_rwlock); + + /* + * Get a list of handles we want to flush; for named checkpoints this + * may pull closed objects into the session cache. + * + * First, gather all handles, then start the checkpoint transaction, + * then release any clean handles. + */ + WT_ASSERT(session, session->ckpt_handle_next == 0); + WT_WITH_TABLE_READ_LOCK(session, ret = __checkpoint_apply_all( + session, cfg, __wt_checkpoint_get_handles, NULL)); + return (ret); +} + /* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. @@ -550,19 +674,15 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; - WT_TXN_STATE *txn_state; void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; bool failed, full, idle, logging, tracking; - const char *txn_cfg[] = { WT_CONFIG_BASE(session, - WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; conn = S2C(session); cache = conn->cache; txn = &session->txn; txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; full = idle = logging = tracking = false; @@ -631,86 +751,24 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) tracking = true; /* - * Get a list of handles we want to flush; for named checkpoints this - * may pull closed objects into the session cache. - * * We want to skip checkpointing clean handles whenever possible. That * is, when the checkpoint is not named or forced. However, we need to * take care about ordering with respect to the checkpoint transaction. * - * If we skip clean handles before starting the transaction, the + * We can't skip clean handles before starting the transaction or the * checkpoint can miss updates in trees that become dirty as the * checkpoint is starting. If we wait until the transaction has * started before locking a handle, there could be a metadata-changing * operation in between (e.g., salvage) that will cause a write * conflict when the checkpoint goes to write the metadata. * - * First, gather all handles, then start the checkpoint transaction, - * then release any clean handles. + * Hold the schema lock while starting the transaction and gathering + * handles so the set we get is complete and correct. */ - WT_ASSERT(session, session->ckpt_handle_next == 0); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_READ_LOCK(session, - ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_get_handles, NULL))); + WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, cfg)); WT_ERR(ret); - /* - * Start a snapshot transaction for the checkpoint. - * - * Note: we don't go through the public API calls because they have - * side effects on cursors, which applications can hold open across - * calls to checkpoint. - */ - WT_ERR(__wt_txn_begin(session, txn_cfg)); - - /* Ensure a transaction ID is allocated prior to sharing it globally */ - WT_ERR(__wt_txn_id_check(session)); - - /* - * Mark the connection as clean. If some data gets modified after - * generating checkpoint transaction id, connection will be reset to - * dirty when reconciliation marks the btree dirty on encountering the - * dirty page. - */ - conn->modified = false; - - /* - * Save the checkpoint session ID. - * - * We never do checkpoints in the default session (with id zero). - */ - WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); - txn_global->checkpoint_id = session->id; - - /* - * Remove the checkpoint transaction from the global table. - * - * This allows ordinary visibility checks to move forward because - * checkpoints often take a long time and only write to the metadata. - */ - __wt_writelock(session, &txn_global->scan_rwlock); - txn_global->checkpoint_txnid = txn->id; - txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); - - /* - * Sanity check that the oldest ID hasn't moved on before we have - * cleared our entry. - */ - WT_ASSERT(session, - WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); - - /* - * Clear our entry from the global transaction session table. Any - * operation that needs to know about the ID for this checkpoint will - * consider the checkpoint ID in the global structure. Most operations - * can safely ignore the checkpoint ID (see the visible all check for - * details). - */ - txn_state->id = txn_state->pinned_id = - txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, &txn_global->scan_rwlock); + WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT); /* * Unblock updates -- we can figure out that any updates to clean pages @@ -719,16 +777,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); - /* - * Mark old checkpoints that are being deleted and figure out which - * trees we can skip in this checkpoint. - * - * Release clean trees. Any updates made after this point will not - * visible to the checkpoint transaction. - */ - WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_mark_deletes)); - WT_ERR(__checkpoint_release_clean_trees(session)); - /* Tell logging that we have started a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( @@ -1065,12 +1113,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* - * __checkpoint_lock_tree -- - * Acquire the locks required to checkpoint a tree. + * __checkpoint_lock_dirty_tree -- + * Decide whether the tree needs to be included in the checkpoint and if + * so, acquire the necessary locks. */ static int -__checkpoint_lock_tree(WT_SESSION_IMPL *session, - bool is_checkpoint, bool need_tracking, const char *cfg[]) +__checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, + bool is_checkpoint, bool force, bool need_tracking, const char *cfg[]) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; @@ -1194,6 +1243,14 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, ckpt->name); } + /* + * Mark old checkpoints that are being deleted and figure out which + * trees we can skip in this checkpoint. + */ + WT_ERR(__checkpoint_mark_skip(session, ckptbase, force)); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + goto err; + /* * Lock the checkpoints that will be deleted. * @@ -1268,23 +1325,19 @@ err: if (hot_backup_locked) } /* - * __checkpoint_mark_deletes -- - * Figure out what old checkpoints will be deleted, and whether the - * checkpoint can be skipped entirely. + * __checkpoint_mark_skip -- + * Figure out whether the checkpoint can be skipped for a tree. */ static int -__checkpoint_mark_deletes( - WT_SESSION_IMPL *session, const char *cfg[]) +__checkpoint_mark_skip( + WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) { WT_BTREE *btree; - WT_CKPT *ckpt, *ckptbase; - WT_CONFIG_ITEM cval; + WT_CKPT *ckpt; const char *name; int deleted; - bool force; btree = S2BT(session); - ckptbase = btree->ckpt; /* * Check for clean objects not requiring a checkpoint. @@ -1310,12 +1363,7 @@ __checkpoint_mark_deletes( * to open the checkpoint in a cursor after taking any checkpoint, which * means it must exist. */ - force = false; F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && cfg != NULL) { - WT_RET(__wt_config_gets(session, cfg, "force", &cval)); - force = cval.val != 0; - } if (!btree->modified && !force) { deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -1524,7 +1572,8 @@ __checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[]) WT_UNUSED(cfg); btree = S2BT(session); - WT_ASSERT(session, !btree->include_checkpoint_txn); + WT_ASSERT(session, btree->checkpoint_gen == + S2C(session)->txn_global.checkpoint_gen); btree->evict_walk_period = btree->evict_walk_saved; return (0); } @@ -1582,12 +1631,11 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, true, true, cfg)); - WT_RET(ret); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_mark_deletes(session, cfg)); + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, true, false, true, cfg)); WT_RET(ret); + if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) + return (0); return (__checkpoint_tree(session, true, cfg)); } @@ -1662,15 +1710,10 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, false, false, need_tracking, NULL)); WT_ASSERT(session, ret == 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = __checkpoint_mark_deletes(session, NULL)); - WT_ASSERT(session, ret == 0); - } - if (ret == 0) + if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)) ret = __checkpoint_tree(session, false, NULL); if (need_tracking) -- cgit v1.2.1 From 1e05438f426c0c54a603f660fb7831eb2b9a523e Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 10 Mar 2017 21:01:21 +1100 Subject: WT-3207 Report a message for conflicting forced checkpoints, rather than an error (#3326) Have test/fops handle EBUSY returns from forced checkpoints and EINVAL from bulk cursors. --- src/docs/upgrading.dox | 6 ++++++ src/txn/txn_ckpt.c | 8 +++++--- test/fops/file.c | 27 ++++++++++++++++++--------- test/fops/t.c | 5 +++++ 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index f463e6bc615..2894db0c126 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -13,6 +13,12 @@ In the 2.9.1 release we added statistics tracking handle list lock timing, we have switched that lock from a spin lock to a read-write lock, and consequently changed the statistics tracking lock related wait time. +
Forced and named checkpoint error conditions changed
+
+There are new cases where checkpoints created with an explicit name or the +"force" configuration option can return an EBUSY error. This can happen if +the checkpoint overlaps with other schema operations, for example table create. +
@section version_291 Upgrading to Version 2.9.1 diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index d6f0e45c042..3eb07089b87 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -319,10 +319,12 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) * safely be skipped here. */ F_CLR(&session->txn, WT_TXN_ERROR); - if (force) - WT_RET_MSG(session, EBUSY, + if (force) { + WT_RET(__wt_msg(session, "forced or named checkpoint raced with " - "a metadata update"); + "a metadata update")); + return (EBUSY); + } __wt_verbose(session, WT_VERB_CHECKPOINT, "skipped checkpoint of %s with metadata conflict", session->dhandle->name); diff --git a/test/fops/file.c b/test/fops/file.c index ea15f1ee80d..66c23dfed3c 100644 --- a/test/fops/file.c +++ b/test/fops/file.c @@ -51,7 +51,7 @@ obj_bulk(void) if ((ret = c->close(c)) != 0) testutil_die(ret, "cursor.close"); } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL) - testutil_die(ret, "session.open_cursor"); + testutil_die(ret, "session.open_cursor bulk"); } if ((ret = session->close(session, NULL)) != 0) testutil_die(ret, "session.close"); @@ -79,12 +79,17 @@ obj_bulk_unique(int force) testutil_die(ret, "session.create: %s", new_uri); __wt_yield(); - if ((ret = - session->open_cursor(session, new_uri, NULL, "bulk", &c)) != 0) - testutil_die(ret, "session.open_cursor: %s", new_uri); - - if ((ret = c->close(c)) != 0) - testutil_die(ret, "cursor.close"); + /* + * Opening a bulk cursor may have raced with a forced checkpoint + * which created a checkpoint of the empty file, and triggers an EINVAL + */ + if ((ret = session->open_cursor( + session, new_uri, NULL, "bulk", &c)) == 0) { + if ((ret = c->close(c)) != 0) + testutil_die(ret, "cursor.close"); + } else if (ret != EINVAL) + testutil_die(ret, + "session.open_cursor bulk unique: %s, new_uri"); while ((ret = session->drop( session, new_uri, force ? "force" : NULL)) != 0) @@ -190,9 +195,13 @@ obj_checkpoint(void) if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); - /* Force the checkpoint so it has to be taken. */ + /* + * Force the checkpoint so it has to be taken. Forced checkpoints can + * race with other metadata operations and return EBUSY - we'd expect + * applications using forced checkpoints to retry on EBUSY. + */ if ((ret = session->checkpoint(session, "force")) != 0) - if (ret != ENOENT) + if (ret != EBUSY && ret != ENOENT) testutil_die(ret, "session.checkpoint"); if ((ret = session->close(session, NULL)) != 0) diff --git a/test/fops/t.c b/test/fops/t.c index 651d22c8deb..469d5acd33a 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -217,6 +217,11 @@ handle_message(WT_EVENT_HANDLER *handler, (void)(handler); (void)(session); + /* Ignore messages about failing to create forced checkpoints. */ + if (strstr( + message, "forced or named checkpoint") != NULL) + return (0); + if (logfp != NULL) return (fprintf(logfp, "%s\n", message) < 0 ? -1 : 0); -- cgit v1.2.1 From f72c78b74d42c9e89bc98ad56ba184536e8efcae Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Mon, 13 Mar 2017 17:13:31 +1100 Subject: WT-3207 Fix a leak if a checkpoint fails. (#3329) Also switch to holding the schema lock when completing a bulk load. This avoids a race with checkpoints starting, so avoids a failure mode that was added to checkpoint earlier in this ticket. Assert that we don't hit that case instead. --- src/btree/bt_vrfy.c | 2 +- src/conn/conn_dhandle.c | 10 +++--- src/include/extern.h | 2 +- src/meta/meta_ckpt.c | 10 +++--- src/meta/meta_ext.c | 2 +- src/session/session_dhandle.c | 10 ++++++ src/session/session_salvage.c | 2 +- src/txn/txn_ckpt.c | 75 +++++++++++++++---------------------------- 8 files changed, 50 insertions(+), 63 deletions(-) diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 3c90e580696..7475811adc5 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -274,7 +274,7 @@ err: /* Inform the underlying block manager we're done. */ /* Discard the list of checkpoints. */ if (ckptbase != NULL) - __wt_meta_ckptlist_free(session, ckptbase); + __wt_meta_ckptlist_free(session, &ckptbase); /* Free allocated memory. */ __wt_scr_free(session, &vs->max_key); diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 25795a8d309..6c8d66d63f8 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -152,11 +152,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_RET(__wt_evict_file_exclusive_on(session)); /* - * If we don't already have the schema lock, make it an error to try - * to acquire it. The problem is that we are holding an exclusive - * lock on the handle, and if we attempt to acquire the schema lock - * we might deadlock with a thread that has the schema lock and wants - * a handle lock (specifically, checkpoint). + * If we don't already have the schema lock, make it an error to try to + * acquire it. The problem is that we are holding an exclusive lock on + * the handle, and if we attempt to acquire the schema lock we might + * deadlock with a thread that has the schema lock and wants a handle + * lock. */ no_schema_lock = false; if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { diff --git a/src/include/extern.h b/src/include/extern.h index d0c9655fafb..db718966426 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -462,7 +462,7 @@ extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index b985104c2eb..151bbe0e081 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -297,7 +297,7 @@ __wt_meta_ckptlist_get( *ckptbasep = ckptbase; if (0) { -err: __wt_meta_ckptlist_free(session, ckptbase); +err: __wt_meta_ckptlist_free(session, &ckptbase); } __wt_free(session, config); __wt_scr_free(session, &buf); @@ -463,16 +463,16 @@ err: __wt_scr_free(session, &buf); * Discard the checkpoint array. */ void -__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep) { - WT_CKPT *ckpt; + WT_CKPT *ckpt, *ckptbase; - if (ckptbase == NULL) + if ((ckptbase = *ckptbasep) == NULL) return; WT_CKPT_FOREACH(ckptbase, ckpt) __wt_meta_checkpoint_free(session, ckpt); - __wt_free(session, ckptbase); + __wt_free(session, *ckptbasep); } /* diff --git a/src/meta/meta_ext.c b/src/meta/meta_ext.c index 50e7568fe77..aa1ea8b974d 100644 --- a/src/meta/meta_ext.c +++ b/src/meta/meta_ext.c @@ -102,5 +102,5 @@ void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase); + __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, &ckptbase); } diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 7c96dd8b8a8..95fb6a6f90e 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -270,6 +270,16 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) { ret = __wt_conn_btree_sync_and_close(session, false, true); F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE); + } else if (F_ISSET(btree, WT_BTREE_BULK)) { + WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && + !F_ISSET(dhandle, WT_DHANDLE_DISCARD)); + /* + * Acquire the schema lock while completing a bulk load. This + * avoids racing with a checkpoint while it gathers a set + * of handles. + */ + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_conn_btree_sync_and_close(session, false, false)); } else if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) || F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) { WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c index 983b28dd8ea..12ce71cdbb0 100644 --- a/src/session/session_salvage.c +++ b/src/session/session_salvage.c @@ -54,6 +54,6 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_meta_ckptlist_set( session, dhandle->name, ckptbase, NULL)); -err: __wt_meta_ckptlist_free(session, ckptbase); +err: __wt_meta_ckptlist_free(session, &ckptbase); return (ret); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3eb07089b87..748f4aa2473 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -267,10 +267,9 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; WT_CONFIG_ITEM cval; - WT_CURSOR *meta_cursor; WT_DECL_RET; const char *name; - bool force, metadata_race; + bool force; btree = S2BT(session); @@ -291,6 +290,7 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) return (0); +#ifdef HAVE_DIAGNOSTIC /* * We may have raced between starting the checkpoint transaction and * some operation completing on the handle that updated the metadata @@ -298,10 +298,12 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) * exclusive access to the handle or hold the schema lock. We are now * holding the schema lock and have an open btree handle, so if we * can't update the metadata, then there has been some state change - * invisible to the checkpoint transaction. Skip checkpointing such - * files: they must have a recent durable point. + * invisible to the checkpoint transaction. */ if (!WT_IS_METADATA(session->dhandle)) { + WT_CURSOR *meta_cursor; + bool metadata_race; + WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); WT_RET(__wt_metadata_cursor(session, &meta_cursor)); meta_cursor->set_key(meta_cursor, session->dhandle->name); @@ -313,26 +315,9 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) metadata_race = false; WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); WT_RET(ret); - if (metadata_race) { - /* - * The conflict registers as a rollback error: that can - * safely be skipped here. - */ - F_CLR(&session->txn, WT_TXN_ERROR); - if (force) { - WT_RET(__wt_msg(session, - "forced or named checkpoint raced with " - "a metadata update")); - return (EBUSY); - } - __wt_verbose(session, WT_VERB_CHECKPOINT, - "skipped checkpoint of %s with metadata conflict", - session->dhandle->name); - F_SET(btree, WT_BTREE_SKIP_CKPT); - __checkpoint_update_generation(session); - return (0); - } + WT_ASSERT(session, !metadata_race); } +#endif /* * Decide whether the tree needs to be included in the checkpoint and @@ -342,6 +327,7 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) session, true, force, true, cfg)); WT_RET(ret); if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { + WT_ASSERT(session, btree->ckpt == NULL); __checkpoint_update_generation(session); return (0); } @@ -567,8 +553,11 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, static void __checkpoint_fail_reset(WT_SESSION_IMPL *session) { - S2BT(session)->modified = true; - S2BT(session)->ckpt = NULL; + WT_BTREE *btree; + + btree = S2BT(session); + btree->modified = true; + __wt_meta_ckptlist_free(session, &btree->ckpt); } /* @@ -600,6 +589,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_RET(__wt_txn_begin(session, txn_cfg)); + WT_DIAGNOSTIC_YIELD; + /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_RET(__wt_txn_id_check(session)); @@ -1286,33 +1277,20 @@ __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, } /* - * There are special files: those being bulk-loaded, salvaged, upgraded - * or verified during the checkpoint. We have to do something for those - * objects because a checkpoint is an external name the application can - * reference and the name must exist no matter what's happening during - * the checkpoint. For bulk-loaded files, we could block until the load - * completes, checkpoint the partial load, or magic up an empty-file - * checkpoint. The first is too slow, the second is insane, so do the - * third. - * Salvage, upgrade and verify don't currently require any work, all - * three hold the schema lock, blocking checkpoints. If we ever want to - * fix that (and I bet we eventually will, at least for verify), we can - * copy the last checkpoint the file has. That works if we guarantee - * salvage, upgrade and verify act on objects with previous checkpoints - * (true if handles are closed/re-opened between object creation and a - * subsequent salvage, upgrade or verify operation). Presumably, - * salvage and upgrade will discard all previous checkpoints when they - * complete, which is fine with us. This change will require reference - * counting checkpoints, and once that's done, we should use checkpoint - * copy instead of forcing checkpoints on clean objects to associate - * names with checkpoints. + * There are special tree: those being bulk-loaded, salvaged, upgraded + * or verified during the checkpoint. They should never be part of a + * checkpoint: we will fail to lock them because the operations have + * exclusive access to the handles. Named checkpoints will fail in that + * case, ordinary checkpoints will skip files that cannot be opened + * normally. */ WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); __wt_readunlock(session, &conn->hot_backup_lock); - WT_ASSERT(session, btree->ckpt == NULL); + WT_ASSERT(session, btree->ckpt == NULL && + !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); btree->ckpt = ckptbase; return (0); @@ -1320,7 +1298,7 @@ __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, err: if (hot_backup_locked) __wt_readunlock(session, &conn->hot_backup_lock); - __wt_meta_ckptlist_free(session, ckptbase); + __wt_meta_ckptlist_free(session, &ckptbase); __wt_free(session, name_alloc); return (ret); @@ -1554,8 +1532,7 @@ err: /* S2C(session)->modified = true; } - __wt_meta_ckptlist_free(session, ckptbase); - btree->ckpt = NULL; + __wt_meta_ckptlist_free(session, &btree->ckpt); return (ret); } -- cgit v1.2.1 From 28a883f7b4acd020a8d92a733cb9df415a6be482 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 13 Mar 2017 20:53:11 -0400 Subject: WT-3211 WT_CURSOR.remove cannot always retain its position. (#3321) --- .gitignore | 30 +++-- SConstruct | 1 - dist/s_string.ok | 1 + dist/s_void | 4 +- examples/c/Makefile.am | 1 - examples/c/ex_scope.c | 217 ------------------------------ src/btree/bt_cursor.c | 312 +++++++++++++++++++++++++++++++------------- src/btree/bt_ret.c | 152 ++++++++++++--------- src/cursor/cur_file.c | 23 ++-- src/cursor/cur_table.c | 30 ++++- src/docs/cursor-ops.dox | 4 - src/docs/upgrading.dox | 30 ++++- src/include/api.h | 37 +++--- src/include/buf.i | 37 +++--- src/include/cursor.i | 35 ++--- src/include/error.h | 6 +- src/include/extern.h | 1 + src/include/misc.h | 1 + src/include/session.h | 1 - src/include/wiredtiger.in | 30 +++-- src/lsm/lsm_cursor.c | 42 +++++- src/txn/txn_ckpt.c | 1 - test/csuite/Makefile.am | 7 +- test/csuite/scope/main.c | 288 ++++++++++++++++++++++++++++++++++++++++ test/suite/test_cursor10.py | 4 +- test/suite/test_cursor11.py | 159 ++++++++++++++++++++++ 26 files changed, 960 insertions(+), 494 deletions(-) delete mode 100644 examples/c/ex_scope.c create mode 100644 test/csuite/scope/main.c create mode 100644 test/suite/test_cursor11.py diff --git a/.gitignore b/.gitignore index c7b3ade9e87..4611f2aa98c 100644 --- a/.gitignore +++ b/.gitignore @@ -90,24 +90,28 @@ _wiredtiger.pyd **/examples/c/ex_pack **/examples/c/ex_process **/examples/c/ex_schema -**/examples/c/ex_scope **/examples/c/ex_stat **/examples/c/ex_sync **/examples/c/ex_thread **/test/bloom/t **/test/checkpoint/t -**/test/csuite/test_wt1965_col_efficiency -**/test/csuite/test_wt2246_col_append -**/test/csuite/test_wt2323_join_visibility -**/test/csuite/test_wt2403_lsm_workload -**/test/csuite/test_wt2447_join_main_table -**/test/csuite/test_wt2535_insert_race -**/test/csuite/test_wt2592_join_schema -**/test/csuite/test_wt2695_checksum -**/test/csuite/test_wt2719_reconfig -**/test/csuite/test_wt2834_join_bloom_fix -**/test/csuite/test_wt2853_perf -**/test/csuite/test_wt2999_join_extractor +**/test_scope +**/test_wt1965_col_efficiency +**/test_wt2246_col_append +**/test_wt2323_join_visibility +**/test_wt2403_lsm_workload +**/test_wt2447_join_main_table +**/test_wt2535_insert_race +**/test_wt2592_join_schema +**/test_wt2695_checksum +**/test_wt2719_reconfig +**/test_wt2834_join_bloom_fix +**/test_wt2853_perf +**/test_wt2909_checkpoint_integrity +**/test_wt2999_join_extractor +**/test_wt3120_filesys +**/test_wt3135_search_near_collator +**/test_wt3184_dup_index_collator **/test/cursor_order/cursor_order **/test/fops/t **/test/format/s_dumpcmp diff --git a/SConstruct b/SConstruct index e9e72630b11..b397f662be7 100644 --- a/SConstruct +++ b/SConstruct @@ -376,7 +376,6 @@ examples = [ "ex_pack", "ex_process", "ex_schema", - "ex_scope", "ex_stat", "ex_thread", ] diff --git a/dist/s_string.ok b/dist/s_string.ok index e033f77327f..cdfa4aec968 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -63,6 +63,7 @@ CPUs CRC CSV CStream +CURFILE CURSORs CURSTD CallsCustDate diff --git a/dist/s_void b/dist/s_void index 90425d5a718..249f043d029 100755 --- a/dist/s_void +++ b/dist/s_void @@ -137,7 +137,7 @@ for f in `find bench ext src test -name '*.[ci]'`; do # form of return assignment or call. file_parse $f | sed -e 's/return ([^)]*); }$//' \ - -e '/[A-Z]*_API_CALL[A-Z_]*(/d' \ + -e '/[_A-Z]*_API_CALL[_A-Z]*(/d' \ -e '/WT_CURSOR_NEEDKEY(/d' \ -e '/WT_CURSOR_NEEDVALUE(/d' \ -e '/WT_ERR[A-Z_]*(/d' \ @@ -166,7 +166,7 @@ for f in `find bench ext src test -name '*.[ci]'`; do file_parse $f | grep 'WT_DECL_RET' | sed -e '/ret =/d' \ - -e '/API_END_RET/d' \ + -e '/[_A-Z]*_API_CALL[_A-Z]*(/d' \ -e '/WT_CURSOR_NEEDKEY/d' \ -e '/WT_CURSOR_NEEDVALUE/d' \ -e '/WT_ERR/d' \ diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am index d5305eec5c8..20936661e06 100644 --- a/examples/c/Makefile.am +++ b/examples/c/Makefile.am @@ -20,7 +20,6 @@ noinst_PROGRAMS = \ ex_pack \ ex_process \ ex_schema \ - ex_scope \ ex_stat \ ex_sync \ ex_thread diff --git a/examples/c/ex_scope.c b/examples/c/ex_scope.c deleted file mode 100644 index 795ad85d57b..00000000000 --- a/examples/c/ex_scope.c +++ /dev/null @@ -1,217 +0,0 @@ -/*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. - * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * ex_scope.c - * demonstrates the scope of buffers holding cursor keys and values. - */ -#include -#include -#include -#include - -#include - -#ifdef _WIN32 -/* snprintf is not supported on <= VS2013 */ -#define snprintf _snprintf -#endif - -static const char *home; - -static int -cursor_scope_ops(WT_CURSOR *cursor) -{ - struct { - const char *op; - const char *key; - const char *value; - int (*apply)(WT_CURSOR *); - } *op, ops[] = { - { "insert", "key1", "value1", cursor->insert, }, - { "update", "key1", "value2", cursor->update, }, - { "search", "key1", "value2", cursor->search, }, - { "remove", "key1", "value2", cursor->remove, }, - { NULL, NULL, NULL, NULL } - }; - WT_SESSION *session; - const char *key, *value; - char keybuf[10], valuebuf[10]; - int ret; - - session = cursor->session; - - for (op = ops; op->key != NULL; op++) { - key = value = NULL; - - /*! [cursor scope operation] */ - (void)snprintf(keybuf, sizeof(keybuf), "%s", op->key); - cursor->set_key(cursor, keybuf); - (void)snprintf(valuebuf, sizeof(valuebuf), "%s", op->value); - cursor->set_value(cursor, valuebuf); - - /* - * The application must keep key and value memory valid until - * the next operation that positions the cursor, modifies the - * data, or resets or closes the cursor. - * - * Modifying either the key or value buffers is not permitted. - */ - - /* Apply the operation (insert, update, search or remove). */ - if ((ret = op->apply(cursor)) != 0) { - fprintf(stderr, - "%s: error performing the operation: %s\n", - op->op, session->strerror(session, ret)); - return (ret); - } - - /* - * The cursor no longer references application memory, so - * application buffers can be safely overwritten. - */ - strcpy(keybuf, "no key"); - strcpy(valuebuf, "no value"); - - /* - * Check that get_key/value behave as expected after the - * operation. - */ - if (op->apply == cursor->insert) { - /* - * WT_CURSOR::insert no longer references application - * memory, but as it does not position the cursor, it - * doesn't reference memory owned by the cursor, either. - */ - printf("ex_scope: " - "expect two WiredTiger error messages:\n"); - if ((ret = cursor->get_key(cursor, &key)) == 0 || - (ret = cursor->get_value(cursor, &value)) == 0) { - fprintf(stderr, - "%s: error in get_key/value: %s\n", - op->op, session->strerror(session, ret)); - return (ret); - } - continue; - } - if (op->apply == cursor->remove) { - /* - * WT_CURSOR::remove no longer references application - * memory; as it does not position the cursor, it will - * reference key memory owned by the cursor, but has no - * value. - */ - printf("ex_scope: " - "expect one WiredTiger error message:\n"); - if ((ret = cursor->get_key(cursor, &key)) != 0 || - (ret = cursor->get_value(cursor, &value)) == 0) { - fprintf(stderr, - "%s: error in get_key/value: %s\n", - op->op, session->strerror(session, ret)); - return (ret); - } - } else /* search, update */{ - /* - * WT_CURSOR::search and WT_CURSOR::update no longer - * reference application memory; as they position the - * cursor, they will reference key/value memory owned - * by the cursor. - */ - if ((ret = cursor->get_key(cursor, &key)) != 0 || - (ret = cursor->get_value(cursor, &value)) != 0) { - fprintf(stderr, - "%s: error in get_key/value: %s\n", - op->op, session->strerror(session, ret)); - return (ret); - } - } - - /* - * Modifying the memory referenced by either key or value is - * not permitted. - * - * Check that the cursor's key and value are what we expect. - */ - if (key == keybuf || - (op->apply != cursor->remove && value == valuebuf)) { - fprintf(stderr, - "%s: cursor points at application memory!\n", - op->op); - return (EINVAL); - } - - if (strcmp(key, op->key) != 0 || - (op->apply != cursor->remove && - strcmp(value, op->value) != 0)) { - fprintf(stderr, - "%s: unexpected key / value!\n", op->op); - return (EINVAL); - } - /*! [cursor scope operation] */ - } - - return (0); -} - -int -main(void) -{ - WT_CONNECTION *conn; - WT_CURSOR *cursor; - WT_SESSION *session; - int ret; - - /* - * Create a clean test directory for this run of the test program if the - * environment variable isn't already set (as is done by make check). - */ - if (getenv("WIREDTIGER_HOME") == NULL) { - home = "WT_HOME"; - ret = system("rm -rf WT_HOME && mkdir WT_HOME"); - } else - home = NULL; - - /* Open a connection, create a simple table, open a cursor. */ - if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 || - (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) { - fprintf(stderr, "Error connecting to %s: %s\n", - home == NULL ? "." : home, wiredtiger_strerror(ret)); - return (EXIT_FAILURE); - } - - ret = session->create(session, - "table:scope", "key_format=S,value_format=S,columns=(k,v)"); - - ret = session->open_cursor(session, - "table:scope", NULL, NULL, &cursor); - - ret = cursor_scope_ops(cursor); - - /* Close the connection and clean up. */ - ret = conn->close(conn, NULL); - - return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); -} diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 4634059589b..3ae6e022906 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -8,6 +8,70 @@ #include "wt_internal.h" +/* + * WT_CURFILE_OP_XXX + * If we're going to return an error, we need to restore the cursor to + * a valid state, the upper-level cursor code is likely to retry. The macros + * here are called to save and restore that state. + */ +#define WT_CURFILE_OP_DECL \ + WT_ITEM __key_copy; \ + WT_ITEM __value_copy; \ + uint64_t __recno; \ + uint32_t __flags +#define WT_CURFILE_OP_PUSH do { \ + WT_ITEM_SET(__key_copy, cursor->key); \ + WT_ITEM_SET(__value_copy, cursor->value); \ + __recno = cursor->recno; \ + __flags = cursor->flags; \ +} while (0) +#define WT_CURFILE_OP_POP do { \ + cursor->recno = __recno; \ + if (FLD_ISSET(__flags, WT_CURSTD_KEY_EXT)) \ + WT_ITEM_SET(cursor->key, __key_copy); \ + if (FLD_ISSET(__flags, WT_CURSTD_VALUE_EXT)) \ + WT_ITEM_SET(cursor->value, __value_copy); \ + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ + F_SET(cursor, \ + FLD_MASK(__flags, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT));\ +} while (0) + +/* + * __cursor_page_pinned -- + * Return if we have a page pinned and it's not been flagged for forced + * eviction (the forced eviction test is so we periodically release pages + * grown too large). + */ +static inline bool +__cursor_page_pinned(WT_CURSOR_BTREE *cbt) +{ + return (F_ISSET(cbt, WT_CBT_ACTIVE) && + cbt->ref->page->read_gen != WT_READGEN_OLDEST); +} + +/* + * __cursor_copy_int_key -- + * If we're pointing into the tree, save the key into local memory. + */ +static inline int +__cursor_copy_int_key(WT_CURSOR *cursor) +{ + /* + * We're about to discard the cursor's position and the cursor layer + * might retry the operation. We discard pinned pages on error, which + * will invalidate pinned keys. Clear WT_CURSTD_KEY_INT in all cases, + * the underlying page is gone whether we can allocate memory or not. + */ + if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + F_CLR(cursor, WT_CURSTD_KEY_INT); + if (!WT_DATA_IN_ITEM(&cursor->key)) + WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session, + &cursor->key, cursor->key.data, cursor->key.size)); + F_SET(cursor, WT_CURSTD_KEY_EXT); + } + return (0); +} + /* * __cursor_size_chk -- * Return if an inserted item is too large. @@ -343,8 +407,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) * from the root. */ valid = false; - if (F_ISSET(cbt, WT_CBT_ACTIVE) && - cbt->ref->page->read_gen != WT_READGEN_OLDEST) { + if (__cursor_page_pinned(cbt)) { __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? @@ -422,9 +485,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * existing record. */ valid = false; - if (btree->type == BTREE_ROW && - F_ISSET(cbt, WT_CBT_ACTIVE) && - cbt->ref->page->read_gen != WT_READGEN_OLDEST) { + if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); @@ -531,9 +592,18 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) retry: WT_RET(__cursor_func_init(cbt, true)); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + /* + * If not overwriting, fail if the key exists, else insert the + * key/value pair. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_DUPLICATE_KEY); + + ret = __cursor_row_modify(session, cbt, false); + } else { /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). The real record number @@ -558,19 +628,6 @@ retry: WT_RET(__cursor_func_init(cbt, true)); WT_ERR(__cursor_col_modify(session, cbt, false)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; - break; - case BTREE_ROW: - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * If not overwriting, fail if the key exists, else insert the - * key/value pair. - */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) - WT_ERR(WT_DUPLICATE_KEY); - - ret = __cursor_row_modify(session, cbt, false); - break; } err: if (ret == WT_RESTART) { @@ -578,11 +635,9 @@ err: if (ret == WT_RESTART) { WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } + /* Insert doesn't maintain a position across calls, clear resources. */ - if (ret == 0) - WT_TRET(__curfile_leave(cbt)); - if (ret != 0) - WT_TRET(__cursor_reset(cbt)); + WT_TRET(__cursor_reset(cbt)); return (ret); } @@ -640,29 +695,24 @@ __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) retry: WT_RET(__cursor_func_init(cbt, true)); - switch (btree->type) { - case BTREE_ROW: + if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * Just check for conflicts. */ ret = __curfile_update_check(cbt); - break; - case BTREE_COL_FIX: - case BTREE_COL_VAR: + } else WT_ERR(__wt_illegal_value(session, NULL)); - break; - } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } - WT_TRET(__curfile_leave(cbt)); - if (ret != 0) - WT_TRET(__cursor_reset(cbt)); + + /* Insert doesn't maintain a position across calls, clear resources. */ + WT_TRET(__cursor_reset(cbt)); return (ret); } @@ -674,23 +724,83 @@ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_OP_DECL; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool positioned; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; + WT_CURFILE_OP_PUSH; + WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * WT_CURSOR.remove has a unique semantic, the cursor stays positioned + * if it starts positioned, otherwise clear the cursor on completion. + */ + positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: +retry: + /* + * If removing with overwrite configured, and positioned to an on-page + * key, the update doesn't require another search. The cursor won't be + * positioned on a page with an external key set, but be sure. + */ + if (__cursor_page_pinned(cbt) && + F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE)) { + WT_ERR(__wt_txn_autocommit_check(session)); + + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * remove whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, true) : + __cursor_col_modify(session, cbt, true); + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + if (ret != 0) { + WT_TRET(__cursor_copy_int_key(cursor)); + WT_CURFILE_OP_PUSH; + goto err; + } + goto done; + } + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + WT_CURFILE_OP_PUSH; + + WT_ERR(__cursor_func_init(cbt, true)); + + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, false)); + + /* Check whether an update would conflict. */ + WT_ERR(__curfile_update_check(cbt)); + + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_NOTFOUND); + + ret = __cursor_row_modify(session, cbt, true); + } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); /* @@ -717,19 +827,6 @@ retry: WT_RET(__cursor_func_init(cbt, true)); cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, true); - break; - case BTREE_ROW: - /* Remove the record if it exists. */ - WT_ERR(__cursor_row_search(session, cbt, NULL, false)); - - /* Check whether an update would conflict. */ - WT_ERR(__curfile_update_check(cbt)); - - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) - WT_ERR(WT_NOTFOUND); - - ret = __cursor_row_modify(session, cbt, true); - break; } err: if (ret == WT_RESTART) { @@ -737,15 +834,29 @@ err: if (ret == WT_RESTART) { WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } + /* - * If the cursor is configured to overwrite and the record is not - * found, that is exactly what we want. + * If the cursor is configured to overwrite and the record is not found, + * that is exactly what we want, return success. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; - if (ret != 0) +done: /* + * If the cursor was positioned, it stays positioned, point the cursor + * at an internal copy of the key. Otherwise, there's no position or + * key/value. + */ + if (ret == 0) + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (ret == 0 && positioned) { + WT_TRET(__wt_key_return(session, cbt)); + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_INT); + } else WT_TRET(__cursor_reset(cbt)); + if (ret != 0) + WT_CURFILE_OP_POP; return (ret); } @@ -779,9 +890,19 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) retry: WT_RET(__cursor_func_init(cbt, true)); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + /* + * If not overwriting, check for conflicts and fail if the key + * does not exist. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + WT_ERR(__curfile_update_check(cbt)); + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_NOTFOUND); + } + ret = __cursor_row_modify(session, cbt, false); + } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); /* @@ -800,20 +921,6 @@ retry: WT_RET(__cursor_func_init(cbt, true)); WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); - break; - case BTREE_ROW: - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * If not overwriting, check for conflicts and fail if the key - * does not exist. - */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { - WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) - WT_ERR(WT_NOTFOUND); - } - ret = __cursor_row_modify(session, cbt, false); - break; } err: if (ret == WT_RESTART) { @@ -963,9 +1070,12 @@ __cursor_truncate(WT_SESSION_IMPL *session, WT_DECL_RET; /* - * First, call the standard cursor remove method to do a full search and - * re-position the cursor because we don't have a saved copy of the - * page's write generation information, which we need to remove records. + * First, call the cursor search method to re-position the cursor: we + * may not have a cursor position (if the higher-level truncate code + * switched the cursors to have an "external" cursor key, and because + * we don't save a copy of the page's write generation information, + * which we need to remove records. + * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to @@ -978,20 +1088,31 @@ __cursor_truncate(WT_SESSION_IMPL *session, * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ -retry: WT_RET(__wt_btcur_remove(start)); +retry: WT_RET(__wt_btcur_search(start)); + + /* + * XXX KEITH + * When the btree cursor code sets/clears the cursor flags (rather than + * the cursor layer), the set/clear goes away, only the assert remains. + */ + F_CLR((WT_CURSOR *)start, WT_CURSTD_KEY_SET); + F_SET((WT_CURSOR *)start, WT_CURSTD_KEY_INT); + WT_ASSERT(session, + F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { + if ((ret = rmfunc(session, start, 1)) != 0) + break; + if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; - start->compare = 0; /* Exact match */ - if ((ret = rmfunc(session, start, 1)) != 0) - break; + start->compare = 0; /* Exact match */ } if (ret == WT_RESTART) { @@ -1024,29 +1145,44 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, * record 37, records 1-36 magically appear. Those records can't be * deleted, which means we have to ignore already "deleted" records. * - * First, call the standard cursor remove method to do a full search and - * re-position the cursor because we don't have a saved copy of the - * page's write generation information, which we need to remove records. + * First, call the cursor search method to re-position the cursor: we + * may not have a cursor position (if the higher-level truncate code + * switched the cursors to have an "external" cursor key, and because + * we don't save a copy of the page's write generation information, + * which we need to remove records. + * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ -retry: WT_RET(__wt_btcur_remove(start)); +retry: WT_RET(__wt_btcur_search(start)); + + /* + * XXX KEITH + * When the btree cursor code sets/clears the cursor flags (rather than + * the cursor layer), the set/clear goes away, only the assert remains. + */ + F_CLR((WT_CURSOR *)start, WT_CURSTD_KEY_SET); + F_SET((WT_CURSOR *)start, WT_CURSTD_KEY_INT); + WT_ASSERT(session, + F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { + value = (const uint8_t *)start->iface.value.data; + if (*value != 0 && + (ret = rmfunc(session, start, 1)) != 0) + break; + if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; start->compare = 0; /* Exact match */ - value = (const uint8_t *)start->iface.value.data; - if (*value != 0 && - (ret = rmfunc(session, start, 1)) != 0) - break; } if (ret == WT_RESTART) { @@ -1166,7 +1302,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) * Skip the usual cursor tear-down in that case. */ if (!lowlevel) - ret = __curfile_leave(cbt); + ret = __cursor_reset(cbt); __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index 6409a1a180c..9fc457e2297 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -9,64 +9,21 @@ #include "wt_internal.h" /* - * __wt_kv_return -- - * Return a page referenced key/value pair to the application. + * __key_return -- + * Change the cursor to reference an internal return key. */ -int -__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +static inline int +__key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; - uint8_t v; - - btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; - switch (page->type) { - case WT_PAGE_COL_FIX: - /* - * The interface cursor's record has usually been set, but that - * isn't universally true, specifically, cursor.search_near may - * call here without first setting the interface cursor. - */ - cursor->recno = cbt->recno; - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; - return (0); - } - - /* Take the value from the original page. */ - v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt); - return (__wt_buf_set(session, &cursor->value, &v, 1)); - case WT_PAGE_COL_VAR: - /* - * The interface cursor's record has usually been set, but that - * isn't universally true, specifically, cursor.search_near may - * call here without first setting the interface cursor. - */ - cursor->recno = cbt->recno; - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; - return (0); - } - - /* Take the value from the original page cell. */ - cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); - break; - case WT_PAGE_ROW_LEAF: + if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; /* @@ -79,7 +36,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); - } else if (cbt->compare == 0) { + return (0); + } + + if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want @@ -97,16 +57,51 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; - } else - WT_RET(__wt_row_leaf_key( - session, page, rip, &cursor->key, false)); - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; return (0); } + return (__wt_row_leaf_key( + session, page, rip, &cursor->key, false)); + } + + /* + * WT_PAGE_COL_FIX, WT_PAGE_COL_VAR: + * The interface cursor's record has usually been set, but that + * isn't universally true, specifically, cursor.search_near may call + * here without first setting the interface cursor. + */ + cursor->recno = cbt->recno; + return (0); +} + +/* + * __value_return -- + * Change the cursor to reference an internal return value. + */ +static inline int +__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_CURSOR *cursor; + WT_PAGE *page; + WT_ROW *rip; + uint8_t v; + + btree = S2BT(session); + + page = cbt->ref->page; + cursor = &cbt->iface; + + /* If the cursor references a WT_UPDATE item, return it. */ + if (upd != NULL) { + cursor->value.data = WT_UPDATE_DATA(upd); + cursor->value.size = upd->size; + return (0); + } + + if (page->type == WT_PAGE_ROW_LEAF) { + rip = &page->pg_row[cbt->slot]; /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) @@ -121,13 +116,46 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) cursor->value.size = 0; return (0); } - break; - WT_ILLEGAL_VALUE(session); + __wt_cell_unpack(cell, &unpack); + return (__wt_page_cell_data_ref( + session, page, &unpack, &cursor->value)); + + } + + if (page->type == WT_PAGE_COL_VAR) { + /* Take the value from the original page cell. */ + cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); + __wt_cell_unpack(cell, &unpack); + return (__wt_page_cell_data_ref( + session, page, &unpack, &cursor->value)); } - /* The value is an on-page cell, unpack and expand it as necessary. */ - __wt_cell_unpack(cell, &unpack); - WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); + /* WT_PAGE_COL_FIX: Take the value from the original page. */ + v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt); + return (__wt_buf_set(session, &cursor->value, &v, 1)); +} + +/* + * __wt_key_return -- + * Change the cursor to reference an internal return key. + */ +int +__wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_RET(__key_return(session, cbt)); + + return (0); +} + +/* + * __wt_kv_return -- + * Return a page referenced key/value pair to the application. + */ +int +__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_RET(__wt_key_return(session, cbt)); + WT_RET(__value_return(session, cbt, upd)); return (0); } diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 0ec917fbf95..274dc1e8f62 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -325,24 +325,21 @@ __curfile_remove(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); + WT_ERR(__wt_btcur_remove(cbt)); /* - * After a successful remove, copy the key: the value is not available. + * Remove with a search-key is fire-and-forget, no position and no key. + * Remove starting from a position maintains the position and a key. + * We don't know which it was at this layer, so can only assert the key + * is not set at all, or internal. There's never a value. */ - if (ret == 0) { - if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && - !WT_DATA_IN_ITEM(&(cursor)->key)) { - WT_ERR(__wt_buf_set(session, &cursor->key, - cursor->key.data, cursor->key.size)); - F_CLR(cursor, WT_CURSTD_KEY_INT); - F_SET(cursor, WT_CURSTD_KEY_EXT); - } - F_CLR(cursor, WT_CURSTD_VALUE_SET); - } + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 || + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0); err: CURSOR_UPDATE_API_END(session, ret); return (ret); diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 7e8cd153d2d..98dbbec8981 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -511,9 +511,16 @@ __curtable_insert(WT_CURSOR *cursor) */ F_SET(primary, flag_orig | WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); - if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curtable_update(cursor)); - else { + + /* + * The cursor is no longer positioned. This isn't just cosmetic, + * without a reset, iteration on this cursor won't start at the + * beginning/end of the table. + */ + APPLY_CG(ctable, reset); + } else { WT_ERR(ret); for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { @@ -601,14 +608,20 @@ err: CURSOR_UPDATE_API_END(session, ret); static int __curtable_remove(WT_CURSOR *cursor) { + WT_CURSOR *primary; WT_CURSOR_TABLE *ctable; WT_DECL_RET; WT_SESSION_IMPL *session; + bool positioned; ctable = (WT_CURSOR_TABLE *)cursor; JOINABLE_CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_ERR(__curtable_open_indices(ctable)); + /* Check if the cursor was positioned. */ + primary = *ctable->cg_cursors; + positioned = F_ISSET(primary, WT_CURSTD_KEY_INT); + /* Find the old record so it can be removed from indices */ if (ctable->table->nindices > 0) { APPLY_CG(ctable, search); @@ -617,6 +630,19 @@ __curtable_remove(WT_CURSOR *cursor) } APPLY_CG(ctable, remove); + WT_ERR(ret); + + /* + * If the cursor was positioned, it stays positioned with a key but no + * no value, otherwise, there's no position, key or value. This isn't + * just cosmetic, without a reset, iteration on this cursor won't start + * at the beginning/end of the table. + */ + F_CLR(primary, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (positioned) + F_SET(primary, WT_CURSTD_KEY_INT); + else + APPLY_CG(ctable, reset); err: CURSOR_UPDATE_API_END(session, ret); return (ret); diff --git a/src/docs/cursor-ops.dox b/src/docs/cursor-ops.dox index b743d81db57..e479ff29191 100644 --- a/src/docs/cursor-ops.dox +++ b/src/docs/cursor-ops.dox @@ -145,9 +145,5 @@ that may not be modified or freed by the application. If a longer scope is required, the application must make a copy of the memory before the cursor is re-used, closed or reset. -The comments in this example code explain when the application can safely -modify memory passed to WT_CURSOR::set_key or WT_CURSOR::set_value: - -@snippet ex_scope.c cursor scope operation @m_endif */ diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 2894db0c126..e5fce3d0d5d 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -2,28 +2,45 @@ @section version_292 Upgrading to Version 2.9.2
-
WiredTiger Utility now supports truncate
+ +
WiredTiger utility now supports truncate
-The WiredTiger Utility can now \c truncate an object. Removing all contents -from the specified object. +The WiredTiger utility \c wt can now \c truncate objects, removing all +contents from the specified object.
+
Handle list lock statistics
In the 2.9.1 release we added statistics tracking handle list lock timing, we have switched that lock from a spin lock to a read-write lock, and consequently changed the statistics tracking lock related wait time.
+
Forced and named checkpoint error conditions changed
There are new cases where checkpoints created with an explicit name or the "force" configuration option can return an EBUSY error. This can happen if the checkpoint overlaps with other schema operations, for example table create.
-
-@section version_291 Upgrading to Version 2.9.1 +
WT_CURSOR::remove may not return a positioned cursor
+
+The WT_CURSOR::remove method was previously documented to always return a +positioned cursor on success, which is not possible when \c overwrite=true +and the record does not exist. + +The documentation has been updated, and the method has been changed to +never return a cursor position unless called with an existing cursor +position. In other words, if the cursor is positioned and the +WT_CURSOR::remove is called, the cursor will remain positioned; if the +cursor is not positioned and the WT_CURSOR::remove method is called, the +cursor will not be positioned on return. +
+
+@section version_291 Upgrading to Version 2.9.1
+
Changes to hazard pointer configuration
The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is @@ -39,10 +56,11 @@ have added a new \c access_pattern_hint configuration option available for WT_SESSION::create that can be used to restore the old default by setting the value to "random".
-
+
@section version_290 Upgrading to Version 2.9.0
+
Changes to cursor behavior after WT_CURSOR::insert
After a successful call to WT_CURSOR::insert, unless a cursor has record diff --git a/src/include/api.h b/src/include/api.h index 2783d17f825..1fa777ed5cc 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -7,22 +7,21 @@ */ /* Standard entry points to the API: declares/initializes local variables. */ -#define API_SESSION_INIT(s, h, n, cur, dh) \ +#define API_SESSION_INIT(s, h, n, dh) \ WT_DATA_HANDLE *__olddh = (s)->dhandle; \ const char *__oldname = (s)->name; \ - (s)->cursor = (cur); \ (s)->dhandle = (dh); \ (s)->name = (s)->lastop = #h "." #n; \ -#define API_CALL_NOCONF(s, h, n, cur, dh) do { \ - API_SESSION_INIT(s, h, n, cur, dh); \ +#define API_CALL_NOCONF(s, h, n, dh) do { \ + API_SESSION_INIT(s, h, n, dh); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ __wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n) -#define API_CALL(s, h, n, cur, dh, config, cfg) do { \ +#define API_CALL(s, h, n, dh, config, cfg) do { \ const char *cfg[] = \ { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \ - API_SESSION_INIT(s, h, n, cur, dh); \ + API_SESSION_INIT(s, h, n, dh); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ if ((config) != NULL) \ WT_ERR(__wt_config_check((s), \ @@ -42,17 +41,17 @@ } while (0) /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL(s, h, n, cur, bt, config, cfg) do { \ +#define TXN_API_CALL(s, h, n, bt, config, cfg) do { \ bool __autotxn = false; \ - API_CALL(s, h, n, bt, cur, config, cfg); \ + API_CALL(s, h, n, bt, config, cfg); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL_NOCONF(s, h, n, cur, bt) do { \ +#define TXN_API_CALL_NOCONF(s, h, n, bt) do { \ bool __autotxn = false; \ - API_CALL_NOCONF(s, h, n, cur, bt); \ + API_CALL_NOCONF(s, h, n, bt); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) @@ -98,24 +97,24 @@ #define CONNECTION_API_CALL(conn, s, n, config, cfg) \ s = (conn)->default_session; \ - API_CALL(s, WT_CONNECTION, n, NULL, NULL, config, cfg) + API_CALL(s, WT_CONNECTION, n, NULL, config, cfg) #define CONNECTION_API_CALL_NOCONF(conn, s, n) \ s = (conn)->default_session; \ - API_CALL_NOCONF(s, WT_CONNECTION, n, NULL, NULL) + API_CALL_NOCONF(s, WT_CONNECTION, n, NULL) #define SESSION_API_CALL(s, n, config, cfg) \ - API_CALL(s, WT_SESSION, n, NULL, NULL, config, cfg) + API_CALL(s, WT_SESSION, n, NULL, config, cfg) #define SESSION_API_CALL_NOCONF(s, n) \ - API_CALL_NOCONF(s, WT_SESSION, n, NULL, NULL) + API_CALL_NOCONF(s, WT_SESSION, n, NULL) #define SESSION_TXN_API_CALL(s, n, config, cfg) \ - TXN_API_CALL(s, WT_SESSION, n, NULL, NULL, config, cfg) + TXN_API_CALL(s, WT_SESSION, n, NULL, config, cfg) #define CURSOR_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ + API_CALL_NOCONF(s, WT_CURSOR, n, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) #define JOINABLE_CURSOR_CALL_CHECK(cur) \ @@ -128,7 +127,7 @@ #define CURSOR_REMOVE_API_CALL(cur, s, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \ + TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); #define JOINABLE_CURSOR_REMOVE_API_CALL(cur, s, bt) \ @@ -137,7 +136,7 @@ #define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ + TXN_API_CALL_NOCONF(s, WT_CURSOR, n, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && \ !F_ISSET((WT_BTREE *)(bt), WT_BTREE_IGNORE_CACHE) && \ @@ -153,4 +152,4 @@ #define ASYNCOP_API_CALL(conn, s, n) \ s = (conn)->default_session; \ - API_CALL_NOCONF(s, asyncop, n, NULL, NULL) + API_CALL_NOCONF(s, asyncop, n, NULL) diff --git a/src/include/buf.i b/src/include/buf.i index ebbee6b4633..d192e292dcf 100644 --- a/src/include/buf.i +++ b/src/include/buf.i @@ -37,28 +37,30 @@ __wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) /* * __wt_buf_init -- - * Initialize a buffer at a specific size. + * Create an empty buffer at a specific size. */ static inline int __wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { + /* + * The buffer grow function does what we need, but anticipates data + * referenced by the buffer. Avoid any data copy by setting data to + * reference the buffer's allocated memory, and clearing it. + */ buf->data = buf->mem; - buf->size = 0; /* Clear existing data length */ - WT_RET(__wt_buf_grow(session, buf, size)); - - return (0); + buf->size = 0; + return (__wt_buf_grow(session, buf, size)); } /* * __wt_buf_initsize -- - * Initialize a buffer at a specific size, and set the data length. + * Create an empty buffer at a specific size, and set the data length. */ static inline int __wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { - buf->data = buf->mem; - buf->size = 0; /* Clear existing data length */ - WT_RET(__wt_buf_grow(session, buf, size)); + WT_RET(__wt_buf_init(session, buf, size)); + buf->size = size; /* Set the data length. */ return (0); @@ -72,14 +74,15 @@ static inline int __wt_buf_set( WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size) { - /* Ensure the buffer is large enough. */ - WT_RET(__wt_buf_initsize(session, buf, size)); - - /* Copy the data, allowing for overlapping strings. */ - if (size != 0) - memmove(buf->mem, data, size); - - return (0); + /* + * The buffer grow function does what we need, but expects the data to + * be referenced by the buffer. If we're copying data from outside the + * buffer, set it up so it makes sense to the buffer grow function. (No + * test needed, this works if WT_ITEM.data is already set to "data".) + */ + buf->data = data; + buf->size = size; + return (__wt_buf_grow(session, buf, size)); } /* diff --git a/src/include/cursor.i b/src/include/cursor.i index c3fcef9a13d..9cb9f5e7189 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -93,17 +93,19 @@ __curfile_enter(WT_CURSOR_BTREE *cbt) } /* - * __curfile_leave -- - * Clear a file cursor's position. + * __cursor_reset -- + * Reset the cursor, it no longer holds any position. */ static inline int -__curfile_leave(WT_CURSOR_BTREE *cbt) +__cursor_reset(WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; + __cursor_pos_clear(cbt); + /* If the cursor was active, deactivate it. */ if (F_ISSET(cbt, WT_CBT_ACTIVE)) { if (!F_ISSET(cbt, WT_CBT_NO_TXN)) @@ -111,12 +113,15 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) F_CLR(cbt, WT_CBT_ACTIVE); } + /* If we're not holding a cursor reference, we're done. */ + if (cbt->ref == NULL) + return (0); + /* * If we were scanning and saw a lot of deleted records on this page, * try to evict the page when we release it. */ - if (cbt->ref != NULL && - cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD) + if (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; @@ -247,7 +252,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) #ifdef HAVE_DIAGNOSTIC __wt_cursor_key_order_reset(cbt); #endif - WT_RET(__curfile_leave(cbt)); + WT_RET(__cursor_reset(cbt)); } /* @@ -271,24 +276,6 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) return (0); } -/* - * __cursor_reset -- - * Reset the cursor. - */ -static inline int -__cursor_reset(WT_CURSOR_BTREE *cbt) -{ - WT_DECL_RET; - - /* - * The cursor is leaving the API, and no longer holds any position, - * generally called to clean up the cursor after an error. - */ - ret = __curfile_leave(cbt); - __cursor_pos_clear(cbt); - return (ret); -} - /* * __cursor_row_slot_return -- * Return a row-store leaf page slot's K/V pair. diff --git a/src/include/error.h b/src/include/error.h index bbb7f989332..c338acb370f 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -67,14 +67,16 @@ int __ret; \ if ((__ret = (a)) != 0 && \ (__ret == WT_PANIC || \ - ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret == 0 || ret == WT_DUPLICATE_KEY || \ + ret == WT_NOTFOUND || ret == WT_RESTART)) \ ret = __ret; \ } while (0) #define WT_TRET_ERROR_OK(a, e) do { \ int __ret; \ if ((__ret = (a)) != 0 && __ret != (e) && \ (__ret == WT_PANIC || \ - ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret == 0 || ret == WT_DUPLICATE_KEY || \ + ret == WT_NOTFOUND || ret == WT_RESTART)) \ ret = __ret; \ } while (0) #define WT_TRET_NOTFOUND_OK(a) WT_TRET_ERROR_OK(a, WT_NOTFOUND) diff --git a/src/include/extern.h b/src/include/extern.h index db718966426..c0aa21b7f4c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -161,6 +161,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/misc.h b/src/include/misc.h index 66d43496e93..7aba397e173 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -140,6 +140,7 @@ #define F_CLR(p, mask) FLD_CLR((p)->flags, mask) #define F_ISSET(p, mask) FLD_ISSET((p)->flags, mask) +#define F_ISSET_ALL(p, mask) (FLD_MASK((p)->flags, mask) == (mask)) #define F_MASK(p, mask) FLD_MASK((p)->flags, mask) #define F_SET(p, mask) FLD_SET((p)->flags, mask) diff --git a/src/include/session.h b/src/include/session.h index f3092dc3c6c..dec97cff5d3 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -67,7 +67,6 @@ struct __wt_session_impl { TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles; time_t last_sweep; /* Last sweep for dead handles */ - WT_CURSOR *cursor; /* Current cursor */ /* Cursors closed with the session */ TAILQ_HEAD(__cursors, __wt_cursor) cursors; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index c148e759299..5dd9a720e31 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -427,7 +427,7 @@ struct __wt_cursor { * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and a record with + * In particular, if \c overwrite=false is configured and a record with * the specified key already exists, ::WT_DUPLICATE_KEY is returned. * Also, if \c in_memory is configured for the database and the insert * requires more than the configured cache size to complete, @@ -452,7 +452,9 @@ struct __wt_cursor { * * On success, the cursor ends positioned at the modified record; to * minimize cursor resources, the WT_CURSOR::reset method should be - * called as soon as the cursor no longer needs that position. + * called as soon as the cursor no longer needs that position. (The + * WT_CURSOR::insert method never keeps a cursor position and may be + * more efficient for that reason.) * * The maximum length of a single column stored in a table is not fixed * (as it partially depends on the underlying file configuration), but @@ -460,7 +462,7 @@ struct __wt_cursor { * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and no record with + * In particular, if \c overwrite=false is configured and no record with * the specified key exists, ::WT_NOTFOUND is returned. * Also, if \c in_memory is configured for the database and the insert * requires more than the configured cache size to complete, @@ -477,8 +479,18 @@ struct __wt_cursor { * * @snippet ex_all.c Remove a record * - * If the cursor was not configured with "overwrite=true", the key must - * be set and the key's record must exist; the record will be removed. + * If the cursor was configured with "overwrite=false" (not the + * default), the key must be set and the key's record must exist; the + * record will be removed. + * + * Any cursor position does not change: if the cursor was positioned + * before the WT_CURSOR::remove call, the cursor remains positioned + * at the removed record; to minimize cursor resources, the + * WT_CURSOR::reset method should be called as soon as the cursor no + * longer needs that position. If the cursor was not positioned before + * the WT_CURSOR::remove call, the cursor ends with no position, and a + * subsequent call to the WT_CURSOR::next (WT_CURSOR::prev) method will + * iterate from the beginning (end) of the table. * * @snippet ex_all.c Remove a record and fail if DNE * @@ -486,14 +498,10 @@ struct __wt_cursor { * (that is, a store with an 'r' type key and 't' type value) is * identical to setting the record's value to 0. * - * On success, the cursor ends positioned at the removed record; to - * minimize cursor resources, the WT_CURSOR::reset method should be - * called as soon as the cursor no longer needs that position. - * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and no record with - * the specified key exists, ::WT_NOTFOUND is returned. + * In particular, if \c overwrite=false is configured and no record + * with the specified key exists, ::WT_NOTFOUND is returned. */ int __F(remove)(WT_CURSOR *cursor); /*! @} */ diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 116740f8f0c..77fa96ebdfd 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1565,12 +1565,23 @@ __clsm_update(WT_CURSOR *cursor) WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); - if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || - (ret = __clsm_lookup(clsm, &value)) == 0) { - WT_ERR(__clsm_deleted_encode( - session, &cursor->value, &value, &buf)); - ret = __clsm_put(session, clsm, &cursor->key, &value, true); - } + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + WT_ERR(__clsm_lookup(clsm, &value)); + WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf)); + WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, true)); + + /* + * Set the cursor to reference the internal key/value of the positioned + * cursor. + */ + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + WT_ITEM_SET(cursor->key, clsm->current->key); + WT_ITEM_SET(cursor->value, clsm->current->value); + WT_ASSERT(session, + F_MASK(clsm->current, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + WT_ASSERT(session, + F_MASK(clsm->current, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: __wt_scr_free(session, &buf); __clsm_leave(clsm); @@ -1589,9 +1600,13 @@ __clsm_remove(WT_CURSOR *cursor) WT_DECL_RET; WT_ITEM value; WT_SESSION_IMPL *session; + bool positioned; clsm = (WT_CURSOR_LSM *)cursor; + /* Check if the cursor is positioned. */ + positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); + CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); @@ -1600,9 +1615,22 @@ __clsm_remove(WT_CURSOR *cursor) if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || (ret = __clsm_lookup(clsm, &value)) == 0) ret = __clsm_put( - session, clsm, &cursor->key, &__tombstone, true); + session, clsm, &cursor->key, &__tombstone, positioned); err: __clsm_leave(clsm); + + /* + * If the cursor was positioned, it stays positioned with a key but no + * no value, otherwise, there's no position, key or value. This isn't + * just cosmetic, without a reset, iteration on this cursor won't start + * at the beginning/end of the table. + */ + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (positioned) + F_SET(cursor, WT_CURSTD_KEY_INT); + else + WT_TRET(cursor->reset(cursor)); + CURSOR_UPDATE_API_END(session, ret); return (ret); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 748f4aa2473..ec150f39fc5 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -274,7 +274,6 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) btree = S2BT(session); /* Find out if we have to force a checkpoint. */ - force = false; WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); force = cval.val != 0; if (!force) { diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index e2b72532703..10ab890f2f5 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -4,8 +4,13 @@ LDADD = $(top_builddir)/test/utility/libtest_util.la \ $(top_builddir)/libwiredtiger.la AM_LDFLAGS = -static +noinst_PROGRAMS= + +test_scope_SOURCES = scope/main.c +noinst_PROGRAMS += test_scope + test_wt1965_col_efficiency_SOURCES = wt1965_col_efficiency/main.c -noinst_PROGRAMS = test_wt1965_col_efficiency +noinst_PROGRAMS += test_wt1965_col_efficiency test_wt2403_lsm_workload_SOURCES = wt2403_lsm_workload/main.c noinst_PROGRAMS += test_wt2403_lsm_workload diff --git a/test/csuite/scope/main.c b/test/csuite/scope/main.c new file mode 100644 index 00000000000..15dabd97c40 --- /dev/null +++ b/test/csuite/scope/main.c @@ -0,0 +1,288 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +#define KEY "key" +#define VALUE "value" + +static int ignore_errors; + +static int +handle_error(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *message) +{ + (void)(handler); + + /* Skip the error messages we're expecting to see. */ + if (ignore_errors > 0 && + (strstr(message, "requires key be set") != NULL || + strstr(message, "requires value be set") != NULL)) { + --ignore_errors; + return (0); + } + + (void)fprintf(stderr, "%s: %s\n", + message, session->strerror(session, error)); + return (0); +} + +static WT_EVENT_HANDLER event_handler = { + handle_error, + NULL, + NULL, + NULL +}; + +static void +cursor_scope_ops(WT_SESSION *session, const char *uri) +{ + struct { + const char *op; + enum { INSERT, SEARCH, SEARCH_NEAR, + REMOVE, REMOVE_POS, RESERVE, UPDATE } func; + const char *config; + } *op, ops[] = { + /* + * The ops order is fixed and shouldn't change, that is, insert + * has to happen first so search, update and remove operations + * are possible, and remove has to be last. + */ + { "insert", INSERT, NULL, }, + { "search", SEARCH, NULL, }, + { "search", SEARCH_NEAR, NULL, }, +#if 0 + { "reserve", RESERVE, NULL, }, +#endif + { "update", UPDATE, NULL, }, + { "remove", REMOVE, NULL, }, + { "remove", REMOVE_POS, NULL, }, + { NULL, INSERT, NULL } + }; + WT_CURSOR *cursor; + uint64_t keyr; + const char *key, *value; + char keybuf[100], valuebuf[100]; + int exact; + bool recno; + + /* Reserve requires a running transaction. */ + testutil_check(session->begin_transaction(session, NULL)); + + cursor = NULL; + for (op = ops; op->op != NULL; op++) { + key = value = NULL; + + /* Open a cursor. */ + if (cursor != NULL) + testutil_check(cursor->close(cursor)); + testutil_check(session->open_cursor( + session, uri, NULL, op->config, &cursor)); + recno = strcmp(cursor->key_format, "r") == 0; + + /* + * Set up application buffers so we can detect overwrites + * or failure to copy application information into library + * memory. + */ + if (recno) + cursor->set_key(cursor, (uint64_t)1); + else { + strcpy(keybuf, KEY); + cursor->set_key(cursor, keybuf); + } + strcpy(valuebuf, VALUE); + cursor->set_value(cursor, valuebuf); + + /* + * The application must keep key and value memory valid until + * the next operation that positions the cursor, modifies the + * data, or resets or closes the cursor. + * + * Modifying either the key or value buffers is not permitted. + */ + switch (op->func) { + case INSERT: + testutil_check(cursor->insert(cursor)); + break; + case SEARCH: + testutil_check(cursor->search(cursor)); + break; + case SEARCH_NEAR: + testutil_check(cursor->search_near(cursor, &exact)); + break; + case REMOVE_POS: + /* + * Remove has two modes, one where the remove is based + * on a cursor position, the other where it's based on + * a set key. The results are different, so test them + * separately. + */ + testutil_check(cursor->search(cursor)); + /* FALLTHROUGH */ + case REMOVE: + testutil_check(cursor->remove(cursor)); + break; + case RESERVE: +#if 0 + testutil_check(cursor->reserve(cursor)); +#endif + break; + case UPDATE: + testutil_check(cursor->update(cursor)); + break; + } + + /* + * The cursor should no longer reference application memory, + * and application buffers can be safely overwritten. + */ + memset(keybuf, 'K', sizeof(keybuf)); + memset(valuebuf, 'V', sizeof(valuebuf)); + + /* + * Check that get_key/get_value behave as expected after the + * operation. + */ + switch (op->func) { + case INSERT: + case REMOVE: + /* + * Insert and remove configured with a search key do + * not position the cursor and have no key or value. + * + * There should be two error messages, ignore them. + */ + ignore_errors = 2; + if (recno) + testutil_assert( + cursor->get_key(cursor, &keyr) != 0); + else + testutil_assert( + cursor->get_key(cursor, &key) != 0); + testutil_assert(cursor->get_value(cursor, &value) != 0); + testutil_assert(ignore_errors == 0); + break; + case REMOVE_POS: + /* + * Remove configured with a cursor position has a key, + * but no value. + * + * There should be one error message, ignore it. + */ + if (recno) { + testutil_assert( + cursor->get_key(cursor, &keyr) == 0); + testutil_assert(keyr == 1); + } else { + testutil_assert( + cursor->get_key(cursor, &key) == 0); + testutil_assert(key != keybuf); + testutil_assert(strcmp(key, KEY) == 0); + } + ignore_errors = 1; + testutil_assert(cursor->get_value(cursor, &value) != 0); + testutil_assert(ignore_errors == 0); + break; + case RESERVE: + case SEARCH: + case SEARCH_NEAR: + case UPDATE: + /* + * Reserve, search, search-near and update position the + * cursor and have both a key and value. + * + * Any key/value should not reference application + * memory. + */ + if (recno) { + testutil_assert( + cursor->get_key(cursor, &keyr) == 0); + testutil_assert(keyr == 1); + } else { + testutil_assert( + cursor->get_key(cursor, &key) == 0); + testutil_assert(key != keybuf); + testutil_assert(strcmp(key, KEY) == 0); + } + testutil_assert(cursor->get_value(cursor, &value) == 0); + testutil_assert(value != valuebuf); + testutil_assert(strcmp(value, VALUE) == 0); + break; + } + + /* + * We have more than one remove operation, add the key back + * in. + */ + if (op->func == REMOVE || op->func == REMOVE_POS) { + if (recno) + cursor->set_key(cursor, (uint64_t)1); + else { + cursor->set_key(cursor, KEY); + } + cursor->set_value(cursor, VALUE); + testutil_check(cursor->insert(cursor)); + } + } +} + +static void +run(WT_CONNECTION *conn, const char *uri, const char *config) +{ + WT_SESSION *session; + + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + testutil_check(session->create(session, uri, config)); + cursor_scope_ops(session, uri); + testutil_check(session->close(session, NULL)); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check( + wiredtiger_open(opts->home, &event_handler, "create", &opts->conn)); + + run(opts->conn, "file:file.SS", "key_format=S,value_format=S"); + run(opts->conn, "file:file.rS", "key_format=r,value_format=S"); + run(opts->conn, "lsm:lsm.SS", "key_format=S,value_format=S"); + run(opts->conn, "lsm:lsm.rS", "key_format=r,value_format=S"); + run(opts->conn, "table:table.SS", "key_format=S,value_format=S"); + run(opts->conn, "table:table.rS", "key_format=r,value_format=S"); + + testutil_cleanup(opts); + + return (EXIT_SUCCESS); +} diff --git a/test/suite/test_cursor10.py b/test/suite/test_cursor10.py index b3cffeab4e9..6cabfde9f1f 100644 --- a/test/suite/test_cursor10.py +++ b/test/suite/test_cursor10.py @@ -31,11 +31,11 @@ from wtscenario import make_scenarios # test_cursor10.py # Cursors with projections. -class test_cursor04(wttest.WiredTigerTestCase): +class test_cursor10(wttest.WiredTigerTestCase): """ Test cursor search and search_near """ - table_name1 = 'test_cursor04' + table_name1 = 'test_cursor10' nentries = 20 scenarios = make_scenarios([ diff --git a/test/suite/test_cursor11.py b/test/suite/test_cursor11.py new file mode 100644 index 00000000000..e159ec499e6 --- /dev/null +++ b/test/suite/test_cursor11.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtdataset import SimpleDataSet, SimpleIndexDataSet +from wtdataset import SimpleLSMDataSet, ComplexDataSet, ComplexLSMDataSet +from wtscenario import make_scenarios + +# test_cursor11.py +# WT_CURSOR position tests: remove (if not already positioned), and insert +# leave the cursor without position or information. +class test_cursor11(wttest.WiredTigerTestCase): + + keyfmt = [ + ('integer', dict(keyfmt='i')), + ('recno', dict(keyfmt='r')), + ('string', dict(keyfmt='S')), + ] + types = [ + ('file', dict(uri='file', ds=SimpleDataSet)), + ('lsm', dict(uri='lsm', ds=SimpleDataSet)), + ('table-complex', dict(uri='table', ds=ComplexDataSet)), + ('table-complex-lsm', dict(uri='table', ds=ComplexLSMDataSet)), + ('table-index', dict(uri='table', ds=SimpleIndexDataSet)), + ('table-simple', dict(uri='table', ds=SimpleDataSet)), + ('table-simple-lsm', dict(uri='table', ds=SimpleLSMDataSet)), + ] + scenarios = make_scenarios(types, keyfmt) + + def skip(self): + return self.keyfmt == 'r' and \ + (self.ds.is_lsm() or self.uri == 'lsm') + + # Do a remove using the cursor after setting a position, and confirm + # the key and position remain set but no value. + def test_cursor_remove_with_position(self): + if self.skip(): + return + + # Build an object. + uri = self.uri + ':test_cursor11' + ds = self.ds(self, uri, 50, key_format=self.keyfmt) + ds.populate() + s = self.conn.open_session() + c = s.open_cursor(uri, None) + + c.set_key(ds.key(25)) + self.assertEquals(c.search(), 0) + self.assertEquals(c.next(), 0) + self.assertEquals(c.get_key(), ds.key(26)) + c.remove() + self.assertEquals(c.get_key(), ds.key(26)) + msg = '/requires value be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_value, msg) + self.assertEquals(c.next(), 0) + self.assertEquals(c.get_key(), ds.key(27)) + + # Do a remove using the cursor without setting a position, and confirm + # no key, value or position remains. + def test_cursor_remove_without_position(self): + if self.skip(): + return + + # Build an object. + uri = self.uri + ':test_cursor11' + ds = self.ds(self, uri, 50, key_format=self.keyfmt) + ds.populate() + s = self.conn.open_session() + c = s.open_cursor(uri, None) + + c.set_key(ds.key(25)) + c.remove() + msg = '/requires key be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_key, msg) + msg = '/requires value be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_value, msg) + self.assertEquals(c.next(), 0) + self.assertEquals(c.get_key(), ds.key(1)) + + # Do a remove using the key after also setting a position, and confirm + # no key, value or position remains. + def test_cursor_remove_with_key_and_position(self): + if self.skip(): + return + + # Build an object. + uri = self.uri + ':test_cursor11' + ds = self.ds(self, uri, 50, key_format=self.keyfmt) + ds.populate() + s = self.conn.open_session() + c = s.open_cursor(uri, None) + + c.set_key(ds.key(25)) + self.assertEquals(c.search(), 0) + c.set_key(ds.key(25)) + c.remove() + msg = '/requires key be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_key, msg) + msg = '/requires value be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_value, msg) + self.assertEquals(c.next(), 0) + self.assertEquals(c.get_key(), ds.key(1)) + + # Do an insert and confirm no key, value or position remains. + def test_cursor_insert(self): + if self.skip(): + return + + # Build an object. + uri = self.uri + ':test_cursor11' + ds = self.ds(self, uri, 50, key_format=self.keyfmt) + ds.populate() + s = self.conn.open_session() + c = s.open_cursor(uri, None) + + c.set_key(ds.key(25)) + c.set_value(ds.value(300)) + c.insert() + msg = '/requires key be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_key, msg) + msg = '/requires value be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, c.get_value, msg) + self.assertEquals(c.next(), 0) + self.assertEquals(c.get_key(), ds.key(1)) + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1 From b77f9cc3b7fe7c15445c13df9bef74f1dd39b991 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 16 Mar 2017 15:03:31 +1100 Subject: WT-3218 Avoid adding duplicate handles to connection dhandle list (#3331) * Recheck for existence after acquiring write lock when creating a new dhandle. * Add a wtperf workload that reproduced the original failure. --- bench/wtperf/runners/many-table-stress.wtperf | 19 +++++++++++++++++++ src/conn/conn_dhandle.c | 8 ++++++++ 2 files changed, 27 insertions(+) create mode 100644 bench/wtperf/runners/many-table-stress.wtperf diff --git a/bench/wtperf/runners/many-table-stress.wtperf b/bench/wtperf/runners/many-table-stress.wtperf new file mode 100644 index 00000000000..51d0bb0dd9d --- /dev/null +++ b/bench/wtperf/runners/many-table-stress.wtperf @@ -0,0 +1,19 @@ +# Create a set of tables with uneven distribution of data +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=20,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +table_config="type=file" +table_count=5000 +icount=0 +random_range=1000000000 +pareto=10 +range_partition=true +report_interval=5 + +run_ops=1000000 +populate_threads=0 +icount=0 +threads=((count=60,inserts=1)) + +# Warn if a latency over 1 second is seen +max_latency=1000 +sample_interval=5 +sample_rate=1 diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 6c8d66d63f8..c5480897494 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -42,6 +42,14 @@ __wt_conn_dhandle_alloc( WT_DECL_RET; uint64_t bucket; + /* + * Ensure no one beat us to creating the handle now that we hold the + * write lock. + */ + if ((ret = + __wt_conn_dhandle_find(session, uri, checkpoint)) != WT_NOTFOUND) + return (ret); + WT_RET(__wt_calloc_one(session, &dhandle)); __wt_rwlock_init(session, &dhandle->rwlock); -- cgit v1.2.1 From 51d22616094e0a0d34997d26aec925adf949fbdf Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 16 Mar 2017 16:26:49 +1100 Subject: WT-3206 Fix a race allocating split generations. (#3332) We use split generations to detect when readers may be looking at structures that are replaced by a split. For correctness, we should only increment the global split generation *after* a split becomes public. Only then can we safely check that no thread is still reading with the old generation. Previously, a split could increment the global split generation, then a thread could start reading with the new split generation but see the old index structure. This issue was introduced by WT 3088, where we wanted a way to ensure that newly-allocated pages don't split until it is safe. That is solved here by having the split code pin a split generation in the ordinary way (without allocating a new one) for the duration that splits of new pages need to be prevented. --- src/btree/bt_split.c | 71 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 45550ff627f..6b2100ec7e3 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session, exclusive = true; if (exclusive) { - __wt_free(session, p); + __wt_overwrite_and_free_len(session, p, s); return (0); } @@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the root page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - root->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, false); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; + WT_LEAVE_PAGE_INDEX(session); + + /* + * Get a generation for this split, mark the root page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_root(session, root)); @@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the parent page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - parent->pg_intl_split_gen = split_gen; - /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -837,6 +843,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; + /* + * Get a generation for this split, mark the page. This must be after + * the new index is swapped into place in order to know that no readers + * are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF @@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - page->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, true); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, true); /* Split into the parent. */ - WT_ERR(__split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)); + if ((ret = __split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)) == 0) { + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + } + + WT_LEAVE_PAGE_INDEX(session); + WT_ERR(ret); /* - * Confirm the page's index hasn't moved, then update it, which makes - * the split visible to threads descending the tree. + * Get a generation for this split, mark the parent page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, -- cgit v1.2.1 From 92327e8fd19c92ca5687f6e093553fb96c742688 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 16 Mar 2017 15:03:31 +1100 Subject: WT-3218 Avoid adding duplicate handles to connection dhandle list (#3331) * Recheck for existence after acquiring write lock when creating a new dhandle. * Add a wtperf workload that reproduced the original failure. (cherry picked from commit b77f9cc3b7fe7c15445c13df9bef74f1dd39b991) --- bench/wtperf/runners/many-table-stress.wtperf | 19 +++++++++++++++++++ src/conn/conn_dhandle.c | 8 ++++++++ 2 files changed, 27 insertions(+) create mode 100644 bench/wtperf/runners/many-table-stress.wtperf diff --git a/bench/wtperf/runners/many-table-stress.wtperf b/bench/wtperf/runners/many-table-stress.wtperf new file mode 100644 index 00000000000..51d0bb0dd9d --- /dev/null +++ b/bench/wtperf/runners/many-table-stress.wtperf @@ -0,0 +1,19 @@ +# Create a set of tables with uneven distribution of data +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=20,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +table_config="type=file" +table_count=5000 +icount=0 +random_range=1000000000 +pareto=10 +range_partition=true +report_interval=5 + +run_ops=1000000 +populate_threads=0 +icount=0 +threads=((count=60,inserts=1)) + +# Warn if a latency over 1 second is seen +max_latency=1000 +sample_interval=5 +sample_rate=1 diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 866b8633f71..99213c5b557 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -38,6 +38,14 @@ __wt_conn_dhandle_alloc( WT_DECL_RET; uint64_t bucket; + /* + * Ensure no one beat us to creating the handle now that we hold the + * write lock. + */ + if ((ret = + __wt_conn_dhandle_find(session, uri, checkpoint)) != WT_NOTFOUND) + return (ret); + WT_RET(__wt_calloc_one(session, &dhandle)); __wt_rwlock_init(session, &dhandle->rwlock); -- cgit v1.2.1 From cc2f15f595b16479affd73791c207da334453bcc Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 16 Mar 2017 16:26:49 +1100 Subject: WT-3206 Fix a race allocating split generations. (#3332) We use split generations to detect when readers may be looking at structures that are replaced by a split. For correctness, we should only increment the global split generation *after* a split becomes public. Only then can we safely check that no thread is still reading with the old generation. Previously, a split could increment the global split generation, then a thread could start reading with the new split generation but see the old index structure. This issue was introduced by WT 3088, where we wanted a way to ensure that newly-allocated pages don't split until it is safe. That is solved here by having the split code pin a split generation in the ordinary way (without allocating a new one) for the duration that splits of new pages need to be prevented. (cherry picked from commit 51d22616094e0a0d34997d26aec925adf949fbdf) --- src/btree/bt_split.c | 71 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 45550ff627f..6b2100ec7e3 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session, exclusive = true; if (exclusive) { - __wt_free(session, p); + __wt_overwrite_and_free_len(session, p, s); return (0); } @@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the root page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - root->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, false); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; + WT_LEAVE_PAGE_INDEX(session); + + /* + * Get a generation for this split, mark the root page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_root(session, root)); @@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the parent page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - parent->pg_intl_split_gen = split_gen; - /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -837,6 +843,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; + /* + * Get a generation for this split, mark the page. This must be after + * the new index is swapped into place in order to know that no readers + * are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF @@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - page->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, true); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, true); /* Split into the parent. */ - WT_ERR(__split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)); + if ((ret = __split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)) == 0) { + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + } + + WT_LEAVE_PAGE_INDEX(session); + WT_ERR(ret); /* - * Confirm the page's index hasn't moved, then update it, which makes - * the split visible to threads descending the tree. + * Get a generation for this split, mark the parent page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, -- cgit v1.2.1 From 6203106c56504f194bab7093b28c45ae7beb9cac Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 16 Mar 2017 07:49:08 -0400 Subject: WT-3204 eviction changes cost LSM performance (#3325) * WT-3204 eviction changes cost LSM performance Modify LSM's primary chunk switching to match the new btree eviction semantics on object creation. We now create objects with eviction turned off, LSM should no longer have to turn eviction off when configuring the primary chunk. LSM previously set WT_BTREE.bulk_load_ok to false to ensure an insert into the tree wouldn't turn eviction on. That problem remains, but there's a race in the implementation if multiple threads are inserting at the same time (where a thread modifies WT_BTREE.bulk_load_ok and goes to sleep before configuring eviction, and another thread does an insert and turns off eviction), and there's a further race between threads doing F_ISSET/F_SET tests. Change the WT_BTREE_LSM_PRIMARY flag into a WT_BTREE.lsm_primary variable so there's no F_ISSET/F_SET race. Remove the test/set of bulk-load_ok, instead, test the lsm_primary value in the btree code before turning eviction off. When checkpointing an LSM chunk, move the code that turns off the chunk's primary flag in the chunk inside the single-threaded part of the function to ensure we don't race with other threads doing checkpoints. That makes the code to fix up the accounting single-threaded and safe. Simplify the LSM checkpoint code to call __wt_checkpoint directly, and use the same handle for turning off the chunk's primary flag as we use for the checkpoint. * Force a primary switch in LSM after an exclusive-handle operation has come through. Otherwise it's possible to attempt to use a file as the primary chunk without disabling eviction. * spelling * WT_BTREE.bulk_load_ok isn't a boolean, don't use true/false comparisons. * Only check for an empty tree the first time an LSM chunk is opened. The goal here is to make sure that LSM primary chunks start empty. Otherwise, we can't load into a skiplist in memory as required by LSM. If an operation such as verify closes a btree in order to check the on-disk state, the next time it is opened we have to check whether it is empty. It is safe to do this check without locking: what matters is that we always do the `lsm_primary` check before any update operation that would turn off `btree->bulk_load_ok`. * Rename WT_BTREE.bulk_load_ok to be WT_BTREE.original, it's used by LSM. * Fix a comment. --- src/btree/bt_cursor.c | 24 ++++++---- src/btree/bt_handle.c | 4 +- src/include/btree.h | 9 ++-- src/include/btree.i | 60 ++----------------------- src/lsm/lsm_cursor.c | 31 ++++++++----- src/lsm/lsm_work_unit.c | 109 ++++++++++++++++++++++++++++++++++------------ src/reconcile/rec_write.c | 3 +- src/txn/txn_ckpt.c | 2 +- 8 files changed, 130 insertions(+), 112 deletions(-) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 3ae6e022906..d6dc0991d3f 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -126,15 +126,23 @@ static inline void __cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree) { /* - * Once a tree is no longer empty, eviction should pay attention to it, - * and it's no longer possible to bulk-load into it. - * - * We use a compare-and-swap here to avoid races among the first - * inserts into a tree. Eviction is disabled when an empty tree is - * opened, it must only be enabled once. + * Once a tree (other than the LSM primary) is no longer empty, eviction + * should pay attention to it, and it's no longer possible to bulk-load + * into it. + */ + if (!btree->original) + return; + if (btree->lsm_primary) { + btree->original = 0; /* Make the next test faster. */ + return; + } + + /* + * We use a compare-and-swap here to avoid races among the first inserts + * into a tree. Eviction is disabled when an empty tree is opened, and + * it must only be enabled once. */ - if (btree->bulk_load_ok && - __wt_atomic_cas8(&btree->bulk_load_ok, 1, 0)) + if (__wt_atomic_cas8(&btree->original, 1, 0)) __wt_evict_file_exclusive_off(session); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index ff199eb1e0e..f2bffee06da 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -188,7 +188,7 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) * Special operations don't enable eviction. (The underlying commands * may turn on eviction, but it's their decision.) */ - if (btree->bulk_load_ok || + if (btree->original || F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) WT_ERR(__wt_evict_file_exclusive_on(session)); @@ -562,7 +562,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) * tree. */ if (creation) - btree->bulk_load_ok = 1; + btree->original = 1; /* * A note about empty trees: the initial tree is a single root page. diff --git a/src/include/btree.h b/src/include/btree.h index 857dc6694c5..15a68474fdf 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -118,11 +118,13 @@ struct __wt_btree { uint64_t last_recno; /* Column-store last record number */ - WT_REF root; /* Root page reference */ - bool modified; /* If the tree ever modified */ - uint8_t bulk_load_ok; /* Bulk-load is a possibility + WT_REF root; /* Root page reference */ + bool modified; /* If the tree ever modified */ + uint8_t original; /* Newly created: bulk-load possible (want a bool but needs atomic cas) */ + bool lsm_primary; /* Handle is/was the LSM primary */ + WT_BM *bm; /* Block manager reference */ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ @@ -160,7 +162,6 @@ struct __wt_btree { #define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */ #define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */ #define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */ -#define WT_BTREE_LSM_PRIMARY 0x002000 /* Handle is current LSM primary */ #define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */ #define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */ #define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */ diff --git a/src/include/btree.i b/src/include/btree.i index cec6f67e9bd..c0c5c7c5a8d 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -149,7 +149,7 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) if (WT_PAGE_IS_INTERNAL(page)) { (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -285,7 +285,7 @@ __wt_cache_page_byte_dirty_decr( decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!btree->lsm_primary) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -345,7 +345,7 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { - if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -444,7 +444,7 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - } else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!btree->lsm_primary) { __wt_cache_decr_zero_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); @@ -1545,58 +1545,6 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) return (child->memory_footprint > maxsize); } -/* - * __wt_btree_lsm_switch_primary -- - * Switch a btree handle to/from the current primary chunk of an LSM tree. - */ -static inline int -__wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) -{ - WT_BTREE *btree; - WT_CACHE *cache; - WT_PAGE *child, *root; - WT_PAGE_INDEX *pindex; - WT_REF *first; - size_t size; - - btree = S2BT(session); - cache = S2C(session)->cache; - root = btree->root.page; - - if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - F_SET(btree, WT_BTREE_LSM_PRIMARY); - WT_RET(__wt_evict_file_exclusive_on(session)); - } - if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - pindex = WT_INTL_INDEX_GET_SAFE(root); - if (btree->evict_disabled == 0 || pindex->entries != 1) - return (0); - first = pindex->index[0]; - - /* - * We're reaching down into the page without a hazard pointer, - * but that's OK because we know that no-eviction is set so the - * page can't disappear. - * - * While this tree was the primary, its dirty bytes were not - * included in the cache accounting. Fix that now before we - * open it up for eviction. - */ - child = first->page; - if (first->state == WT_REF_MEM && - child->type == WT_PAGE_ROW_LEAF && - __wt_page_is_modified(child)) { - size = child->modify->bytes_dirty; - (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); - } - - F_CLR(btree, WT_BTREE_LSM_PRIMARY); - __wt_evict_file_exclusive_off(session); - } - return (0); -} - /* * __wt_split_descent_race -- * Return if we raced with an internal page split when descending the tree. diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 77fa96ebdfd..bd1daaa6915 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -688,20 +688,29 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { - clsm->primary_chunk = chunk; primary = clsm->chunks[clsm->nchunks - 1]->cursor; + btree = ((WT_CURSOR_BTREE *)primary)->btree; + /* - * Disable eviction for the in-memory chunk. Also clear the - * bulk load flag here, otherwise eviction will be enabled by - * the first update. + * If the primary is not yet set as the primary, do that now. + * Note that eviction was configured off when the underlying + * object was created, which is what we want, leave it alone. + * + * We don't have to worry about races here: every thread that + * modifies the tree will have to come through here, at worse + * we set the flag repeatedly. We don't use a WT_BTREE handle + * flag, however, we could race doing the read-modify-write of + * the flags field. + * + * If something caused the chunk to be closed and reopened + * since it was created, we can no longer use it as a primary + * chunk and we need to force a switch. We detect the tree was + * created when it was opened by checking the "original" flag. */ - btree = ((WT_CURSOR_BTREE *)(primary))->btree; - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - WT_WITH_BTREE(session, btree, - ret = __wt_btree_lsm_switch_primary(session, true)); - WT_ERR(ret); - } + if (!btree->lsm_primary && btree->original) + btree->lsm_primary = true; + if (btree->lsm_primary) + clsm->primary_chunk = chunk; } clsm->dsk_gen = lsm_tree->dsk_gen; diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index c9c350c5ac9..0b0801a8cca 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -255,6 +255,51 @@ err: return (ret); } +/* + * __lsm_switch_primary_off -- + * Switch when a btree handle is no longer the current primary chunk of + * an LSM tree. + */ +static void +__lsm_switch_primary_off(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_PAGE *child, *root; + WT_PAGE_INDEX *pindex; + WT_REF *first; + size_t size; + + btree = S2BT(session); + cache = S2C(session)->cache; + root = btree->root.page; + pindex = WT_INTL_INDEX_GET_SAFE(root); + + /* Diagnostic: assert we've never split. */ + WT_ASSERT(session, pindex->entries == 1); + + /* + * We're reaching down into the page without a hazard pointer, + * but that's OK because we know that no-eviction is set so the + * page can't disappear. + * + * While this tree was the primary, its dirty bytes were not + * included in the cache accounting. Fix that now before we + * open it up for eviction. + */ + first = pindex->index[0]; + child = first->page; + if (first->state == WT_REF_MEM && + child->type == WT_PAGE_ROW_LEAF && __wt_page_is_modified(child)) { + size = child->modify->bytes_dirty; + (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); + (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + } + + /* Configure eviction. */ + __wt_evict_file_exclusive_off(session); +} + /* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. @@ -263,11 +308,12 @@ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { + WT_BTREE *btree; WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; - bool flush_set; + bool flush_set, release_btree; - flush_set = false; + flush_set = release_btree = false; /* * If the chunk is already checkpointed, make sure it is also evicted. @@ -318,20 +364,18 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ - if ((ret = __wt_session_get_btree( - session, chunk->uri, NULL, NULL, 0)) == 0) { - /* - * Set read-uncommitted: we have already checked that all of the - * updates in this chunk are globally visible, use the cheapest - * possible check in reconciliation. - */ - saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_READ_UNCOMMITTED; - ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); - session->txn.isolation = saved_isolation; - WT_TRET(__wt_session_release_btree(session)); - } - WT_ERR(ret); + WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); + release_btree = true; + + /* + * Set read-uncommitted: we have already checked that all of the updates + * in this chunk are globally visible, use the cheapest possible check + * in reconciliation. + */ + saved_isolation = session->txn.isolation; + session->txn.isolation = WT_ISO_READ_UNCOMMITTED; + WT_ERR(__wt_cache_op(session, WT_SYNC_WRITE_LEAVES)); + session->txn.isolation = saved_isolation; __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri); @@ -348,12 +392,28 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker( - session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); + ret = __wt_checkpoint(session, NULL))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); + /* + * If the chunk is the lsm primary, clear the no-eviction flag so it can + * be evicted and eventually closed. Only do once, and only do after the + * checkpoint has succeeded: otherwise, accessing the leaf page during + * the checkpoint can trigger forced eviction. + * + * We don't have to worry about races here, we're single-threaded. + */ + btree = S2BT(session); + if (btree->lsm_primary) { + __lsm_switch_primary_off(session); + btree->lsm_primary = false; + } + + release_btree = false; + WT_ERR(__wt_session_release_btree(session)); + /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); @@ -376,17 +436,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_PUBLISH(chunk->flushing, 0); flush_set = false; - /* - * Clear the no-eviction flag so the primary can be evicted and - * eventually closed. Only do this once the checkpoint has succeeded: - * otherwise, accessing the leaf page during the checkpoint can trigger - * forced eviction. - */ - WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); - WT_TRET(__wt_btree_lsm_switch_primary(session, false)); - WT_TRET(__wt_session_release_btree(session)); - WT_ERR(ret); - /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); @@ -403,6 +452,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); + if (release_btree) + WT_TRET(__wt_session_release_btree(session)); return (ret); } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index a667a288187..88d4397fcb5 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -3583,11 +3583,12 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) uint64_t recno; btree = S2BT(session); + /* * Bulk-load is only permitted on newly created files, not any empty * file -- see the checkpoint code for a discussion. */ - if (!btree->bulk_load_ok) + if (!btree->original) WT_RET_MSG(session, EINVAL, "bulk-load is only possible for newly created trees"); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index ec150f39fc5..80cdf1cd39b 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -1420,7 +1420,7 @@ __checkpoint_tree( * delete a physical checkpoint, and that will end in tears. */ if (is_checkpoint) - if (btree->bulk_load_ok) { + if (btree->original) { fake_ckpt = true; goto fake; } -- cgit v1.2.1 From 6a3ee4ea9986ff2a7446c4774b04423673165c57 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 16 Mar 2017 11:42:07 -0400 Subject: WT-3225 WiredTiger won't build with clang on CentOS 7.3.1611 (#3333) * WT-3225 WiredTiger won't build with clang on CentOS 7.3.1611 Casting the call's return to int is because CentOS 7.3.1611 complains about syscall returning a long and the loss of integer precision in the assignment to ret. The cast should be a no-op everywhere. * On Centos 7.3.1611, system header files aren't compatible with -Wdisabled-macro-expansion. I don't see a big reason for having that warning, so I'm turning it off generally. Add -Wuninitialized to WiredTiger's gcc builds. --- build_posix/aclocal/strict.m4 | 5 +++++ dist/s_string.ok | 2 ++ src/include/os.h | 8 +++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/build_posix/aclocal/strict.m4 b/build_posix/aclocal/strict.m4 index c107dd017d7..659867fa69e 100644 --- a/build_posix/aclocal/strict.m4 +++ b/build_posix/aclocal/strict.m4 @@ -31,6 +31,7 @@ AC_DEFUN([AM_GCC_WARNINGS], [ w="$w -Wstrict-prototypes" w="$w -Wswitch-enum" w="$w -Wundef" + w="$w -Wuninitialized" w="$w -Wunreachable-code" w="$w -Wunsafe-loop-optimizations" w="$w -Wunused" @@ -66,6 +67,10 @@ AC_DEFUN([AM_CLANG_WARNINGS], [ # w="$w -Wno-error=cast-qual" w="$w -Wno-cast-qual" + # On Centos 7.3.1611, system header files aren't compatible with + # -Wdisabled-macro-expansion. + w="$w -Wno-disabled-macro-expansion" + case "$1" in *Apple*clang*version*4.1*) # Apple clang has its own numbering system, and older OS X diff --git a/dist/s_string.ok b/dist/s_string.ok index cdfa4aec968..39b6b163cd9 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -68,6 +68,7 @@ CURSORs CURSTD CallsCustDate Castagnoli +CentOS Checkpointing Checksum Checksums @@ -1148,6 +1149,7 @@ sw sx sy sys +syscall sz t's tV diff --git a/src/include/os.h b/src/include/os.h index 7a8e47ed81f..8505649a1fd 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -11,8 +11,14 @@ * A call returning 0 indicates success; any call where \ * 0 is not the only successful return must provide an \ * expression evaluating to 0 in all successful cases. \ + * \ + * XXX \ + * Casting the call's return to int is because CentOS 7.3.1611 \ + * complains about syscall returning a long and the loss of \ + * integer precision in the assignment to ret. The cast should \ + * be a no-op everywhere. \ */ \ - if (((ret) = (call)) == 0) \ + if (((ret) = (int)(call)) == 0) \ break; \ /* \ * The call's error was either returned by the call or \ -- cgit v1.2.1 From c9b353c33631725e633e146f87c1c92e32a5def3 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 16 Mar 2017 14:38:05 -0400 Subject: WT-3218 Reduce to 2k tables so Jenkins doesn't hit open file ulimit. (#3334) --- bench/wtperf/runners/many-table-stress.wtperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/wtperf/runners/many-table-stress.wtperf b/bench/wtperf/runners/many-table-stress.wtperf index 51d0bb0dd9d..6cf1d5d2696 100644 --- a/bench/wtperf/runners/many-table-stress.wtperf +++ b/bench/wtperf/runners/many-table-stress.wtperf @@ -1,7 +1,7 @@ # Create a set of tables with uneven distribution of data conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=20,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" table_config="type=file" -table_count=5000 +table_count=2000 icount=0 random_range=1000000000 pareto=10 -- cgit v1.2.1 From 4e47a53801a7bd54e323d9899905a69340ed8dfb Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Thu, 16 Mar 2017 23:45:17 -0400 Subject: WT-3212 Table cursors should not free memory owned by the table. (#3327) --- src/cursor/cur_table.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 98dbbec8981..72eec177449 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -1015,11 +1015,15 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (0) { err: if (*cursorp != NULL) { - if (*cursorp != cursor) - WT_TRET(__wt_cursor_close(*cursorp)); + /* + * When a dump cursor is opened, then *cursorp, not + * cursor, is the dump cursor. Close the dump cursor, + * and the table cursor will be closed as its child. + */ + cursor = *cursorp; *cursorp = NULL; } - WT_TRET(__curtable_close(cursor)); + WT_TRET(cursor->close(cursor)); } __wt_scr_free(session, &tmp); -- cgit v1.2.1 From 65ab67ed8d9777285dedf89cc506b9cffc52942e Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Thu, 16 Mar 2017 23:47:18 -0400 Subject: WT-2978 Python: make a pip-compatible installer. (#3320) * Build a static library with -fPIC objects, suitable for pulling into a dynamic library. Distribute our SWIG results, rather than running SWIG on the target machine. * Added builtin support for snappy and zlib. Made it easy to manage the list of builtins. --- lang/python/setup_pip.py | 408 +++++++++++++++++++++++++++++++++++++ lang/python/wiredtiger/pip_init.py | 48 +++++ 2 files changed, 456 insertions(+) create mode 100644 lang/python/setup_pip.py create mode 100644 lang/python/wiredtiger/pip_init.py diff --git a/lang/python/setup_pip.py b/lang/python/setup_pip.py new file mode 100644 index 00000000000..636eecab80a --- /dev/null +++ b/lang/python/setup_pip.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +# This script builds a Python source distribution that can built be installed +# via pip install. This must be run in a git repository to determine the files +# to package. Also as a prerequisite, SWIG must be run as the generated files +# are part of the package. To create the distribution, in this directory, run +# "python setup_pip.py sdist", this creates a tar.gz file under ./dist . +from __future__ import print_function +import os, os.path, re, shutil, site, sys +from setuptools import setup, Distribution +from distutils.extension import Extension +import distutils.sysconfig +import distutils.ccompiler +from distutils.errors import CompileError, LinkError +import subprocess +from subprocess import call +import setuptools.command.install +import setuptools.command.build_ext + +# msg -- +# Print a message to stderr. +def msg(s): + print(os.path.basename(__file__) + ": " + s, file=sys.stderr) + +# die -- +# For failures, show a message and exit. +def die(s): + msg(s) + sys.exit(1) + +# build_commands -- +# Run a sequence of commands, and die if any fail. +def build_commands(commands, build_dir, build_env): + for command in commands: + callargs = [ 'sh', '-c', command ] + verbose_command = '"' + '" "'.join(callargs) + '"' + print('running: ' + verbose_command) + if call(callargs, cwd=build_dir, env=build_env) != 0: + die('build command failed: ' + verbose_command) + +# check_needed_dependencies -- +# Make a quick check of any needed library dependencies, and +# add to the library path and include path as needed. If a library +# is not found, it is not definitive. +def check_needed_dependencies(builtins, inc_paths, lib_paths): + library_dirs = get_library_dirs() + compiler = distutils.ccompiler.new_compiler() + distutils.sysconfig.customize_compiler(compiler) + compiler.set_library_dirs(library_dirs) + missing = [] + for name, libname, instructions in builtins: + found = compiler.find_library_file(library_dirs, libname) + if found is None: + msg(libname + ": missing") + msg(instructions) + msg("after installing it, set LD_LIBRARY_PATH or DYLD_LIBRARY_PATH") + missing.append(libname) + else: + package_top = os.path.dirname(os.path.dirname(found)) + inc_paths.append(os.path.join(package_top, 'include')) + lib_paths.append(os.path.join(package_top, 'lib')) + + # XXX: we are not accounting for other directories that might be + # discoverable via /sbin/ldconfig. It might be better to write a tiny + # compile using -lsnappy, -lz... + # + #if len(missing) > 0: + # die("install packages for: " + str(missing)) + +# find_executable -- +# Locate an executable in the PATH. +def find_executable(exename, path): + p = subprocess.Popen(['which', exename ], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate('') + out = str(out) # needed for Python3 + if out == '': + if err != '': + err = ': "' + err + '"' + die('"' + exename + '": not found in path' + err) + dirname = os.path.dirname(out) + if not dirname in path: + path.append(dirname) + +# get_build_path -- +# Create a PATH that can be used for installation. Apparently, +# installation commands are run with a restricted PATH, and +# autoreconf/aclocal will not normally be found. +def get_build_path(): + build_paths = [] + find_executable('autoreconf', build_paths) + find_executable('aclocal', build_paths) + build_path = os.environ['PATH'] + ':' + ':'.join(build_paths) + return build_path + +# get_compile_flags -- +# Get system specific compile flags. Return a triple: C preprocessor +# flags, C compilation flags and linker flags. +def get_compile_flags(inc_paths, lib_paths): + # Suppress warnings building SWIG generated code + if sys.platform == 'win32' and cc == 'msvc': + cflags = ['/arch:SSE2', '/EHsc'] + cppflags = [] + ldflags = [] + # Windows untested and incomplete, don't claim that it works. + die('Windows is not supported by this setup script') + else: + cflags = [ '-w', '-Wno-sign-conversion', '-std=c11' ] + cppflags = ['-I' + path for path in inc_paths] + cppflags.append('-DHAVE_CONFIG_H') + ldflags = ['-L' + path for path in lib_paths] + if sys.platform == 'darwin': + cflags.extend([ '-arch', 'x86_64' ]) + return (cppflags, cflags, ldflags) + +# get_sources_curdir -- +# Get a list of sources from the current directory +def get_sources_curdir(): + DEVNULL = open(os.devnull, 'w') + gitproc = subprocess.Popen( + ['git', 'ls-tree', '-r', '--name-only', 'HEAD^{tree}'], + stdin=DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sources = [line.rstrip() for line in gitproc.stdout.readlines()] + err = gitproc.stderr.read() + gitproc.wait() + subret = gitproc.returncode + if subret != 0 or err: + msg("git command to get sources returned " + str(subret) + + ", error=" + str(err)) + die("this command must be run in a git repository") + return sources + +# get_wiredtiger_versions -- +# Read the version information from the RELEASE_INFO file. +def get_wiredtiger_versions(wt_dir): + v = {} + for l in open(os.path.join(wt_dir, 'RELEASE_INFO')): + if re.match(r'WIREDTIGER_VERSION_(?:MAJOR|MINOR|PATCH)=', l): + exec(l, v) + wt_ver = '%d.%d' % (v['WIREDTIGER_VERSION_MAJOR'], + v['WIREDTIGER_VERSION_MINOR']) + wt_full_ver = wt_ver + '.%d' % (v['WIREDTIGER_VERSION_PATCH']) + return (wt_ver, wt_full_ver) + +# get_library_dirs +# Build a plausible set of library directories. +def get_library_dirs(): + dirs = [] + dirs.append("/usr/local/lib") + dirs.append("/usr/local/lib64") + dirs.append("/lib/x86_64-linux-gnu") + dirs.append("/opt/local/lib") + dirs.append("/usr/lib") + dirs.append("/usr/lib64") + for path in ['LD_LIBRARY_PATH', 'DYLD_LIBRARY_PATH', 'LIBRARY_PATH']: + if path in os.environ: + dirs.extend(os.environ[path].split(':')) + dirs = list(set(filter(os.path.isdir, dirs))) + return dirs + +# source_filter +# Make any needed changes to the sources list. Any entry that +# needs to be moved is returned in a dictionary. +def source_filter(sources): + result = [] + movers = dict() + py_dir = os.path.join('lang', 'python') + pywt_dir = os.path.join(py_dir, 'wiredtiger') + pywt_prefix = pywt_dir + os.path.sep + for f in sources: + if not re.match(source_regex, f): + continue + src = f + dest = f + # move all lang/python files to the top level. + if dest.startswith(pywt_prefix): + dest = os.path.basename(dest) + if dest == 'pip_init.py': + dest = '__init__.py' + if dest != src: + movers[dest] = src + result.append(dest) + # Add SWIG generated files + result.append('wiredtiger.py') + movers['wiredtiger.py'] = os.path.join(pywt_dir, '__init__.py') + result.append(os.path.join(py_dir, 'wiredtiger_wrap.c')) + return result, movers + +################################################################ +# Do some initial setup and checks. +this_abs_script = os.path.abspath(__file__) +this_dir = os.path.dirname(this_abs_script) +pip_command = None +for arg in sys.argv[1:]: + if arg[0] != '-' and pip_command == None: + pip_command = arg + break + +if this_dir.endswith(os.sep + os.path.join('lang', 'python')): + wt_dir = os.path.dirname(os.path.dirname(this_dir)) + os.chdir(wt_dir) +elif os.path.isfile(os.path.join(this_dir, 'LICENSE')): + wt_dir = this_dir +else: + die('running from an unknown directory') + +python3 = (sys.version_info[0] > 2) +if python3: + die('Python3 is not yet supported') + +# Ensure that Extensions won't be built for 32 bit, +# that won't work with WiredTiger. +if sys.maxsize < 2**32: + die('need to be running on a 64 bit system, and have a 64 bit Python') + +python_rel_dir = os.path.join('lang', 'python') +build_dir = os.path.join(wt_dir, 'build_posix') +makefile = os.path.join(build_dir, 'Makefile') +built_sentinal = os.path.join(build_dir, 'built.txt') +conf_make_dir = 'build_posix' +wt_swig_lib_name = os.path.join(python_rel_dir, '_wiredtiger.so') + +################################################################ +# Put together build options for the WiredTiger extension. +short_description = 'high performance, scalable, production quality, ' + \ + 'NoSQL, Open Source extensible platform for data management' +long_description = 'WiredTiger is a ' + short_description + '.\n\n' + \ + open(os.path.join(wt_dir, 'README')).read() + +wt_ver, wt_full_ver = get_wiredtiger_versions(wt_dir) +build_path = get_build_path() + +# We only need a small set of directories to build a WT library, +# we also include any files at the top level. +source_regex = r'^(?:(?:api|build_posix|ext|lang/python|src|dist)/|[^/]*$)' + +# The builtins that we include in this distribution. +builtins = [ + # [ name, libname, instructions ] + [ 'snappy', 'snappy', + 'Note: a suitable version of snappy can be found at\n' + \ + ' https://github.com/google/snappy/releases/download/' + \ + '1.1.3/snappy-1.1.3.tar.gz\n' + \ + 'It can be installed via: yum install snappy snappy-devel' + \ + 'or via: apt-get install libsnappy-dev' ], + [ 'zlib', 'z', + 'Need to install zlib\n' + \ + 'It can be installed via: apt-get install zlib1g' ] +] +builtin_names = [b[0] for b in builtins] +builtin_libraries = [b[1] for b in builtins] + +# Here's the configure/make operations we perform before the python extension +# is linked. +configure_cmds = [ + './makemake --clean-and-make', + './reconf', + # force building a position independent library; it will be linked + # into a single shared library with the SWIG interface code. + 'CFLAGS="${CFLAGS:-} -fPIC -DPIC" ' + \ + '../configure --enable-python --with-builtins=' + ','.join(builtin_names) +] + +# build all the builtins, at the moment they are all compressors. +make_cmds = [] +for name in builtin_names: + make_cmds.append('(cd ext/compressors/' + name + '/; make)') +make_cmds.append('make libwiredtiger.la') + +inc_paths = [ os.path.join(build_dir, 'src', 'include'), build_dir, '.' ] +lib_paths = [ '.' ] # wiredtiger.so is moved into the top level directory + +check_needed_dependencies(builtins, inc_paths, lib_paths) + +cppflags, cflags, ldflags = get_compile_flags(inc_paths, lib_paths) + +# If we are creating a source distribution, create a staging directory +# with just the right sources. Put the result in the python dist directory. +if pip_command == 'sdist': + sources, movers = source_filter(get_sources_curdir()) + stage_dir = os.path.join(python_rel_dir, 'stage') + shutil.rmtree(stage_dir, True) + os.makedirs(stage_dir) + shutil.copy2(this_abs_script, os.path.join(stage_dir, 'setup.py')) + for f in sources: + d = os.path.join(stage_dir, os.path.dirname(f)) + if not os.path.isdir(d): + os.makedirs(d) + if f in movers: + src = movers[f] + else: + src = f + # Symlinks are not followed in setup, we need to use real files. + shutil.copy2(src, os.path.join(stage_dir, f)) + os.chdir(stage_dir) + sys.argv.append('--dist-dir=' + os.path.join('..', 'dist')) +else: + sources = [ os.path.join(python_rel_dir, 'wiredtiger_wrap.c') ] + +wt_ext = Extension('_wiredtiger', + sources = sources, + extra_compile_args = cflags + cppflags, + extra_link_args = ldflags, + libraries = builtin_libraries, + extra_objects = [ os.path.join(build_dir, '.libs', 'libwiredtiger.a') ], + include_dirs = inc_paths, + library_dirs = lib_paths, +) +extensions = [ wt_ext ] +env = { "CFLAGS" : ' '.join(cflags), + "CPPFLAGS" : ' '.join(cppflags), + "LDFLAGS" : ' '.join(ldflags), + "PATH" : build_path } + +class BinaryDistribution(Distribution): + def is_pure(self): + return False + +class WTInstall(setuptools.command.install.install): + def run(self): + self.run_command("build_ext") + return setuptools.command.install.install.run(self) + +class WTBuildExt(setuptools.command.build_ext.build_ext): + def __init__(self, *args, **kwargs): + setuptools.command.build_ext.build_ext.__init__(self, *args, **kwargs) + + def run(self): + # only run this once + if not os.path.isfile(built_sentinal): + try: + os.remove(makefile) + except OSError: + pass + self.execute( + lambda: build_commands(configure_cmds, conf_make_dir, env), [], + 'wiredtiger configure') + if not os.path.isfile(makefile): + die('configure failed, file does not exist: ' + makefile) + self.execute( + lambda: build_commands(make_cmds, conf_make_dir, env), [], + 'wiredtiger make') + open(built_sentinal, 'a').close() + return setuptools.command.build_ext.build_ext.run(self) + +setup( + name = 'wiredtiger', + version = wt_full_ver, + author = 'The WiredTiger Development Team, part of MongoDB', + author_email = 'info@wiredtiger.com', + description = short_description, + license='GPL2,GPL3,Commercial', + long_description = long_description, + url = 'http://source.wiredtiger.com/', + keywords = 'scalable NoSQL database datastore engine open source', + packages = ['wiredtiger'], + ext_package = 'wiredtiger', + ext_modules = extensions, + include_package_data = True, + distclass = BinaryDistribution, + package_dir = { 'wiredtiger' : '.' }, + cmdclass = { 'install': WTInstall, 'build_ext': WTBuildExt }, + package_data = { + 'wiredtiger' : [ wt_swig_lib_name, '*.py' ] + }, + classifiers=[ + 'Intended Audience :: Developers', + 'Programming Language :: C', + 'Programming Language :: C++', + 'Programming Language :: Python', + 'Programming Language :: Java', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: POSIX', + 'Operating System :: POSIX :: BSD', + 'Operating System :: POSIX :: Linux', + 'Operating System :: POSIX :: SunOS/Solaris', + ] +) + +if pip_command == 'sdist': + shutil.rmtree(os.path.join(this_dir, 'stage')) diff --git a/lang/python/wiredtiger/pip_init.py b/lang/python/wiredtiger/pip_init.py new file mode 100644 index 00000000000..d59c8218976 --- /dev/null +++ b/lang/python/wiredtiger/pip_init.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +# pip_init.py +# This is installed as __init__.py, and imports the file created by SWIG. +# This is needed because SWIG's import helper code created by certain SWIG +# versions may be broken, see: https://github.com/swig/swig/issues/769 . +# Importing indirectly seems to avoid these issues. +import os, sys +fname = os.path.basename(__file__) +if fname != '__init__.py' and fname != '__init__.pyc': + print(__file__ + ': this file is not yet installed') + sys.exit(1) + +# After importing the SWIG-generated file, copy all symbols from from it +# to this module so they will appear in the wiredtiger namespace. +me = sys.modules[__name__] +sys.path.append(os.path.dirname(__file__)) # needed for Python3 +import wiredtiger +for name in dir(wiredtiger): + value = getattr(wiredtiger, name) + setattr(me, name, value) -- cgit v1.2.1 From 360b43b33170a89587a737988477d0619008ec2a Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 16 Mar 2017 23:50:25 -0400 Subject: WT-3216 changes suggested by clang-tidy (#3328) --- dist/api_err.py | 2 +- src/async/async_api.c | 10 ++-- src/block/block_addr.c | 2 +- src/block/block_ext.c | 11 +++-- src/block/block_read.c | 2 +- src/block/block_vrfy.c | 2 +- src/btree/bt_cursor.c | 12 +---- src/btree/bt_debug.c | 2 +- src/btree/bt_io.c | 9 ++-- src/btree/bt_read.c | 6 ++- src/btree/bt_split.c | 6 +-- src/btree/bt_sync.c | 6 ++- src/btree/bt_vrfy_dsk.c | 8 ++-- src/config/config_api.c | 10 ++-- src/conn/conn_sweep.c | 2 +- src/cursor/cur_join.c | 5 +- src/cursor/cur_json.c | 87 +++++++++++++++++----------------- src/cursor/cur_metadata.c | 4 +- src/cursor/cur_stat.c | 1 - src/cursor/cur_table.c | 4 +- src/evict/evict_lru.c | 12 +++-- src/include/api.h | 11 +++-- src/include/bitstring.i | 2 +- src/include/btmem.h | 16 +++---- src/include/cell.i | 16 +++---- src/include/column.i | 4 +- src/include/connection.h | 4 +- src/include/cursor.h | 8 ++-- src/include/dhandle.h | 12 ++--- src/include/intpack.i | 30 ++++++------ src/include/lint.h | 14 +++--- src/include/log.h | 10 ++-- src/include/misc.h | 18 +++---- src/include/mutex.i | 4 +- src/include/os.h | 2 +- src/include/packing.i | 44 ++++++++--------- src/include/schema.h | 7 +-- src/include/session.h | 2 +- src/include/stat.h | 2 +- src/include/wiredtiger.in | 18 +++---- src/log/log.c | 14 ++---- src/lsm/lsm_cursor.c | 13 ++++-- src/lsm/lsm_merge.c | 2 +- src/lsm/lsm_work_unit.c | 8 ++-- src/reconcile/rec_write.c | 17 ++----- src/support/crypto.c | 1 - src/txn/txn_ckpt.c | 1 - src/txn/txn_recover.c | 2 +- src/utilities/util_dump.c | 11 +++-- src/utilities/util_load.c | 4 +- src/utilities/util_main.c | 117 ++++++++++++++++++++++++---------------------- 51 files changed, 304 insertions(+), 313 deletions(-) diff --git a/dist/api_err.py b/dist/api_err.py index 82f961a4ac9..bd379ac8d70 100644 --- a/dist/api_err.py +++ b/dist/api_err.py @@ -82,7 +82,7 @@ for line in open('../src/include/wiredtiger.in', 'r'): ''.join('\n * ' + l for l in textwrap.wrap( textwrap.dedent(err.long_desc).strip(), 77)) + '\n' if err.long_desc else '')) - tfile.write('#define\t%s\t%d\n' % (err.name, err.value)) + tfile.write('#define\t%s\t(%d)\n' % (err.name, err.value)) if 'undoc' in err.flags: tfile.write('/*! @endcond */\n') tfile.write('/*\n') diff --git a/src/async/async_api.c b/src/async/async_api.c index 026a008188c..b9cc995f5a5 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -338,17 +338,15 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) * 2. If async is off, and the user wants it on, start it. * 3. If not a toggle and async is off, we're done. */ - if (conn->async_cfg && !run) { - /* Case 1 */ + if (conn->async_cfg && !run) { /* Case 1 */ WT_TRET(__wt_async_flush(session)); ret = __wt_async_destroy(session); conn->async_cfg = false; return (ret); - } else if (!conn->async_cfg && run) - /* Case 2 */ + } + if (!conn->async_cfg && run) /* Case 2 */ return (__async_start(session)); - else if (!conn->async_cfg) - /* Case 3 */ + if (!conn->async_cfg) /* Case 3 */ return (0); /* diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 580316bdfc6..a67efca62a3 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -226,7 +226,7 @@ __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, ci->discard.offset, ci->discard.size, ci->discard.checksum)); a = (uint64_t)ci->file_size; WT_RET(__wt_vpack_uint(pp, 0, a)); - a = (uint64_t)ci->ckpt_size; + a = ci->ckpt_size; WT_RET(__wt_vpack_uint(pp, 0, a)); return (0); diff --git a/src/block/block_ext.c b/src/block/block_ext.c index 26acc8c560f..e9357d73d1d 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -634,11 +634,11 @@ __wt_block_off_free( */ if ((ret = __wt_block_off_remove_overlap( session, block, &block->live.alloc, offset, size)) == 0) - ret = __block_merge(session, block, - &block->live.avail, offset, (wt_off_t)size); + ret = __block_merge( + session, block, &block->live.avail, offset, size); else if (ret == WT_NOTFOUND) - ret = __block_merge(session, block, - &block->live.discard, offset, (wt_off_t)size); + ret = __block_merge( + session, block, &block->live.discard, offset, size); return (ret); } @@ -1247,7 +1247,8 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_DECL_RET; WT_EXT *ext; WT_PAGE_HEADER *dsk; - size_t entries, size; + uint32_t entries; + size_t size; uint8_t *p; WT_RET(__block_extlist_dump(session, block, el, "write")); diff --git a/src/block/block_read.c b/src/block/block_read.c index 869a92b6ae1..8d4aec7df75 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -39,7 +39,7 @@ __wt_bm_preload( (uint8_t *)bm->map + offset, size, bm->mapped_cookie); if (!mapped && handle->fh_advise != NULL) ret = handle->fh_advise(handle, (WT_SESSION *)session, - (wt_off_t)offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); + offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); if (ret != EBUSY && ret != ENOTSUP) return (ret); diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index 94824ad19f8..154765ed079 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -22,7 +22,7 @@ static int __verify_set_file_size(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); ((off) / (block)->allocsize - 1) #ifdef HAVE_VERBOSE #define WT_FRAG_TO_OFF(block, frag) \ - (((wt_off_t)(frag + 1)) * (block)->allocsize) + (((wt_off_t)((frag) + 1)) * (block)->allocsize) #endif /* diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index d6dc0991d3f..48ae1ad6d76 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1108,11 +1108,7 @@ retry: WT_RET(__wt_btcur_search(start)); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { + for (;;) { if ((ret = rmfunc(session, start, 1)) != 0) break; @@ -1176,11 +1172,7 @@ retry: WT_RET(__wt_btcur_search(start)); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { + for (;;) { value = (const uint8_t *)start->iface.value.data; if (*value != 0 && (ret = rmfunc(session, start, 1)) != 0) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d664da2ebd3..4989301468f 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -34,7 +34,7 @@ static const /* Output separator */ static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *); static int __debug_cell_data( - WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *); + WT_DBG *, WT_PAGE *, int, const char *, WT_CELL_UNPACK *); static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index a8645f79dbe..b5e4d52394a 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -183,7 +183,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t dst_len, len, result_len, size, src_len; int compression_failed; /* Extension API, so not a bool. */ uint8_t *dst, *src; - bool data_checksum, encrypted; + bool data_checksum, encrypted, timer; btree = S2BT(session); bm = btree->bm; @@ -216,7 +216,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); - ctmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; + ctmp->size = result_len + WT_BLOCK_COMPRESS_SKIP; ip = ctmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); @@ -357,7 +357,8 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, data_checksum = !compressed; break; } - if (!F_ISSET(session, WT_SESSION_INTERNAL)) + timer = !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &start); /* Call the block manager to write the block. */ @@ -367,7 +368,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, bm, session, ip, addr, addr_sizep, data_checksum, checkpoint_io)); /* Update some statistics now that the write is done */ - if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + if (timer) { __wt_epoch(session, &stop); WT_STAT_CONN_INCR(session, cache_write_app_count); WT_STAT_CONN_INCRV(session, cache_write_app_time, diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index e87ddc082f2..b170a9fb900 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -369,6 +369,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) size_t addr_size; uint32_t previous_state; const uint8_t *addr; + bool timer; btree = S2BT(session); page = NULL; @@ -408,10 +409,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ - if (!F_ISSET(session, WT_SESSION_INTERNAL)) + timer = !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &start); WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); - if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + if (timer) { __wt_epoch(session, &stop); WT_STAT_CONN_INCR(session, cache_read_app_count); WT_STAT_CONN_INCRV(session, cache_read_app_time, diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 6b2100ec7e3..b1bad760826 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -10,8 +10,8 @@ #define WT_MEM_TRANSFER(from_decr, to_incr, len) do { \ size_t __len = (len); \ - from_decr += __len; \ - to_incr += __len; \ + (from_decr) += __len; \ + (to_incr) += __len; \ } while (0) /* @@ -119,7 +119,7 @@ __wt_split_stash_discard(WT_SESSION_IMPL *session) ++i, ++stash) { if (stash->p == NULL) continue; - else if (stash->split_gen >= oldest) + if (stash->split_gen >= oldest) break; /* * It's a bad thing if another thread is in this memory after diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7bf15baa67f..cdb27752fb7 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -78,6 +78,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id; uint32_t flags; + bool timer; conn = S2C(session); btree = S2BT(session); @@ -88,7 +89,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; - if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) + timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); + if (timer) __wt_epoch(session, &start); switch (syncop) { @@ -242,7 +244,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) break; } - if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + if (timer) { __wt_epoch(session, &end); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 3a6fd8261ba..a4071c44aee 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -203,7 +203,8 @@ __verify_dsk_row( WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; - uint32_t cell_num, cell_type, i, key_cnt, prefix; + size_t prefix; + uint32_t cell_num, cell_type, i, key_cnt; uint8_t *end; int cmp; @@ -343,8 +344,9 @@ __verify_dsk_row( if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " - "compression count of %" PRIu32 ", larger than " - "the length of the previous key, %" WT_SIZET_FMT, + "compression count of %" WT_SIZET_FMT + ", larger than the length of the previous key, %" + WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* diff --git a/src/config/config_api.c b/src/config/config_api.c index 05c5c1287a7..9f70ba65e9b 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -215,7 +215,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, WT_CONFIG_ENTRY *entry; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - size_t cnt; + size_t cnt, len; char *newcheck_name, *p; /* @@ -276,11 +276,9 @@ __wt_configure_method(WT_SESSION_IMPL *session, */ WT_ERR(__wt_calloc_one(session, &entry)); entry->method = (*epp)->method; - WT_ERR(__wt_calloc_def(session, - strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p)); - (void)strcpy(p, (*epp)->base); - (void)strcat(p, ","); - (void)strcat(p, config); + len = strlen((*epp)->base) + strlen(",") + strlen(config) + 1; + WT_ERR(__wt_calloc_def(session, len, &p)); + snprintf(p, len, "%s,%s", (*epp)->base, config); entry->base = p; /* diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 8c186c63939..22d90b08438 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -10,7 +10,7 @@ #define WT_DHANDLE_CAN_DISCARD(dhandle) \ (!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) && \ - dhandle->session_inuse == 0 && dhandle->session_ref == 0) + (dhandle)->session_inuse == 0 && (dhandle)->session_ref == 0) /* * __sweep_mark -- diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 013a64ef2d5..8df8e201173 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -270,7 +270,7 @@ again: iter->positioned = true; return (ret); } - else if (ret == WT_NOTFOUND) { + if (ret == WT_NOTFOUND) { WT_RET(__curjoin_iter_close_all(iter->child)); entry->subjoin->iter = NULL; iter->child = NULL; @@ -518,8 +518,7 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, } if (disjunction && end == endmax) return (WT_NOTFOUND); - else - return (0); + return (0); } typedef struct { diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index 5870d14273e..0ad3c4f4201 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -23,20 +23,20 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, bool, const char *, size_t *); #define WT_PACK_JSON_GET(session, pv, jstr) do { \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ - pv.type = pv.type == 's' ? 'j' : 'J'; \ + WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\ + (pv).type = (pv).type == 's' ? 'j' : 'J'; \ break; \ case 'b': \ case 'h': \ case 'i': \ case 'l': \ case 'q': \ - WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \ + WT_RET(json_int_arg(session, &(jstr), &(pv).u.i)); \ break; \ case 'B': \ case 'H': \ @@ -46,11 +46,11 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, case 'r': \ case 'R': \ case 't': \ - WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \ + WT_RET(json_uint_arg(session, &(jstr), &(pv).u.u)); \ break; \ case 'u': \ - WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ - pv.type = 'K'; \ + WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\ + (pv).type = 'K'; \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ @@ -304,7 +304,6 @@ __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) __wt_free(session, json->value_buf); __wt_free(session, json); } - return; } /* @@ -323,33 +322,32 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) if (bufsz >= 1) *buf = ch; return (1); - } else { - abbrev = '\0'; - switch (ch) { - case '\\': - case '"': - abbrev = ch; - break; - case '\f': - abbrev = 'f'; - break; - case '\n': - abbrev = 'n'; - break; - case '\r': - abbrev = 'r'; - break; - case '\t': - abbrev = 't'; - break; - } - if (abbrev != '\0') { - if (bufsz >= 2) { - *buf++ = '\\'; - *buf = abbrev; - } - return (2); + } + abbrev = '\0'; + switch (ch) { + case '\\': + case '"': + abbrev = ch; + break; + case '\f': + abbrev = 'f'; + break; + case '\n': + abbrev = 'n'; + break; + case '\r': + abbrev = 'r'; + break; + case '\t': + abbrev = 't'; + break; + } + if (abbrev != '\0') { + if (bufsz >= 2) { + *buf++ = '\\'; + *buf = abbrev; } + return (2); } } if (bufsz >= 6) { @@ -421,16 +419,16 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, #define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \ size_t _kwlen = strlen(keyword); \ if (strncmp(in, keyword, _kwlen) == 0 && \ - !__wt_isalnum((u_char)in[_kwlen])) { \ - in += _kwlen; \ - result = matchval; \ + !__wt_isalnum((u_char)(in)[_kwlen])) { \ + (in) += _kwlen; \ + (result) = matchval; \ } else { \ - const char *_bad = in; \ - while (__wt_isalnum((u_char)*in)) \ - in++; \ + const char *_bad = (in); \ + while (__wt_isalnum((u_char)*(in))) \ + (in)++; \ WT_RET_MSG(session, EINVAL, \ "unknown keyword \"%.*s\" in JSON", \ - (int)(in - _bad), _bad); \ + (int)((in) - _bad), _bad); \ } \ } while (0) @@ -692,12 +690,13 @@ json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up) #define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \ int __tok; \ - WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\ - if (__tok != tokval) \ + WT_RET(__wt_json_token( \ + (WT_SESSION *)(session), jstr, &__tok, &(start), &(sz))); \ + if (__tok != (tokval)) \ WT_RET_MSG(session, EINVAL, \ "expected JSON %s, got %s", \ __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \ - jstr = start + sz; \ + (jstr) = (start) + (sz); \ } while (0) #define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \ diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index 10e2fdf28be..fbfc73956e2 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -16,7 +16,7 @@ WT_CURSOR_NEEDKEY(cursor); \ WT_ERR(__wt_buf_set(session, \ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key, \ - cursor->key.data, cursor->key.size)); \ + (cursor)->key.data, (cursor)->key.size)); \ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ WT_CURSTD_KEY_EXT); \ } while (0) @@ -25,7 +25,7 @@ WT_CURSOR_NEEDVALUE(cursor); \ WT_ERR(__wt_buf_set(session, \ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value, \ - cursor->value.data, cursor->value.size)); \ + (cursor)->value.data, (cursor)->value.size)); \ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ WT_CURSTD_VALUE_EXT); \ } while (0) diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 5fde64c74ca..c5ccdb1b649 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -163,7 +163,6 @@ static void __curstat_set_value(WT_CURSOR *cursor, ...) { WT_UNUSED(cursor); - return; } /* diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 72eec177449..ef2c0ac5163 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -14,8 +14,8 @@ static int __curtable_update(WT_CURSOR *cursor); #define APPLY_CG(ctable, f) do { \ WT_CURSOR **__cp; \ u_int __i; \ - for (__i = 0, __cp = ctable->cg_cursors; \ - __i < WT_COLGROUPS(ctable->table); \ + for (__i = 0, __cp = (ctable)->cg_cursors; \ + __i < WT_COLGROUPS((ctable)->table); \ __i++, __cp++) \ WT_TRET((*__cp)->f(*__cp)); \ } while (0) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 6863533acfb..84c9990832d 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -934,7 +934,6 @@ __evict_tune_workers(WT_SESSION_IMPL *session) cache = conn->cache; WT_ASSERT(session, conn->evict_threads.threads[0]->session == session); - pgs_evicted_persec_cur = 0; if (conn->evict_tune_stable) return (0); @@ -966,7 +965,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session) pgs_evicted_persec_cur = (delta_pages * WT_THOUSAND) / delta_msec; conn->evict_tune_num_points++; - /* Keep track of the maximum eviction throughput seen and the number + /* + * Keep track of the maximum eviction throughput seen and the number * of workers corresponding to that throughput. */ if (pgs_evicted_persec_cur > conn->evict_tune_pg_sec_max) { @@ -2116,6 +2116,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; uint64_t init_evict_count, max_pages_evicted; + bool timer; conn = S2C(session); cache = conn->cache; @@ -2136,7 +2137,9 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) __wt_evict_server_wake(session); /* Track how long application threads spend doing eviction. */ - if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) + timer = + WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &enter); for (init_evict_count = cache->pages_evict;; ret = 0) { @@ -2202,8 +2205,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) } } -err: if (WT_STAT_ENABLED(session) && - !F_ISSET(session, WT_SESSION_INTERNAL)) { +err: if (timer) { __wt_epoch(session, &leave); WT_STAT_CONN_INCRV(session, application_cache_time, WT_TIMEDIFF_US(leave, enter)); diff --git a/src/include/api.h b/src/include/api.h index 1fa777ed5cc..a3636eb8040 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -19,7 +19,7 @@ __wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n) #define API_CALL(s, h, n, dh, config, cfg) do { \ - const char *cfg[] = \ + const char *(cfg)[] = \ { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \ API_SESSION_INIT(s, h, n, dh); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ @@ -62,15 +62,16 @@ if (__autotxn) { \ if (F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT)) \ F_CLR(&(s)->txn, WT_TXN_AUTOCOMMIT); \ - else if (ret == 0 && !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \ - ret = __wt_txn_commit((s), NULL); \ + else if ((ret) == 0 && \ + !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \ + (ret) = __wt_txn_commit((s), NULL); \ else { \ if (retry) \ WT_TRET(__wt_session_copy_values(s)); \ WT_TRET(__wt_txn_rollback((s), NULL)); \ - if ((ret == 0 || ret == WT_ROLLBACK) && \ + if (((ret) == 0 || (ret) == WT_ROLLBACK) && \ (retry)) { \ - ret = 0; \ + (ret) = 0; \ continue; \ } \ WT_TRET(__wt_session_reset_cursors(s, false)); \ diff --git a/src/include/bitstring.i b/src/include/bitstring.i index 08746beb9b9..118dc0bba01 100644 --- a/src/include/bitstring.i +++ b/src/include/bitstring.i @@ -230,7 +230,7 @@ __bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width) #define __BIT_GET(len, mask) \ case len: \ if (__bit_test(bitf, bit)) \ - value |= mask; \ + value |= (mask); \ ++bit \ /* FALLTHROUGH */ diff --git a/src/include/btmem.h b/src/include/btmem.h index 39ca223aebf..f1bb08d2699 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -507,7 +507,7 @@ struct __wt_page { #define WT_INTL_INDEX_GET_SAFE(page) \ ((page)->u.intl.__index) #define WT_INTL_INDEX_GET(session, page, pindex) do { \ - WT_ASSERT(session, session->split_gen != 0); \ + WT_ASSERT(session, (session)->split_gen != 0); \ (pindex) = WT_INTL_INDEX_GET_SAFE(page); \ } while (0) #define WT_INTL_INDEX_SET(page, v) do { \ @@ -868,7 +868,7 @@ struct __wt_col { * Return the 0-based array offset based on a WT_COL reference. */ #define WT_COL_SLOT(page, cip) \ - ((uint32_t)(((WT_COL *)cip) - (page)->pg_var)) + ((uint32_t)(((WT_COL *)(cip)) - (page)->pg_var)) /* * WT_IKEY -- @@ -977,10 +977,10 @@ struct __wt_insert { } key; } u; -#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size) +#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)(ins))->u.key.size) #define WT_INSERT_KEY(ins) \ - ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset)) -#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno) + ((void *)((uint8_t *)(ins) + ((WT_INSERT *)(ins))->u.key.offset)) +#define WT_INSERT_RECNO(ins) (((WT_INSERT *)(ins))->u.recno) WT_INSERT *next[0]; /* forward-linked skip list */ }; @@ -989,9 +989,9 @@ struct __wt_insert { * Skiplist helper macros. */ #define WT_SKIP_FIRST(ins_head) \ - (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0]) + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->head[0]) #define WT_SKIP_LAST(ins_head) \ - (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0]) + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->tail[0]) #define WT_SKIP_NEXT(ins) ((ins)->next[0]) #define WT_SKIP_FOREACH(ins, ins_head) \ for ((ins) = WT_SKIP_FIRST(ins_head); \ @@ -1004,7 +1004,7 @@ struct __wt_insert { #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \ if (((v) = (dest)) == NULL) { \ WT_ERR(__wt_calloc_def(s, count, &(v))); \ - if (__wt_atomic_cas_ptr(&dest, NULL, v)) \ + if (__wt_atomic_cas_ptr(&(dest), NULL, v)) \ __wt_cache_page_inmem_incr( \ s, page, (count) * sizeof(*(v))); \ else \ diff --git a/src/include/cell.i b/src/include/cell.i index c130768e595..71c2515daf0 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -361,14 +361,12 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) cell->__chunk[0] = (uint8_t) ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT); return (1); - } else { - byte = (uint8_t)size; /* Type + length */ - cell->__chunk[0] = (uint8_t) - ((byte << WT_CELL_SHORT_SHIFT) | - WT_CELL_KEY_SHORT_PFX); - cell->__chunk[1] = prefix; /* Prefix */ - return (2); } + byte = (uint8_t)size; /* Type + length */ + cell->__chunk[0] = (uint8_t) + ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT_PFX); + cell->__chunk[1] = prefix; /* Prefix */ + return (2); } if (prefix == 0) { @@ -569,8 +567,8 @@ __wt_cell_unpack_safe( */ #define WT_CELL_LEN_CHK(t, len) do { \ if (start != NULL && \ - ((uint8_t *)t < (uint8_t *)start || \ - (((uint8_t *)t) + (len)) > (uint8_t *)end)) \ + ((uint8_t *)(t) < (uint8_t *)start || \ + (((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \ return (WT_ERROR); \ } while (0) diff --git a/src/include/column.i b/src/include/column.i index c1b45a1f4e0..07b627315e6 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -108,7 +108,7 @@ __col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) /* Fast path the check for values at the end of the skiplist. */ if (recno > WT_INSERT_RECNO(ret_ins)) return (NULL); - else if (recno == WT_INSERT_RECNO(ret_ins)) + if (recno == WT_INSERT_RECNO(ret_ins)) return (ret_ins); /* @@ -127,7 +127,7 @@ __col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) if (cmp == 0) /* Exact match: return */ return (*insp); - else if (cmp > 0) /* Keep going at this level */ + if (cmp > 0) /* Keep going at this level */ insp = &(*insp)->next[i]; else { /* Drop down a level */ --i; diff --git a/src/include/connection.h b/src/include/connection.h index ce483d3291a..6c23492e926 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -127,7 +127,7 @@ struct __wt_named_extractor { F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ - ++conn->dhandle_count; \ + ++(conn)->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ @@ -135,7 +135,7 @@ struct __wt_named_extractor { F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ - --conn->dhandle_count; \ + --(conn)->dhandle_count; \ } while (0) /* diff --git a/src/include/cursor.h b/src/include/cursor.h index 31c8963a486..f32b4250d30 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -73,7 +73,7 @@ struct __wt_cursor_backup { #define WT_CURBACKUP_LOCKER 0x01 /* Hot-backup started */ uint8_t flags; }; -#define WT_CURSOR_BACKUP_ID(cursor) (((WT_CURSOR_BACKUP *)cursor)->maxid) +#define WT_CURSOR_BACKUP_ID(cursor) (((WT_CURSOR_BACKUP *)(cursor))->maxid) struct __wt_cursor_btree { WT_CURSOR iface; @@ -474,7 +474,7 @@ struct __wt_cursor_stat { * Return a reference to a statistic cursor's stats structures. */ #define WT_CURSOR_STATS(cursor) \ - (((WT_CURSOR_STAT *)cursor)->stats) + (((WT_CURSOR_STAT *)(cursor))->stats) struct __wt_cursor_table { WT_CURSOR iface; @@ -493,7 +493,7 @@ struct __wt_cursor_table { }; #define WT_CURSOR_PRIMARY(cursor) \ - (((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]) + (((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0]) #define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r") @@ -550,4 +550,4 @@ struct __wt_cursor_table { } while (0) #define WT_CURSOR_RAW_OK \ - WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW + (WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW) diff --git a/src/include/dhandle.h b/src/include/dhandle.h index 4f318e7bccf..8861e96112b 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -38,20 +38,20 @@ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) #define WT_DHANDLE_ACQUIRE(dhandle) \ - (void)__wt_atomic_add32(&dhandle->session_ref, 1) + (void)__wt_atomic_add32(&(dhandle)->session_ref, 1) #define WT_DHANDLE_RELEASE(dhandle) \ - (void)__wt_atomic_sub32(&dhandle->session_ref, 1) + (void)__wt_atomic_sub32(&(dhandle)->session_ref, 1) #define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\ - if (dhandle == NULL) \ - dhandle = TAILQ_FIRST(head); \ + if ((dhandle) == NULL) \ + (dhandle) = TAILQ_FIRST(head); \ else { \ WT_DHANDLE_RELEASE(dhandle); \ - dhandle = TAILQ_NEXT(dhandle, field); \ + (dhandle) = TAILQ_NEXT(dhandle, field); \ } \ - if (dhandle != NULL) \ + if ((dhandle) != NULL) \ WT_DHANDLE_ACQUIRE(dhandle); \ } while (0) diff --git a/src/include/intpack.i b/src/include/intpack.i index e8bea58cede..a534de9d9a8 100644 --- a/src/include/intpack.i +++ b/src/include/intpack.i @@ -59,21 +59,21 @@ /* Count the leading zero bytes. */ #if defined(__GNUC__) #define WT_LEADING_ZEROS(x, i) \ - (i = (x == 0) ? (int)sizeof(x) : __builtin_clzll(x) >> 3) + ((i) = ((x) == 0) ? (int)sizeof(x) : __builtin_clzll(x) >> 3) #elif defined(_MSC_VER) #define WT_LEADING_ZEROS(x, i) do { \ - if (x == 0) i = (int)sizeof(x); \ + if ((x) == 0) (i) = (int)sizeof(x); \ else { \ unsigned long __index; \ _BitScanReverse64(&__index, x); \ __index = 63 ^ __index; \ - i = (int)(__index >> 3); } \ + (i) = (int)(__index >> 3); } \ } while (0) #else #define WT_LEADING_ZEROS(x, i) do { \ uint64_t __x = (x); \ uint64_t __m = (uint64_t)0xff << 56; \ - for (i = 0; !(__x & __m) && i != 8; i++) \ + for ((i) = 0; !(__x & __m) && (i) != 8; (i)++) \ __m >>= 8; \ } while (0) #endif @@ -231,7 +231,8 @@ __wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x) if (x < NEG_2BYTE_MIN) { *p = NEG_MULTI_MARKER; return (__wt_vpack_negint(pp, maxlen, (uint64_t)x)); - } else if (x < NEG_1BYTE_MIN) { + } + if (x < NEG_1BYTE_MIN) { WT_SIZE_CHECK_PACK(2, maxlen); x -= NEG_2BYTE_MIN; *p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8); @@ -358,12 +359,10 @@ __wt_vsize_uint(uint64_t x) { if (x <= POS_1BYTE_MAX) return (1); - else if (x <= POS_2BYTE_MAX + 1) { + if (x <= POS_2BYTE_MAX + 1) return (2); - } else { - x -= POS_2BYTE_MAX + 1; - return (__wt_vsize_posint(x)); - } + x -= POS_2BYTE_MAX + 1; + return (__wt_vsize_posint(x)); } /* @@ -373,13 +372,12 @@ __wt_vsize_uint(uint64_t x) static inline size_t __wt_vsize_int(int64_t x) { - if (x < NEG_2BYTE_MIN) { + if (x < NEG_2BYTE_MIN) return (__wt_vsize_negint((uint64_t)x)); - } else if (x < NEG_1BYTE_MIN) { + if (x < NEG_1BYTE_MIN) return (2); - } else if (x < 0) { + if (x < 0) return (1); - } else - /* For non-negative values, use the unsigned code above. */ - return (__wt_vsize_uint((uint64_t)x)); + /* For non-negative values, use the unsigned code above. */ + return (__wt_vsize_uint((uint64_t)x)); } diff --git a/src/include/lint.h b/src/include/lint.h index e20a83144ee..2d0f47988b7 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -29,9 +29,9 @@ __wt_atomic_fetch_add##name(type *vp, type v) \ { \ type orig; \ \ - old = *vp; \ + orig = *vp; \ *vp += v; \ - return (old); \ + return (orig); \ } \ static inline ret \ __wt_atomic_store##name(type *vp, type v) \ @@ -40,7 +40,7 @@ __wt_atomic_store##name(type *vp, type v) \ \ orig = *vp; \ *vp = v; \ - return (old); \ + return (orig); \ } \ static inline ret \ __wt_atomic_sub##name(type *vp, type v) \ @@ -49,9 +49,9 @@ __wt_atomic_sub##name(type *vp, type v) \ return (*vp); \ } \ static inline bool \ -__wt_atomic_cas##name(type *vp, type old, type new) \ +__wt_atomic_cas##name(type *vp, type orig, type new) \ { \ - if (*vp == old) { \ + if (*vp == orig) { \ *vp = new; \ return (true); \ } \ @@ -75,8 +75,8 @@ WT_ATOMIC_FUNC(size, size_t, size_t) * Pointer compare and swap. */ static inline bool -__wt_atomic_cas_ptr(void *vp, void *old, void *new) { - if (*(void **)vp == old) { +__wt_atomic_cas_ptr(void *vp, void *orig, void *new) { + if (*(void **)vp == orig) { *(void **)vp = new; return (true); } diff --git a/src/include/log.h b/src/include/log.h index a6be3582b4d..f0999ba316b 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -86,8 +86,8 @@ union __wt_lsn { * The high bit is reserved for the special states. If the high bit is * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state. */ -#define WT_LOG_SLOT_FREE -1 /* Not in use */ -#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */ +#define WT_LOG_SLOT_FREE (-1) /* Not in use */ +#define WT_LOG_SLOT_WRITTEN (-2) /* Slot data written, not processed */ /* * We allocate the buffer size, but trigger a slot switch when we cross @@ -144,8 +144,8 @@ union __wt_lsn { /* Slot is in use, but closed to new joins */ #define WT_LOG_SLOT_CLOSED(state) \ (WT_LOG_SLOT_ACTIVE(state) && \ - (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \ - !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED))) + (FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \ + !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_RESERVED))) /* Slot is in use, all data copied into buffer */ #define WT_LOG_SLOT_INPROGRESS(state) \ (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state)) @@ -185,7 +185,7 @@ struct __wt_logslot { #define WT_WITH_SLOT_LOCK(session, log, op) do { \ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \ WT_WITH_LOCK_WAIT(session, \ - &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ + &(log)->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ } while (0) struct __wt_myslot { diff --git a/src/include/misc.h b/src/include/misc.h index 7aba397e173..9161a215fdc 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -63,7 +63,7 @@ #define WT_MAX(a, b) ((a) < (b) ? (b) : (a)) /* Elements in an array. */ -#define WT_ELEMENTS(a) (sizeof(a) / sizeof(a[0])) +#define WT_ELEMENTS(a) (sizeof(a) / sizeof((a)[0])) /* 10 level skip lists, 1/4 have a link to the next element. */ #define WT_SKIP_MAXDEPTH 10 @@ -181,14 +181,14 @@ */ #define WT_BINARY_SEARCH(key, arrayp, n, found) do { \ uint32_t __base, __indx, __limit; \ - found = false; \ + (found) = false; \ for (__base = 0, __limit = (n); __limit != 0; __limit >>= 1) { \ __indx = __base + (__limit >> 1); \ - if ((arrayp)[__indx] < key) { \ + if ((arrayp)[__indx] < (key)) { \ __base = __indx + 1; \ --__limit; \ - } else if ((arrayp)[__indx] == key) { \ - found = true; \ + } else if ((arrayp)[__indx] == (key)) { \ + (found) = true; \ break; \ } \ } \ @@ -207,8 +207,8 @@ /* Check if a string matches a prefix. */ #define WT_PREFIX_MATCH(str, pfx) \ - (((const char *)(str))[0] == ((const char *)pfx)[0] && \ - strncmp((str), (pfx), strlen(pfx)) == 0) + (((const char *)(str))[0] == ((const char *)(pfx))[0] && \ + strncmp(str, pfx, strlen(pfx)) == 0) /* Check if a string matches a prefix, and move past it. */ #define WT_PREFIX_SKIP(str, pfx) \ @@ -225,8 +225,8 @@ /* Check if a string matches a byte string of len bytes. */ #define WT_STRING_MATCH(str, bytes, len) \ - (((const char *)str)[0] == ((const char *)bytes)[0] && \ - strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') + (((const char *)(str))[0] == ((const char *)(bytes))[0] && \ + strncmp(str, bytes, len) == 0 && (str)[len] == '\0') /* * Macro that produces a string literal that isn't wrapped in quotes, to avoid diff --git a/src/include/mutex.i b/src/include/mutex.i index 6b83cb280d3..640706284c3 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -316,6 +316,6 @@ __wt_spin_trylock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) stats = (int64_t **)S2C(session)->stats; stats[session->stat_bucket][t->stat_count_off]++; return (0); - } else - return (__wt_spin_trylock(session, t)); + } + return (__wt_spin_trylock(session, t)); } diff --git a/src/include/os.h b/src/include/os.h index 8505649a1fd..73d89268392 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -67,7 +67,7 @@ #define WT_TIMECMP(t1, t2) \ ((t1).tv_sec < (t2).tv_sec ? -1 : \ - (t1).tv_sec == (t2.tv_sec) ? \ + (t1).tv_sec == (t2).tv_sec ? \ (t1).tv_nsec < (t2).tv_nsec ? -1 : \ (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1) diff --git a/src/include/packing.i b/src/include/packing.i index 8ba3dd536ac..d79afe6d4a2 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -206,43 +206,43 @@ next: if (pack->cur == pack->end) #define WT_PACK_GET(session, pv, ap) do { \ WT_ITEM *__item; \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - pv.u.s = va_arg(ap, const char *); \ + (pv).u.s = va_arg(ap, const char *); \ break; \ case 'U': \ case 'u': \ __item = va_arg(ap, WT_ITEM *); \ - pv.u.item.data = __item->data; \ - pv.u.item.size = __item->size; \ + (pv).u.item.data = __item->data; \ + (pv).u.item.size = __item->size; \ break; \ case 'b': \ case 'h': \ case 'i': \ - pv.u.i = va_arg(ap, int); \ + (pv).u.i = va_arg(ap, int); \ break; \ case 'B': \ case 'H': \ case 'I': \ case 't': \ - pv.u.u = va_arg(ap, unsigned int); \ + (pv).u.u = va_arg(ap, unsigned int); \ break; \ case 'l': \ - pv.u.i = va_arg(ap, long); \ + (pv).u.i = va_arg(ap, long); \ break; \ case 'L': \ - pv.u.u = va_arg(ap, unsigned long); \ + (pv).u.u = va_arg(ap, unsigned long); \ break; \ case 'q': \ - pv.u.i = va_arg(ap, int64_t); \ + (pv).u.i = va_arg(ap, int64_t); \ break; \ case 'Q': \ case 'r': \ case 'R': \ - pv.u.u = va_arg(ap, uint64_t); \ + (pv).u.u = va_arg(ap, uint64_t); \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ @@ -556,47 +556,47 @@ __unpack_read(WT_SESSION_IMPL *session, #define WT_UNPACK_PUT(session, pv, ap) do { \ WT_ITEM *__item; \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - *va_arg(ap, const char **) = pv.u.s; \ + *va_arg(ap, const char **) = (pv).u.s; \ break; \ case 'U': \ case 'u': \ __item = va_arg(ap, WT_ITEM *); \ - __item->data = pv.u.item.data; \ - __item->size = pv.u.item.size; \ + __item->data = (pv).u.item.data; \ + __item->size = (pv).u.item.size; \ break; \ case 'b': \ - *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \ + *va_arg(ap, int8_t *) = (int8_t)(pv).u.i; \ break; \ case 'h': \ - *va_arg(ap, int16_t *) = (short)pv.u.i; \ + *va_arg(ap, int16_t *) = (short)(pv).u.i; \ break; \ case 'i': \ case 'l': \ - *va_arg(ap, int32_t *) = (int32_t)pv.u.i; \ + *va_arg(ap, int32_t *) = (int32_t)(pv).u.i; \ break; \ case 'q': \ - *va_arg(ap, int64_t *) = pv.u.i; \ + *va_arg(ap, int64_t *) = (pv).u.i; \ break; \ case 'B': \ case 't': \ - *va_arg(ap, uint8_t *) = (uint8_t)pv.u.u; \ + *va_arg(ap, uint8_t *) = (uint8_t)(pv).u.u; \ break; \ case 'H': \ - *va_arg(ap, uint16_t *) = (uint16_t)pv.u.u; \ + *va_arg(ap, uint16_t *) = (uint16_t)(pv).u.u; \ break; \ case 'I': \ case 'L': \ - *va_arg(ap, uint32_t *) = (uint32_t)pv.u.u; \ + *va_arg(ap, uint32_t *) = (uint32_t)(pv).u.u; \ break; \ case 'Q': \ case 'r': \ case 'R': \ - *va_arg(ap, uint64_t *) = pv.u.u; \ + *va_arg(ap, uint64_t *) = (pv).u.u; \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ diff --git a/src/include/schema.h b/src/include/schema.h index 9a6e1e54e80..50e141d9921 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -107,10 +107,11 @@ struct __wt_table { * Acquire a lock if available, perform an operation, drop the lock. */ #define WT_WITH_LOCK_NOWAIT(session, ret, lock, flag, op) do { \ - ret = 0; \ + (ret) = 0; \ if (F_ISSET(session, (flag))) { \ op; \ - } else if ((ret = __wt_spin_trylock_track(session, lock)) == 0) {\ + } else if (((ret) = \ + __wt_spin_trylock_track(session, lock)) == 0) { \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ @@ -248,7 +249,7 @@ struct __wt_table { WT_SESSION_LOCKED_HANDLE_LIST)); \ if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ op; \ - } else if ((ret = __wt_try_writelock(session, \ + } else if (((ret) = __wt_try_writelock(session, \ &S2C(session)->table_lock)) == 0) { \ F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ op; \ diff --git a/src/include/session.h b/src/include/session.h index dec97cff5d3..674e92671b1 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -87,7 +87,7 @@ struct __wt_session_impl { void *meta_track_sub; /* Child transaction / save point */ size_t meta_track_alloc; /* Currently allocated */ int meta_track_nest; /* Nesting level of meta transaction */ -#define WT_META_TRACKING(session) (session->meta_track_next != NULL) +#define WT_META_TRACKING(session) ((session)->meta_track_next != NULL) /* * Each session keeps a cache of table handles. The set of handles diff --git a/src/include/stat.h b/src/include/stat.h index 8b2e78a4ed5..ed3d588b7d3 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -72,7 +72,7 @@ * and the session ID is a small, monotonically increasing number. */ #define WT_STATS_SLOT_ID(session) \ - ((session)->id) % WT_COUNTER_SLOTS + (((session)->id) % WT_COUNTER_SLOTS) /* * Statistic structures are arrays of int64_t's. We have functions to read/write diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 5dd9a720e31..7223aeae0f6 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -36,7 +36,7 @@ extern "C" { #if defined(DOXYGEN) || defined(SWIG) #define __F(func) func #else -#define __F(func) (*func) +#define __F(func) (*(func)) #endif #ifdef SWIG @@ -3073,27 +3073,27 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * transaction is in progress, it should be rolled back and the operation * retried in a new transaction. */ -#define WT_ROLLBACK -31800 +#define WT_ROLLBACK (-31800) /*! * Attempt to insert an existing key. * This error is generated when the application attempts to insert a record with * the same key as an existing record without the 'overwrite' configuration to * WT_SESSION::open_cursor. */ -#define WT_DUPLICATE_KEY -31801 +#define WT_DUPLICATE_KEY (-31801) /*! * Non-specific WiredTiger error. * This error is returned when an error is not covered by a specific error * return. */ -#define WT_ERROR -31802 +#define WT_ERROR (-31802) /*! * Item not found. * This error indicates an operation did not find a value to return. This * includes cursor search and other operations where no record matched the * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove. */ -#define WT_NOTFOUND -31803 +#define WT_NOTFOUND (-31803) /*! * WiredTiger library panic. * This error indicates an underlying problem that requires the application exit @@ -3101,17 +3101,17 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * returned from a WiredTiger interface, no further WiredTiger calls are * required. */ -#define WT_PANIC -31804 +#define WT_PANIC (-31804) /*! @cond internal */ /*! Restart the operation (internal). */ -#define WT_RESTART -31805 +#define WT_RESTART (-31805) /*! @endcond */ /*! * Recovery must be run to continue. * This error is generated when wiredtiger_open is configured to return an error * if recovery is required to use the database. */ -#define WT_RUN_RECOVERY -31806 +#define WT_RUN_RECOVERY (-31806) /*! * Operation would overflow cache. * This error is only generated when wiredtiger_open is configured to run in- @@ -3120,7 +3120,7 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * progress, it should be rolled back and the operation retried in a new * transaction. */ -#define WT_CACHE_FULL -31807 +#define WT_CACHE_FULL (-31807) /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. diff --git a/src/log/log.c b/src/log/log.c index 3477ca52502..05234619d32 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -309,14 +309,11 @@ void __wt_log_written_reset(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; - WT_LOG *log; conn = S2C(session); - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - return; - log = conn->log; - log->log_written = 0; - return; + + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + conn->log->log_written = 0; } /* @@ -1775,9 +1772,8 @@ advance: if (eol) /* Found a hole. This LSN is the end. */ break; - else - /* Last record in log. Look for more. */ - goto advance; + /* Last record in log. Look for more. */ + goto advance; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index bd1daaa6915..2a34240de46 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -10,10 +10,10 @@ #define WT_FORALL_CURSORS(clsm, c, i) \ for ((i) = (clsm)->nchunks; (i) > 0;) \ - if (((c) = (clsm)->chunks[--i]->cursor) != NULL) + if (((c) = (clsm)->chunks[--(i)]->cursor) != NULL) #define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ - __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp) + __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &(cmp)) static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *); static int __clsm_open_cursors(WT_CURSOR_LSM *, bool, u_int, uint32_t); @@ -1223,7 +1223,8 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_LSM_TREE_STAT_INCR( session, clsm->lsm_tree->bloom_miss); continue; - } else if (ret == 0) + } + if (ret == 0) WT_LSM_TREE_STAT_INCR( session, clsm->lsm_tree->bloom_hit); WT_ERR(ret); @@ -1328,7 +1329,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) { ret = 0; continue; - } else if (ret != 0) + } + if (ret != 0) goto err; /* Do we have an exact match? */ @@ -1348,7 +1350,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) if ((ret = c->next(c)) == WT_NOTFOUND) { ret = 0; continue; - } else if (ret != 0) + } + if (ret != 0) goto err; } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index ceb5f03a2f5..a06b736bf0a 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -187,7 +187,7 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, continue; if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0) break; - else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) break; } diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 0b0801a8cca..358c43eab96 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -230,7 +230,7 @@ __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) { if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( - session, lsm_tree, chunk, (u_int)i); + session, lsm_tree, chunk, i); /* * Record if we were successful so that we can * later push a merge work unit. @@ -662,7 +662,8 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (drop_ret == EBUSY) { ++skipped; continue; - } else if (drop_ret != ENOENT) + } + if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; @@ -673,7 +674,8 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (drop_ret == EBUSY) { ++skipped; continue; - } else if (drop_ret != ENOENT) + } + if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 88d4397fcb5..23f654caa70 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1395,7 +1395,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ #define WT_CHILD_RELEASE(session, hazard, ref) do { \ if (hazard) { \ - hazard = false; \ + (hazard) = false; \ WT_TRET( \ __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ } \ @@ -1737,7 +1737,7 @@ __rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do * the copy in-line. */ - for (p = (uint8_t *)r->first_free, + for (p = r->first_free, t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) *p++ = *t++; @@ -2889,7 +2889,7 @@ no_slots: len = WT_PTRDIFF( r->first_free, (uint8_t *)dsk + dsk_dst->mem_size); dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); + (void)memmove(dsk_start, r->first_free - len, len); r->entries -= r->raw_entries[result_slots - 1]; r->first_free = dsk_start + len; @@ -3605,16 +3605,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) r = cbulk->reconcile; r->is_bulk_load = true; - recno = WT_RECNO_OOB; /* -Werror=maybe-uninitialized */ - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: - recno = 1; - break; - case BTREE_ROW: - recno = WT_RECNO_OOB; - break; - } + recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : 1; return (__rec_split_init( session, r, cbulk->leaf, recno, btree->maxleafpage)); diff --git a/src/support/crypto.c b/src/support/crypto.c index ab94ec2c829..cce0d228832 100644 --- a/src/support/crypto.c +++ b/src/support/crypto.c @@ -133,5 +133,4 @@ __wt_encrypt_size(WT_SESSION_IMPL *session, return; *sizep = incoming_size + kencryptor->size_const + WT_ENCRYPT_LEN_SIZE; - return; } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 80cdf1cd39b..6c97922f7e1 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -422,7 +422,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) __wt_sleep(0, stepdown_us / 10); __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); - total_ms = WT_TIMEDIFF_MS(stop, start); bytes_written_total = cache->bytes_written - bytes_written_start; diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index 2d8a77a69e6..30932195b1e 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -93,7 +93,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \ "/%" PRIu32, \ cursor == NULL ? "Skipping" : "Applying", \ - optype, fileid, lsnp->l.file, lsnp->l.offset); \ + optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \ if (cursor == NULL) \ break diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index cded40a8b45..947fa7bf9ef 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -504,17 +504,18 @@ dump_prefix(WT_SESSION *session, bool hex, bool json) (void)wiredtiger_version(&vmajor, &vminor, &vpatch); + if (json && printf( + " \"%s\" : \"%d (%d.%d.%d)\",\n", + DUMP_JSON_VERSION_MARKER, DUMP_JSON_CURRENT_VERSION, + vmajor, vminor, vpatch) < 0) + return (util_err(session, EIO, NULL)); + if (!json && (printf( "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n", vmajor, vminor, vpatch) < 0 || printf("Format=%s\n", hex ? "hex" : "print") < 0 || printf("Header\n") < 0)) return (util_err(session, EIO, NULL)); - else if (json && printf( - " \"%s\" : \"%d (%d.%d.%d)\",\n", - DUMP_JSON_VERSION_MARKER, DUMP_JSON_CURRENT_VERSION, - vmajor, vminor, vpatch) < 0) - return (util_err(session, EIO, NULL)); return (0); } diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index ca77643eb49..d31fa4c9d08 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -80,8 +80,8 @@ util_load(WT_SESSION *session, int argc, char *argv[]) if (no_overwrite) flags |= LOAD_JSON_NO_OVERWRITE; return (util_load_json(session, filename, flags)); - } else - return (load_dump(session)); + } + return (load_dump(session)); } /* diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 7157f0d90fe..68e3b0f1bc5 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -20,7 +20,43 @@ static const char *command; /* Command name */ #define REC_LOGOFF "log=(enabled=false)" #define REC_RECOVER "log=(recover=on)" -static int usage(void); +static void +usage(void) +{ + fprintf(stderr, + "WiredTiger Data Engine (version %d.%d)\n", + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); + fprintf(stderr, + "global options:\n" + "\t" "-C\t" "wiredtiger_open configuration\n" + "\t" "-h\t" "database directory\n" + "\t" "-L\t" "turn logging off for debug-mode\n" + "\t" "-R\t" "run recovery if configured\n" + "\t" "-V\t" "display library version and exit\n" + "\t" "-v\t" "verbose\n"); + fprintf(stderr, + "commands:\n" + "\t" "alter\t alter an object\n" + "\t" "backup\t database backup\n" + "\t" "compact\t compact an object\n" + "\t" "copyright copyright information\n" + "\t" "create\t create an object\n" + "\t" "drop\t drop an object\n" + "\t" "dump\t dump an object\n" + "\t" "list\t list database objects\n" + "\t" "load\t load an object\n" + "\t" "loadtext load an object from a text file\n" + "\t" "printlog display the database log\n" + "\t" "read\t read values from an object\n" + "\t" "rebalance rebalance an object\n" + "\t" "rename\t rename an object\n" + "\t" "salvage\t salvage a file\n" + "\t" "stat\t display statistics for an object\n" + "\t" "truncate truncate an object, removing all content\n" + "\t" "upgrade\t upgrade an object\n" + "\t" "verify\t verify an object\n" + "\t" "write\t write values to an object\n"); +} int main(int argc, char *argv[]) @@ -73,8 +109,9 @@ main(int argc, char *argv[]) cmd_config = __wt_optarg; break; case 'E': /* secret key */ + free(secretkey); /* lint: set more than once */ if ((secretkey = strdup(__wt_optarg)) == NULL) { - ret = util_err(NULL, errno, NULL); + (void)util_err(NULL, errno, NULL); goto err; } memset(__wt_optarg, 0, strlen(__wt_optarg)); @@ -92,24 +129,27 @@ main(int argc, char *argv[]) break; case 'V': /* version */ printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); - return (EXIT_SUCCESS); + goto done; case 'v': /* verbose */ verbose = true; break; case '?': default: - return (usage()); + usage(); + goto err; } if (logoff && recover) { fprintf(stderr, "Only one of -L and -R is allowed.\n"); - return (EXIT_FAILURE); + goto err; } argc -= __wt_optind; argv += __wt_optind; /* The next argument is the command name. */ - if (argc < 1) - return (usage()); + if (argc < 1) { + usage(); + goto err; + } command = argv[0]; /* Reset getopt. */ @@ -130,7 +170,7 @@ main(int argc, char *argv[]) func = util_compact; else if (strcmp(command, "copyright") == 0) { util_copyright(); - return (EXIT_SUCCESS); + goto done; } else if (strcmp(command, "create") == 0) { func = util_create; config = "create"; @@ -194,8 +234,10 @@ main(int argc, char *argv[]) default: break; } - if (func == NULL) - return (usage()); + if (func == NULL) { + usage(); + goto err; + } /* Build the configuration string. */ len = 10; /* some slop */ @@ -212,7 +254,7 @@ main(int argc, char *argv[]) } len += strlen(rec_config); if ((p = malloc(len)) == NULL) { - ret = util_err(NULL, errno, NULL); + (void)util_err(NULL, errno, NULL); goto err; } (void)snprintf(p, len, "%s,%s,%s%s%s%s", @@ -223,19 +265,24 @@ main(int argc, char *argv[]) /* Open the database and a session. */ if ((ret = wiredtiger_open(home, verbose ? verbose_handler : NULL, config, &conn)) != 0) { - ret = util_err(NULL, ret, NULL); + (void)util_err(NULL, ret, NULL); goto err; } if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) { - ret = util_err(NULL, ret, NULL); + (void)util_err(NULL, ret, NULL); goto err; } /* Call the function. */ ret = func(session, argc, argv); + if (0) { +err: ret = 1; + } +done: + /* Close the database. */ -err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) + if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) ret = tret; free(p); @@ -244,46 +291,6 @@ err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } -static int -usage(void) -{ - fprintf(stderr, - "WiredTiger Data Engine (version %d.%d)\n", - WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); - fprintf(stderr, - "global options:\n" - "\t" "-C\t" "wiredtiger_open configuration\n" - "\t" "-h\t" "database directory\n" - "\t" "-L\t" "turn logging off for debug-mode\n" - "\t" "-R\t" "run recovery if configured\n" - "\t" "-V\t" "display library version and exit\n" - "\t" "-v\t" "verbose\n"); - fprintf(stderr, - "commands:\n" - "\t" "alter\t alter an object\n" - "\t" "backup\t database backup\n" - "\t" "compact\t compact an object\n" - "\t" "copyright copyright information\n" - "\t" "create\t create an object\n" - "\t" "drop\t drop an object\n" - "\t" "dump\t dump an object\n" - "\t" "list\t list database objects\n" - "\t" "load\t load an object\n" - "\t" "loadtext load an object from a text file\n" - "\t" "printlog display the database log\n" - "\t" "read\t read values from an object\n" - "\t" "rebalance rebalance an object\n" - "\t" "rename\t rename an object\n" - "\t" "salvage\t salvage a file\n" - "\t" "stat\t display statistics for an object\n" - "\t" "truncate truncate an object, removing all content\n" - "\t" "upgrade\t upgrade an object\n" - "\t" "verify\t verify an object\n" - "\t" "write\t write values to an object\n"); - - return (EXIT_FAILURE); -} - /* * util_uri -- * Build a name. @@ -314,7 +321,7 @@ util_uri(WT_SESSION *session, const char *s, const char *type) * the default type for the operation. */ if (strchr(s, ':') != NULL) - strcpy(name, s); + snprintf(name, len, "%s", s); else snprintf(name, len, "%s:%s", type, s); return (name); -- cgit v1.2.1 From 19fac80017eee9758d8109ab94796231d4995f33 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 17 Mar 2017 01:15:06 -0400 Subject: WT-3224 Prevent splits in LSM primaries (#3335) Move lsm_primary check near evict_disabled check. The assertion was caused by `WT_BTREE_NO_RECONCILE`, which allows in-memory splits even when eviction is disabled. Rename that flag `WT_BTREE_ALLOW_SPLITS` for clarity. --- src/btree/bt_read.c | 3 ++- src/btree/bt_sync.c | 4 ++-- src/evict/evict_page.c | 4 ++-- src/include/btree.h | 12 ++++++------ src/include/btree.i | 1 - 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index b170a9fb900..64874547b9c 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -592,8 +592,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || + btree->lsm_primary || (btree->evict_disabled > 0 && - !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) + !F_ISSET(btree, WT_BTREE_ALLOW_SPLITS))) goto skip_evict; /* diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index cdb27752fb7..ead6ccc4ac0 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -188,9 +188,9 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * to grow significantly larger than the configured maximum * size. */ - F_SET(btree, WT_BTREE_NO_RECONCILE); + F_SET(btree, WT_BTREE_ALLOW_SPLITS); ret = __wt_evict_file_exclusive_on(session); - F_CLR(btree, WT_BTREE_NO_RECONCILE); + F_CLR(btree, WT_BTREE_ALLOW_SPLITS); WT_ERR(ret); __wt_evict_file_exclusive_off(session); diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 5b17a78a4dd..85689efd0b1 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -480,8 +480,8 @@ __evict_review( if (LF_ISSET(WT_EVICT_INMEM_SPLIT)) return (__wt_split_insert(session, ref)); - /* We are done if reconciliation is disabled. */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_RECONCILE)) + /* If splits are the only permitted operation, we're done. */ + if (F_ISSET(S2BT(session), WT_BTREE_ALLOW_SPLITS)) return (EBUSY); } diff --git a/src/include/btree.h b/src/include/btree.h index 15a68474fdf..88312f408cc 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -157,14 +157,14 @@ struct __wt_btree { WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ -#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ -#define WT_BTREE_CLOSED 0x000200 /* Handle closed */ -#define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */ +#define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */ +#define WT_BTREE_BULK 0x000200 /* Bulk-load handle */ +#define WT_BTREE_CLOSED 0x000400 /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x000800 /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x001000 /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x002000 /* Look-aside table */ #define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */ #define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */ #define WT_BTREE_REBALANCE 0x020000 /* Handle is for rebalance */ #define WT_BTREE_SALVAGE 0x040000 /* Handle is for salvage */ #define WT_BTREE_SKIP_CKPT 0x080000 /* Handle skipped checkpoint */ diff --git a/src/include/btree.i b/src/include/btree.i index c0c5c7c5a8d..eefc2db075d 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1229,7 +1229,6 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * data in the last skiplist on the page. Split if there are enough * items and the skiplist does not fit within a single disk page. */ - ins_head = page->type == WT_PAGE_ROW_LEAF ? (page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : -- cgit v1.2.1 From 5cf626fda029c966c5c1eea7916fa7e8d12e6330 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 17 Mar 2017 15:50:21 -0400 Subject: WT-3227 Python test suite inserts unnecessary whitespace in error output. (#3338) The Python test suite uses "XXX: " as its error prefix, and the WiredTiger error routines append a comma and space after the error prefix in error messages. This means the error messages come out "XXX: , YYY". Remove the comma and space from the declared error_prefix so the error messages come out "XXX, YYY". --- test/suite/test_compact02.py | 2 +- test/suite/test_encrypt04.py | 2 +- test/suite/test_shared_cache01.py | 2 +- test/suite/test_shared_cache02.py | 2 +- test/suite/test_txn07.py | 2 +- test/suite/wttest.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/suite/test_compact02.py b/test/suite/test_compact02.py index 7af76b5fd58..803600eea14 100644 --- a/test/suite/test_compact02.py +++ b/test/suite/test_compact02.py @@ -99,7 +99,7 @@ class test_compact02(wttest.WiredTigerTestCase): def ConnectionOpen(self, cacheSize): self.home = '.' conn_params = 'create,' + \ - cacheSize + ',error_prefix="%s: ",' % self.shortid() + \ + cacheSize + ',error_prefix="%s",' % self.shortid() + \ 'statistics=(all),' + \ 'eviction_dirty_target=99,eviction_dirty_trigger=99' try: diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py index 17777fc9564..19c0b85d427 100644 --- a/test/suite/test_encrypt04.py +++ b/test/suite/test_encrypt04.py @@ -113,7 +113,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): completed = False try: conn = self.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( + 'create,error_prefix="{0}",{1}{2}'.format( self.shortid(), encarg, extarg)) except (BaseException) as err: # Capture the recognizable error created by rotn diff --git a/test/suite/test_shared_cache01.py b/test/suite/test_shared_cache01.py index 70560a625ee..c3bd946cc4b 100644 --- a/test/suite/test_shared_cache01.py +++ b/test/suite/test_shared_cache01.py @@ -73,7 +73,7 @@ class test_shared_cache01(wttest.WiredTigerTestCase): os.mkdir(name) next_conn = self.wiredtiger_open( name, - 'create,error_prefix="' + self.shortid() + ': "' + + 'create,error_prefix="%s",' % self.shortid() + pool_opts + extra_opts) self.conns.append(next_conn) self.sessions.append(next_conn.open_session(None)) diff --git a/test/suite/test_shared_cache02.py b/test/suite/test_shared_cache02.py index 7cde6c86695..67f9bf7c6b7 100644 --- a/test/suite/test_shared_cache02.py +++ b/test/suite/test_shared_cache02.py @@ -73,7 +73,7 @@ class test_shared_cache02(wttest.WiredTigerTestCase): os.mkdir(name) next_conn = self.wiredtiger_open( name, - 'create,error_prefix="' + self.shortid() + ': "' + + 'create,error_prefix="%s",' % self.shortid() + pool_opts + extra_opts) self.conns.append(next_conn) self.sessions.append(next_conn.open_session(None)) diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py index e2986fb999a..e26cf5aaaea 100644 --- a/test/suite/test_txn07.py +++ b/test/suite/test_txn07.py @@ -76,7 +76,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): def conn_config(self): return 'log=(archive=false,enabled,file_max=%s,' % self.logmax + \ 'compressor=%s)' % self.compress + \ - ',create,error_prefix="%s: ",' % self.shortid() + \ + ',create,error_prefix="%s",' % self.shortid() + \ "statistics=(fast)," + \ 'transaction_sync="%s",' % \ self.sync_list[self.scenario_number % len(self.sync_list)] diff --git a/test/suite/wttest.py b/test/suite/wttest.py index 0dce51f07d5..e91838544b9 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -302,7 +302,7 @@ class WiredTigerTestCase(unittest.TestCase): # In case the open starts additional threads, flush first to # avoid confusion. sys.stdout.flush() - conn_param = 'create,error_prefix="%s: ",%s' % (self.shortid(), config) + conn_param = 'create,error_prefix="%s",%s' % (self.shortid(), config) try: conn = self.wiredtiger_open(home, conn_param) except wiredtiger.WiredTigerError as e: -- cgit v1.2.1 From 89c063c2acb0f901725f0cd838503c983687a49f Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Mon, 20 Mar 2017 16:58:17 +1100 Subject: WT-3196 Prevent eviction in LSM primaries after the are flushed. (#3336) Once an LSM primary is known to be on disk, we expect readers to use the checkpoint. The original page image for the primary will then be discarded by an LSM worker thread. We previously allowed the LSM primary to be evicted in between so that eviction workers can deal with cache pressure ahead of the LSM worker threads discarding the chunk. However, that leads to cases where application threads end up evicting a 100MB page, and also means that discarding the chunk needs to worry about split generations (the cause of the assertion failure here). The solution suggested here is simple: never enable eviction in LSM primaries, which also means we never need to fix up cache accounting. --- src/lsm/lsm_work_unit.c | 69 +++---------------------------------------------- 1 file changed, 3 insertions(+), 66 deletions(-) diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 358c43eab96..10b85d573aa 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -255,51 +255,6 @@ err: return (ret); } -/* - * __lsm_switch_primary_off -- - * Switch when a btree handle is no longer the current primary chunk of - * an LSM tree. - */ -static void -__lsm_switch_primary_off(WT_SESSION_IMPL *session) -{ - WT_BTREE *btree; - WT_CACHE *cache; - WT_PAGE *child, *root; - WT_PAGE_INDEX *pindex; - WT_REF *first; - size_t size; - - btree = S2BT(session); - cache = S2C(session)->cache; - root = btree->root.page; - pindex = WT_INTL_INDEX_GET_SAFE(root); - - /* Diagnostic: assert we've never split. */ - WT_ASSERT(session, pindex->entries == 1); - - /* - * We're reaching down into the page without a hazard pointer, - * but that's OK because we know that no-eviction is set so the - * page can't disappear. - * - * While this tree was the primary, its dirty bytes were not - * included in the cache accounting. Fix that now before we - * open it up for eviction. - */ - first = pindex->index[0]; - child = first->page; - if (first->state == WT_REF_MEM && - child->type == WT_PAGE_ROW_LEAF && __wt_page_is_modified(child)) { - size = child->modify->bytes_dirty; - (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); - } - - /* Configure eviction. */ - __wt_evict_file_exclusive_off(session); -} - /* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. @@ -308,7 +263,6 @@ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { - WT_BTREE *btree; WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set, release_btree; @@ -322,9 +276,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_WRITE_LOCK(session, - ret = __lsm_discard_handle(session, chunk->uri, NULL)); - if (ret == 0) + if ((ret = + __lsm_discard_handle(session, chunk->uri, NULL)) == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; @@ -397,20 +350,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); - /* - * If the chunk is the lsm primary, clear the no-eviction flag so it can - * be evicted and eventually closed. Only do once, and only do after the - * checkpoint has succeeded: otherwise, accessing the leaf page during - * the checkpoint can trigger forced eviction. - * - * We don't have to worry about races here, we're single-threaded. - */ - btree = S2BT(session); - if (btree->lsm_primary) { - __lsm_switch_primary_off(session); - btree->lsm_primary = false; - } - release_btree = false; WT_ERR(__wt_session_release_btree(session)); @@ -569,9 +508,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_WRITE_LOCK(session, - ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); - WT_RET(ret); + WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT)); /* * Take the schema lock for the drop operation. Since __wt_schema_drop -- cgit v1.2.1 From cfdf4394aa39209d402a9006661810cda3bdb38d Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 20 Mar 2017 15:04:18 -0400 Subject: WT-2990 Restore use of dhandle lock in LSM. (#3342) --- src/lsm/lsm_work_unit.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 10b85d573aa..e6a29666094 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -276,8 +276,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - if ((ret = - __lsm_discard_handle(session, chunk->uri, NULL)) == 0) + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_discard_handle(session, chunk->uri, NULL)); + if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; @@ -508,7 +509,9 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT)); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); + WT_RET(ret); /* * Take the schema lock for the drop operation. Since __wt_schema_drop -- cgit v1.2.1 From 620398dc9a90b401aa9e4437c834bfbb2a6d9a6d Mon Sep 17 00:00:00 2001 From: Sulabh Mahajan Date: Thu, 23 Mar 2017 11:57:44 +1100 Subject: WT-3202 Add in_memory config opt, do not reopen connection if db is in_memory (#3341) --- bench/wtperf/wtperf.c | 13 ++++++++++++- bench/wtperf/wtperf_opt.i | 2 ++ src/docs/wtperf.dox | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 7f5e5ad3373..b5aff21bdbc 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -1655,6 +1655,9 @@ close_reopen(WTPERF *wtperf) opts = wtperf->opts; + if (opts->in_memory) + return (0); + if (!opts->readonly && !opts->reopen_connection) return (0); /* @@ -2568,7 +2571,8 @@ main(int argc, char *argv[]) /* Concatenate non-default configuration strings. */ if ((opts->verbose > 1 && strlen(debug_cconfig) != 0) || user_cconfig != NULL || opts->session_count_idle > 0 || - wtperf->compress_ext != NULL || wtperf->async_config != NULL) { + wtperf->compress_ext != NULL || wtperf->async_config != NULL || + opts->in_memory) { req_len = 20; req_len += wtperf->async_config != NULL ? strlen(wtperf->async_config) : 0; @@ -2583,6 +2587,7 @@ main(int argc, char *argv[]) opts->session_count_idle + wtperf->workers_cnt + opts->populate_threads + 10); } + req_len += opts->in_memory ? strlen("in_memory=true") : 0; req_len += user_cconfig != NULL ? strlen(user_cconfig) : 0; req_len += debug_cconfig != NULL ? strlen(debug_cconfig) : 0; cc_buf = dmalloc(req_len); @@ -2603,6 +2608,12 @@ main(int argc, char *argv[]) append_comma, wtperf->compress_ext); append_comma = ","; } + if (opts->in_memory) { + pos += (size_t)snprintf( + cc_buf + pos, req_len - pos, "%s%s", + append_comma, "in_memory=true"); + append_comma = ","; + } if (sess_cfg != NULL && strlen(sess_cfg) != 0) { pos += (size_t)snprintf( cc_buf + pos, req_len - pos, "%s%s", diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 63cef4c28fb..90f70457407 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -110,6 +110,8 @@ DEF_OPT_AS_UINT32(database_count, 1, DEF_OPT_AS_BOOL(drop_tables, 0, "Whether to drop all tables at the end of the run, and report time taken" " to do the drop.") +DEF_OPT_AS_BOOL(in_memory, 0, + "Whether to create the database in-memory.") DEF_OPT_AS_UINT32(icount, 5000, "number of records to initially populate. If multiple tables are " "configured the count is spread evenly across all tables.") diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 2eac0fef3f4..6bdcf5f4f8d 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -167,6 +167,8 @@ do population phase; false to use existing database number of WiredTiger databases to use. Each database will execute the workload using a separate home directory and complete set of worker threads @par drop_tables (boolean, default=false) Whether to drop all tables at the end of the run, and report time taken to do the drop. +@par in_memory (boolean, default=false) +Whether to create the database in-memory. @par icount (unsigned int, default=5000) number of records to initially populate. If multiple tables are configured the count is spread evenly across all tables. @par idle_table_cycle (unsigned int, default=0) -- cgit v1.2.1 From 6bd63027a6fd00db3f0f379acb929c22cd1b7f6f Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 23 Mar 2017 02:27:54 -0400 Subject: SERVER-28194 Missing WiredTiger.turtle file loses data (#3337) There's a two step process on Windows to rename files (including the turtle file), remove the original and then move the replacement into place -- a DeleteFileW followed by a MoveFileW. If we crash in the middle (and in SERVER-28194, it looks like there's a weirder failure mode, where the DeleteFileW succeeded, but the file was still there), we can be left without a turtle file, which will lose all of the data in the database. * Add the MOVEFILE_WRITE_THROUGH flag to the MoveFileEx call. If we somehow end up in a copy-then-delete path, that flag adds a disk flush after the copy phase, so the window of vulnerability is as short as possible. --- dist/s_string.ok | 5 +++-- src/os_win/os_fs.c | 23 ++++++++++------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/dist/s_string.ok b/dist/s_string.ok index 39b6b163cd9..1f7f7d9fd3a 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -225,8 +225,10 @@ MEMALIGN MERCHANTABILITY METADATA MONGODB +MOVEFILE MRXB MRXBOPC +MSDN MSVC MULTI MULTIBLOCK @@ -240,8 +242,7 @@ Metadata Mewhort Mitzenmacher MongoDB -MoveFile -MoveFileW +MoveFileExW Multi MultiByteToWideChar Multithreaded diff --git a/src/os_win/os_fs.c b/src/os_win/os_fs.c index 2f76fff04a5..5cf47ea5763 100644 --- a/src/os_win/os_fs.c +++ b/src/os_win/os_fs.c @@ -87,22 +87,19 @@ __win_fs_rename(WT_FILE_SYSTEM *file_system, WT_ERR(__wt_to_utf16_string(session, to, &to_wide)); /* - * Check if file exists since Windows does not override the file if - * it exists. + * We want an atomic rename, but that's not guaranteed by MoveFileExW + * (or by any MSDN API). Don't set the MOVEFILE_COPY_ALLOWED flag to + * prevent the system from falling back to a copy and delete process. + * Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small + * as possible, just in case. WiredTiger renames are done in a single + * directory and we expect that to be an atomic metadata update on any + * modern filesystem. */ - if (GetFileAttributesW(to_wide->data) != INVALID_FILE_ATTRIBUTES) - if (DeleteFileW(to_wide->data) == FALSE) { - windows_error = __wt_getlasterror(); - __wt_errx(session, - "%s: file-rename: DeleteFileW: %s", - to, __wt_formatmessage(session, windows_error)); - WT_ERR(__wt_map_windows_error(windows_error)); - } - - if (MoveFileW(from_wide->data, to_wide->data) == FALSE) { + if (MoveFileExW(from_wide->data, to_wide->data, + MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == FALSE) { windows_error = __wt_getlasterror(); __wt_errx(session, - "%s to %s: file-rename: MoveFileW: %s", + "%s to %s: file-rename: MoveFileExW: %s", from, to, __wt_formatmessage(session, windows_error)); WT_ERR(__wt_map_windows_error(windows_error)); } -- cgit v1.2.1 From 81df9eadd01427173e7b14525f53723a33a7235e Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 23 Mar 2017 21:16:42 -0400 Subject: WT-3234 Update WiredTiger build for clang 4.0. (#3345) * Update WiredTiger build for clang 4.0. ex_all.c:852:7: error: possible misuse of comma operator here [-Werror,-Wcomma] p1++, p2++; ^ ex_all.c:852:3: note: cast expression to void to silence warning p1++, p2++; ^~~~ (void)( ) 1 error generated. * wtperf.c:2670:4: error: code will never be executed [-Werror,-Wunreachable-code] pos += (size_t)snprintf( ^~~ wtperf.c:2669:23: note: silence by adding parentheses to mark code as explicitly dead if (opts->verbose > 1 && strlen(debug_tconfig) != 0) ^ /* DISABLES CODE */ ( ) wtperf.c:2630:4: error: code will never be executed [-Werror,-Wunreachable-code] pos += (size_t)snprintf( ^~~ wtperf.c:2629:23: note: silence by adding parentheses to mark code as explicitly dead if (opts->verbose > 1 && strlen(debug_cconfig) != 0) ^ /* DISABLES CODE */ ( ) 2 errors generated. --- bench/wtperf/wtperf.c | 18 ++---------------- examples/c/ex_all.c | 4 ++-- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index b5aff21bdbc..772dedac8c8 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -32,9 +32,6 @@ #define DEFAULT_HOME "WT_TEST" #define DEFAULT_MONITOR_DIR "WT_TEST" -static const char * const debug_cconfig = ""; -static const char * const debug_tconfig = ""; - static void *checkpoint_worker(void *); static int drop_all_tables(WTPERF *); static int execute_populate(WTPERF *); @@ -2569,8 +2566,7 @@ main(int argc, char *argv[]) __wt_stream_set_line_buffer(stdout); /* Concatenate non-default configuration strings. */ - if ((opts->verbose > 1 && strlen(debug_cconfig) != 0) || - user_cconfig != NULL || opts->session_count_idle > 0 || + if (user_cconfig != NULL || opts->session_count_idle > 0 || wtperf->compress_ext != NULL || wtperf->async_config != NULL || opts->in_memory) { req_len = 20; @@ -2589,7 +2585,6 @@ main(int argc, char *argv[]) } req_len += opts->in_memory ? strlen("in_memory=true") : 0; req_len += user_cconfig != NULL ? strlen(user_cconfig) : 0; - req_len += debug_cconfig != NULL ? strlen(debug_cconfig) : 0; cc_buf = dmalloc(req_len); pos = 0; @@ -2626,23 +2621,18 @@ main(int argc, char *argv[]) append_comma, user_cconfig); append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_cconfig) != 0) - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", - append_comma, debug_cconfig); if (strlen(cc_buf) != 0 && (ret = config_opt_name_value(wtperf, "conn_config", cc_buf)) != 0) goto err; } - if ((opts->verbose > 1 && strlen(debug_tconfig) != 0) || opts->index || + if (opts->index || user_tconfig != NULL || wtperf->compress_table != NULL) { req_len = 20; req_len += wtperf->compress_table != NULL ? strlen(wtperf->compress_table) : 0; req_len += opts->index ? strlen(INDEX_COL_NAMES) : 0; req_len += user_tconfig != NULL ? strlen(user_tconfig) : 0; - req_len += debug_tconfig != NULL ? strlen(debug_tconfig) : 0; tc_buf = dmalloc(req_len); pos = 0; @@ -2666,10 +2656,6 @@ main(int argc, char *argv[]) append_comma, user_tconfig); append_comma = ","; } - if (opts->verbose > 1 && strlen(debug_tconfig) != 0) - pos += (size_t)snprintf( - tc_buf + pos, req_len - pos, "%s%s", - append_comma, debug_tconfig); if (strlen(tc_buf) != 0 && (ret = config_opt_name_value(wtperf, "table_config", tc_buf)) != 0) diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 8a1533011b2..82620673fe1 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -848,8 +848,8 @@ my_compare(WT_COLLATOR *collator, WT_SESSION *session, p1 = (const char *)value1->data; p2 = (const char *)value2->data; - while (*p1 != '\0' && *p1 == *p2) - p1++, p2++; + for (; *p1 != '\0' && *p1 == *p2; ++p1, ++p2) + ; *cmp = (int)*p2 - (int)*p1; return (0); -- cgit v1.2.1 From e4edaa7b73ca8583506f23a0c6fe701d6213d836 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 23 Mar 2017 21:39:38 -0400 Subject: WT-3228 Remove with overwrite shouldn't return WT_NOTFOUND (#3339) * Table cursors with overwrite configured wrongly treat not-found as an error, return success instead. * The LSM code clears WT_CURSTD_KEY_SET on unsuccessful searches, which breaks table cursors with indices doing searches on the set of cursors in order to delete old index keys, because there's no key set when it's time to do the update. --- src/cursor/cur_table.c | 12 ++++++++ src/lsm/lsm_cursor.c | 6 ++-- test/suite/test_overwrite.py | 67 +++++++++++++++++++++++++++++--------------- 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index ef2c0ac5163..3b72bb0730f 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -625,13 +625,25 @@ __curtable_remove(WT_CURSOR *cursor) /* Find the old record so it can be removed from indices */ if (ctable->table->nindices > 0) { APPLY_CG(ctable, search); + if (ret == WT_NOTFOUND) + goto notfound; WT_ERR(ret); WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, remove), false)); } APPLY_CG(ctable, remove); + if (ret == WT_NOTFOUND) + goto notfound; WT_ERR(ret); +notfound: + /* + * If the cursor is configured to overwrite and the record is not found, + * that is exactly what we want. + */ + if (ret == WT_NOTFOUND && F_ISSET(primary, WT_CURSTD_OVERWRITE)) + ret = 0; + /* * If the cursor was positioned, it stays positioned with a key but no * no value, otherwise, there's no position, key or value. This isn't diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 2a34240de46..3f0b6df8eb0 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1250,10 +1250,10 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_ERR(WT_NOTFOUND); done: -err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if (ret == 0) { - clsm->current = c; +err: if (ret == 0) { + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); F_SET(cursor, WT_CURSTD_KEY_INT); + clsm->current = c; if (value == &cursor->value) F_SET(cursor, WT_CURSTD_VALUE_INT); } else if (c != NULL) diff --git a/test/suite/test_overwrite.py b/test/suite/test_overwrite.py index 4739abaa578..c894de99bd0 100644 --- a/test/suite/test_overwrite.py +++ b/test/suite/test_overwrite.py @@ -27,32 +27,47 @@ # OTHER DEALINGS IN THE SOFTWARE. import wiredtiger, wttest -from wtdataset import SimpleDataSet +from wtdataset import SimpleDataSet, SimpleIndexDataSet +from wtdataset import SimpleLSMDataSet, ComplexDataSet, ComplexLSMDataSet from wtscenario import make_scenarios # test_overwrite.py # cursor overwrite configuration method class test_overwrite(wttest.WiredTigerTestCase): name = 'overwrite' - scenarios = make_scenarios([ - ('file-r', dict(type='file:', keyfmt='r', dataset=SimpleDataSet)), - ('file-S', dict(type='file:', keyfmt='S', dataset=SimpleDataSet)), - ('lsm-S', dict(type='lsm:', keyfmt='S', dataset=SimpleDataSet)), - ('table-r', dict(type='table:', keyfmt='r', dataset=SimpleDataSet)), - ('table-S', dict(type='table:', keyfmt='S', dataset=SimpleDataSet)), - ]) + keyfmt = [ + ('integer', dict(keyfmt='i')), + ('recno', dict(keyfmt='r')), + ('string', dict(keyfmt='S')), + ] + types = [ + ('file', dict(uri='file:', ds=SimpleDataSet)), + ('lsm', dict(uri='lsm:', ds=SimpleDataSet)), + ('table-complex', dict(uri='table:', ds=ComplexDataSet)), + ('table-complex-lsm', dict(uri='table:', ds=ComplexLSMDataSet)), + ('table-index', dict(uri='table:', ds=SimpleIndexDataSet)), + ('table-simple', dict(uri='table:', ds=SimpleDataSet)), + ('table-simple-lsm', dict(uri='table:', ds=SimpleLSMDataSet)), + ] + scenarios = make_scenarios(types, keyfmt) + def skip(self): + return self.keyfmt == 'r' and \ + (self.ds.is_lsm() or self.uri == 'lsm') # Confirm a cursor configured with/without overwrite correctly handles # non-existent records during insert, remove and update operations. def test_overwrite_insert(self): - uri = self.type + self.name - ds = self.dataset(self, uri, 100, key_format=self.keyfmt) + if self.skip(): + return + + uri = self.uri + self.name + ds = self.ds(self, uri, 100, key_format=self.keyfmt) ds.populate() # Insert of an existing record with overwrite off fails. cursor = self.session.open_cursor(uri, None, "overwrite=false") cursor.set_key(ds.key(5)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1000)) self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert()) # One additional test for the insert method: duplicate the cursor with @@ -63,30 +78,33 @@ class test_overwrite(wttest.WiredTigerTestCase): cursor = self.session.open_cursor(uri, None, "overwrite=false") cursor.set_key(ds.key(5)) dupc = self.session.open_cursor(None, cursor, "overwrite=true") - dupc.set_value('XXXXXXXXXX') + dupc.set_value(ds.value(1001)) self.assertEquals(dupc.insert(), 0) # Insert of an existing record with overwrite on succeeds. cursor = self.session.open_cursor(uri, None) cursor.set_key(ds.key(6)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1002)) self.assertEquals(cursor.insert(), 0) # Insert of a non-existent record with overwrite off succeeds. cursor = self.session.open_cursor(uri, None, "overwrite=false") cursor.set_key(ds.key(200)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1003)) self.assertEquals(cursor.insert(), 0) # Insert of a non-existent record with overwrite on succeeds. cursor = self.session.open_cursor(uri, None) cursor.set_key(ds.key(201)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1004)) self.assertEquals(cursor.insert(), 0) def test_overwrite_remove(self): - uri = self.type + self.name - ds = self.dataset(self, uri, 100, key_format=self.keyfmt) + if self.skip(): + return + + uri = self.uri + self.name + ds = self.ds(self, uri, 100, key_format=self.keyfmt) ds.populate() # Remove of an existing record with overwrite off succeeds. @@ -110,32 +128,35 @@ class test_overwrite(wttest.WiredTigerTestCase): self.assertEquals(cursor.remove(), 0) def test_overwrite_update(self): - uri = self.type + self.name - ds = self.dataset(self, uri, 100, key_format=self.keyfmt) + if self.skip(): + return + + uri = self.uri + self.name + ds = self.ds(self, uri, 100, key_format=self.keyfmt) ds.populate() # Update of an existing record with overwrite off succeeds. cursor = self.session.open_cursor(uri, None, "overwrite=false") cursor.set_key(ds.key(5)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1005)) self.assertEquals(cursor.update(), 0) # Update of an existing record with overwrite on succeeds. cursor = self.session.open_cursor(uri, None) cursor.set_key(ds.key(6)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1006)) self.assertEquals(cursor.update(), 0) # Update of a non-existent record with overwrite off fails. cursor = self.session.open_cursor(uri, None, "overwrite=false") cursor.set_key(ds.key(200)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1007)) self.assertEquals(cursor.update(), wiredtiger.WT_NOTFOUND) # Update of a non-existent record with overwrite on succeeds. cursor = self.session.open_cursor(uri, None) cursor.set_key(ds.key(201)) - cursor.set_value('XXXXXXXXXX') + cursor.set_value(ds.value(1008)) self.assertEquals(cursor.update(), 0) if __name__ == '__main__': -- cgit v1.2.1 From 54909d4c49019e6d9d007d3783cb8f3dbbccba84 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 01:02:52 -0400 Subject: WT-98 Update the current cursor value without a search (#3330) --- src/btree/bt_curnext.c | 11 +- src/btree/bt_curprev.c | 11 +- src/btree/bt_cursor.c | 324 +++++++++++++++++++++++++++---------- src/btree/bt_random.c | 7 +- src/btree/bt_ret.c | 21 ++- src/cursor/cur_file.c | 135 ++++++++-------- src/cursor/cur_join.c | 4 +- src/evict/evict_lru.c | 4 +- src/include/cursor.i | 25 +-- src/include/extern.h | 4 +- src/include/packing.i | 2 +- src/include/wiredtiger.in | 4 + src/log/log.c | 4 +- src/lsm/lsm_cursor.c | 35 ++-- src/lsm/lsm_merge.c | 2 +- src/lsm/lsm_meta.c | 2 +- src/lsm/lsm_stat.c | 4 +- src/schema/schema_create.c | 2 +- src/schema/schema_worker.c | 4 +- src/session/session_api.c | 7 +- src/session/session_compact.c | 2 +- src/txn/txn.c | 2 +- src/txn/txn_ckpt.c | 2 +- test/format/config.c | 47 +++--- test/format/ops.c | 362 ++++++++++++++++++++++++++++-------------- test/suite/test_truncate01.py | 1 + 26 files changed, 663 insertions(+), 365 deletions(-) diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index ba5fceae7c7..21e575ffca9 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -579,20 +579,20 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) { + WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); - flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ - if (truncating) - LF_SET(WT_READ_TRUNCATE); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); @@ -608,6 +608,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) * found. Then, move to the next page, until we reach the end of the * file. */ + flags = WT_READ_SKIP_INTL; /* tree walk flags */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; @@ -676,6 +679,8 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 602c01b60eb..bf4bdad6529 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -535,20 +535,20 @@ new_insert: if ((ins = cbt->ins) != NULL) { int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { + WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); - flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ - if (truncating) - LF_SET(WT_READ_TRUNCATE); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); @@ -564,6 +564,9 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) * found. Then, move to the previous page, until we reach the start * of the file. */ + flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* tree walk flags */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; @@ -631,6 +634,8 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 48ae1ad6d76..944e276fc01 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -9,32 +9,46 @@ #include "wt_internal.h" /* - * WT_CURFILE_OP_XXX - * If we're going to return an error, we need to restore the cursor to - * a valid state, the upper-level cursor code is likely to retry. The macros - * here are called to save and restore that state. + * When returning an error, we need to restore the cursor to a valid state, the + * upper-level cursor code is likely to retry. This structure and the associated + * functions are used save and restore the cursor state. */ -#define WT_CURFILE_OP_DECL \ - WT_ITEM __key_copy; \ - WT_ITEM __value_copy; \ - uint64_t __recno; \ - uint32_t __flags -#define WT_CURFILE_OP_PUSH do { \ - WT_ITEM_SET(__key_copy, cursor->key); \ - WT_ITEM_SET(__value_copy, cursor->value); \ - __recno = cursor->recno; \ - __flags = cursor->flags; \ -} while (0) -#define WT_CURFILE_OP_POP do { \ - cursor->recno = __recno; \ - if (FLD_ISSET(__flags, WT_CURSTD_KEY_EXT)) \ - WT_ITEM_SET(cursor->key, __key_copy); \ - if (FLD_ISSET(__flags, WT_CURSTD_VALUE_EXT)) \ - WT_ITEM_SET(cursor->value, __value_copy); \ - F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - F_SET(cursor, \ - FLD_MASK(__flags, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT));\ -} while (0) +typedef struct { + WT_ITEM key; + WT_ITEM value; + uint64_t recno; + uint32_t flags; +} WT_CURFILE_STATE; + +/* + * __cursor_state_save -- + * Save the cursor's external state. + */ +static inline void +__cursor_state_save(WT_CURSOR *cursor, WT_CURFILE_STATE *state) +{ + WT_ITEM_SET(state->key, cursor->key); + WT_ITEM_SET(state->value, cursor->value); + state->recno = cursor->recno; + state->flags = cursor->flags; +} + +/* + * __cursor_state_restore -- + * Restore the cursor's external state. + */ +static inline void +__cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state) +{ + if (F_ISSET(state, WT_CURSTD_KEY_EXT)) + WT_ITEM_SET(cursor->key, state->key); + if (F_ISSET(state, WT_CURSTD_VALUE_EXT)) + WT_ITEM_SET(cursor->value, state->value); + cursor->recno = state->recno; + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + F_SET(cursor, F_MASK(state, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT)); + +} /* * __cursor_page_pinned -- @@ -377,13 +391,17 @@ __cursor_row_modify( int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { + WT_CURSOR *cursor; WT_SESSION_IMPL *session; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + return (__cursor_reset(cbt)); } @@ -395,6 +413,7 @@ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -409,6 +428,15 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); + __cursor_state_save(cursor, &state); + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key, then re-save the cursor state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search @@ -443,6 +471,8 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } else ret = WT_NOTFOUND; @@ -451,8 +481,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif -err: if (ret != 0) +err: if (ret != 0) { WT_TRET(__cursor_reset(cbt)); + __cursor_state_restore(cursor, &state); + } return (ret); } @@ -464,6 +496,7 @@ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -480,6 +513,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); + __cursor_state_save(cursor, &state); + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key, then re-save the cursor state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact @@ -544,6 +586,8 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { @@ -558,15 +602,18 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) exact = -1; } +err: if (ret == 0 && exactp != NULL) + *exactp = exact; + #ifdef HAVE_DIAGNOSTIC if (ret == 0) - WT_ERR(__wt_cursor_key_order_init(session, cbt)); + WT_TRET(__wt_cursor_key_order_init(session, cbt)); #endif -err: if (ret != 0) + if (ret != 0) { WT_TRET(__cursor_reset(cbt)); - if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) - *exactp = exact; + __cursor_state_restore(cursor, &state); + } return (ret); } @@ -578,9 +625,11 @@ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool append_key; btree = cbt->btree; cursor = &cbt->iface; @@ -591,6 +640,8 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); + __cursor_state_save(cursor, &state); + if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); @@ -598,7 +649,58 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) /* It's no longer possible to bulk-load into the tree. */ __cursor_disable_bulk(session, btree); -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * Insert a new record if WT_CURSTD_APPEND configured, (ignoring any + * application set record number). Although append can't be configured + * for a row-store, this code would break if it were, and that's owned + * by the upper cursor layer, be cautious. + */ + append_key = + F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW; + + /* + * If inserting with overwrite configured, and positioned to an on-page + * key, the update doesn't require another search. The cursor won't be + * positioned on a page with an external key set, but be sure. Cursors + * configured for append aren't included, regardless of whether or not + * they meet all other criteria. + */ + if (__cursor_page_pinned(cbt) && + F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE) && + !append_key) { + WT_ERR(__wt_txn_autocommit_check(session)); + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * update whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, false) : + __cursor_col_modify(session, cbt, false); + if (ret == 0) + goto done; + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; + } + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + +retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); @@ -613,11 +715,11 @@ retry: WT_RET(__cursor_func_init(cbt, true)); ret = __cursor_row_modify(session, cbt, false); } else { /* - * If WT_CURSTD_APPEND is set, insert a new record (ignoring - * the application's record number). The real record number - * is assigned by the serialized append operation. + * Optionally insert a new record (ignoring the application's + * record number). The real record number is allocated by the + * serialized append operation. */ - if (F_ISSET(cursor, WT_CURSTD_APPEND)) + if (append_key) cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); @@ -634,7 +736,8 @@ retry: WT_RET(__cursor_func_init(cbt, true)); WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, false)); - if (F_ISSET(cursor, WT_CURSTD_APPEND)) + + if (append_key) cbt->iface.recno = cbt->recno; } @@ -644,8 +747,16 @@ err: if (ret == WT_RESTART) { goto retry; } - /* Insert doesn't maintain a position across calls, clear resources. */ +done: /* Insert doesn't maintain a position across calls, clear resources. */ + if (ret == 0) { + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (append_key) + F_SET(cursor, WT_CURSTD_KEY_INT); + } WT_TRET(__cursor_reset(cbt)); + if (ret != 0) + __cursor_state_restore(cursor, &state); + return (ret); } @@ -681,16 +792,15 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt) } /* - * __wt_btcur_update_check -- + * __wt_btcur_insert_check -- * Check whether an update would conflict. * - * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so - * they only check for conflicts without updating the tree. It is used to - * maintain snapshot isolation for transactions that span multiple chunks - * in an LSM tree. + * This can replace WT_CURSOR::insert, so it only checks for conflicts without + * updating the tree. It is used to maintain snapshot isolation for transactions + * that span multiple chunks in an LSM tree. */ int -__wt_btcur_update_check(WT_CURSOR_BTREE *cbt) +__wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; @@ -701,14 +811,20 @@ __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Unlike most of the btree cursor routines, + * we don't have to save/restore the cursor key state, none of the + * work done here changes the key state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + +retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * Just check for conflicts. - */ + /* Just check for conflicts. */ ret = __curfile_update_check(cbt); } else WT_ERR(__wt_illegal_value(session, NULL)); @@ -720,7 +836,10 @@ err: if (ret == WT_RESTART) { } /* Insert doesn't maintain a position across calls, clear resources. */ + if (ret == 0) + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_TRET(__cursor_reset(cbt)); + return (ret); } @@ -732,7 +851,7 @@ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; - WT_CURFILE_OP_DECL; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -742,26 +861,27 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; - WT_CURFILE_OP_PUSH; - WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); + __cursor_state_save(cursor, &state); + /* * WT_CURSOR.remove has a unique semantic, the cursor stays positioned * if it starts positioned, otherwise clear the cursor on completion. */ positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); -retry: /* - * If removing with overwrite configured, and positioned to an on-page - * key, the update doesn't require another search. The cursor won't be - * positioned on a page with an external key set, but be sure. + * If remove positioned to an on-page key, the remove doesn't require + * another search. We don't care about the "overwrite" configuration + * because regardless of the overwrite setting, any existing record is + * removed, and the record must exist with a positioned cursor. The + * cursor won't be positioned on a page with an external key set, but + * be sure. */ - if (__cursor_page_pinned(cbt) && - F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE)) { + if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) { WT_ERR(__wt_txn_autocommit_check(session)); /* @@ -773,6 +893,8 @@ retry: ret = btree->type == BTREE_ROW ? __cursor_row_modify(session, cbt, true) : __cursor_col_modify(session, cbt, true); + if (ret == 0) + goto done; /* * The pinned page goes away if we fail for any reason, make @@ -780,12 +902,9 @@ retry: * use the pinned page, but that's an unlikely path.) Re-save * the cursor state: we may retry but eventually fail. */ - if (ret != 0) { - WT_TRET(__cursor_copy_int_key(cursor)); - WT_CURFILE_OP_PUSH; - goto err; - } - goto done; + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; } /* @@ -794,9 +913,9 @@ retry: * eventually fail. */ WT_ERR(__cursor_copy_int_key(cursor)); - WT_CURFILE_OP_PUSH; + __cursor_state_save(cursor, &state); - WT_ERR(__cursor_func_init(cbt, true)); +retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, false)); @@ -857,14 +976,12 @@ done: /* */ if (ret == 0) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if (ret == 0 && positioned) { + if (ret == 0 && positioned) WT_TRET(__wt_key_return(session, cbt)); - if (ret == 0) - F_SET(cursor, WT_CURSTD_KEY_INT); - } else + else WT_TRET(__cursor_reset(cbt)); if (ret != 0) - WT_CURFILE_OP_POP; + __cursor_state_restore(cursor, &state); return (ret); } @@ -877,6 +994,7 @@ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -889,6 +1007,8 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); + __cursor_state_save(cursor, &state); + if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); @@ -896,7 +1016,48 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) /* It's no longer possible to bulk-load into the tree. */ __cursor_disable_bulk(session, btree); -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * If update positioned to an on-page key, the update doesn't require + * another search. We don't care about the "overwrite" configuration + * because regardless of the overwrite setting, any existing record is + * updated, and the record must exist with a positioned cursor. The + * cursor won't be positioned on a page with an external key set, but + * be sure. + */ + if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + WT_ERR(__wt_txn_autocommit_check(session)); + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * update whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, false) : + __cursor_col_modify(session, cbt, false); + if (ret == 0) + goto done; + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; + } + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + +retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); @@ -945,11 +1106,14 @@ err: if (ret == WT_RESTART) { * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ - if (ret == 0) +done: if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); - if (ret != 0) + if (ret != 0) { WT_TRET(__cursor_reset(cbt)); + __cursor_state_restore(cursor, &state); + } + return (ret); } @@ -1097,14 +1261,6 @@ __cursor_truncate(WT_SESSION_IMPL *session, * and we can proceed without concern. */ retry: WT_RET(__wt_btcur_search(start)); - - /* - * XXX KEITH - * When the btree cursor code sets/clears the cursor flags (rather than - * the cursor layer), the set/clear goes away, only the assert remains. - */ - F_CLR((WT_CURSOR *)start, WT_CURSTD_KEY_SET); - F_SET((WT_CURSOR *)start, WT_CURSTD_KEY_INT); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); @@ -1161,14 +1317,6 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, * refresh the page's modification information. */ retry: WT_RET(__wt_btcur_search(start)); - - /* - * XXX KEITH - * When the btree cursor code sets/clears the cursor flags (rather than - * the cursor layer), the set/clear goes away, only the assert remains. - */ - F_CLR((WT_CURSOR *)start, WT_CURSTD_KEY_SET); - F_SET((WT_CURSOR *)start, WT_CURSTD_KEY_INT); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index 25ede0a09ac..c5948ec4ab5 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -292,14 +292,16 @@ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t n, skip; - session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cbt->iface.session; /* * Only supports row-store: applications can trivially select a random @@ -312,6 +314,8 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + #ifdef HAVE_DIAGNOSTIC /* * Under some conditions we end up using the underlying cursor.next to @@ -320,7 +324,6 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) */ __wt_cursor_key_order_reset(cbt); #endif - /* * If we don't have a current position in the tree, or if retrieving * random values without sampling, pick a roughly random leaf page in diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index 9fc457e2297..f17fa1b85d1 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -142,8 +142,20 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_RET(__key_return(session, cbt)); + WT_CURSOR *cursor; + + cursor = &cbt->iface; + /* + * We may already have an internal key, in which case the cursor may + * not be set up to get another copy (for example, when we rely on a + * search-function result). + */ + F_CLR(cursor, WT_CURSTD_KEY_EXT); + if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + WT_RET(__key_return(session, cbt)); + F_SET(cursor, WT_CURSTD_KEY_INT); + } return (0); } @@ -154,8 +166,15 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { + WT_CURSOR *cursor; + + cursor = &cbt->iface; + WT_RET(__wt_key_return(session, cbt)); + + F_CLR(cursor, WT_CURSTD_VALUE_EXT); WT_RET(__value_return(session, cbt, upd)); + F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 274dc1e8f62..205afb607c3 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -8,29 +8,6 @@ #include "wt_internal.h" -/* - * WT_BTREE_CURSOR_SAVE_AND_RESTORE - * Save the cursor's key/value data/size fields, call an underlying btree - * function, and then consistently handle failure and success. - */ -#define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \ - WT_ITEM __key_copy = (cursor)->key; \ - uint64_t __recno = (cursor)->recno; \ - WT_ITEM __value_copy = (cursor)->value; \ - if (((ret) = (f)) == 0) { \ - F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - } else { \ - if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \ - (cursor)->recno = __recno; \ - WT_ITEM_SET((cursor)->key, __key_copy); \ - } \ - if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \ - WT_ITEM_SET((cursor)->value, __value_copy); \ - F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - } \ -} while (0) - /* * __curfile_compare -- * WT_CURSOR->compare method for the btree cursor type. @@ -109,9 +86,12 @@ __curfile_next(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, next, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_next(cbt, false)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_next(cbt, false)); + + /* Next maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -131,9 +111,12 @@ __wt_curfile_next_random(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, next, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_next_random(cbt)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_next_random(cbt)); + + /* Next-random maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -152,9 +135,12 @@ __curfile_prev(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, prev, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_prev(cbt, false)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_prev(cbt, false)); + + /* Prev maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -175,7 +161,10 @@ __curfile_reset(WT_CURSOR *cursor) ret = __wt_btcur_reset(cbt); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* Reset maintains no position, key or value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0); err: API_END_RET(session, ret); } @@ -194,10 +183,15 @@ __curfile_search(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret); + WT_ERR(__wt_btcur_search(cbt)); + + /* Search maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -216,11 +210,15 @@ __curfile_search_near(WT_CURSOR *cursor, int *exact) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search_near, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE( - cursor, __wt_btcur_search_near(cbt, exact), ret); + WT_ERR(__wt_btcur_search_near(cbt, exact)); + + /* Search-near maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -238,38 +236,33 @@ __curfile_insert(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret); + WT_ERR(__wt_btcur_insert(cbt)); /* - * Insert is the one cursor operation that doesn't end with the cursor - * pointing to an on-page item (except for column-store appends, where - * we are returning a key). That is, the application's cursor continues - * to reference the application's memory after a successful cursor call, - * which isn't true anywhere else. We don't want to have to explain that - * scoping corner case, so we reset the application's cursor so it can - * free the referenced memory and continue on without risking subsequent - * core dumps. + * Insert maintains no position, key or value (except for column-store + * appends, where we are returning a key). */ - if (ret == 0) { - if (!F_ISSET(cursor, WT_CURSTD_APPEND)) - F_CLR(cursor, WT_CURSTD_KEY_INT); - F_CLR(cursor, WT_CURSTD_VALUE_INT); - } + WT_ASSERT(session, + (F_ISSET(cursor, WT_CURSTD_APPEND) && + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT) || + (!F_ISSET(cursor, WT_CURSTD_APPEND) && + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0)); err: CURSOR_UPDATE_API_END(session, ret); return (ret); } /* - * __curfile_update -- - * WT_CURSOR->update method for the btree cursor type. + * __wt_curfile_insert_check -- + * WT_CURSOR->insert_check method for the btree cursor type. */ -static int -__curfile_update(WT_CURSOR *cursor) +int +__wt_curfile_insert_check(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -278,21 +271,21 @@ __curfile_update(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret); + ret = __wt_btcur_insert_check(cbt); err: CURSOR_UPDATE_API_END(session, ret); return (ret); } /* - * __wt_curfile_update_check -- - * WT_CURSOR->update_check method for the btree cursor type. + * __curfile_update -- + * WT_CURSOR->update method for the btree cursor type. */ -int -__wt_curfile_update_check(WT_CURSOR *cursor) +static int +__curfile_update(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -301,11 +294,15 @@ __wt_curfile_update_check(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NOVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE( - cursor, __wt_btcur_update_check(cbt), ret); + WT_ERR(__wt_btcur_update(cbt)); + + /* Update maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: CURSOR_UPDATE_API_END(session, ret); return (ret); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 8df8e201173..6135132601b 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -974,8 +974,8 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, - "join cursors with Bloom filters cannot be " - "used with read-uncommitted isolation"); + "join cursors with Bloom filters cannot be " + "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 84c9990832d..a957d245958 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -987,7 +987,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) { if ((conn->evict_tune_workers_best == conn->evict_threads.current_threads) && - (conn->evict_threads.current_threads < + (conn->evict_threads.current_threads < conn->evict_threads_max)) { /* * Keep adding workers. We will check again @@ -996,7 +996,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) conn->evict_tune_datapts_needed += WT_MIN(EVICT_TUNE_DATAPT_MIN, (conn->evict_threads_max - - conn->evict_threads.current_threads)/ + - conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); } else { /* diff --git a/src/include/cursor.i b/src/include/cursor.i index 9cb9f5e7189..12044e0e228 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -75,23 +75,6 @@ __cursor_leave(WT_SESSION_IMPL *session) __wt_txn_read_last(session); } -/* - * __curfile_enter -- - * Activate a file cursor. - */ -static inline int -__curfile_enter(WT_CURSOR_BTREE *cbt) -{ - WT_SESSION_IMPL *session; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - - if (!F_ISSET(cbt, WT_CBT_NO_TXN)) - WT_RET(__cursor_enter(session)); - F_SET(cbt, WT_CBT_ACTIVE); - return (0); -} - /* * __cursor_reset -- * Reset the cursor, it no longer holds any position. @@ -264,8 +247,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) /* If the transaction is idle, check that the cache isn't full. */ WT_RET(__wt_txn_idle_cache_check(session)); - if (!F_ISSET(cbt, WT_CBT_ACTIVE)) - WT_RET(__curfile_enter(cbt)); + /* Activate the file cursor. */ + if (!F_ISSET(cbt, WT_CBT_ACTIVE)) { + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + WT_RET(__cursor_enter(session)); + F_SET(cbt, WT_CBT_ACTIVE); + } /* * If this is an ordinary transactional cursor, make sure we are set up diff --git a/src/include/extern.h b/src/include/extern.h index c0aa21b7f4c..a7eb4b491a9 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -103,7 +103,7 @@ extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((wa extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -289,7 +289,7 @@ extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curfile_next_random(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_curfile_update_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_curfile_insert_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curjoin_joined(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/packing.i b/src/include/packing.i index d79afe6d4a2..6b4bcd49e04 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -198,7 +198,7 @@ next: if (pack->cur == pack->end) return (0); default: WT_RET_MSG(pack->session, EINVAL, - "Invalid type '%c' found in format '%.*s'", + "Invalid type '%c' found in format '%.*s'", pv->type, (int)(pack->end - pack->orig), pack->orig); } diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 7223aeae0f6..558e93d3de0 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1480,6 +1480,10 @@ struct __wt_session { * contains. * @snippet ex_all.c Truncate a range * + * Any specified cursors end with no position, and subsequent calls to + * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the + * beginning (end) of the table. + * * @param session the session handle * @param name the URI of the file or table to truncate * @param start optional cursor marking the first record discarded; diff --git a/src/log/log.c b/src/log/log.c index 05234619d32..1a27120710b 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -783,8 +783,8 @@ __log_openfile(WT_SESSION_IMPL *session, __wt_log_desc_byteswap(desc); if (desc->log_magic != WT_LOG_MAGIC) WT_PANIC_RET(session, WT_ERROR, - "log file %s corrupted: Bad magic number %" PRIu32, - (*fhp)->name, desc->log_magic); + "log file %s corrupted: Bad magic number %" PRIu32, + (*fhp)->name, desc->log_magic); if (desc->majorv > WT_LOG_MAJOR_VERSION || (desc->majorv == WT_LOG_MAJOR_VERSION && desc->minorv > WT_LOG_MINOR_VERSION)) diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 3f0b6df8eb0..0de39b38370 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -178,20 +178,12 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, - WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); + WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { - /* - * If the cursor looks up-to-date, check if the cache is full. - * In case this call blocks, the check will be repeated before - * proceeding. - */ - if (clsm->dsk_gen != lsm_tree->dsk_gen && - lsm_tree->nchunks != 0) - goto open; - + /* Check if the cursor looks up-to-date. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; @@ -666,7 +658,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { */ if (i != nchunks - 1) clsm->chunks[i]->cursor->insert = - __wt_curfile_update_check; + __wt_curfile_insert_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) @@ -852,8 +844,8 @@ __clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); - WT_CURSOR_NEEDKEY(a); - WT_CURSOR_NEEDKEY(b); + WT_CURSOR_CHECKKEY(a); + WT_CURSOR_CHECKKEY(b); WT_ERR(__wt_compare( session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp)); @@ -1529,7 +1521,7 @@ __clsm_insert(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); @@ -1573,7 +1565,7 @@ __clsm_update(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); @@ -1620,16 +1612,14 @@ __clsm_remove(WT_CURSOR *cursor) positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); CURSOR_REMOVE_API_CALL(cursor, session, NULL); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); - if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || - (ret = __clsm_lookup(clsm, &value)) == 0) - ret = __clsm_put( - session, clsm, &cursor->key, &__tombstone, positioned); - -err: __clsm_leave(clsm); + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + WT_ERR(__clsm_lookup(clsm, &value)); + WT_ERR(__clsm_put( + session, clsm, &cursor->key, &__tombstone, positioned)); /* * If the cursor was positioned, it stays positioned with a key but no @@ -1643,6 +1633,7 @@ err: __clsm_leave(clsm); else WT_TRET(cursor->reset(cursor)); +err: __clsm_leave(clsm); CURSOR_UPDATE_API_END(session, ret); return (ret); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index a06b736bf0a..8838638f388 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -625,7 +625,7 @@ err: if (locked) else __wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", - __wt_strerror(session, ret, NULL, 0)); + __wt_strerror(session, ret, NULL, 0)); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index 46ead6d6ac4..fc4dde82470 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -229,7 +229,7 @@ __lsm_meta_read_v1( cv.len -= 2; } WT_ERR(__wt_config_check(session, - WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); + WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_getones( session, lsmconf, "lsm.bloom_hash_count", &cv)); diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 21e8991be94..ed760b6d5f3 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -29,8 +29,8 @@ __curstat_lsm_init( const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL, NULL }; const char *disk_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_open_cursor), - "checkpoint=" WT_CHECKPOINT, NULL, NULL }; + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), + "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 020d5e72c13..a77ca51f9d2 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -35,7 +35,7 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session, * units of its happy place. */ if (FLD_ISSET(conn->direct_io, - WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) { + WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) { align = (int64_t)conn->buffer_alignment; if (align != 0 && (cval.val < align || cval.val % align != 0)) WT_RET_MSG(session, EINVAL, diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index e5f71b5d56f..62cdd7d367b 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -112,10 +112,10 @@ __wt_schema_worker(WT_SESSION_IMPL *session, wt_session = (WT_SESSION *)session; if (file_func == __wt_salvage && dsrc->salvage != NULL) WT_ERR(dsrc->salvage( - dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_verify && dsrc->verify != NULL) WT_ERR(dsrc->verify( - dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; else if (file_func == __wt_checkpoint_get_handles) diff --git a/src/session/session_api.c b/src/session/session_api.c index 3d13287fbe6..51233e5e224 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1206,10 +1206,15 @@ __wt_session_range_truncate(WT_SESSION_IMPL *session, done: err: /* - * Close any locally-opened start cursor. + * Close any locally-opened start cursor. Reset application cursors, + * they've possibly moved and the application cannot use them. */ if (local_start) WT_TRET(start->close(start)); + else + WT_TRET(start->reset(start)); + if (stop != NULL) + WT_TRET(stop->reset(stop)); return (ret); } diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 85214ae6d98..72c072e0fb8 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -210,7 +210,7 @@ __compact_checkpoint(WT_SESSION_IMPL *session) * work we need to have done is done in the underlying block manager. */ const char *checkpoint_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL }; + WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL }; /* Checkpoints take a lot of time, check if we've run out. */ WT_RET(__wt_session_compact_check_timeout(session)); diff --git a/src/txn/txn.c b/src/txn/txn.c index e5e59c2b901..6eebf5ecf9f 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -713,7 +713,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) snapshot_pinned = txn_global->nsnap_oldest_id; WT_STAT_SET(session, stats, txn_pinned_range, - txn_global->current - txn_global->oldest_id); + txn_global->current - txn_global->oldest_id); WT_STAT_SET(session, stats, txn_pinned_snapshot_range, snapshot_pinned == WT_TXN_NONE ? diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 6c97922f7e1..5ec8aa19e4c 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -306,7 +306,7 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); WT_RET(__wt_metadata_cursor(session, &meta_cursor)); meta_cursor->set_key(meta_cursor, session->dhandle->name); - ret = __wt_curfile_update_check(meta_cursor); + ret = __wt_curfile_insert_check(meta_cursor); if (ret == WT_ROLLBACK) { metadata_race = true; ret = 0; diff --git a/test/format/config.c b/test/format/config.c index cd9856d641e..535dcd677e2 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -63,39 +63,42 @@ config_setup(void) config_in_memory(); /* - * Choose a data source type and a file type: they're interrelated (LSM - * trees are only compatible with row-store) and other items depend on - * them. + * Choose a file format and a data source: they're interrelated (LSM is + * only compatible with row-store) and other items depend on them. */ + if (!config_is_perm("file_type")) { + if (config_is_perm("data_source") && DATASOURCE("lsm")) + config_single("file_type=row", 0); + else + switch (mmrand(NULL, 1, 10)) { + case 1: /* 10% */ + config_single("file_type=fix", 0); + break; + case 2: case 3: case 4: /* 30% */ + config_single("file_type=var", 0); + break; /* 60% */ + case 5: case 6: case 7: case 8: case 9: case 10: + config_single("file_type=row", 0); + break; + } + } + config_map_file_type(g.c_file_type, &g.type); + if (!config_is_perm("data_source")) switch (mmrand(NULL, 1, 3)) { case 1: config_single("data_source=file", 0); break; case 2: - if (!g.c_in_memory) { - config_single("data_source=lsm", 0); - break; - } - /* FALLTHROUGH */ - case 3: config_single("data_source=table", 0); break; - } - - if (!config_is_perm("file_type")) - switch (DATASOURCE("lsm") ? 5 : mmrand(NULL, 1, 10)) { - case 1: - config_single("file_type=fix", 0); - break; - case 2: case 3: case 4: - config_single("file_type=var", 0); - break; - case 5: case 6: case 7: case 8: case 9: case 10: - config_single("file_type=row", 0); + case 3: + if (g.c_in_memory || g.type != ROW) + config_single("data_source=table", 0); + else + config_single("data_source=lsm", 0); break; } - config_map_file_type(g.c_file_type, &g.type); /* * If data_source and file_type were both "permanent", we may still diff --git a/test/format/ops.c b/test/format/ops.c index 1013d1da30b..05457ebb5a0 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -28,14 +28,17 @@ #include "format.h" -static int col_insert(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t *); -static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t); -static int col_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); +static int col_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t *); +static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, bool); +static int col_update( + TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t, bool); static int nextprev(WT_CURSOR *, int); static void *ops(void *); -static int row_insert(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); -static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t); -static int row_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); +static int row_insert( + TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t, bool); +static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, bool); +static int row_update( + TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t, bool); static void table_append_init(void); #ifdef HAVE_BERKELEY_DB @@ -243,6 +246,9 @@ typedef struct { bool insert; /* Insert operation */ } SNAP_OPS; +#define SNAP_TRACK \ + (snap != NULL && (size_t)(snap - snap_list) < WT_ELEMENTS(snap_list)) + /* * snap_track -- * Add a single snapshot isolation returned value to the list. @@ -395,15 +401,16 @@ snap_check(WT_CURSOR *cursor, static void * ops(void *arg) { + enum { INSERT, READ, REMOVE, UPDATE } op; SNAP_OPS *snap, snap_list[64]; TINFO *tinfo; WT_CONNECTION *conn; - WT_CURSOR *cursor, *cursor_insert; + WT_CURSOR *cursor; WT_DECL_RET; WT_ITEM *key, _key, *value, _value; WT_SESSION *session; uint64_t keyno, ckpt_op, reset_op, session_op; - uint32_t op, rnd; + uint32_t rnd; u_int i; int dir; char *ckpt_config, ckpt_name[64]; @@ -429,9 +436,9 @@ ops(void *arg) val_gen_setup(&tinfo->rnd, value); /* Set the first operation where we'll create sessions and cursors. */ - session_op = 0; + cursor = NULL; session = NULL; - cursor = cursor_insert = NULL; + session_op = 0; /* Set the first operation where we'll perform checkpoint operations. */ ckpt_op = g.c_checkpoints ? mmrand(&tinfo->rnd, 100, 10000) : 0; @@ -485,24 +492,11 @@ ops(void *arg) readonly = true; } else { /* - * Open two cursors: one for overwriting and one - * for append (if it's a column-store). - * - * The reason is when testing with existing - * records, we don't track if a record was - * deleted or not, which means we must use - * cursor->insert with overwriting configured. - * But, in column-store files where we're - * testing with new, appended records, we don't - * want to have to specify the record number, - * which requires an append configuration. + * Configure "append", in the case of column + * stores, we append when inserting new rows. */ - testutil_check(session->open_cursor(session, - g.uri, NULL, "overwrite", &cursor)); - if (g.type == FIX || g.type == VAR) - testutil_check(session->open_cursor( - session, g.uri, - NULL, "append", &cursor_insert)); + testutil_check(session->open_cursor( + session, g.uri, NULL, "append", &cursor)); /* Pick the next session/cursor close/open. */ session_op += mmrand(&tinfo->rnd, 100, 5000); @@ -600,111 +594,174 @@ skip_checkpoint: /* Pick the next checkpoint operation. */ intxn = true; } + /* Select a row. */ keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows); positioned = false; + /* Select an operation. */ + op = READ; + if (!readonly) { + i = mmrand(&tinfo->rnd, 1, 100); + if (i < g.c_delete_pct) + op = REMOVE; + else if (i < g.c_delete_pct + g.c_insert_pct) + op = INSERT; + else if (i < + g.c_delete_pct + g.c_insert_pct + g.c_write_pct) + op = UPDATE; + else + op = READ; + } + /* - * Perform some number of operations: the percentage of deletes, - * inserts and writes are specified, reads are the rest. The - * percentages don't have to add up to 100, a high percentage - * of deletes will mean fewer inserts and writes. Modifications - * are always followed by a read to confirm it worked. + * Inserts, removes and updates can be done following a cursor + * set-key, or based on a cursor position taken from a previous + * search. If not already doing a read, position the cursor at + * an existing point in the tree 20% of the time. */ - op = readonly ? UINT32_MAX : mmrand(&tinfo->rnd, 1, 100); - if (op < g.c_delete_pct) { - ++tinfo->remove; + positioned = false; + if (op != READ && mmrand(&tinfo->rnd, 1, 5) == 1) { + ++tinfo->search; + ret = read_row(cursor, key, value, keyno); + if (ret == 0) { + positioned = true; + if (SNAP_TRACK) + snap_track(snap++, keyno, NULL, value); + } else { + positioned = false; + if (ret == WT_ROLLBACK && intxn) + goto deadlock; + testutil_assert(ret == WT_NOTFOUND); + } + } +#if 0 + /* Optionally reserve a row. */ + if (!readonly && intxn && mmrand(&tinfo->rnd, 0, 20) == 1) { switch (g.type) { case ROW: - ret = row_remove(cursor, key, keyno); + ret = + row_reserve(cursor, key, keyno, positioned); break; case FIX: case VAR: - ret = col_remove(cursor, key, keyno); + ret = col_reserve(cursor, keyno, positioned); break; } if (ret == 0) { positioned = true; - if (snap != NULL && (size_t) - (snap - snap_list) < WT_ELEMENTS(snap_list)) + __wt_yield(); + } else { + positioned = false; + if (ret == WT_ROLLBACK && intxn) + goto deadlock; + testutil_assert(ret == WT_NOTFOUND); + } + } +#endif + /* Perform the operation. */ + switch (op) { + case REMOVE: + switch (g.type) { + case ROW: + ret = + row_remove(cursor, key, keyno, positioned); + break; + case FIX: + case VAR: + ret = + col_remove(cursor, key, keyno, positioned); + break; + } + if (ret == 0) { + ++tinfo->remove; + /* + * Don't set positioned: it's unchanged from the + * previous state, but not necessarily set. + */ + if (SNAP_TRACK) snap_track(snap++, keyno, NULL, NULL); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) goto deadlock; + testutil_assert(ret == WT_NOTFOUND); } - } else if (op < g.c_delete_pct + g.c_insert_pct) { - ++tinfo->insert; + break; + case INSERT: switch (g.type) { case ROW: - key_gen_insert(&tinfo->rnd, key, keyno); - val_gen(&tinfo->rnd, value, keyno); - ret = row_insert(cursor, key, value, keyno); + ret = row_insert(tinfo, + cursor, key, value, keyno, positioned); break; case FIX: case VAR: /* - * We can only append so many new records, if - * we've reached that limit, update a record - * instead of doing an insert. + * We can only append so many new records, once + * we reach that limit, update a record instead + * of inserting. */ if (g.append_cnt >= g.append_max) - goto skip_insert; + goto update_instead_of_insert; - /* Insert, then reset the insert cursor. */ - val_gen(&tinfo->rnd, value, g.rows + 1); ret = col_insert( - cursor_insert, key, value, &keyno); - testutil_check( - cursor_insert->reset(cursor_insert)); + tinfo, cursor, key, value, &keyno); break; } + + /* Insert never leaves the cursor positioned. */ positioned = false; if (ret == 0) { - if (snap != NULL && (size_t) - (snap - snap_list) < WT_ELEMENTS(snap_list)) + ++tinfo->insert; + if (SNAP_TRACK) snap_track(snap++, keyno, g.type == ROW ? key : NULL, value); - } else + } else { if (ret == WT_ROLLBACK && intxn) goto deadlock; - } else if ( - op < g.c_delete_pct + g.c_insert_pct + g.c_write_pct) { + testutil_assert(ret == 0); + } + break; + case UPDATE: +update_instead_of_insert: ++tinfo->update; + + /* Update the row. */ switch (g.type) { case ROW: - key_gen(key, keyno); - val_gen(&tinfo->rnd, value, keyno); - ret = row_update(cursor, key, value, keyno); + ret = row_update(tinfo, + cursor, key, value, keyno, positioned); break; case FIX: case VAR: -skip_insert: val_gen(&tinfo->rnd, value, keyno); - ret = col_update(cursor, key, value, keyno); + ret = col_update(tinfo, + cursor, key, value, keyno, positioned); break; } if (ret == 0) { positioned = true; - if (snap != NULL && (size_t) - (snap - snap_list) < WT_ELEMENTS(snap_list)) + if (SNAP_TRACK) snap_track(snap++, keyno, NULL, value); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) goto deadlock; + testutil_assert(ret == 0); } - } else { + break; + case READ: ++tinfo->search; ret = read_row(cursor, key, value, keyno); if (ret == 0) { positioned = true; - if (snap != NULL && (size_t) - (snap - snap_list) < WT_ELEMENTS(snap_list)) + if (SNAP_TRACK) snap_track(snap++, keyno, NULL, value); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) goto deadlock; + testutil_assert(ret == WT_NOTFOUND); } + break; } /* @@ -727,8 +784,8 @@ skip_insert: val_gen(&tinfo->rnd, value, keyno); testutil_check(cursor->reset(cursor)); /* - * If we're in a transaction, commit 40% of the time and - * rollback 10% of the time. + * Continue if not in a transaction, else add more operations + * to the transaction half the time. */ if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5) continue; @@ -741,6 +798,10 @@ skip_insert: val_gen(&tinfo->rnd, value, keyno); cursor, snap_list, snap, key, value)) == WT_ROLLBACK) goto deadlock; + /* + * If we're in a transaction, commit 40% of the time and + * rollback 10% of the time. + */ switch (rnd) { case 1: case 2: case 3: case 4: /* 40% */ testutil_check( @@ -1040,27 +1101,94 @@ nextprev(WT_CURSOR *cursor, int next) return (ret); } +#if 0 +/* + * row_reserve -- + * Reserve a row in a row-store file. + */ +static int +row_reserve(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, bool positioned) +{ + WT_DECL_RET; + + if (!positioned) { + key_gen(key, keyno); + cursor->set_key(cursor, key); + } + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, + "%-10s{%.*s}", "reserve", (int)key->size, key->data); + + switch (ret = cursor->reserve(cursor)) { + case 0: + break; + case WT_CACHE_FULL: + case WT_ROLLBACK: + return (WT_ROLLBACK); + case WT_NOTFOUND: + return (WT_NOTFOUND); + default: + testutil_die(ret, + "row_reserve: reserve row %" PRIu64 " by key", keyno); + } + return (0); +} + +/* + * col_reserve -- + * Reserve a row in a column-store file. + */ +static int +col_reserve(WT_CURSOR *cursor, uint64_t keyno, bool positioned) +{ + WT_DECL_RET; + + if (!positioned) + cursor->set_key(cursor, keyno); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, + "%-10s%" PRIu64, "reserve", keyno); + + switch (ret = cursor->reserve(cursor)) { + case 0: + break; + case WT_CACHE_FULL: + case WT_ROLLBACK: + return (WT_ROLLBACK); + case WT_NOTFOUND: + return (WT_NOTFOUND); + default: + testutil_die(ret, "col_reserve: %" PRIu64, keyno); + } + return (0); +} +#endif + /* * row_update -- * Update a row in a row-store file. */ static int -row_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) +row_update(TINFO *tinfo, WT_CURSOR *cursor, + WT_ITEM *key, WT_ITEM *value, uint64_t keyno, bool positioned) { WT_DECL_RET; - WT_SESSION *session; - session = cursor->session; + if (!positioned) { + key_gen(key, keyno); + cursor->set_key(cursor, key); + } + val_gen(&tinfo->rnd, value, keyno); + cursor->set_value(cursor, value); - /* Log the operation */ if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s{%.*s}, {%.*s}", "put", (int)key->size, key->data, (int)value->size, value->data); - cursor->set_key(cursor, key); - cursor->set_value(cursor, value); switch (ret = cursor->update(cursor)) { case 0: break; @@ -1086,32 +1214,32 @@ row_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) * Update a row in a column-store file. */ static int -col_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) +col_update(TINFO *tinfo, WT_CURSOR *cursor, + WT_ITEM *key, WT_ITEM *value, uint64_t keyno, bool positioned) { WT_DECL_RET; - WT_SESSION *session; - session = cursor->session; + if (!positioned) + cursor->set_key(cursor, keyno); + val_gen(&tinfo->rnd, value, keyno); + if (g.type == FIX) + cursor->set_value(cursor, *(uint8_t *)value->data); + else + cursor->set_value(cursor, value); - /* Log the operation */ if (g.logging == LOG_OPS) { if (g.type == FIX) - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64 " {0x%02" PRIx8 "}", "update", keyno, ((uint8_t *)value->data)[0]); else - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64 " {%.*s}", "update", keyno, (int)value->size, (char *)value->data); } - cursor->set_key(cursor, keyno); - if (g.type == FIX) - cursor->set_value(cursor, *(uint8_t *)value->data); - else - cursor->set_value(cursor, value); switch (ret = cursor->update(cursor)) { case 0: break; @@ -1238,22 +1366,29 @@ table_append(uint64_t keyno) * Insert a row in a row-store file. */ static int -row_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) +row_insert(TINFO *tinfo, WT_CURSOR *cursor, + WT_ITEM *key, WT_ITEM *value, uint64_t keyno, bool positioned) { WT_DECL_RET; - WT_SESSION *session; - session = cursor->session; + /* + * If we positioned the cursor already, it's a test of an update using + * the insert method. Otherwise, generate a unique key and insert. + */ + if (!positioned) { + key_gen_insert(&tinfo->rnd, key, keyno); + cursor->set_key(cursor, key); + } + val_gen(&tinfo->rnd, value, keyno); + cursor->set_value(cursor, value); /* Log the operation */ if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s{%.*s}, {%.*s}", "insert", (int)key->size, key->data, (int)value->size, value->data); - cursor->set_key(cursor, key); - cursor->set_value(cursor, value); switch (ret = cursor->insert(cursor)) { case 0: break; @@ -1279,14 +1414,13 @@ row_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) * Insert an element in a column-store file. */ static int -col_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop) +col_insert(TINFO *tinfo, + WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop) { WT_DECL_RET; - WT_SESSION *session; uint64_t keyno; - session = cursor->session; - + val_gen(&tinfo->rnd, value, g.rows + 1); if (g.type == FIX) cursor->set_value(cursor, *(uint8_t *)value->data); else @@ -1307,12 +1441,12 @@ col_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop) if (g.logging == LOG_OPS) { if (g.type == FIX) - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64 " {0x%02" PRIx8 "}", "insert", keyno, ((uint8_t *)value->data)[0]); else - (void)g.wt_api->msg_printf(g.wt_api, session, + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64 " {%.*s}", "insert", keyno, (int)value->size, (char *)value->data); @@ -1335,21 +1469,19 @@ col_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop) * Remove an row from a row-store file. */ static int -row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) +row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, bool positioned) { WT_DECL_RET; - WT_SESSION *session; - session = cursor->session; - - key_gen(key, keyno); + if (!positioned) { + key_gen(key, keyno); + cursor->set_key(cursor, key); + } - /* Log the operation */ if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf( - g.wt_api, session, "%-10s%" PRIu64, "remove", keyno); + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", keyno); - cursor->set_key(cursor, key); /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); @@ -1385,19 +1517,17 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) * Remove a row from a column-store file. */ static int -col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) +col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, bool positioned) { WT_DECL_RET; - WT_SESSION *session; - session = cursor->session; + if (!positioned) + cursor->set_key(cursor, keyno); - /* Log the operation */ if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf( - g.wt_api, session, "%-10s%" PRIu64, "remove", keyno); + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", keyno); - cursor->set_key(cursor, keyno); /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); diff --git a/test/suite/test_truncate01.py b/test/suite/test_truncate01.py index 7d2b3862568..98b741ba6a4 100644 --- a/test/suite/test_truncate01.py +++ b/test/suite/test_truncate01.py @@ -128,6 +128,7 @@ class test_truncate_cursor_order(wttest.WiredTigerTestCase): msg = '/the start cursor position is after the stop cursor position/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.truncate(None, c1, c2, None), msg) + c1.set_key(ds.key(10)) c2.set_key(ds.key(20)) self.session.truncate(None, c1, c2, None) -- cgit v1.2.1 From 56fa32f25a0745b049789f31e7dd5128be9525a0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 07:52:59 -0400 Subject: WT-98 Update the current cursor value without a search (#3346) * WT-98 Update the current cursor value without a search When running in-memory and insert/update fails, we should expect WT_ROLLBACK even when not running inside a transaction. * Order the operations alphabetically (they were ordered the way they were because of the order in which we used to choose operations, but that's no longer the case). --- test/format/ops.c | 86 +++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/test/format/ops.c b/test/format/ops.c index 05457ebb5a0..5309edf81c0 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -660,33 +660,6 @@ skip_checkpoint: /* Pick the next checkpoint operation. */ #endif /* Perform the operation. */ switch (op) { - case REMOVE: - switch (g.type) { - case ROW: - ret = - row_remove(cursor, key, keyno, positioned); - break; - case FIX: - case VAR: - ret = - col_remove(cursor, key, keyno, positioned); - break; - } - if (ret == 0) { - ++tinfo->remove; - /* - * Don't set positioned: it's unchanged from the - * previous state, but not necessarily set. - */ - if (SNAP_TRACK) - snap_track(snap++, keyno, NULL, NULL); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } - break; case INSERT: switch (g.type) { case ROW: @@ -718,7 +691,48 @@ skip_checkpoint: /* Pick the next checkpoint operation. */ } else { if (ret == WT_ROLLBACK && intxn) goto deadlock; - testutil_assert(ret == 0); + testutil_assert(ret == 0 || ret == WT_ROLLBACK); + } + break; + case READ: + ++tinfo->search; + ret = read_row(cursor, key, value, keyno); + if (ret == 0) { + positioned = true; + if (SNAP_TRACK) + snap_track(snap++, keyno, NULL, value); + } else { + positioned = false; + if (ret == WT_ROLLBACK && intxn) + goto deadlock; + testutil_assert(ret == WT_NOTFOUND); + } + break; + case REMOVE: + switch (g.type) { + case ROW: + ret = + row_remove(cursor, key, keyno, positioned); + break; + case FIX: + case VAR: + ret = + col_remove(cursor, key, keyno, positioned); + break; + } + if (ret == 0) { + ++tinfo->remove; + /* + * Don't set positioned: it's unchanged from the + * previous state, but not necessarily set. + */ + if (SNAP_TRACK) + snap_track(snap++, keyno, NULL, NULL); + } else { + positioned = false; + if (ret == WT_ROLLBACK && intxn) + goto deadlock; + testutil_assert(ret == WT_NOTFOUND); } break; case UPDATE: @@ -745,21 +759,7 @@ update_instead_of_insert: positioned = false; if (ret == WT_ROLLBACK && intxn) goto deadlock; - testutil_assert(ret == 0); - } - break; - case READ: - ++tinfo->search; - ret = read_row(cursor, key, value, keyno); - if (ret == 0) { - positioned = true; - if (SNAP_TRACK) - snap_track(snap++, keyno, NULL, value); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); + testutil_assert(ret == 0 || ret == WT_ROLLBACK); } break; } -- cgit v1.2.1 From 1ceddd4a972bf220db9585739e9fcb283d618da4 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 08:16:21 -0400 Subject: WT-3136 bug fix: WiredTiger doesn't check sprintf calls for error return (#3340) * WT-3136 bug fix: WiredTiger doesn't check sprintf calls for error return Make a pass through the source base to check sprintf, snprintf, vsprintf and vsnprintf calls for errors. * A WiredTiger key is a uint64_t. Use sizeof(), don't hard-wire buffer sizes into the code. * More (u_int) vs. (uint64_t) fixes. * Use CONFIG_APPEND instead of FORMAT_APPEND, it makes more sense. * revert part of 4475ae9, there's an explicit allocation of the size of the buffer. * MVSC complaints: test\format\config.c(765): warning C4018: '<': signed/unsigned mismatch test\format\config.c(765): warning C4018: '>': signed/unsigned mismatch * Change Windows testing shim to correctly use __wt_snprintf * Change Windows test shim to use the __wt_XXX functions * MSDN's _vscprintf API returns the number of characters excluding the termininating nul byte, return that value. --- bench/wtperf/config.c | 11 +- bench/wtperf/idle_table_cycle.c | 4 +- bench/wtperf/misc.c | 4 +- bench/wtperf/track.c | 4 +- bench/wtperf/wtperf.c | 104 +++++++++--------- dist/filelist | 2 +- examples/c/ex_async.c | 6 +- examples/c/ex_backup.c | 28 ++--- examples/c/ex_encrypt.c | 4 +- examples/c/ex_log.c | 12 +-- examples/c/ex_sync.c | 20 ++-- src/block/block_ext.c | 4 +- src/bloom/bloom.c | 4 +- src/btree/bt_debug.c | 16 +-- src/config/config_api.c | 2 +- src/conn/conn_api.c | 9 +- src/cursor/cur_join.c | 15 +-- src/cursor/cur_json.c | 47 ++++---- src/cursor/cur_stat.c | 4 +- src/include/extern_posix.h | 3 +- src/include/extern_win.h | 3 +- src/include/misc.i | 91 ++++++++++++++++ src/include/os_windows.h | 22 ---- src/include/packing.i | 4 +- src/log/log.c | 7 +- src/lsm/lsm_stat.c | 4 +- src/os_common/filename.c | 4 +- src/os_common/os_errno.c | 2 +- src/os_common/os_fstream.c | 2 +- src/os_posix/os_snprintf.c | 27 +++++ src/os_posix/os_thread.c | 10 +- src/os_win/os_snprintf.c | 50 +++++++-- src/os_win/os_thread.c | 6 +- src/os_win/os_vsnprintf.c | 41 ------- src/schema/schema_create.c | 3 +- src/support/err.c | 119 ++++++++++---------- src/support/scratch.c | 9 +- src/utilities/util_backup.c | 15 +-- src/utilities/util_dump.c | 15 +-- src/utilities/util_load.c | 11 +- src/utilities/util_load_json.c | 23 ++-- src/utilities/util_main.c | 17 ++- src/utilities/util_misc.c | 5 +- src/utilities/util_stat.c | 5 +- src/utilities/util_verify.c | 7 +- src/utilities/util_write.c | 8 +- test/bloom/test_bloom.c | 4 +- test/checkpoint/checkpointer.c | 26 +++-- test/checkpoint/test_checkpoint.c | 4 +- test/checkpoint/workers.c | 22 ++-- test/csuite/wt1965_col_efficiency/main.c | 3 +- test/csuite/wt2246_col_append/main.c | 12 +-- test/csuite/wt2323_join_visibility/main.c | 34 +++--- test/csuite/wt2447_join_main_table/main.c | 12 ++- test/csuite/wt2592_join_schema/main.c | 9 +- test/csuite/wt2834_join_bloom_fix/main.c | 20 ++-- test/csuite/wt2853_perf/main.c | 15 +-- test/csuite/wt2909_checkpoint_integrity/main.c | 21 ++-- test/csuite/wt3120_filesys/main.c | 4 +- test/cursor_order/cursor_order.c | 8 +- test/cursor_order/cursor_order_file.c | 30 +++--- test/cursor_order/cursor_order_ops.c | 20 ++-- test/fops/file.c | 6 +- test/fops/t.c | 4 +- test/format/backup.c | 4 +- test/format/config.c | 23 ++-- test/format/ops.c | 10 +- test/format/rebalance.c | 16 +-- test/format/salvage.c | 25 +++-- test/format/util.c | 51 +++++---- test/format/wts.c | 143 ++++++++++++------------- test/manydbs/manydbs.c | 3 +- test/readonly/readonly.c | 45 ++++---- test/recovery/random-abort.c | 20 ++-- test/recovery/truncated-log.c | 14 +-- test/salvage/salvage.c | 26 ++--- test/thread/file.c | 35 +++--- test/thread/rw.c | 41 ++++--- test/thread/stats.c | 3 +- test/thread/t.c | 8 +- test/utility/misc.c | 8 +- test/utility/parse_opts.c | 6 +- test/utility/thread.c | 7 +- test/windows/windows_shim.h | 9 +- 84 files changed, 893 insertions(+), 671 deletions(-) create mode 100644 src/os_posix/os_snprintf.c delete mode 100644 src/os_win/os_vsnprintf.c diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 9eea99eeec4..e4eee66e4cb 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -438,14 +438,13 @@ config_opt(WTPERF *wtperf, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) return (EINVAL); } strp = (char **)valueloc; - newlen = v->len + 1; if (*strp == NULL) begin = newstr = dstrdup(v->str); else { - newlen += strlen(*strp) + 1; - newstr = dcalloc(newlen, sizeof(char)); - snprintf(newstr, newlen, - "%s,%*s", *strp, (int)v->len, v->str); + newlen = strlen(*strp) + v->len + strlen(",") + 1; + newstr = dmalloc(newlen); + testutil_check(__wt_snprintf(newstr, newlen, + "%s,%.*s", *strp, (int)v->len, v->str)); /* Free the old value now we've copied it. */ free(*strp); begin = &newstr[(newlen - 1) - v->len]; @@ -712,7 +711,7 @@ config_opt_name_value(WTPERF *wtperf, const char *name, const char *value) /* name="value" */ len = strlen(name) + strlen(value) + 4; optstr = dmalloc(len); - snprintf(optstr, len, "%s=\"%s\"", name, value); + testutil_check(__wt_snprintf(optstr, len, "%s=\"%s\"", name, value)); ret = config_opt_str(wtperf, optstr); free(optstr); return (ret); diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c index bb44cfbde59..4387860cfb2 100644 --- a/bench/wtperf/idle_table_cycle.c +++ b/bench/wtperf/idle_table_cycle.c @@ -80,8 +80,8 @@ cycle_idle_tables(void *arg) } for (cycle_count = 0; wtperf->idle_cycle_run; ++cycle_count) { - snprintf(uri, sizeof(uri), - "%s_cycle%07d", wtperf->uris[0], cycle_count); + testutil_check(__wt_snprintf(uri, sizeof(uri), + "%s_cycle%07d", wtperf->uris[0], cycle_count)); /* Don't busy cycle in this loop. */ __wt_sleep(1, 0); diff --git a/bench/wtperf/misc.c b/bench/wtperf/misc.c index 24b3323a49a..0874794e01e 100644 --- a/bench/wtperf/misc.c +++ b/bench/wtperf/misc.c @@ -46,8 +46,8 @@ setup_log_file(WTPERF *wtperf) len = strlen(wtperf->monitor_dir) + strlen(opts->table_name) + strlen(".stat") + 2; fname = dmalloc(len); - snprintf(fname, len, - "%s/%s.stat", wtperf->monitor_dir, opts->table_name); + testutil_check(__wt_snprintf(fname, len, + "%s/%s.stat", wtperf->monitor_dir, opts->table_name)); if ((wtperf->logf = fopen(fname, "w")) == NULL) { ret = errno; fprintf(stderr, "%s: %s\n", fname, strerror(ret)); diff --git a/bench/wtperf/track.c b/bench/wtperf/track.c index 822bdaa4b4a..86a26120a6a 100644 --- a/bench/wtperf/track.c +++ b/bench/wtperf/track.c @@ -288,8 +288,8 @@ latency_print_single(WTPERF *wtperf, TRACK *total, const char *name) uint64_t cumops; char path[1024]; - snprintf(path, sizeof(path), - "%s/latency.%s", wtperf->monitor_dir, name); + testutil_check(__wt_snprintf(path, sizeof(path), + "%s/latency.%s", wtperf->monitor_dir, name)); if ((fp = fopen(path, "w")) == NULL) { lprintf(wtperf, errno, 0, "%s", path); return; diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 772dedac8c8..1eedaba4f32 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -551,7 +551,8 @@ worker(void *arg) goto err; } for (i = 0; i < opts->table_count_idle; i++) { - snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); + testutil_check(__wt_snprintf( + buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i)); if ((ret = session->open_cursor( session, buf, NULL, NULL, &tmp_cursor)) != 0) { lprintf(wtperf, ret, 0, @@ -1297,7 +1298,8 @@ monitor(void *arg) /* Open the logging file. */ len = strlen(wtperf->monitor_dir) + 100; path = dmalloc(len); - snprintf(path, len, "%s/monitor", wtperf->monitor_dir); + testutil_check(__wt_snprintf( + path, len, "%s/monitor", wtperf->monitor_dir)); if ((fp = fopen(path, "w")) == NULL) { lprintf(wtperf, errno, 0, "%s", path); goto err; @@ -1937,19 +1939,19 @@ create_uris(WTPERF *wtperf) /* If there is only one table, just use the base name. */ wtperf->uris[i] = dmalloc(len); if (opts->table_count == 1) - snprintf(wtperf->uris[i], - len, "table:%s", opts->table_name); + testutil_check(__wt_snprintf(wtperf->uris[i], + len, "table:%s", opts->table_name)); else - snprintf(wtperf->uris[i], - len, "table:%s%05d", opts->table_name, i); + testutil_check(__wt_snprintf(wtperf->uris[i], + len, "table:%s%05d", opts->table_name, i)); } /* Create the log-like-table URI. */ len = strlen("table:") + strlen(opts->table_name) + strlen("_log_table") + 1; wtperf->log_table_uri = dmalloc(len); - snprintf( - wtperf->log_table_uri, len, "table:%s_log_table", opts->table_name); + testutil_check(__wt_snprintf(wtperf->log_table_uri, + len, "table:%s_log_table", opts->table_name)); } static int @@ -1971,7 +1973,8 @@ create_tables(WTPERF *wtperf) } for (i = 0; i < opts->table_count_idle; i++) { - snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); + testutil_check(__wt_snprintf( + buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i)); if ((ret = session->create( session, buf, opts->table_config)) != 0) { lprintf(wtperf, ret, 0, @@ -2000,8 +2003,9 @@ create_tables(WTPERF *wtperf) return (ret); } if (opts->index) { - snprintf(buf, 512, "index:%s:val_idx", - wtperf->uris[i] + strlen("table:")); + testutil_check(__wt_snprintf(buf, 512, + "index:%s:val_idx", + wtperf->uris[i] + strlen("table:"))); if ((ret = session->create( session, buf, "columns=(val)")) != 0) { lprintf(wtperf, ret, 0, @@ -2186,15 +2190,15 @@ start_all_runs(WTPERF *wtperf) */ len = strlen(wtperf->home) + 5; next_wtperf->home = dmalloc(len); - snprintf( - next_wtperf->home, len, "%s/D%02d", wtperf->home, (int)i); + testutil_check(__wt_snprintf( + next_wtperf->home, len, "%s/D%02d", wtperf->home, (int)i)); if (opts->create != 0) recreate_dir(next_wtperf->home); len = strlen(wtperf->monitor_dir) + 5; next_wtperf->monitor_dir = dmalloc(len); - snprintf(next_wtperf->monitor_dir, - len, "%s/D%02d", wtperf->monitor_dir, (int)i); + testutil_check(__wt_snprintf(next_wtperf->monitor_dir, + len, "%s/D%02d", wtperf->monitor_dir, (int)i)); if (opts->create != 0 && strcmp(next_wtperf->home, next_wtperf->monitor_dir) != 0) recreate_dir(next_wtperf->monitor_dir); @@ -2543,9 +2547,9 @@ main(int argc, char *argv[]) */ req_len = strlen(",async=(enabled=true,threads=)") + 4; wtperf->async_config = dmalloc(req_len); - snprintf(wtperf->async_config, req_len, + testutil_check(__wt_snprintf(wtperf->async_config, req_len, ",async=(enabled=true,threads=%" PRIu32 ")", - opts->async_threads); + opts->async_threads)); } if ((ret = config_compress(wtperf)) != 0) goto err; @@ -2578,10 +2582,10 @@ main(int argc, char *argv[]) sreq_len = strlen("session_max=") + 6; req_len += sreq_len; sess_cfg = dmalloc(sreq_len); - snprintf(sess_cfg, sreq_len, + testutil_check(__wt_snprintf(sess_cfg, sreq_len, "session_max=%" PRIu32, opts->session_count_idle + - wtperf->workers_cnt + opts->populate_threads + 10); + wtperf->workers_cnt + opts->populate_threads + 10)); } req_len += opts->in_memory ? strlen("in_memory=true") : 0; req_len += user_cconfig != NULL ? strlen(user_cconfig) : 0; @@ -2591,16 +2595,16 @@ main(int argc, char *argv[]) append_comma = ""; if (wtperf->async_config != NULL && strlen(wtperf->async_config) != 0) { - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", - append_comma, wtperf->async_config); + testutil_check(__wt_snprintf_len_incr( + cc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, wtperf->async_config)); append_comma = ","; } if (wtperf->compress_ext != NULL && strlen(wtperf->compress_ext) != 0) { - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", - append_comma, wtperf->compress_ext); + testutil_check(__wt_snprintf_len_incr( + cc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, wtperf->compress_ext)); append_comma = ","; } if (opts->in_memory) { @@ -2610,15 +2614,15 @@ main(int argc, char *argv[]) append_comma = ","; } if (sess_cfg != NULL && strlen(sess_cfg) != 0) { - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", - append_comma, sess_cfg); + testutil_check(__wt_snprintf_len_incr( + cc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, sess_cfg)); append_comma = ","; } if (user_cconfig != NULL && strlen(user_cconfig) != 0) { - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", - append_comma, user_cconfig); + testutil_check(__wt_snprintf_len_incr( + cc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, user_cconfig)); append_comma = ","; } @@ -2639,21 +2643,21 @@ main(int argc, char *argv[]) append_comma = ""; if (wtperf->compress_table != NULL && strlen(wtperf->compress_table) != 0) { - pos += (size_t)snprintf( - tc_buf + pos, req_len - pos, "%s%s", - append_comma, wtperf->compress_table); + testutil_check(__wt_snprintf_len_incr( + tc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, wtperf->compress_table)); append_comma = ","; } if (opts->index) { - pos += (size_t)snprintf( - tc_buf + pos, req_len - pos, "%s%s", - append_comma, INDEX_COL_NAMES); + testutil_check(__wt_snprintf_len_incr( + tc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, INDEX_COL_NAMES)); append_comma = ","; } if (user_tconfig != NULL && strlen(user_tconfig) != 0) { - pos += (size_t)snprintf( - tc_buf + pos, req_len - pos, "%s%s", - append_comma, user_tconfig); + testutil_check(__wt_snprintf_len_incr( + tc_buf + pos, req_len - pos, &pos, "%s%s", + append_comma, user_tconfig)); append_comma = ","; } @@ -2665,8 +2669,9 @@ main(int argc, char *argv[]) req_len = strlen(opts->table_config) + strlen(LOG_PARTIAL_CONFIG) + 1; wtperf->partial_config = dmalloc(req_len); - snprintf(wtperf->partial_config, req_len, "%s%s", - opts->table_config, LOG_PARTIAL_CONFIG); + testutil_check(__wt_snprintf( + wtperf->partial_config, req_len, "%s%s", + opts->table_config, LOG_PARTIAL_CONFIG)); } /* * Set the config for reopen. If readonly add in that string. @@ -2679,11 +2684,12 @@ main(int argc, char *argv[]) req_len = strlen(opts->conn_config) + 1; wtperf->reopen_config = dmalloc(req_len); if (opts->readonly) - snprintf(wtperf->reopen_config, req_len, "%s%s", - opts->conn_config, READONLY_CONFIG); + testutil_check(__wt_snprintf( + wtperf->reopen_config, req_len, "%s%s", + opts->conn_config, READONLY_CONFIG)); else - snprintf(wtperf->reopen_config, - req_len, "%s", opts->conn_config); + testutil_check(__wt_snprintf( + wtperf->reopen_config, req_len, "%s", opts->conn_config)); /* Sanity-check the configuration. */ if ((ret = config_sanity(wtperf)) != 0) @@ -2696,7 +2702,8 @@ main(int argc, char *argv[]) /* Write a copy of the config. */ req_len = strlen(wtperf->home) + strlen("/CONFIG.wtperf") + 1; path = dmalloc(req_len); - snprintf(path, req_len, "%s/CONFIG.wtperf", wtperf->home); + testutil_check(__wt_snprintf( + path, req_len, "%s/CONFIG.wtperf", wtperf->home)); config_opt_log(opts, path); free(path); @@ -2821,7 +2828,8 @@ recreate_dir(const char *name) len = strlen(name) * 2 + 100; buf = dmalloc(len); - (void)snprintf(buf, len, "rm -rf %s && mkdir %s", name, name); + testutil_check(__wt_snprintf( + buf, len, "rm -rf %s && mkdir %s", name, name)); testutil_checkfmt(system(buf), "system: %s", buf); free(buf); } diff --git a/dist/filelist b/dist/filelist index 3886035eaa9..5a3348b940a 100644 --- a/dist/filelist +++ b/dist/filelist @@ -133,6 +133,7 @@ src/os_posix/os_path.c POSIX_HOST src/os_posix/os_priv.c POSIX_HOST src/os_posix/os_setvbuf.c POSIX_HOST src/os_posix/os_sleep.c POSIX_HOST +src/os_posix/os_snprintf.c POSIX_HOST src/os_posix/os_thread.c POSIX_HOST src/os_posix/os_time.c POSIX_HOST src/os_posix/os_yield.c POSIX_HOST @@ -152,7 +153,6 @@ src/os_win/os_snprintf.c WINDOWS_HOST src/os_win/os_thread.c WINDOWS_HOST src/os_win/os_time.c WINDOWS_HOST src/os_win/os_utf8.c WINDOWS_HOST -src/os_win/os_vsnprintf.c WINDOWS_HOST src/os_win/os_winerr.c WINDOWS_HOST src/os_win/os_yield.c WINDOWS_HOST src/packing/pack_api.c diff --git a/examples/c/ex_async.c b/examples/c/ex_async.c index f7531a5c3d8..5cfafca0418 100644 --- a/examples/c/ex_async.c +++ b/examples/c/ex_async.c @@ -170,12 +170,12 @@ main(void) * an asynchronous insert. */ /*! [async set the operation's string key] */ - snprintf(k[i], sizeof(k), "key%d", i); + (void)snprintf(k[i], sizeof(k), "key%d", i); op->set_key(op, k[i]); /*! [async set the operation's string key] */ /*! [async set the operation's string value] */ - snprintf(v[i], sizeof(v), "value%d", i); + (void)snprintf(v[i], sizeof(v), "value%d", i); op->set_value(op, v[i]); /*! [async set the operation's string value] */ @@ -218,7 +218,7 @@ main(void) * Set the operation's string key and value, and then do * an asynchronous search. */ - snprintf(k[i], sizeof(k), "key%d", i); + (void)snprintf(k[i], sizeof(k), "key%d", i); op->set_key(op, k[i]); ret = op->search(op); /*! [async search] */ diff --git a/examples/c/ex_backup.c b/examples/c/ex_backup.c index 0697cbb3458..83cc9b22ecc 100644 --- a/examples/c/ex_backup.c +++ b/examples/c/ex_backup.c @@ -96,7 +96,7 @@ compare_backups(int i) if (i == 0) (void)strncpy(msg, "MAIN", sizeof(msg)); else - snprintf(msg, sizeof(msg), "%d", i); + (void)snprintf(msg, sizeof(msg), "%d", i); printf( "Iteration %s: Tables %s.%d and %s.%d %s\n", msg, full_out, i, incr_out, i, ret == 0 ? "identical" : "differ"); @@ -131,8 +131,8 @@ setup_directories(void) * For incremental backups we need 0-N. The 0 incremental * directory will compare with the original at the end. */ - snprintf(buf, sizeof(buf), "rm -rf %s.%d && mkdir %s.%d", - home_incr, i, home_incr, i); + (void)snprintf(buf, sizeof(buf), + "rm -rf %s.%d && mkdir %s.%d", home_incr, i, home_incr, i); if ((ret = system(buf)) != 0) { fprintf(stderr, "%s: failed ret %d\n", buf, ret); return (ret); @@ -142,8 +142,8 @@ setup_directories(void) /* * For full backups we need 1-N. */ - snprintf(buf, sizeof(buf), "rm -rf %s.%d && mkdir %s.%d", - home_full, i, home_full, i); + (void)snprintf(buf, sizeof(buf), + "rm -rf %s.%d && mkdir %s.%d", home_full, i, home_full, i); if ((ret = system(buf)) != 0) { fprintf(stderr, "%s: failed ret %d\n", buf, ret); return (ret); @@ -164,8 +164,8 @@ add_work(WT_SESSION *session, int iter) * Perform some operations with individual auto-commit transactions. */ for (i = 0; i < MAX_KEYS; i++) { - snprintf(k, sizeof(k), "key.%d.%d", iter, i); - snprintf(v, sizeof(v), "value.%d.%d", iter, i); + (void)snprintf(k, sizeof(k), "key.%d.%d", iter, i); + (void)snprintf(v, sizeof(v), "value.%d.%d", iter, i); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); @@ -187,7 +187,7 @@ take_full_backup(WT_SESSION *session, int i) * directories. Otherwise only into the appropriate full directory. */ if (i != 0) { - snprintf(h, sizeof(h), "%s.%d", home_full, i); + (void)snprintf(h, sizeof(h), "%s.%d", home_full, i); hdir = h; } else hdir = home_incr; @@ -200,14 +200,15 @@ take_full_backup(WT_SESSION *session, int i) * Take a full backup into each incremental directory. */ for (j = 0; j < MAX_ITERATIONS; j++) { - snprintf(h, sizeof(h), "%s.%d", home_incr, j); + (void)snprintf(h, sizeof(h), + "%s.%d", home_incr, j); (void)snprintf(buf, sizeof(buf), "cp %s/%s %s/%s", home, filename, h, filename); ret = system(buf); } else { - snprintf(h, sizeof(h), "%s.%d", home_full, i); + (void)snprintf(h, sizeof(h), "%s.%d", home_full, i); (void)snprintf(buf, sizeof(buf), "cp %s/%s %s/%s", home, filename, hdir, filename); ret = system(buf); @@ -237,12 +238,12 @@ take_incr_backup(WT_SESSION *session, int i) * Copy into the 0 incremental directory and then each of the * incremental directories for this iteration and later. */ - snprintf(h, sizeof(h), "%s.0", home_incr); + (void)snprintf(h, sizeof(h), "%s.0", home_incr); (void)snprintf(buf, sizeof(buf), "cp %s/%s %s/%s", home, filename, h, filename); ret = system(buf); for (j = i; j < MAX_ITERATIONS; j++) { - snprintf(h, sizeof(h), "%s.%d", home_incr, j); + (void)snprintf(h, sizeof(h), "%s.%d", home_incr, j); (void)snprintf(buf, sizeof(buf), "cp %s/%s %s/%s", home, filename, h, filename); ret = system(buf); @@ -270,7 +271,8 @@ main(void) int i, ret; char cmd_buf[256]; - snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s && mkdir %s", home, home); + (void)snprintf(cmd_buf, sizeof(cmd_buf), + "rm -rf %s && mkdir %s", home, home); if ((ret = system(cmd_buf)) != 0) { fprintf(stderr, "%s: failed ret %d\n", cmd_buf, ret); return (EXIT_FAILURE); diff --git a/examples/c/ex_encrypt.c b/examples/c/ex_encrypt.c index 00dc66fc24d..1520bd286cd 100644 --- a/examples/c/ex_encrypt.c +++ b/examples/c/ex_encrypt.c @@ -507,12 +507,12 @@ main(void) * we decrypt on read. */ for (i = 0; i < MAX_KEYS; i++) { - snprintf(keybuf, sizeof(keybuf), "key%d", i); + (void)snprintf(keybuf, sizeof(keybuf), "key%d", i); c1->set_key(c1, keybuf); c2->set_key(c2, keybuf); nc->set_key(nc, keybuf); - snprintf(valbuf, sizeof(valbuf), "value%d", i); + (void)snprintf(valbuf, sizeof(valbuf), "value%d", i); c1->set_value(c1, valbuf); c2->set_value(c2, valbuf); nc->set_value(nc, valbuf); diff --git a/examples/c/ex_log.c b/examples/c/ex_log.c index fdbc39412ae..0d8fbf97233 100644 --- a/examples/c/ex_log.c +++ b/examples/c/ex_log.c @@ -291,8 +291,8 @@ main(void) char cmd_buf[256], k[16], v[16]; count_min = 0; - snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s %s && mkdir %s %s", - home1, home2, home1, home2); + (void)snprintf(cmd_buf, sizeof(cmd_buf), + "rm -rf %s %s && mkdir %s %s", home1, home2, home1, home2); if ((ret = system(cmd_buf)) != 0) { fprintf(stderr, "%s: failed ret %d\n", cmd_buf, ret); return (EXIT_FAILURE); @@ -312,8 +312,8 @@ main(void) * Perform some operations with individual auto-commit transactions. */ for (record_count = 0, i = 0; i < MAX_KEYS; i++, record_count++) { - snprintf(k, sizeof(k), "key%d", i); - snprintf(v, sizeof(v), "value%d", i); + (void)snprintf(k, sizeof(k), "key%d", i); + (void)snprintf(v, sizeof(v), "value%d", i); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); @@ -324,8 +324,8 @@ main(void) * Perform some operations within a single transaction. */ for (i = MAX_KEYS; i < MAX_KEYS+5; i++, record_count++) { - snprintf(k, sizeof(k), "key%d", i); - snprintf(v, sizeof(v), "value%d", i); + (void)snprintf(k, sizeof(k), "key%d", i); + (void)snprintf(v, sizeof(v), "value%d", i); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); diff --git a/examples/c/ex_sync.c b/examples/c/ex_sync.c index 2c610b1e570..b2d74b52f7f 100644 --- a/examples/c/ex_sync.c +++ b/examples/c/ex_sync.c @@ -59,8 +59,8 @@ main(void) char cmd_buf[256], k[16], v[16]; const char *conf; - snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s && mkdir %s", - home, home); + (void)snprintf(cmd_buf, sizeof(cmd_buf), + "rm -rf %s && mkdir %s", home, home); if ((ret = system(cmd_buf)) != 0) { fprintf(stderr, "%s: failed ret %d\n", cmd_buf, ret); return (EXIT_FAILURE); @@ -98,8 +98,8 @@ main(void) ret = session->commit_transaction(session, conf); ret = session->begin_transaction(session, NULL); } - snprintf(k, sizeof(k), "key%d", i); - snprintf(v, sizeof(v), "value%d", i); + (void)snprintf(k, sizeof(k), "key%d", i); + (void)snprintf(v, sizeof(v), "value%d", i); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); @@ -113,8 +113,8 @@ main(void) * Perform some operations within a single transaction. */ for (i = MAX_KEYS; i < MAX_KEYS+5; i++, record_count++) { - snprintf(k, sizeof(k), "key%d", i); - snprintf(v, sizeof(v), "value%d", i); + (void)snprintf(k, sizeof(k), "key%d", i); + (void)snprintf(v, sizeof(v), "value%d", i); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); @@ -129,8 +129,8 @@ main(void) * Demonstrate using log_flush to force the log to disk. */ for (i = 0; i < MAX_KEYS; i++, record_count++) { - snprintf(k, sizeof(k), "key%d", record_count); - snprintf(v, sizeof(v), "value%d", record_count); + (void)snprintf(k, sizeof(k), "key%d", record_count); + (void)snprintf(v, sizeof(v), "value%d", record_count); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); @@ -138,8 +138,8 @@ main(void) ret = session->log_flush(session, "sync=on"); for (i = 0; i < MAX_KEYS; i++, record_count++) { - snprintf(k, sizeof(k), "key%d", record_count); - snprintf(v, sizeof(v), "value%d", record_count); + (void)snprintf(k, sizeof(k), "key%d", record_count); + (void)snprintf(v, sizeof(v), "value%d", record_count); cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); diff --git a/src/block/block_ext.c b/src/block/block_ext.c index e9357d73d1d..da7a06d873d 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -1378,8 +1378,8 @@ __wt_block_extlist_init(WT_SESSION_IMPL *session, size = (name == NULL ? 0 : strlen(name)) + strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1); WT_RET(__wt_calloc_def(session, size, &el->name)); - (void)snprintf(el->name, size, "%s.%s", - name == NULL ? "" : name, extname == NULL ? "" : extname); + WT_RET(__wt_snprintf(el->name, size, "%s.%s", + name == NULL ? "" : name, extname == NULL ? "" : extname)); el->offset = WT_BLOCK_INVALID_OFFSET; el->track_size = track_size; diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index be3230437d3..b8d75678835 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -37,8 +37,8 @@ __bloom_init(WT_SESSION_IMPL *session, len += strlen(config); WT_ERR(__wt_calloc_def(session, len, &bloom->config)); /* Add the standard config at the end, so it overrides user settings. */ - (void)snprintf(bloom->config, len, - "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG); + WT_ERR(__wt_snprintf(bloom->config, len, + "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG)); bloom->session = session; diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 4989301468f..d3f02e29b90 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -64,7 +64,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) const char *cfg[2] = { NULL, NULL }; char buf[256]; - snprintf(buf, sizeof(buf), "verbose=[%s]", v); + WT_RET(__wt_snprintf(buf, sizeof(buf), "verbose=[%s]", v)); cfg[0] = buf; return (__wt_verbose_config(session, cfg)); } @@ -87,6 +87,7 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v) static int __dmsg_event(WT_DBG *ds, const char *fmt, ...) { + WT_DECL_RET; WT_ITEM *msg; WT_SESSION_IMPL *session; size_t len, space; @@ -107,8 +108,9 @@ __dmsg_event(WT_DBG *ds, const char *fmt, ...) p = (char *)msg->mem + msg->size; space = msg->memsize - msg->size; va_start(ap, fmt); - len = (size_t)vsnprintf(p, space, fmt, ap); + ret = __wt_vsnprintf_len_set(p, space, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < space) { @@ -447,13 +449,14 @@ __debug_tree_shape_info(WT_PAGE *page) v = page->memory_footprint; if (v >= WT_GIGABYTE) - snprintf(buf, sizeof(buf), + (void)__wt_snprintf(buf, sizeof(buf), "(%p %" PRIu64 "G)", (void *)page, v / WT_GIGABYTE); else if (v >= WT_MEGABYTE) - snprintf(buf, sizeof(buf), + (void)__wt_snprintf(buf, sizeof(buf), "(%p %" PRIu64 "M)", (void *)page, v / WT_MEGABYTE); else - snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", (void *)page, v); + (void)__wt_snprintf(buf, sizeof(buf), + "(%p %" PRIu64 ")", (void *)page, v); return (buf); } @@ -838,7 +841,8 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) __wt_cell_unpack(cell, unpack); rle = __wt_cell_rle(unpack); } - snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle); + WT_RET(__wt_snprintf( + tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle)); WT_RET( __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack)); diff --git a/src/config/config_api.c b/src/config/config_api.c index 9f70ba65e9b..88e173459f9 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -278,7 +278,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, entry->method = (*epp)->method; len = strlen((*epp)->base) + strlen(",") + strlen(config) + 1; WT_ERR(__wt_calloc_def(session, len, &p)); - snprintf(p, len, "%s,%s", (*epp)->base, config); + WT_ERR(__wt_snprintf(p, len, "%s,%s", (*epp)->base, config)); entry->base = p; /* diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 124250a7a7d..68d45678965 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1662,8 +1662,8 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_MSG(session, EINVAL, "Creating a new database is incompatible with " "read-only configuration"); - len = (size_t)snprintf(buf, sizeof(buf), - "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); + WT_ERR(__wt_snprintf_len_set(buf, sizeof(buf), &len, + "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING)); WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); WT_ERR(__wt_fsync(session, fh, true)); } else { @@ -2250,10 +2250,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_scr_alloc(session, 0, &i3)); cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all); cfg[1] = NULL; - WT_ERR_TEST(snprintf(version, sizeof(version), + WT_ERR(__wt_snprintf(version, sizeof(version), "version=(major=%d,minor=%d)", - WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) >= - (int)sizeof(version), ENOMEM); + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR)); __conn_config_append(cfg, version); /* Ignore the base_config file if config_base_set is false. */ diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 6135132601b..80afaf798dc 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -185,7 +185,7 @@ __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos) size = strlen(to_dup->internal_uri) + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); - snprintf(uri, size, "%s()", to_dup->internal_uri); + WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri)); if ((c = iter->cursor) == NULL || !WT_STREQ(c->uri, uri)) { iter->cursor = NULL; if (c != NULL) @@ -929,7 +929,7 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); - snprintf(mainbuf, size, "%s%s", urimain, proj); + WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, @@ -1148,8 +1148,8 @@ __curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, newsize = strlen(cjoin->table->name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); - snprintf(main_uri, newsize, "%s%.*s", - cjoin->table->name, (int)idx->colconf.len, idx->colconf.str); + WT_ERR(__wt_snprintf(main_uri, newsize, "%s%.*s", + cjoin->table->name, (int)idx->colconf.len, idx->colconf.str)); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); if (idx->extractor == NULL) { @@ -1162,7 +1162,8 @@ __curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, */ len = strlen(entry->main->value_format) + 3; WT_ERR(__wt_calloc(session, len, 1, &newformat)); - snprintf(newformat, len, "%s0x", entry->main->value_format); + WT_ERR(__wt_snprintf( + newformat, len, "%s0x", entry->main->value_format)); __wt_free(session, entry->main->value_format); entry->main->value_format = newformat; } @@ -1531,8 +1532,8 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); - snprintf(entry->repack_format, len, "%s0x", - cindex->iface.key_format); + WT_RET(__wt_snprintf(entry->repack_format, + len, "%s0x", cindex->iface.key_format)); } } return (0); diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index 0ad3c4f4201..e8ddb767863 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -8,8 +8,8 @@ #include "wt_internal.h" -static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t, - WT_CONFIG_ITEM *); +static int __json_unpack_put( + WT_SESSION_IMPL *, void *, u_char *, size_t, WT_CONFIG_ITEM *, size_t *); static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t, const char *, WT_CONFIG_ITEM *, bool, size_t *); static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t, @@ -61,22 +61,22 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, * __json_unpack_put -- * Calculate the size of a packed byte string as formatted for JSON. */ -static size_t +static int __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, - u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name) + u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name, size_t *retsizep) { WT_PACK_VALUE *pv; const u_char *p, *end; size_t s, n; pv = (WT_PACK_VALUE *)voidpv; - s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ", - (int)name->len, name->str); + + WT_RET(__wt_snprintf_len_set( + (char *)buf, bufsz, &s, "\"%.*s\" : ", (int)name->len, name->str)); if (s <= bufsz) { bufsz -= s; buf += s; - } - else + } else bufsz = 0; switch (pv->type) { @@ -118,7 +118,8 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, } if (bufsz > 0) *buf++ = '"'; - return (s); + *retsizep += s; + return (0); case 'U': case 'u': s += 2; @@ -140,14 +141,17 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, } if (bufsz > 0) *buf++ = '"'; - return (s); + *retsizep += s; + return (0); case 'b': case 'h': case 'i': case 'l': case 'q': - return (s + - (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i)); + WT_RET(__wt_snprintf_len_incr( + (char *)buf, bufsz, &s, "%" PRId64, pv->u.i)); + *retsizep += s; + return (0); case 'B': case 't': case 'H': @@ -156,11 +160,14 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, case 'Q': case 'r': case 'R': - return (s + - (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u)); + WT_RET(__wt_snprintf_len_incr( + (char *)buf, bufsz, &s, "%" PRId64, pv->u.u)); + *retsizep += s; + return (0); } - __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type); - return ((size_t)-1); + + WT_RET_MSG(session, EINVAL, + "unknown pack-value type: %c", (int)pv->type); } /* @@ -194,7 +201,8 @@ __json_struct_size(WT_SESSION_IMPL *session, const void *buffer, needcr = true; WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); WT_RET(__pack_name_next(&packname, &name)); - result += __json_unpack_put(session, &pv, NULL, 0, &name); + WT_RET( + __json_unpack_put(session, &pv, NULL, 0, &name, &result)); } if (ret == WT_NOTFOUND) ret = 0; @@ -243,8 +251,9 @@ __json_struct_unpackv(WT_SESSION_IMPL *session, needcr = true; WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); WT_RET(__pack_name_next(&packname, &name)); - jsize = __json_unpack_put(session, - (u_char *)&pv, jbuf, jbufsize, &name); + jsize = 0; + WT_RET(__json_unpack_put(session, + (u_char *)&pv, jbuf, jbufsize, &name, &jsize)); WT_ASSERT(session, jsize <= jbufsize); jbuf += jsize; jbufsize -= jsize; diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index c5ccdb1b649..0bff642370d 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -477,8 +477,8 @@ __curstat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **resultp) len = strlen("join: ") + strlen(sgrp->desc_prefix) + strlen(static_desc) + 1; WT_RET(__wt_realloc(session, NULL, len, &cst->desc_buf)); - snprintf(cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix, - static_desc); + WT_RET(__wt_snprintf( + cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix, static_desc)); *resultp = cst->desc_buf; return (0); } diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index fed7835ada1..57d94e392d1 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -24,8 +24,9 @@ extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden") extern void __wt_stream_set_line_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 0bfc821c7a6..43127a0c79f 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -22,9 +22,10 @@ extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden") extern void __wt_stream_set_line_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/misc.i b/src/include/misc.i index d5692a3f9cf..7040886cf82 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -86,3 +86,94 @@ __wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...) WT_UNUSED(fmt); #endif } + +/* + * __wt_snprintf -- + * snprintf convenience function, ignoring the returned size. + */ +static inline int +__wt_snprintf(char *buf, size_t size, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + WT_DECL_RET; + size_t len; + va_list ap; + + len = 0; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, &len, fmt, ap); + va_end(ap); + WT_RET(ret); + + /* It's an error if the buffer couldn't hold everything. */ + return (len >= size ? ERANGE : 0); +} + +/* + * __wt_vsnprintf -- + * vsnprintf convenience function, ignoring the returned size. + */ +static inline int +__wt_vsnprintf(char *buf, size_t size, const char *fmt, va_list ap) +{ + size_t len; + + len = 0; + + WT_RET(__wt_vsnprintf_len_incr(buf, size, &len, fmt, ap)); + + /* It's an error if the buffer couldn't hold everything. */ + return (len >= size ? ERANGE : 0); +} + +/* + * __wt_snprintf_len_set -- + * snprintf convenience function, setting the returned size. + */ +static inline int +__wt_snprintf_len_set( + char *buf, size_t size, size_t *retsizep, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + WT_DECL_RET; + va_list ap; + + *retsizep = 0; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap); + va_end(ap); + return (ret); +} + +/* + * __wt_vsnprintf_len_set -- + * vsnprintf convenience function, setting the returned size. + */ +static inline int +__wt_vsnprintf_len_set( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) +{ + *retsizep = 0; + + return (__wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap)); +} + +/* + * __wt_snprintf_len_incr -- + * snprintf convenience function, incrementing the returned size. + */ +static inline int +__wt_snprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap); + va_end(ap); + return (ret); +} diff --git a/src/include/os_windows.h b/src/include/os_windows.h index 65938ac9f17..c1e5f788dc6 100644 --- a/src/include/os_windows.h +++ b/src/include/os_windows.h @@ -43,16 +43,6 @@ typedef uint32_t u_int; typedef unsigned char u_char; typedef uint64_t u_long; -/* <= VS 2013 is not C99 compat */ -#if _MSC_VER < 1900 -#define snprintf _wt_snprintf - -_Check_return_opt_ int __cdecl _wt_snprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, ...); -#endif - /* * Windows does have ssize_t * Python headers declare also though so we need to guard it @@ -61,18 +51,6 @@ _Check_return_opt_ int __cdecl _wt_snprintf( typedef int ssize_t; #endif -/* - * Provide a custom version of vsnprintf that returns the - * needed buffer length instead of -1 on truncation - */ -#define vsnprintf _wt_vsnprintf - -_Check_return_opt_ int __cdecl _wt_vsnprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, - va_list _ArgList); - /* Provide a custom version of localtime_r */ struct tm *localtime_r(const time_t* timer, struct tm* result); diff --git a/src/include/packing.i b/src/include/packing.i index 6b4bcd49e04..0eadb2f2027 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -104,8 +104,8 @@ __pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name) WT_CONFIG_ITEM ignore; if (pn->genname) { - (void)snprintf(pn->buf, sizeof(pn->buf), - (pn->iskey ? "key%d" : "value%d"), pn->count); + WT_RET(__wt_snprintf(pn->buf, sizeof(pn->buf), + (pn->iskey ? "key%d" : "value%d"), pn->count)); WT_CLEAR(*name); name->str = pn->buf; name->len = strlen(pn->buf); diff --git a/src/log/log.c b/src/log/log.c index 1a27120710b..5b24250fffc 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -2246,8 +2246,10 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) return (0); va_copy(ap_copy, ap); - len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1; + len = 1; + ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy); va_end(ap_copy); + WT_RET(ret); WT_RET( __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); @@ -2264,7 +2266,8 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) rec_fmt, rectype)); logrec->size += (uint32_t)header_size; - (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap); + WT_ERR(__wt_vsnprintf( + (char *)logrec->data + logrec->size, len, fmt, ap)); __wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size); diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index ed760b6d5f3..411655878af 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -38,13 +38,13 @@ __curstat_lsm_init( /* Propagate all, fast and/or clear to the cursors we open. */ if (cst->flags != 0) { - (void)snprintf(config, sizeof(config), + WT_ERR(__wt_snprintf(config, sizeof(config), "statistics=(%s%s%s%s)", F_ISSET(cst, WT_STAT_TYPE_ALL) ? "all," : "", F_ISSET(cst, WT_STAT_CLEAR) ? "clear," : "", !F_ISSET(cst, WT_STAT_TYPE_ALL) && F_ISSET(cst, WT_STAT_TYPE_FAST) ? "fast," : "", - F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : ""); + F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : "")); cfg[1] = disk_cfg[1] = config; } diff --git a/src/os_common/filename.c b/src/os_common/filename.c index 5aeb64bb51e..f803144a3fb 100644 --- a/src/os_common/filename.c +++ b/src/os_common/filename.c @@ -43,8 +43,8 @@ __wt_nfilename( else { len = strlen(S2C(session)->home) + 1 + namelen + 1; WT_RET(__wt_calloc(session, 1, len, &buf)); - snprintf(buf, len, "%s%s%.*s", S2C(session)->home, - __wt_path_separator(), (int)namelen, name); + WT_RET(__wt_snprintf(buf, len, "%s%s%.*s", S2C(session)->home, + __wt_path_separator(), (int)namelen, name)); *path = buf; } diff --git a/src/os_common/os_errno.c b/src/os_common/os_errno.c index a8e56b7f1aa..7ac89536e79 100644 --- a/src/os_common/os_errno.c +++ b/src/os_common/os_errno.c @@ -44,7 +44,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen) * Fallback to a generic message. */ if (session == NULL && - snprintf(errbuf, errlen, "error return: %d", error) > 0) + __wt_snprintf(errbuf, errlen, "error return: %d", error) == 0) return (errbuf); if (session != NULL && __wt_buf_fmt( session, &session->err, "error return: %d", error) == 0) diff --git a/src/os_common/os_fstream.c b/src/os_common/os_fstream.c index 5a368ea75e6..744da732d84 100644 --- a/src/os_common/os_fstream.c +++ b/src/os_common/os_fstream.c @@ -144,7 +144,7 @@ __fstream_printf( p = (char *)((uint8_t *)buf->mem + buf->size); WT_ASSERT(session, buf->memsize >= buf->size); space = buf->memsize - buf->size; - len = (size_t)vsnprintf(p, space, fmt, ap_copy); + WT_RET(__wt_vsnprintf_len_set(p, space, &len, fmt, ap_copy)); va_end(ap_copy); if (len < space) { diff --git a/src/os_posix/os_snprintf.c b/src/os_posix/os_snprintf.c new file mode 100644 index 00000000000..390e2e0334a --- /dev/null +++ b/src/os_posix/os_snprintf.c @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_vsnprintf_len_incr -- + * POSIX vsnprintf convenience function, incrementing the returned size. + */ +int +__wt_vsnprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + WT_DECL_RET; + + if ((ret = vsnprintf(buf, size, fmt, ap)) >= 0) { + *retsizep += (size_t)ret; + return (0); + } + return (__wt_errno()); +} diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c index 9bf36cc2686..85d43f10a33 100644 --- a/src/os_posix/os_thread.c +++ b/src/os_posix/os_thread.c @@ -45,7 +45,7 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) * __wt_thread_id -- * Fill in a printable version of the process and thread IDs. */ -void +int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { @@ -57,10 +57,10 @@ __wt_thread_id(char *buf, size_t buflen) */ self = pthread_self(); #ifdef __sun - (void)snprintf(buf, buflen, - "%" PRIuMAX ":%u", (uintmax_t)getpid(), self); + return (__wt_snprintf(buf, buflen, + "%" PRIuMAX ":%u", (uintmax_t)getpid(), self)); #else - (void)snprintf(buf, buflen, - "%" PRIuMAX ":%p", (uintmax_t)getpid(), (void *)self); + return (__wt_snprintf(buf, buflen, + "%" PRIuMAX ":%p", (uintmax_t)getpid(), (void *)self)); #endif } diff --git a/src/os_win/os_snprintf.c b/src/os_win/os_snprintf.c index a6056ff9342..f3025b12a60 100644 --- a/src/os_win/os_snprintf.c +++ b/src/os_win/os_snprintf.c @@ -8,17 +8,47 @@ #include "wt_internal.h" -_Check_return_opt_ int __cdecl _wt_snprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, ...) +/* + * __wt_vsnprintf_len_incr -- + * POSIX vsnprintf convenience function, incrementing the returned size. + */ +int +__wt_vsnprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) { - va_list args; - WT_DECL_RET; + int len; + + /* + * WiredTiger calls with length 0 to get the needed buffer size. Call + * the count only version in this case, _vsnprintf_s will invoke the + * invalid parameter handler if count is less than or equal to zero. + */ + if (size == 0) { + *retsizep += (size_t)_vscprintf(fmt, ap); + return (0); + } + + /* + * Additionally, the invalid parameter handler is invoked if buffer or + * format is a NULL pointer. + */ + if (buf == NULL || fmt == NULL) + return (EINVAL); + + /* + * If the storage required to store the data and a terminating null + * exceeds size, the invalid parameter handler is invoked, unless + * count is _TRUNCATE, in which case as much of the string as will + * fit in the buffer is written and -1 returned. + */ + if ((len = _vsnprintf_s(buf, size, _TRUNCATE, fmt, ap)) >= 0) { + *retsizep += (size_t)len; + return (0); + } - va_start(args, _Format); - ret = _wt_vsnprintf(_DstBuf, _MaxCount, _Format, args); - va_end(args); + /* Return the buffer size required. */ + if (len == -1) + *retsizep += (size_t)_vscprintf(fmt, ap); - return (ret); + return (0); } diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c index a34dff776b6..7442fb08a36 100644 --- a/src/os_win/os_thread.c +++ b/src/os_win/os_thread.c @@ -58,10 +58,10 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) * __wt_thread_id -- * Fill in a printable version of the process and thread IDs. */ -void +int __wt_thread_id(char *buf, size_t buflen) { - (void)snprintf(buf, buflen, + return (__wt_snprintf(buf, buflen, "%" PRIu64 ":%" PRIu64, - (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId); + (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId)); } diff --git a/src/os_win/os_vsnprintf.c b/src/os_win/os_vsnprintf.c deleted file mode 100644 index 63f96e79d5b..00000000000 --- a/src/os_win/os_vsnprintf.c +++ /dev/null @@ -1,41 +0,0 @@ -/*- - * Copyright (c) 2014-2016 MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -_Check_return_opt_ int __cdecl _wt_vsnprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, - va_list _ArgList) -{ - int len; - - /* - * WiredTiger will call with length 0 to get the needed buffer size - * We call the count only version in this case since vsnprintf_s assumes - * length is greater than zero or else it triggers the invalid_parameter - * handler. - */ - if (_MaxCount == 0) { - return _vscprintf(_Format, _ArgList); - } - - len = (size_t)_vsnprintf_s( - _DstBuf, _MaxCount, _TRUNCATE, _Format, _ArgList); - - /* - * The MSVC implementation returns -1 on truncation instead of what - * it would have written. We could let callers iteratively grow the - * buffer, or just ask us how big a buffer they would like. - */ - if (len == -1) - len = _vscprintf(_Format, _ArgList) + 1; - - return (len); -} diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index a77ca51f9d2..0677fa711a5 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -601,7 +601,8 @@ __create_table(WT_SESSION_IMPL *session, if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); - snprintf(cgname, cgsize, "colgroup:%s", tablename); + WT_ERR(__wt_snprintf( + cgname, cgsize, "colgroup:%s", tablename)); WT_ERR(__create_colgroup( session, cgname, exclusive, config)); } diff --git a/src/support/err.c b/src/support/err.c index 369997d38c0..57efde72b23 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -102,9 +102,10 @@ __handler_failure(WT_SESSION_IMPL *session, */ char s[256]; - (void)snprintf(s, sizeof(s), + if (__wt_snprintf(s, sizeof(s), "application %s event handler failed: %s", - which, __wt_strerror(session, error, NULL, 0)); + which, __wt_strerror(session, error, NULL, 0)) != 0) + return; /* * Use the error handler to report the failure, unless it was the error @@ -148,6 +149,23 @@ __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler) session->event_handler = handler; } +#define WT_ERROR_APPEND(p, remain, ...) do { \ + size_t __len; \ + WT_ERR(__wt_snprintf_len_set(p, remain, &__len, __VA_ARGS__)); \ + if (__len > remain) \ + __len = remain; \ + p += __len; \ + remain -= __len; \ +} while (0) +#define WT_ERROR_APPEND_AP(p, remain, ...) do { \ + size_t __len; \ + WT_ERR(__wt_vsnprintf_len_set(p, remain, &__len, __VA_ARGS__)); \ + if (__len > remain) \ + __len = remain; \ + p += __len; \ + remain -= __len; \ +} while (0) + /* * __wt_eventv -- * Report a message to an event handler. @@ -161,9 +179,9 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, WT_DECL_RET; WT_SESSION *wt_session; struct timespec ts; - size_t len, remain, wlen; + size_t len, remain; const char *err, *prefix; - char *end, *p, tid[128]; + char *p, tid[128]; /* * We're using a stack buffer because we want error messages no matter @@ -174,6 +192,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * Buffer placed at the end of the stack in case snprintf overflows. */ char s[2048]; + p = s; + remain = sizeof(s); /* * !!! @@ -185,24 +205,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * first session, but if the allocation of the first session fails, for * example, we can end up here without a session.) */ - if (session == NULL) { - if (fprintf(stderr, - "WiredTiger Error%s%s: ", - error == 0 ? "" : ": ", - error == 0 ? "" : - __wt_strerror(session, error, NULL, 0)) < 0) - ret = EIO; - if (vfprintf(stderr, fmt, ap) < 0) - ret = EIO; - if (fprintf(stderr, "\n") < 0) - ret = EIO; - if (fflush(stderr) != 0) - ret = EIO; - return (ret); - } - - p = s; - end = s + sizeof(s); + if (session == NULL) + goto err; /* * We have several prefixes for the error message: a timestamp and the @@ -211,42 +215,24 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * followed by a colon. */ __wt_epoch(session, &ts); - __wt_thread_id(tid, sizeof(tid)); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", + WT_ERR(__wt_thread_id(tid, sizeof(tid))); + WT_ERROR_APPEND(p, remain, + "[%" PRIuMAX ":%" PRIuMAX "][%s]", (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); - p = wlen >= remain ? end : p + wlen; - if ((prefix = S2C(session)->error_prefix) != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } + if ((prefix = S2C(session)->error_prefix) != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); prefix = session->dhandle == NULL ? NULL : session->dhandle->name; - if (prefix != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } - if ((prefix = session->name) != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ": "); - p = wlen >= remain ? end : p + wlen; - - if (file_name != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t) - snprintf(p, remain, "%s, %d: ", file_name, line_number); - p = wlen >= remain ? end : p + wlen; - } + if (prefix != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); + if ((prefix = session->name) != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); + WT_ERROR_APPEND(p, remain, ": "); + + if (file_name != NULL) + WT_ERROR_APPEND(p, remain, "%s, %d: ", file_name, line_number); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)vsnprintf(p, remain, fmt, ap); - p = wlen >= remain ? end : p + wlen; + WT_ERROR_APPEND_AP(p, remain, fmt, ap); if (error != 0) { /* @@ -261,10 +247,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, */ err = __wt_strerror(session, error, NULL, 0); len = strlen(err); - if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) { - remain = WT_PTRDIFF(end, p); - (void)snprintf(p, remain, ": %s", err); - } + if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) + WT_ERROR_APPEND(p, remain, ": %s", err); } /* @@ -279,7 +263,7 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * * If an application-specified error message handler fails, complain * using the default error handler. If the default error handler fails, - * there's nothing to do. + * fallback to stderr. */ wt_session = (WT_SESSION *)session; handler = session->event_handler; @@ -293,6 +277,21 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, __handler_failure(session, ret, "error", true); } + if (ret != 0) { +err: if (fprintf(stderr, + "WiredTiger Error%s%s: ", + error == 0 ? "" : ": ", + error == 0 ? "" : + __wt_strerror(session, error, NULL, 0)) < 0) + WT_TRET(EIO); + if (vfprintf(stderr, fmt, ap) < 0) + WT_TRET(EIO); + if (fprintf(stderr, "\n") < 0) + WT_TRET(EIO); + if (fflush(stderr) != 0) + WT_TRET(EIO); + } + return (ret); } @@ -376,7 +375,7 @@ info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap) */ char s[2048]; - (void)vsnprintf(s, sizeof(s), fmt, ap); + WT_RET(__wt_vsnprintf(s, sizeof(s), fmt, ap)); wt_session = (WT_SESSION *)session; handler = session->event_handler; diff --git a/src/support/scratch.c b/src/support/scratch.c index 69987ebc852..485cea90e89 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -69,13 +69,16 @@ int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) { + WT_DECL_RET; va_list ap; size_t len; for (;;) { va_start(ap, fmt); - len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap); + ret = __wt_vsnprintf_len_set( + buf->mem, buf->memsize, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < buf->memsize) { @@ -100,6 +103,7 @@ int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) { + WT_DECL_RET; va_list ap; size_t len, space; char *p; @@ -117,8 +121,9 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) p = (char *)((uint8_t *)buf->mem + buf->size); WT_ASSERT(session, buf->memsize >= buf->size); space = buf->memsize - buf->size; - len = (size_t)vsnprintf(p, space, fmt, ap); + ret = __wt_vsnprintf_len_set(p, space, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < space) { diff --git a/src/utilities/util_backup.c b/src/utilities/util_backup.c index 5dc9671fb45..f1b31f7621a 100644 --- a/src/utilities/util_backup.c +++ b/src/utilities/util_backup.c @@ -109,9 +109,14 @@ copy(WT_SESSION *session, const char *directory, const char *name) /* Build the target pathname. */ len = strlen(directory) + strlen(name) + 2; - if ((to = malloc(len)) == NULL) - goto memerr; - (void)snprintf(to, len, "%s/%s", directory, name); + if ((to = malloc(len)) == NULL) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + return (1); + } + if ((ret = __wt_snprintf(to, len, "%s/%s", directory, name)) != 0) { + fprintf(stderr, "%s: %s\n", progname, strerror(ret)); + goto err; + } if (verbose && printf("Backing up %s/%s to %s\n", home, name, to) < 0) { fprintf(stderr, "%s: %s\n", progname, strerror(EIO)); @@ -126,11 +131,7 @@ copy(WT_SESSION *session, const char *directory, const char *name) fprintf(stderr, "%s/%s to %s: backup copy: %s\n", home, name, to, session->strerror(session, ret)); - if (0) { -memerr: fprintf(stderr, "%s: %s\n", progname, strerror(errno)); - } err: free(to); - return (ret); } diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 947fa7bf9ef..238e2757099 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -259,14 +259,15 @@ dump_add_config(WT_SESSION *session, char **bufp, size_t *leftp, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) { - int n; + WT_DECL_RET; + size_t n; va_list ap; va_start(ap, fmt); - n = vsnprintf(*bufp, *leftp, fmt, ap); + ret = __wt_vsnprintf_len_set(*bufp, *leftp, &n, fmt, ap); va_end(ap); - if (n < 0) - return (util_err(session, EINVAL, NULL)); + if (ret != 0) + return (util_err(session, ret, NULL)); *bufp += n; *leftp -= (size_t)n; return (0); @@ -435,9 +436,9 @@ dump_table_parts_config(WT_SESSION *session, WT_CURSOR *cursor, len = strlen(entry) + strlen(name) + 1; if ((uriprefix = malloc(len)) == NULL) - return util_err(session, errno, NULL); - - snprintf(uriprefix, len, "%s%s", entry, name); + return (util_err(session, errno, NULL)); + if ((ret = __wt_snprintf(uriprefix, len, "%s%s", entry, name)) != 0) + return (util_err(session, ret, NULL)); /* * Search the file looking for column group and index key/value pairs: diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index d31fa4c9d08..d2f00402217 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -120,10 +120,12 @@ load_dump(WT_SESSION *session) goto err; /* Open the insert cursor. */ - (void)snprintf(config, sizeof(config), + if ((ret = __wt_snprintf(config, sizeof(config), "dump=%s%s%s", hex ? "hex" : "print", - append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); + append ? ",append" : "", + no_overwrite ? ",overwrite=false" : "")) != 0) + return (util_err(session, ret, NULL)); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { ret = util_err(session, ret, "%s: session.open_cursor", uri); @@ -472,6 +474,7 @@ config_update(WT_SESSION *session, char **list) static int config_rename(WT_SESSION *session, char **urip, const char *name) { + WT_DECL_RET; size_t len; char *buf, *p; @@ -490,7 +493,9 @@ config_rename(WT_SESSION *session, char **urip, const char *name) } *p = '\0'; p = strchr(p + 1, ':'); - snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p); + if ((ret = __wt_snprintf( + buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p)) != 0) + return (util_err(session, ret, NULL)); *urip = buf; return (0); diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index 1189d49a483..af5c2576b26 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -145,6 +145,7 @@ static int json_kvraw_append(WT_SESSION *session, JSON_INPUT_STATE *ins, const char *str, size_t len) { + WT_DECL_RET; size_t needsize; char *tmp; @@ -152,7 +153,9 @@ json_kvraw_append(WT_SESSION *session, needsize = strlen(ins->kvraw) + len + 2; if ((tmp = malloc(needsize)) == NULL) return (util_err(session, errno, NULL)); - snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str); + if ((ret = __wt_snprintf( + tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str)) != 0) + return (util_err(session, ret, NULL)); free(ins->kvraw); ins->kvraw = tmp; } @@ -181,7 +184,7 @@ json_strdup(WT_SESSION *session, JSON_INPUT_STATE *ins, char **resultp) goto err; } resultlen += 1; - if ((result = (char *)malloc((size_t)resultlen)) == NULL) { + if ((result = malloc((size_t)resultlen)) == NULL) { ret = util_err(session, errno, NULL); goto err; } @@ -236,10 +239,13 @@ json_data(WT_SESSION *session, goto err; uri = clp->list[0]; - (void)snprintf(config, sizeof(config), + if ((ret = __wt_snprintf(config, sizeof(config), "dump=json%s%s", LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "", - LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); + LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "")) != 0) { + ret = util_err(session, ret, NULL); + goto err; + } if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { ret = util_err(session, ret, "%s: session.open_cursor", uri); @@ -256,7 +262,7 @@ json_data(WT_SESSION *session, nfield = 0; JSON_EXPECT(session, ins, '{'); if (ins->kvraw == NULL) { - if ((ins->kvraw = (char *)malloc(1)) == NULL) { + if ((ins->kvraw = malloc(1)) == NULL) { ret = util_err(session, errno, NULL); goto err; } @@ -358,8 +364,11 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags) while (json_peek(session, ins) == 's') { JSON_EXPECT(session, ins, 's'); tableuri = realloc(tableuri, ins->toklen); - snprintf(tableuri, ins->toklen, "%.*s", - (int)(ins->toklen - 2), ins->tokstart + 1); + if ((ret = __wt_snprintf(tableuri, ins->toklen, + "%.*s", (int)(ins->toklen - 2), ins->tokstart + 1)) != 0) { + ret = util_err(session, ret, NULL); + goto err; + } JSON_EXPECT(session, ins, ':'); if (!hasversion) { if (strcmp(tableuri, DUMP_JSON_VERSION_MARKER) != 0) { diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 68e3b0f1bc5..2b4ef36081a 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -257,9 +257,13 @@ main(int argc, char *argv[]) (void)util_err(NULL, errno, NULL); goto err; } - (void)snprintf(p, len, "%s,%s,%s%s%s%s", + if ((ret = __wt_snprintf(p, len, "%s,%s,%s%s%s%s", config == NULL ? "" : config, - cmd_config == NULL ? "" : cmd_config, rec_config, p1, p2, p3); + cmd_config == NULL ? "" : cmd_config, + rec_config, p1, p2, p3)) != 0) { + (void)util_err(NULL, ret, NULL); + goto err; + } config = p; /* Open the database and a session. */ @@ -298,6 +302,7 @@ done: char * util_uri(WT_SESSION *session, const char *s, const char *type) { + WT_DECL_RET; size_t len; char *name; @@ -321,8 +326,12 @@ util_uri(WT_SESSION *session, const char *s, const char *type) * the default type for the operation. */ if (strchr(s, ':') != NULL) - snprintf(name, len, "%s", s); + ret = __wt_snprintf(name, len, "%s", s); else - snprintf(name, len, "%s:%s", type, s); + ret = __wt_snprintf(name, len, "%s:%s", type, s); + if (ret != 0) { + (void)util_err(session, ret, NULL); + return (NULL); + } return (name); } diff --git a/src/utilities/util_misc.c b/src/utilities/util_misc.c index 0905bfa97be..e26185a0096 100644 --- a/src/utilities/util_misc.c +++ b/src/utilities/util_misc.c @@ -140,7 +140,10 @@ util_flush(WT_SESSION *session, const char *uri) if ((buf = malloc(len)) == NULL) return (util_err(session, errno, NULL)); - (void)snprintf(buf, len, "target=(\"%s\")", uri); + if ((ret = __wt_snprintf(buf, len, "target=(\"%s\")", uri)) != 0) { + free(buf); + return (util_err(session, ret, NULL)); + } ret = session->checkpoint(session, buf); free(buf); diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c index 1b75d9ea8bf..0692afe2819 100644 --- a/src/utilities/util_stat.c +++ b/src/utilities/util_stat.c @@ -68,7 +68,10 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) fprintf(stderr, "%s: %s\n", progname, strerror(errno)); goto err; } - snprintf(uri, urilen, "statistics:%s", objname); + if ((ret = __wt_snprintf(uri, urilen, "statistics:%s", objname)) != 0) { + fprintf(stderr, "%s: %s\n", progname, strerror(ret)); + goto err; + } if ((ret = session->open_cursor(session, uri, NULL, config, &cursor)) != 0) { diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index d0587fcfc8c..ace1be7a5de 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -72,7 +72,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) ret = util_err(session, errno, NULL); goto err; } - snprintf(config, size, + if ((ret = __wt_snprintf(config, size, "%s%s%s%s%s%s%s", dump_address ? "dump_address," : "", dump_blocks ? "dump_blocks," : "", @@ -80,7 +80,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) dump_offsets != NULL ? "dump_offsets=[" : "", dump_offsets != NULL ? dump_offsets : "", dump_offsets != NULL ? "]," : "", - dump_pages ? "dump_pages," : ""); + dump_pages ? "dump_pages," : "")) != 0) { + (void)util_err(session, ret, NULL); + goto err; + } } if ((ret = session->verify(session, uri, config)) != 0) (void)util_err(session, ret, "session.verify: %s", uri); diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index b931fad064d..1d3e6937f8d 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -54,8 +54,12 @@ util_write(WT_SESSION *session, int argc, char *argv[]) * Open the object; free allocated memory immediately to simplify * future error handling. */ - (void)snprintf(config, sizeof(config), "%s,%s", - append ? "append=true" : "", overwrite ? "overwrite=true" : ""); + if ((ret = __wt_snprintf(config, sizeof(config), "%s,%s", + append ? "append=true" : "", + overwrite ? "overwrite=true" : "")) != 0) { + free(uri); + return (util_err(session, ret, NULL)); + } if ((ret = session->open_cursor(session, uri, NULL, config, &cursor)) != 0) (void)util_err(session, ret, "%s: session.open_cursor", uri); diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index bef509e01d8..b6299bbbadc 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -121,9 +121,9 @@ setup(void) * Open configuration -- put command line configuration options at the * end so they can override "standard" configuration. */ - snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,%s", - progname, g.c_cache, g.config_open == NULL ? "" : g.config_open); + progname, g.c_cache, g.config_open == NULL ? "" : g.config_open)); testutil_check(wiredtiger_open(NULL, NULL, config, &conn)); diff --git a/test/checkpoint/checkpointer.c b/test/checkpoint/checkpointer.c index ef49a9492ce..84d2765843a 100644 --- a/test/checkpoint/checkpointer.c +++ b/test/checkpoint/checkpointer.c @@ -74,7 +74,7 @@ checkpointer(void *arg) WT_UNUSED(arg); - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); printf("checkpointer thread starting: tid: %s\n", tid); (void)real_checkpointer(); @@ -107,8 +107,9 @@ real_checkpointer(void) "WiredTigerCheckpoint", strlen("WiredTigerCheckpoint")) == 0) checkpoint_config = NULL; else { + testutil_check(__wt_snprintf( + _buf, sizeof(_buf), "name=%s", g.checkpoint_name)); checkpoint_config = _buf; - snprintf(checkpoint_config, 128, "name=%s", g.checkpoint_name); } while (g.running) { /* Execute a checkpoint */ @@ -147,7 +148,8 @@ verify_checkpoint(WT_SESSION *session) ret = t_ret = 0; key_count = 0; - snprintf(ckpt, 128, "checkpoint=%s", g.checkpoint_name); + testutil_check(__wt_snprintf( + ckpt, sizeof(ckpt), "checkpoint=%s", g.checkpoint_name)); cursors = calloc((size_t)g.ntables, sizeof(*cursors)); if (cursors == NULL) return (log_print_err("verify_checkpoint", ENOMEM, 1)); @@ -159,7 +161,8 @@ verify_checkpoint(WT_SESSION *session) */ if (g.cookies[i].type == LSM) continue; - snprintf(next_uri, 128, "table:__wt%04d", i); + testutil_check(__wt_snprintf( + next_uri, sizeof(next_uri), "table:__wt%04d", i)); if ((ret = session->open_cursor( session, next_uri, NULL, ckpt, &cursors[i])) != 0) { (void)log_print_err( @@ -296,7 +299,8 @@ diagnose_key_error( session = cursor1->session; key1_orig = key2_orig = 0; - snprintf(ckpt, 128, "checkpoint=%s", g.checkpoint_name); + testutil_check(__wt_snprintf( + ckpt, sizeof(ckpt), "checkpoint=%s", g.checkpoint_name)); /* Save the failed keys. */ if (cursor1->get_key(cursor1, &key1_orig) != 0 || @@ -338,7 +342,8 @@ diagnose_key_error( * Now try opening new cursors on the checkpoints and see if we * get the same missing key via searching. */ - snprintf(next_uri, 128, "table:__wt%04d", index1); + testutil_check(__wt_snprintf( + next_uri, sizeof(next_uri), "table:__wt%04d", index1)); if (session->open_cursor(session, next_uri, NULL, ckpt, &c) != 0) return (1); c->set_key(c, key1_orig); @@ -350,7 +355,8 @@ diagnose_key_error( if (c->close(c) != 0) return (1); - snprintf(next_uri, 128, "table:__wt%04d", index2); + testutil_check(__wt_snprintf( + next_uri, sizeof(next_uri), "table:__wt%04d", index2)); if (session->open_cursor(session, next_uri, NULL, ckpt, &c) != 0) return (1); c->set_key(c, key1_orig); @@ -367,7 +373,8 @@ live_check: * Now try opening cursors on the live checkpoint to see if we get the * same missing key via searching. */ - snprintf(next_uri, 128, "table:__wt%04d", index1); + testutil_check(__wt_snprintf( + next_uri, sizeof(next_uri), "table:__wt%04d", index1)); if (session->open_cursor(session, next_uri, NULL, NULL, &c) != 0) return (1); c->set_key(c, key1_orig); @@ -376,7 +383,8 @@ live_check: if (c->close(c) != 0) return (1); - snprintf(next_uri, 128, "table:__wt%04d", index2); + testutil_check(__wt_snprintf( + next_uri, sizeof(next_uri), "table:__wt%04d", index2)); if (session->open_cursor(session, next_uri, NULL, NULL, &c) != 0) return (1); c->set_key(c, key2_orig); diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c index c7132b433d2..e7e1a0b81a5 100644 --- a/test/checkpoint/test_checkpoint.c +++ b/test/checkpoint/test_checkpoint.c @@ -199,11 +199,11 @@ wt_connect(const char *config_open) testutil_make_work_dir(g.home); - snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "create,statistics=(fast),error_prefix=\"%s\",cache_size=1GB%s%s", progname, config_open == NULL ? "" : ",", - config_open == NULL ? "" : config_open); + config_open == NULL ? "" : config_open)); if ((ret = wiredtiger_open( g.home, &event_handler, config, &g.conn)) != 0) diff --git a/test/checkpoint/workers.c b/test/checkpoint/workers.c index e4fe7bd1b29..82d1b8685c4 100644 --- a/test/checkpoint/workers.c +++ b/test/checkpoint/workers.c @@ -39,14 +39,12 @@ static int create_table(WT_SESSION *session, COOKIE *cookie) { int ret; - char *p, *end, config[128]; + char config[128]; - p = config; - end = config + sizeof(config); - p += snprintf(p, (size_t)(end - p), - "key_format=%s,value_format=S", cookie->type == COL ? "r" : "q"); - if (cookie->type == LSM) - (void)snprintf(p, (size_t)(end - p), ",type=lsm"); + testutil_check(__wt_snprintf(config, sizeof(config), + "key_format=%s,value_format=S,%s", + cookie->type == COL ? "r" : "q", + cookie->type == LSM ? ",type=lsm" : "")); if ((ret = session->create(session, cookie->uri, config)) != 0) if (ret != EEXIST) @@ -88,8 +86,9 @@ start_workers(table_type type) (table_type)((i % MAX_TABLE_TYPE) + 1); else g.cookies[i].type = type; - (void)snprintf(g.cookies[i].uri, 128, - "%s%04d", URI_BASE, g.cookies[i].id); + testutil_check(__wt_snprintf( + g.cookies[i].uri, sizeof(g.cookies[i].uri), + "%s%04d", URI_BASE, g.cookies[i].id)); /* Should probably be atomic to avoid races. */ if ((ret = create_table(session, &g.cookies[i])) != 0) @@ -132,7 +131,8 @@ worker_op(WT_CURSOR *cursor, uint64_t keyno, u_int new_val) char valuebuf[64]; cursor->set_key(cursor, keyno); - (void)snprintf(valuebuf, sizeof(valuebuf), "%037u", new_val); + testutil_check(__wt_snprintf( + valuebuf, sizeof(valuebuf), "%037u", new_val)); cursor->set_value(cursor, valuebuf); if ((ret = cursor->insert(cursor)) != 0) { if (ret == WT_ROLLBACK) @@ -153,7 +153,7 @@ worker(void *arg) WT_UNUSED(arg); - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); printf("worker thread starting: tid: %s\n", tid); (void)real_worker(); diff --git a/test/csuite/wt1965_col_efficiency/main.c b/test/csuite/wt1965_col_efficiency/main.c index a7235d81b31..e5b73d5e642 100644 --- a/test/csuite/wt1965_col_efficiency/main.c +++ b/test/csuite/wt1965_col_efficiency/main.c @@ -132,7 +132,8 @@ main(int argc, char *argv[]) testutil_check(opts->conn->open_session( opts->conn, NULL, NULL, &session)); - sprintf(table_format, "key_format=r,value_format="); + testutil_check(__wt_snprintf( + table_format, sizeof(table_format), "key_format=r,value_format=")); for (i = 0; i < NR_FIELDS; i++) strcat(table_format, "Q"); diff --git a/test/csuite/wt2246_col_append/main.c b/test/csuite/wt2246_col_append/main.c index 976e2269da6..9876582fffa 100644 --- a/test/csuite/wt2246_col_append/main.c +++ b/test/csuite/wt2246_col_append/main.c @@ -68,8 +68,8 @@ page_init(uint64_t n) else { if (recno % 3 == 0) ++vrecno; - snprintf(buf, - sizeof(buf), "%" PRIu64 " VALUE ------", vrecno); + testutil_check(__wt_snprintf(buf, + sizeof(buf), "%" PRIu64 " VALUE ------", vrecno)); cursor->set_value(cursor, buf); } testutil_check(cursor->insert(cursor)); @@ -112,19 +112,19 @@ main(int argc, char *argv[]) testutil_check(testutil_parse_opts(argc, argv, opts)); testutil_make_work_dir(opts->home); - snprintf(buf, sizeof(buf), + testutil_check(__wt_snprintf(buf, sizeof(buf), "create," "cache_size=%s," "eviction=(threads_max=5)," "statistics=(fast)", - opts->table_type == TABLE_FIX ? "500MB" : "2GB"); + opts->table_type == TABLE_FIX ? "500MB" : "2GB")); testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); - snprintf(buf, sizeof(buf), + testutil_check(__wt_snprintf(buf, sizeof(buf), "key_format=r,value_format=%s," "allocation_size=4K,leaf_page_max=64K", - opts->table_type == TABLE_FIX ? "8t" : "S"); + opts->table_type == TABLE_FIX ? "8t" : "S")); testutil_check(session->create(session, opts->uri, buf)); testutil_check(session->close(session, NULL)); diff --git a/test/csuite/wt2323_join_visibility/main.c b/test/csuite/wt2323_join_visibility/main.c index a61f707e008..617490fec4d 100644 --- a/test/csuite/wt2323_join_visibility/main.c +++ b/test/csuite/wt2323_join_visibility/main.c @@ -106,14 +106,18 @@ main(int argc, char *argv[]) tablename = strchr(opts->uri, ':'); testutil_assert(tablename != NULL); tablename++; - snprintf(sharedopts->posturi, sizeof(sharedopts->posturi), - "index:%s:post", tablename); - snprintf(sharedopts->baluri, sizeof(sharedopts->baluri), - "index:%s:bal", tablename); - snprintf(sharedopts->flaguri, sizeof(sharedopts->flaguri), - "index:%s:flag", tablename); - snprintf(sharedopts->joinuri, sizeof(sharedopts->joinuri), - "join:%s", opts->uri); + testutil_check(__wt_snprintf( + sharedopts->posturi, sizeof(sharedopts->posturi), + "index:%s:post", tablename)); + testutil_check(__wt_snprintf( + sharedopts->baluri, sizeof(sharedopts->baluri), + "index:%s:bal", tablename)); + testutil_check(__wt_snprintf( + sharedopts->flaguri, sizeof(sharedopts->flaguri), + "index:%s:flag", tablename)); + testutil_check(__wt_snprintf( + sharedopts->joinuri, sizeof(sharedopts->joinuri), + "join:%s", opts->uri)); testutil_check(wiredtiger_open(opts->home, NULL, "create,cache_size=1G", &opts->conn)); @@ -350,19 +354,21 @@ static void *thread_join(void *arg) balcur->set_key(balcur, 0); testutil_check(balcur->search(balcur)); if (sharedopts->bloom) - sprintf(cfg, "compare=lt,strategy=bloom,count=%d", - N_RECORDS); + testutil_check(__wt_snprintf(cfg, sizeof(cfg), + "compare=lt,strategy=bloom,count=%d", N_RECORDS)); else - sprintf(cfg, "compare=lt"); + testutil_check(__wt_snprintf( + cfg, sizeof(cfg), "compare=lt")); testutil_check(session->join(session, joincur, balcur, cfg)); flagcur->set_key(flagcur, 0); testutil_check(flagcur->search(flagcur)); if (sharedopts->bloom) - sprintf(cfg, "compare=eq,strategy=bloom,count=%d", - N_RECORDS); + testutil_check(__wt_snprintf(cfg, sizeof(cfg), + "compare=eq,strategy=bloom,count=%d", N_RECORDS)); else - sprintf(cfg, "compare=eq"); + testutil_check(__wt_snprintf( + cfg, sizeof(cfg), "compare=eq")); testutil_check(session->join(session, joincur, flagcur, cfg)); /* Expect no values returned */ diff --git a/test/csuite/wt2447_join_main_table/main.c b/test/csuite/wt2447_join_main_table/main.c index 1368e7c8c09..656cea04145 100644 --- a/test/csuite/wt2447_join_main_table/main.c +++ b/test/csuite/wt2447_join_main_table/main.c @@ -102,9 +102,12 @@ main(int argc, char *argv[]) tablename = strchr(opts->uri, ':'); testutil_assert(tablename != NULL); tablename++; - snprintf(index1uri, sizeof(index1uri), "index:%s:index1", tablename); - snprintf(index2uri, sizeof(index2uri), "index:%s:index2", tablename); - snprintf(joinuri, sizeof(joinuri), "join:%s", opts->uri); + testutil_check(__wt_snprintf( + index1uri, sizeof(index1uri), "index:%s:index1", tablename)); + testutil_check(__wt_snprintf( + index2uri, sizeof(index2uri), "index:%s:index2", tablename)); + testutil_check(__wt_snprintf( + joinuri, sizeof(joinuri), "join:%s", opts->uri)); testutil_check(wiredtiger_open(opts->home, NULL, "statistics=(all),create", &opts->conn)); @@ -150,7 +153,8 @@ main(int argc, char *argv[]) cursor2->set_key(cursor2, half + 1); testutil_check(cursor2->search(cursor2)); - sprintf(bloom_cfg, "compare=lt,strategy=bloom,count=%d", half); + testutil_check(__wt_snprintf(bloom_cfg, sizeof(bloom_cfg), + "compare=lt,strategy=bloom,count=%d", half)); testutil_check(session->open_cursor(session, joinuri, NULL, NULL, &jcursor)); diff --git a/test/csuite/wt2592_join_schema/main.c b/test/csuite/wt2592_join_schema/main.c index 0ec1c765d99..be3eff6136c 100644 --- a/test/csuite/wt2592_join_schema/main.c +++ b/test/csuite/wt2592_join_schema/main.c @@ -82,9 +82,12 @@ main(int argc, char *argv[]) tablename = strchr(opts->uri, ':'); testutil_assert(tablename != NULL); tablename++; - snprintf(countryuri, sizeof(countryuri), "index:%s:country", tablename); - snprintf(yearuri, sizeof(yearuri), "index:%s:year", tablename); - snprintf(joinuri, sizeof(joinuri), "join:%s", opts->uri); + testutil_check(__wt_snprintf( + countryuri, sizeof(countryuri), "index:%s:country", tablename)); + testutil_check(__wt_snprintf( + yearuri, sizeof(yearuri), "index:%s:year", tablename)); + testutil_check(__wt_snprintf( + joinuri, sizeof(joinuri), "join:%s", opts->uri)); testutil_check(wiredtiger_open(opts->home, NULL, "create,cache_size=200M", &opts->conn)); diff --git a/test/csuite/wt2834_join_bloom_fix/main.c b/test/csuite/wt2834_join_bloom_fix/main.c index f2c54b942be..e128df29f41 100644 --- a/test/csuite/wt2834_join_bloom_fix/main.c +++ b/test/csuite/wt2834_join_bloom_fix/main.c @@ -83,10 +83,14 @@ main(int argc, char *argv[]) tablename = strchr(opts->uri, ':'); testutil_assert(tablename != NULL); tablename++; - snprintf(posturi, sizeof(posturi), "index:%s:post", tablename); - snprintf(balanceuri, sizeof(balanceuri), "index:%s:balance", tablename); - snprintf(flaguri, sizeof(flaguri), "index:%s:flag", tablename); - snprintf(joinuri, sizeof(joinuri), "join:%s", opts->uri); + testutil_check(__wt_snprintf( + posturi, sizeof(posturi), "index:%s:post", tablename)); + testutil_check(__wt_snprintf( + balanceuri, sizeof(balanceuri), "index:%s:balance", tablename)); + testutil_check(__wt_snprintf( + flaguri, sizeof(flaguri), "index:%s:flag", tablename)); + testutil_check(__wt_snprintf( + joinuri, sizeof(joinuri), "join:%s", opts->uri)); testutil_check(session->create(session, posturi, "columns=(post)")); testutil_check(session->create(session, balanceuri, @@ -126,14 +130,14 @@ main(int argc, char *argv[]) balancecur->set_key(balancecur, 0); testutil_check(balancecur->search(balancecur)); - sprintf(cfg, "compare=lt,strategy=bloom,count=%d", - N_RECORDS / 100); + testutil_check(__wt_snprintf(cfg, sizeof(cfg), + "compare=lt,strategy=bloom,count=%d", N_RECORDS / 100)); testutil_check(session->join(session, joincur, balancecur, cfg)); flagcur->set_key(flagcur, 0); testutil_check(flagcur->search(flagcur)); - sprintf(cfg, "compare=eq,strategy=bloom,count=%d", - N_RECORDS / 100); + testutil_check(__wt_snprintf(cfg, sizeof(cfg), + "compare=eq,strategy=bloom,count=%d", N_RECORDS / 100)); testutil_check(session->join(session, joincur, flagcur, cfg)); /* Expect no values returned */ diff --git a/test/csuite/wt2853_perf/main.c b/test/csuite/wt2853_perf/main.c index b365b03493a..46ba71372e5 100644 --- a/test/csuite/wt2853_perf/main.c +++ b/test/csuite/wt2853_perf/main.c @@ -114,12 +114,15 @@ main(int argc, char *argv[]) tablename = strchr(opts->uri, ':'); testutil_assert(tablename != NULL); tablename++; - snprintf(sharedopts->posturi, sizeof(sharedopts->posturi), - "index:%s:post", tablename); - snprintf(sharedopts->baluri, sizeof(sharedopts->baluri), - "index:%s:bal", tablename); - snprintf(sharedopts->flaguri, sizeof(sharedopts->flaguri), - "index:%s:flag", tablename); + testutil_check(__wt_snprintf( + sharedopts->posturi, sizeof(sharedopts->posturi), + "index:%s:post", tablename)); + testutil_check(__wt_snprintf( + sharedopts->baluri, sizeof(sharedopts->baluri), + "index:%s:bal", tablename)); + testutil_check(__wt_snprintf( + sharedopts->flaguri, sizeof(sharedopts->flaguri), + "index:%s:flag", tablename)); testutil_check(session->create(session, sharedopts->posturi, "columns=(post)")); diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index 0ae81543050..ce7bd72fa3f 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -267,9 +267,11 @@ enable_failures(uint64_t allow_writes, uint64_t allow_reads) char value[100]; testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1)); - snprintf(value, sizeof(value), "%" PRIu64, allow_writes); + testutil_check(__wt_snprintf( + value, sizeof(value), "%" PRIu64, allow_writes)); testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1)); - snprintf(value, sizeof(value), "%" PRIu64, allow_reads); + testutil_check(__wt_snprintf( + value, sizeof(value), "%" PRIu64, allow_reads)); testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1)); } @@ -325,10 +327,11 @@ run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, subtest_args[narg++] = (char *)"-v"; /* subtest is always verbose */ subtest_args[narg++] = (char *)"-p"; subtest_args[narg++] = (char *)"-o"; - snprintf(sarg, sizeof(sarg), "%" PRIu64, nops); + testutil_check(__wt_snprintf(sarg, sizeof(sarg), "%" PRIu64, nops)); subtest_args[narg++] = sarg; /* number of operations */ subtest_args[narg++] = (char *)"-n"; - snprintf(rarg, sizeof(rarg), "%" PRIu64, opts->nrecords); + testutil_check(__wt_snprintf( + rarg, sizeof(rarg), "%" PRIu64, opts->nrecords)); subtest_args[narg++] = rarg; /* number of records */ subtest_args[narg++] = NULL; testutil_assert(narg <= MAX_ARGS); @@ -463,15 +466,17 @@ subtest_main(int argc, char *argv[], bool close_test) testutil_make_work_dir(opts->home); /* Redirect stderr, stdout. */ - sprintf(filename, "%s/%s", opts->home, STDERR_FILE); + testutil_check(__wt_snprintf( + filename, sizeof(filename), "%s/%s", opts->home, STDERR_FILE)); testutil_assert(freopen(filename, "a", stderr) != NULL); - sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); + testutil_check(__wt_snprintf( + filename, sizeof(filename), "%s/%s", opts->home, STDOUT_FILE)); testutil_assert(freopen(filename, "a", stdout) != NULL); - snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "create,cache_size=250M,log=(enabled)," "transaction_sync=(enabled,method=none),extensions=(" WT_FAIL_FS_LIB - "=(early_load,config={environment=true,verbose=true})]"); + "=(early_load,config={environment=true,verbose=true})]")); testutil_check(wiredtiger_open(opts->home, NULL, config, &opts->conn)); testutil_check( diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c index 09dce624066..2fae85017d4 100644 --- a/test/csuite/wt3120_filesys/main.c +++ b/test/csuite/wt3120_filesys/main.c @@ -52,8 +52,8 @@ main(int argc, char *argv[]) testutil_check(testutil_parse_opts(argc, argv, opts)); testutil_make_work_dir(opts->home); - snprintf(buf, sizeof(buf), - "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))"); + testutil_check(__wt_snprintf(buf, sizeof(buf), + "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))")); testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); diff --git a/test/cursor_order/cursor_order.c b/test/cursor_order/cursor_order.c index 62777f552bf..d3c64b54ab5 100644 --- a/test/cursor_order/cursor_order.c +++ b/test/cursor_order/cursor_order.c @@ -181,19 +181,15 @@ wt_connect(SHARED_CONFIG *cfg, char *config_open) }; int ret; char config[512]; - size_t print_count; testutil_clean_work_dir(home); testutil_make_work_dir(home); - print_count = (size_t)snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "create,statistics=(all),error_prefix=\"%s\",%s%s", progname, config_open == NULL ? "" : ",", - config_open == NULL ? "" : config_open); - - if (print_count >= sizeof(config)) - testutil_die(EINVAL, "Config string too long"); + config_open == NULL ? "" : config_open)); if ((ret = wiredtiger_open( home, &event_handler, config, &cfg->conn)) != 0) diff --git a/test/cursor_order/cursor_order_file.c b/test/cursor_order/cursor_order_file.c index 5dc7194b5fb..42d7af54de4 100644 --- a/test/cursor_order/cursor_order_file.c +++ b/test/cursor_order/cursor_order_file.c @@ -34,23 +34,21 @@ file_create(SHARED_CONFIG *cfg, const char *name) WT_CONNECTION *conn; WT_SESSION *session; int ret; - char *p, *end, config[128]; + char config[128]; conn = cfg->conn; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); - p = config; - end = config + sizeof(config); - p += snprintf(p, (size_t)(end - p), + testutil_check(__wt_snprintf(config, sizeof(config), "key_format=%s," "internal_page_max=%d," "split_deepen_min_child=200," - "leaf_page_max=%d,", - cfg->ftype == ROW ? "S" : "r", 16 * 1024, 128 * 1024); - if (cfg->ftype == FIX) - (void)snprintf(p, (size_t)(end - p), ",value_format=3t"); + "leaf_page_max=%d," + "%s", + cfg->ftype == ROW ? "S" : "r", 16 * 1024, 128 * 1024, + cfg->ftype == FIX ? ",value_format=3t" : "")); if ((ret = session->create(session, name, config)) != 0) if (ret != EEXIST) @@ -67,9 +65,10 @@ load(SHARED_CONFIG *cfg, const char *name) WT_CURSOR *cursor; WT_ITEM *value, _value; WT_SESSION *session; - char keybuf[64], valuebuf[64]; - int64_t keyno; + size_t len; + uint64_t keyno; int ret; + char keybuf[64], valuebuf[64]; conn = cfg->conn; @@ -83,9 +82,10 @@ load(SHARED_CONFIG *cfg, const char *name) testutil_die(ret, "cursor.open"); value = &_value; - for (keyno = 1; keyno <= (int64_t)cfg->nkeys; ++keyno) { + for (keyno = 1; keyno <= cfg->nkeys; ++keyno) { if (cfg->ftype == ROW) { - snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + testutil_check(__wt_snprintf( + keybuf, sizeof(keybuf), "%016" PRIu64, keyno)); cursor->set_key(cursor, keybuf); } else cursor->set_key(cursor, (uint32_t)keyno); @@ -93,8 +93,10 @@ load(SHARED_CONFIG *cfg, const char *name) if (cfg->ftype == FIX) cursor->set_value(cursor, 0x01); else { - value->size = (uint32_t)snprintf( - valuebuf, sizeof(valuebuf), "%37u", (u_int)keyno); + testutil_check(__wt_snprintf_len_set( + valuebuf, sizeof(valuebuf), + &len, "%37" PRIu64, keyno)); + value->size = (uint32_t)len; cursor->set_value(cursor, value); } if ((ret = cursor->insert(cursor)) != 0) diff --git a/test/cursor_order/cursor_order_ops.c b/test/cursor_order/cursor_order_ops.c index 58da49b2991..299f22684c9 100644 --- a/test/cursor_order/cursor_order_ops.c +++ b/test/cursor_order/cursor_order_ops.c @@ -69,7 +69,8 @@ ops_start(SHARED_CONFIG *cfg) run_info[i].cfg = cfg; if (i == 0 || cfg->multiple_files) { run_info[i].name = dmalloc(64); - snprintf(run_info[i].name, 64, FNAME, (int)i); + testutil_check(__wt_snprintf( + run_info[i].name, 64, FNAME, (int)i)); /* Vary by orders of magnitude */ if (cfg->vary_nops) @@ -93,8 +94,8 @@ ops_start(SHARED_CONFIG *cfg) run_info[offset].name = dmalloc(64); /* Have reverse scans read from tables with writes. */ name_index = i % cfg->append_inserters; - snprintf( - run_info[offset].name, 64, FNAME, (int)name_index); + testutil_check(__wt_snprintf( + run_info[offset].name, 64, FNAME, (int)name_index)); /* Vary by orders of magnitude */ if (cfg->vary_nops) @@ -231,7 +232,7 @@ reverse_scan(void *arg) id = (uintmax_t)arg; s = &run_info[id]; cfg = s->cfg; - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf(" reverse scan thread %2" PRIuMAX @@ -272,6 +273,7 @@ append_insert_op( { WT_ITEM *value, _value; uint64_t keyno; + size_t len; int ret; char keybuf[64], valuebuf[64]; @@ -281,7 +283,8 @@ append_insert_op( keyno = __wt_atomic_add64(&cfg->key_range, 1); if (cfg->ftype == ROW) { - snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + testutil_check(__wt_snprintf( + keybuf, sizeof(keybuf), "%016" PRIu64, keyno)); cursor->set_key(cursor, keybuf); } else cursor->set_key(cursor, (uint32_t)keyno); @@ -291,8 +294,9 @@ append_insert_op( if (cfg->ftype == FIX) cursor->set_value(cursor, 0x10); else { - value->size = (uint32_t)snprintf( - valuebuf, sizeof(valuebuf), "XXX %37u", (u_int)keyno); + testutil_check(__wt_snprintf_len_set( + valuebuf, sizeof(valuebuf), &len, "XXX %37" PRIu64, keyno)); + value->size = (uint32_t)len; cursor->set_value(cursor, value); } if ((ret = cursor->insert(cursor)) != 0) @@ -318,7 +322,7 @@ append_insert(void *arg) id = (uintmax_t)arg; s = &run_info[id]; cfg = s->cfg; - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf("write thread %2" PRIuMAX " starting: tid: %s, file: %s\n", diff --git a/test/fops/file.c b/test/fops/file.c index 66c23dfed3c..d1cd22ab391 100644 --- a/test/fops/file.c +++ b/test/fops/file.c @@ -71,7 +71,8 @@ obj_bulk_unique(int force) /* Generate a unique object name. */ if ((ret = pthread_rwlock_wrlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_wrlock single"); - (void)snprintf(new_uri, sizeof(new_uri), "%s.%u", uri, ++uid); + testutil_check(__wt_snprintf( + new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); if ((ret = pthread_rwlock_unlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_unlock single"); @@ -152,7 +153,8 @@ obj_create_unique(int force) /* Generate a unique object name. */ if ((ret = pthread_rwlock_wrlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_wrlock single"); - (void)snprintf(new_uri, sizeof(new_uri), "%s.%u", uri, ++uid); + testutil_check(__wt_snprintf( + new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); if ((ret = pthread_rwlock_unlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_unlock single"); diff --git a/test/fops/t.c b/test/fops/t.c index 469d5acd33a..07ac07349e3 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -157,11 +157,11 @@ wt_startup(char *config_open) testutil_make_work_dir(home); - snprintf(config_buf, sizeof(config_buf), + testutil_check(__wt_snprintf(config_buf, sizeof(config_buf), "create,error_prefix=\"%s\",cache_size=5MB%s%s", progname, config_open == NULL ? "" : ",", - config_open == NULL ? "" : config_open); + config_open == NULL ? "" : config_open)); if ((ret = wiredtiger_open( home, &event_handler, config_buf, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); diff --git a/test/format/backup.c b/test/format/backup.c index 69fdf771de9..8aa614fa970 100644 --- a/test/format/backup.c +++ b/test/format/backup.c @@ -63,7 +63,7 @@ copy_file(WT_SESSION *session, const char *name) len = strlen("BACKUP") + strlen(name) + 10; first = dmalloc(len); - (void)snprintf(first, len, "BACKUP/%s", name); + testutil_check(__wt_snprintf(first, len, "BACKUP/%s", name)); testutil_check(__wt_copy_and_sync(session, name, first)); /* @@ -72,7 +72,7 @@ copy_file(WT_SESSION *session, const char *name) */ len = strlen("BACKUP_COPY") + strlen(name) + 10; second = dmalloc(len); - (void)snprintf(second, len, "BACKUP_COPY/%s", name); + testutil_check(__wt_snprintf(second, len, "BACKUP_COPY/%s", name)); testutil_check(__wt_copy_and_sync(session, first, second)); free(first); diff --git a/test/format/config.c b/test/format/config.c index 535dcd677e2..22b40f7164d 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -257,8 +257,8 @@ config_compression(const char *conf_name) */ cstr = "none"; if (strcmp(conf_name, "logging_compression") == 0 && g.c_logging == 0) { - (void)snprintf( - confbuf, sizeof(confbuf), "%s=%s", conf_name, cstr); + testutil_check(__wt_snprintf( + confbuf, sizeof(confbuf), "%s=%s", conf_name, cstr)); config_single(confbuf, 0); return; } @@ -302,7 +302,8 @@ config_compression(const char *conf_name) break; } - (void)snprintf(confbuf, sizeof(confbuf), "%s=%s", conf_name, cstr); + testutil_check(__wt_snprintf( + confbuf, sizeof(confbuf), "%s=%s", conf_name, cstr)); config_single(confbuf, 0); } @@ -678,7 +679,8 @@ void config_single(const char *s, int perm) { CONFIG *cp; - long v; + long vlong; + uint32_t v; char *p; const char *ep; @@ -743,21 +745,22 @@ config_single(const char *s, int perm) return; } - v = -1; + vlong = -1; if (F_ISSET(cp, C_BOOL)) { if (strncmp(ep, "off", strlen("off")) == 0) - v = 0; + vlong = 0; else if (strncmp(ep, "on", strlen("on")) == 0) - v = 1; + vlong = 1; } - if (v == -1) { - v = strtol(ep, &p, 10); + if (vlong == -1) { + vlong = strtol(ep, &p, 10); if (*p != '\0') { fprintf(stderr, "%s: %s: illegal numeric value\n", progname, s); exit(EXIT_FAILURE); } } + v = (uint32_t)vlong; if (F_ISSET(cp, C_BOOL)) { if (v != 0 && v != 1) { fprintf(stderr, "%s: %s: value of boolean not 0 or 1\n", @@ -770,7 +773,7 @@ config_single(const char *s, int perm) progname, s, cp->min, cp->maxset); exit(EXIT_FAILURE); } - *cp->v = (uint32_t)v; + *cp->v = v; } /* diff --git a/test/format/ops.c b/test/format/ops.c index 5309edf81c0..72e885bd0d6 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -530,8 +530,9 @@ ops(void *arg) pthread_rwlock_trywrlock(&g.backup_lock) == EBUSY) ckpt_config = NULL; else { - (void)snprintf(ckpt_name, sizeof(ckpt_name), - "name=thread-%d", tinfo->id); + testutil_check(__wt_snprintf( + ckpt_name, sizeof(ckpt_name), + "name=thread-%d", tinfo->id)); ckpt_config = ckpt_name; } @@ -557,8 +558,9 @@ ops(void *arg) strcpy(ckpt_name, "checkpoint=WiredTigerCheckpoint"); else - (void)snprintf(ckpt_name, sizeof(ckpt_name), - "checkpoint=thread-%d", tinfo->id); + testutil_check(__wt_snprintf( + ckpt_name, sizeof(ckpt_name), + "checkpoint=thread-%d", tinfo->id)); ckpt_available = true; skip_checkpoint: /* Pick the next checkpoint operation. */ diff --git a/test/format/rebalance.c b/test/format/rebalance.c index 9849b7df82b..e35c62e7255 100644 --- a/test/format/rebalance.c +++ b/test/format/rebalance.c @@ -41,10 +41,10 @@ wts_rebalance(void) track("rebalance", 0ULL, NULL); /* Dump the current object. */ - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), ".." DIR_DELIM_STR ".." DIR_DELIM_STR "wt" " -h %s dump -f %s/rebalance.orig %s", - g.home, g.home, g.uri); + g.home, g.home, g.uri)); testutil_checkfmt(system(cmd), "command failed: %s", cmd); /* Rebalance, then verify the object. */ @@ -66,21 +66,21 @@ wts_rebalance(void) wts_verify("post-rebalance verify"); wts_close(); - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), ".." DIR_DELIM_STR ".." DIR_DELIM_STR "wt" " -h %s dump -f %s/rebalance.new %s", - g.home, g.home, g.uri); + g.home, g.home, g.uri)); testutil_checkfmt(system(cmd), "command failed: %s", cmd); /* Compare the old/new versions of the object. */ #ifdef _WIN32 - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), "fc /b %s\\rebalance.orig %s\\rebalance.new > NUL", - g.home, g.home); + g.home, g.home)); #else - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), "cmp %s/rebalance.orig %s/rebalance.new > /dev/null", - g.home, g.home); + g.home, g.home)); #endif testutil_checkfmt(system(cmd), "command failed: %s", cmd); } diff --git a/test/format/salvage.c b/test/format/salvage.c index 69805fb1018..f82dc34dd5f 100644 --- a/test/format/salvage.c +++ b/test/format/salvage.c @@ -70,29 +70,31 @@ corrupt(void) * It's a little tricky: if the data source is a file, we're looking * for "wt", if the data source is a table, we're looking for "wt.wt". */ - (void)snprintf(buf, sizeof(buf), "%s/%s", g.home, WT_NAME); + testutil_check(__wt_snprintf( + buf, sizeof(buf), "%s/%s", g.home, WT_NAME)); if ((fd = open(buf, O_RDWR)) != -1) { #ifdef _WIN32 - (void)snprintf(copycmd, sizeof(copycmd), + testutil_check(__wt_snprintf(copycmd, sizeof(copycmd), "copy %s\\%s %s\\slvg.copy\\%s.corrupted", - g.home, WT_NAME, g.home, WT_NAME); + g.home, WT_NAME, g.home, WT_NAME)); #else - (void)snprintf(copycmd, sizeof(copycmd), + testutil_check(__wt_snprintf(copycmd, sizeof(copycmd), "cp %s/%s %s/slvg.copy/%s.corrupted", - g.home, WT_NAME, g.home, WT_NAME); + g.home, WT_NAME, g.home, WT_NAME)); #endif goto found; } - (void)snprintf(buf, sizeof(buf), "%s/%s.wt", g.home, WT_NAME); + testutil_check(__wt_snprintf( + buf, sizeof(buf), "%s/%s.wt", g.home, WT_NAME)); if ((fd = open(buf, O_RDWR)) != -1) { #ifdef _WIN32 - (void)snprintf(copycmd, sizeof(copycmd), + testutil_check(__wt_snprintf(copycmd, sizeof(copycmd), "copy %s\\%s.wt %s\\slvg.copy\\%s.wt.corrupted", - g.home, WT_NAME, g.home, WT_NAME); + g.home, WT_NAME, g.home, WT_NAME)); #else - (void)snprintf(copycmd, sizeof(copycmd), + testutil_check(__wt_snprintf(copycmd, sizeof(copycmd), "cp %s/%s.wt %s/slvg.copy/%s.wt.corrupted", - g.home, WT_NAME, g.home, WT_NAME); + g.home, WT_NAME, g.home, WT_NAME)); #endif goto found; } @@ -103,7 +105,8 @@ found: if (fstat(fd, &sb) == -1) offset = mmrand(NULL, 0, (u_int)sb.st_size); len = (size_t)(20 + (sb.st_size / 100) * 2); - (void)snprintf(buf, sizeof(buf), "%s/slvg.corrupt", g.home); + testutil_check(__wt_snprintf( + buf, sizeof(buf), "%s/slvg.corrupt", g.home)); if ((fp = fopen(buf, "w")) == NULL) testutil_die(errno, "salvage-corrupt: open: %s", buf); (void)fprintf(fp, diff --git a/test/format/util.c b/test/format/util.c index b9788f1ac75..983d03e2525 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -241,20 +241,23 @@ val_gen(WT_RAND_STATE *rnd, WT_ITEM *value, uint64_t keyno) void track(const char *tag, uint64_t cnt, TINFO *tinfo) { - static int lastlen = 0; - int len; + static size_t lastlen = 0; + size_t len; char msg[128]; if (g.c_quiet || tag == NULL) return; if (tinfo == NULL && cnt == 0) - len = snprintf(msg, sizeof(msg), "%4d: %s", g.run_cnt, tag); + testutil_check(__wt_snprintf_len_set( + msg, sizeof(msg), &len, "%4d: %s", g.run_cnt, tag)); else if (tinfo == NULL) - len = snprintf( - msg, sizeof(msg), "%4d: %s: %" PRIu64, g.run_cnt, tag, cnt); + testutil_check(__wt_snprintf_len_set( + msg, sizeof(msg), &len, + "%4d: %s: %" PRIu64, g.run_cnt, tag, cnt)); else - len = snprintf(msg, sizeof(msg), + testutil_check(__wt_snprintf_len_set( + msg, sizeof(msg), &len, "%4d: %s: " "search %" PRIu64 "%s, " "insert %" PRIu64 "%s, " @@ -268,7 +271,7 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) tinfo->update > M(9) ? tinfo->update / M(1) : tinfo->update, tinfo->update > M(9) ? "M" : "", tinfo->remove > M(9) ? tinfo->remove / M(1) : tinfo->remove, - tinfo->remove > M(9) ? "M" : ""); + tinfo->remove > M(9) ? "M" : "")); if (lastlen > len) { memset(msg + len, ' ', (size_t)(lastlen - len)); @@ -297,27 +300,30 @@ path_setup(const char *home) /* Log file. */ len = strlen(g.home) + strlen("log") + 2; g.home_log = dmalloc(len); - snprintf(g.home_log, len, "%s/%s", g.home, "log"); + testutil_check(__wt_snprintf(g.home_log, len, "%s/%s", g.home, "log")); /* RNG log file. */ len = strlen(g.home) + strlen("rand") + 2; g.home_rand = dmalloc(len); - snprintf(g.home_rand, len, "%s/%s", g.home, "rand"); + testutil_check(__wt_snprintf( + g.home_rand, len, "%s/%s", g.home, "rand")); /* Run file. */ len = strlen(g.home) + strlen("CONFIG") + 2; g.home_config = dmalloc(len); - snprintf(g.home_config, len, "%s/%s", g.home, "CONFIG"); + testutil_check(__wt_snprintf( + g.home_config, len, "%s/%s", g.home, "CONFIG")); /* Statistics file. */ len = strlen(g.home) + strlen("stats") + 2; g.home_stats = dmalloc(len); - snprintf(g.home_stats, len, "%s/%s", g.home, "stats"); + testutil_check(__wt_snprintf( + g.home_stats, len, "%s/%s", g.home, "stats")); /* BDB directory. */ len = strlen(g.home) + strlen("bdb") + 2; g.home_bdb = dmalloc(len); - snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb"); + testutil_check(__wt_snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb")); /* * Home directory initialize command: create the directory if it doesn't @@ -336,21 +342,23 @@ path_setup(const char *home) "cd %s & mkdir KVS" len = strlen(g.home) * 7 + strlen(CMD) + 1; g.home_init = dmalloc(len); - snprintf(g.home_init, len, CMD, - g.home, g.home, g.home, g.home, g.home, g.home, g.home); + testutil_check(__wt_snprintf(g.home_init, len, CMD, + g.home, g.home, g.home, g.home, g.home, g.home, g.home)); #else #define CMD "test -e %s || mkdir %s; " \ "cd %s > /dev/null && rm -rf `ls | sed /rand/d`; " \ "mkdir KVS" len = strlen(g.home) * 3 + strlen(CMD) + 1; g.home_init = dmalloc(len); - snprintf(g.home_init, len, CMD, g.home, g.home, g.home); + testutil_check(__wt_snprintf( + g.home_init, len, CMD, g.home, g.home, g.home)); #endif /* Primary backup directory. */ len = strlen(g.home) + strlen("BACKUP") + 2; g.home_backup = dmalloc(len); - snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP"); + testutil_check(__wt_snprintf( + g.home_backup, len, "%s/%s", g.home, "BACKUP")); /* * Backup directory initialize command, remove and re-create the primary @@ -365,9 +373,9 @@ path_setup(const char *home) len = strlen(g.home) * 4 + strlen("BACKUP") * 2 + strlen("BACKUP_COPY") * 2 + strlen(CMD) + 1; g.home_backup_init = dmalloc(len); - snprintf(g.home_backup_init, len, CMD, + testutil_check(__wt_snprintf(g.home_backup_init, len, CMD, g.home, "BACKUP", g.home, "BACKUP_COPY", - g.home, "BACKUP", g.home, "BACKUP_COPY"); + g.home, "BACKUP", g.home, "BACKUP_COPY")); /* * Salvage command, save the interesting files so we can replay the @@ -390,7 +398,7 @@ path_setup(const char *home) #endif len = strlen(g.home) + strlen(CMD) + 1; g.home_salvage_copy = dmalloc(len); - snprintf(g.home_salvage_copy, len, CMD, g.home); + testutil_check(__wt_snprintf(g.home_salvage_copy, len, CMD, g.home)); } /* @@ -489,8 +497,9 @@ alter(void *arg) while (!g.workers_finished) { period = mmrand(NULL, 1, 10); - snprintf(buf, sizeof(buf), - "access_pattern_hint=%s", access_value ? "random" : "none"); + testutil_check(__wt_snprintf(buf, sizeof(buf), + "access_pattern_hint=%s", + access_value ? "random" : "none")); access_value = !access_value; if (session->alter(session, g.uri, buf) != 0) break; diff --git a/test/format/wts.c b/test/format/wts.c index a87aa5b9f88..6aa4784d1c1 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -120,8 +120,15 @@ static WT_EVENT_HANDLER event_handler = { NULL /* Close handler. */ }; -#undef REMAIN -#define REMAIN(p, end) (size_t)((p) >= (end) ? 0 : (end) - (p)) +#define CONFIG_APPEND(p, ...) do { \ + size_t __len; \ + testutil_check( \ + __wt_snprintf_len_set(p, max, &__len, __VA_ARGS__)); \ + if (__len > max) \ + __len = max; \ + p += __len; \ + max -= __len; \ +} while (0) /* * wts_open -- @@ -132,14 +139,15 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) { WT_CONNECTION *conn; WT_DECL_RET; - char *config, *end, *p, helium_config[1024]; + size_t max; + char *config, *p, helium_config[1024]; *connp = NULL; config = p = g.wiredtiger_open_config; - end = config + sizeof(g.wiredtiger_open_config); + max = sizeof(g.wiredtiger_open_config); - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, "create=true," "cache_size=%" PRIu32 "MB," "checkpoint_sync=false," @@ -148,26 +156,25 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) /* In-memory configuration. */ if (g.c_in_memory != 0) - p += snprintf(p, REMAIN(p, end), ",in_memory=1"); + CONFIG_APPEND(p, ",in_memory=1"); /* LSM configuration. */ if (DATASOURCE("lsm")) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",lsm_manager=(worker_thread_max=%" PRIu32 "),", g.c_lsm_worker_threads); - if (DATASOURCE("lsm") || g.c_cache < 20) { - p += snprintf(p, REMAIN(p, end), ",eviction_dirty_trigger=95"); - } + if (DATASOURCE("lsm") || g.c_cache < 20) + CONFIG_APPEND(p, ",eviction_dirty_trigger=95"); /* Eviction worker configuration. */ if (g.c_evict_max != 0) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",eviction=(threads_max=%" PRIu32 ")", g.c_evict_max); /* Logging configuration. */ if (g.c_logging) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",log=(enabled=true,archive=%d,prealloc=%d" ",compressor=\"%s\")", g.c_logging_archive ? 1 : 0, @@ -175,21 +182,21 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) compressor(g.c_logging_compression_flag)); if (g.c_encryption) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",encryption=(name=%s)", encryptor(g.c_encryption_flag)); /* Miscellaneous. */ #ifdef HAVE_POSIX_MEMALIGN - p += snprintf(p, REMAIN(p, end), ",buffer_alignment=512"); + CONFIG_APPEND(p, ",buffer_alignment=512"); #endif - p += snprintf(p, REMAIN(p, end), ",mmap=%d", g.c_mmap ? 1 : 0); + CONFIG_APPEND(p, ",mmap=%d", g.c_mmap ? 1 : 0); if (g.c_direct_io) - p += snprintf(p, REMAIN(p, end), ",direct_io=(data)"); + CONFIG_APPEND(p, ",direct_io=(data)"); if (g.c_data_extend) - p += snprintf(p, REMAIN(p, end), ",file_extend=(data=8MB)"); + CONFIG_APPEND(p, ",file_extend=(data=8MB)"); /* * Run the statistics server and/or maintain statistics in the engine. @@ -198,18 +205,18 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) if (g.c_statistics_server) { if (mmrand(NULL, 0, 5) == 1 && memcmp(g.uri, "file:", strlen("file:")) == 0) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",statistics=(fast)" ",statistics_log=(wait=5,sources=(\"file:\"))"); else - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",statistics=(fast),statistics_log=(wait=5)"); } else - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",statistics=(%s)", g.c_statistics ? "fast" : "none"); /* Extensions. */ - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",extensions=[" "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],", g.c_reverse ? REVERSE_PATH : "", @@ -227,11 +234,11 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) * override the standard configuration. */ if (g.c_config_open != NULL) - p += snprintf(p, REMAIN(p, end), ",%s", g.c_config_open); + CONFIG_APPEND(p, ",%s", g.c_config_open); if (g.config_open != NULL) - p += snprintf(p, REMAIN(p, end), ",%s", g.config_open); + CONFIG_APPEND(p, ",%s", g.config_open); - if (REMAIN(p, end) == 0) + if (max == 0) testutil_die(ENOMEM, "wiredtiger_open configuration buffer too small"); @@ -259,12 +266,13 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) if (DATASOURCE("helium")) { if (g.helium_mount == NULL) testutil_die(EINVAL, "no Helium mount point specified"); - (void)snprintf(helium_config, sizeof(helium_config), + testutil_check( + __wt_snprintf(helium_config, sizeof(helium_config), "entry=wiredtiger_extension_init,config=[" "helium_verbose=0," "dev1=[helium_devices=\"he://./%s\"," "helium_o_volume_truncate=1]]", - g.helium_mount); + g.helium_mount)); if ((ret = conn->load_extension( conn, HELIUM_PATH, helium_config)) != 0) testutil_die(ret, @@ -299,13 +307,13 @@ wts_init(void) { WT_CONNECTION *conn; WT_SESSION *session; + size_t max; uint32_t maxintlpage, maxintlkey, maxleafpage, maxleafkey, maxleafvalue; - char config[4096], *end, *p; + char config[4096], *p; conn = g.wts_conn; - p = config; - end = config + sizeof(config); + max = sizeof(config); /* * Ensure that we can service at least one operation per-thread @@ -326,7 +334,7 @@ wts_init(void) if (maxleafpage > 512) maxleafpage >>= 1; } - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, "key_format=%s," "allocation_size=512,%s" "internal_page_max=%" PRIu32 ",leaf_page_max=%" PRIu32, @@ -340,43 +348,35 @@ wts_init(void) */ maxintlkey = mmrand(NULL, maxintlpage / 50, maxintlpage / 40); if (maxintlkey > 20) - p += snprintf(p, REMAIN(p, end), - ",internal_key_max=%" PRIu32, maxintlkey); + CONFIG_APPEND(p, ",internal_key_max=%" PRIu32, maxintlkey); maxleafkey = mmrand(NULL, maxleafpage / 50, maxleafpage / 40); if (maxleafkey > 20) - p += snprintf(p, REMAIN(p, end), - ",leaf_key_max=%" PRIu32, maxleafkey); + CONFIG_APPEND(p, ",leaf_key_max=%" PRIu32, maxleafkey); maxleafvalue = mmrand(NULL, maxleafpage * 10, maxleafpage / 40); if (maxleafvalue > 40 && maxleafvalue < 100 * 1024) - p += snprintf(p, REMAIN(p, end), - ",leaf_value_max=%" PRIu32, maxleafvalue); + CONFIG_APPEND(p, ",leaf_value_max=%" PRIu32, maxleafvalue); switch (g.type) { case FIX: - p += snprintf(p, REMAIN(p, end), - ",value_format=%" PRIu32 "t", g.c_bitcnt); + CONFIG_APPEND(p, ",value_format=%" PRIu32 "t", g.c_bitcnt); break; case ROW: if (g.c_huffman_key) - p += snprintf(p, REMAIN(p, end), - ",huffman_key=english"); + CONFIG_APPEND(p, ",huffman_key=english"); if (g.c_prefix_compression) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",prefix_compression_min=%" PRIu32, g.c_prefix_compression_min); else - p += snprintf(p, REMAIN(p, end), - ",prefix_compression=false"); + CONFIG_APPEND(p, ",prefix_compression=false"); if (g.c_reverse) - p += snprintf(p, REMAIN(p, end), - ",collator=reverse"); + CONFIG_APPEND(p, ",collator=reverse"); /* FALLTHROUGH */ case VAR: if (g.c_huffman_value) - p += snprintf(p, REMAIN(p, end), - ",huffman_value=english"); + CONFIG_APPEND(p, ",huffman_value=english"); if (g.c_dictionary) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(NULL, 123, 517)); break; } @@ -384,66 +384,63 @@ wts_init(void) /* Configure checksums. */ switch (g.c_checksum_flag) { case CHECKSUM_OFF: - p += snprintf(p, REMAIN(p, end), ",checksum=\"off\""); + CONFIG_APPEND(p, ",checksum=\"off\""); break; case CHECKSUM_ON: - p += snprintf(p, REMAIN(p, end), ",checksum=\"on\""); + CONFIG_APPEND(p, ",checksum=\"on\""); break; case CHECKSUM_UNCOMPRESSED: - p += snprintf(p, REMAIN(p, end), ",checksum=\"uncompressed\""); + CONFIG_APPEND(p, ",checksum=\"uncompressed\""); break; } /* Configure compression. */ if (g.c_compression_flag != COMPRESS_NONE) - p += snprintf(p, REMAIN(p, end), ",block_compressor=\"%s\"", + CONFIG_APPEND(p, ",block_compressor=\"%s\"", compressor(g.c_compression_flag)); /* Configure Btree internal key truncation. */ - p += snprintf(p, REMAIN(p, end), ",internal_key_truncate=%s", + CONFIG_APPEND(p, ",internal_key_truncate=%s", g.c_internal_key_truncation ? "true" : "false"); /* Configure Btree page key gap. */ - p += snprintf(p, REMAIN(p, end), ",key_gap=%" PRIu32, g.c_key_gap); + CONFIG_APPEND(p, ",key_gap=%" PRIu32, g.c_key_gap); /* Configure Btree split page percentage. */ - p += snprintf(p, REMAIN(p, end), ",split_pct=%" PRIu32, g.c_split_pct); + CONFIG_APPEND(p, ",split_pct=%" PRIu32, g.c_split_pct); /* Configure LSM and data-sources. */ if (DATASOURCE("helium")) - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",type=helium,helium_o_compress=%d,helium_o_truncate=1", g.c_compression_flag == COMPRESS_NONE ? 0 : 1); if (DATASOURCE("kvsbdb")) - p += snprintf(p, REMAIN(p, end), ",type=kvsbdb"); + CONFIG_APPEND(p, ",type=kvsbdb"); if (DATASOURCE("lsm")) { - p += snprintf(p, REMAIN(p, end), ",type=lsm,lsm=("); - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, ",type=lsm,lsm=("); + CONFIG_APPEND(p, "auto_throttle=%s,", g.c_auto_throttle ? "true" : "false"); - p += snprintf(p, REMAIN(p, end), - "chunk_size=%" PRIu32 "MB,", g.c_chunk_size); + CONFIG_APPEND(p, "chunk_size=%" PRIu32 "MB,", g.c_chunk_size); /* * We can't set bloom_oldest without bloom, and we want to test * with Bloom filters on most of the time anyway. */ if (g.c_bloom_oldest) g.c_bloom = 1; - p += snprintf(p, REMAIN(p, end), - "bloom=%s,", g.c_bloom ? "true" : "false"); - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, "bloom=%s,", g.c_bloom ? "true" : "false"); + CONFIG_APPEND(p, "bloom_bit_count=%" PRIu32 ",", g.c_bloom_bit_count); - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, "bloom_hash_count=%" PRIu32 ",", g.c_bloom_hash_count); - p += snprintf(p, REMAIN(p, end), + CONFIG_APPEND(p, "bloom_oldest=%s,", g.c_bloom_oldest ? "true" : "false"); - p += snprintf(p, REMAIN(p, end), - "merge_max=%" PRIu32 ",", g.c_merge_max); - p += snprintf(p, REMAIN(p, end), ",)"); + CONFIG_APPEND(p, "merge_max=%" PRIu32 ",", g.c_merge_max); + CONFIG_APPEND(p, ",)"); } - if (REMAIN(p, end) == 0) + if (max == 0) testutil_die(ENOMEM, "WT_SESSION.create configuration buffer too small"); @@ -490,14 +487,14 @@ wts_dump(const char *tag, int dump_bdb) len = strlen(g.home) + strlen(BERKELEY_DB_PATH) + strlen(g.uri) + 100; cmd = dmalloc(len); - (void)snprintf(cmd, len, + testutil_check(__wt_snprintf(cmd, len, "sh s_dumpcmp -h %s %s %s %s %s %s", g.home, dump_bdb ? "-b " : "", dump_bdb ? BERKELEY_DB_PATH : "", g.type == FIX || g.type == VAR ? "-c" : "", g.uri == NULL ? "" : "-n", - g.uri == NULL ? "" : g.uri); + g.uri == NULL ? "" : g.uri)); testutil_checkfmt(system(cmd), "%s: dump comparison failed", tag); free(cmd); @@ -587,7 +584,7 @@ wts_stats(void) fprintf(fp, "\n\n====== Data source statistics:\n"); len = strlen("statistics:") + strlen(g.uri) + 1; stat_name = dmalloc(len); - snprintf(stat_name, len, "statistics:%s", g.uri); + testutil_check(__wt_snprintf(stat_name, len, "statistics:%s", g.uri)); testutil_check(session->open_cursor( session, stat_name, NULL, NULL, &cursor)); free(stat_name); diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c index 345c470ba90..42020d6ce9a 100644 --- a/test/manydbs/manydbs.c +++ b/test/manydbs/manydbs.c @@ -168,7 +168,8 @@ main(int argc, char *argv[]) testutil_make_work_dir(home); __wt_random_init(&rnd); for (i = 0; i < dbs; ++i) { - snprintf(hometmp, HOME_SIZE, "%s/%s.%d", home, HOME_BASE, i); + testutil_check(__wt_snprintf( + hometmp, HOME_SIZE, "%s/%s.%d", home, HOME_BASE, i)); testutil_make_work_dir(hometmp); /* * Open each database. Rotate different configurations diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c index 746aecbf6c5..66c7a0ca692 100644 --- a/test/readonly/readonly.c +++ b/test/readonly/readonly.c @@ -206,10 +206,12 @@ main(int argc, char *argv[]) * Set up all the directory names. */ testutil_work_dir_from_path(home, sizeof(home), working_dir); - (void)snprintf(home_wr, sizeof(home_wr), "%s%s", home, HOME_WR_SUFFIX); - (void)snprintf(home_rd, sizeof(home_rd), "%s%s", home, HOME_RD_SUFFIX); - (void)snprintf( - home_rd2, sizeof(home_rd2), "%s%s", home, HOME_RD2_SUFFIX); + testutil_check(__wt_snprintf( + home_wr, sizeof(home_wr), "%s%s", home, HOME_WR_SUFFIX)); + testutil_check(__wt_snprintf( + home_rd, sizeof(home_rd), "%s%s", home, HOME_RD_SUFFIX)); + testutil_check(__wt_snprintf( + home_rd2, sizeof(home_rd2), "%s%s", home, HOME_RD2_SUFFIX)); if (!child) { testutil_make_work_dir(home); testutil_make_work_dir(home_wr); @@ -268,22 +270,22 @@ main(int argc, char *argv[]) * Copy the database. Remove any lock file from one copy * and chmod the copies to be read-only permissions. */ - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), "cp -rp %s/* %s; rm -f %s/WiredTiger.lock", - home, home_wr, home_wr); + home, home_wr, home_wr)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), "cp -rp %s/* %s; chmod 0555 %s; chmod -R 0444 %s/*", - home, home_rd, home_rd, home_rd); + home, home_rd, home_rd, home_rd)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); - (void)snprintf(cmd, sizeof(cmd), + testutil_check(__wt_snprintf(cmd, sizeof(cmd), "cp -rp %s/* %s; rm -f %s/WiredTiger.lock; " "chmod 0555 %s; chmod -R 0444 %s/*", - home, home_rd2, home_rd2, home_rd2, home_rd2); + home, home_rd2, home_rd2, home_rd2, home_rd2)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); @@ -327,8 +329,8 @@ main(int argc, char *argv[]) * * The child will exit with success if its test passes. */ - (void)snprintf( - cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); if (WEXITSTATUS(status) != 0) @@ -337,8 +339,8 @@ main(int argc, char *argv[]) /* * Scenario 2. Run child with writable config. */ - (void)snprintf( - cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); if (WEXITSTATUS(status) != 0) @@ -358,8 +360,8 @@ main(int argc, char *argv[]) /* * Scenario 3. Child read-only. */ - (void)snprintf( - cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); if (WEXITSTATUS(status) != 0) @@ -368,8 +370,8 @@ main(int argc, char *argv[]) /* * Scenario 4. Run child with writable config. */ - (void)snprintf( - cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); if (WEXITSTATUS(status) != 0) @@ -390,11 +392,12 @@ main(int argc, char *argv[]) * We need to chmod the read-only databases back so that they can * be removed by scripts. */ - (void)snprintf(cmd, sizeof(cmd), "chmod 0777 %s %s", home_rd, home_rd2); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "chmod 0777 %s %s", home_rd, home_rd2)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); - (void)snprintf(cmd, sizeof(cmd), "chmod -R 0666 %s/* %s/*", - home_rd, home_rd2); + testutil_check(__wt_snprintf( + cmd, sizeof(cmd), "chmod -R 0666 %s/* %s/*", home_rd, home_rd2)); if ((status = system(cmd)) < 0) testutil_die(status, "system: %s", cmd); printf(" *** Readonly test successful ***\n"); diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index 1d6599ce1b3..febe6530534 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -94,14 +94,16 @@ thread_run(void *arg) /* * The value is the name of the record file with our id appended. */ - snprintf(buf, sizeof(buf), RECORDS_FILE, td->id); + testutil_check(__wt_snprintf(buf, sizeof(buf), RECORDS_FILE, td->id)); /* * Set up a large value putting our id in it. Write it in there a * bunch of times, but the rest of the buffer can just be zero. */ - snprintf(lgbuf, sizeof(lgbuf), "th-%" PRIu32, td->id); + testutil_check(__wt_snprintf( + lgbuf, sizeof(lgbuf), "th-%" PRIu32, td->id)); for (i = 0; i < 128; i += strlen(lgbuf)) - snprintf(&large[i], lsize - i, "%s", lgbuf); + testutil_check(__wt_snprintf( + &large[i], lsize - i, "%s", lgbuf)); /* * Keep a separate file with the records we wrote for checking. */ @@ -124,7 +126,8 @@ thread_run(void *arg) * Write our portion of the key space until we're killed. */ for (i = td->start; ; ++i) { - snprintf(kname, sizeof(kname), "%" PRIu64, i); + testutil_check(__wt_snprintf( + kname, sizeof(kname), "%" PRIu64, i)); cursor->set_key(cursor, kname); /* * Every 30th record write a very large record that exceeds the @@ -313,7 +316,8 @@ main(int argc, char *argv[]) * still exists in case the child aborts for some reason we * don't stay in this loop forever. */ - snprintf(statname, sizeof(statname), "%s/%s", home, fs_main); + testutil_check(__wt_snprintf( + statname, sizeof(statname), "%s/%s", home, fs_main)); while (stat(statname, &sb) != 0 && kill(pid, 0) == 0) sleep(1); sleep(timeout); @@ -348,7 +352,8 @@ main(int argc, char *argv[]) fatal = false; for (i = 0; i < nth; ++i) { middle = 0; - snprintf(fname, sizeof(fname), RECORDS_FILE, i); + testutil_check(__wt_snprintf( + fname, sizeof(fname), RECORDS_FILE, i)); if ((fp = fopen(fname, "r")) == NULL) testutil_die(errno, "fopen: %s", fname); @@ -376,7 +381,8 @@ main(int argc, char *argv[]) fname, key, last_key); break; } - snprintf(kname, sizeof(kname), "%" PRIu64, key); + testutil_check(__wt_snprintf( + kname, sizeof(kname), "%" PRIu64, key)); cursor->set_key(cursor, kname); if ((ret = cursor->search(cursor)) != 0) { if (ret != WT_NOTFOUND) diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c index 1f0a0f7a7bd..a127d8c1c63 100644 --- a/test/recovery/truncated-log.c +++ b/test/recovery/truncated-log.c @@ -30,11 +30,6 @@ #include -#ifdef _WIN32 -/* snprintf is not supported on <= VS2013 */ -#define snprintf _snprintf -#endif - static char home[1024]; /* Program working dir */ static const char * const uri = "table:main"; @@ -137,7 +132,8 @@ usage(void) * Child process creates the database and table, and then writes data into * the table until it is killed by the parent. */ -static void fill_db(void)WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void fill_db(void) + WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); static void fill_db(void) { @@ -193,9 +189,9 @@ fill_db(void) max_key = min_key * 2; first = true; for (i = 0; i < max_key; ++i) { - snprintf(k, sizeof(k), "key%03d", (int)i); - snprintf(v, sizeof(v), "value%0*d", - (int)(V_SIZE - strlen("value")), (int)i); + testutil_check(__wt_snprintf(k, sizeof(k), "key%03d", (int)i)); + testutil_check(__wt_snprintf(v, sizeof(v), "value%0*d", + (int)(V_SIZE - (strlen("value") + 1)), (int)i)); cursor->set_key(cursor, k); cursor->set_value(cursor, v); if ((ret = cursor->insert(cursor)) != 0) diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index 942f7faba03..83f9c6349bc 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -440,7 +440,8 @@ run(int r) process(); - snprintf(buf, sizeof(buf), "cmp %s %s > /dev/null", DUMP, RSLT); + testutil_check(__wt_snprintf( + buf, sizeof(buf), "cmp %s %s > /dev/null", DUMP, RSLT)); if (system(buf)) { fprintf(stderr, "check failed, salvage results were incorrect\n"); @@ -485,28 +486,28 @@ build(int ikey, int ivalue, int cnt) switch (page_type) { case WT_PAGE_COL_FIX: - (void)snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "key_format=r,value_format=7t," "allocation_size=%d," "internal_page_max=%d,internal_item_max=%d," "leaf_page_max=%d,leaf_item_max=%d", - PSIZE, PSIZE, OSIZE, PSIZE, OSIZE); + PSIZE, PSIZE, OSIZE, PSIZE, OSIZE)); break; case WT_PAGE_COL_VAR: - (void)snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "key_format=r," "allocation_size=%d," "internal_page_max=%d,internal_item_max=%d," "leaf_page_max=%d,leaf_item_max=%d", - PSIZE, PSIZE, OSIZE, PSIZE, OSIZE); + PSIZE, PSIZE, OSIZE, PSIZE, OSIZE)); break; case WT_PAGE_ROW_LEAF: - (void)snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "key_format=u," "allocation_size=%d," "internal_page_max=%d,internal_item_max=%d," "leaf_page_max=%d,leaf_item_max=%d", - PSIZE, PSIZE, OSIZE, PSIZE, OSIZE); + PSIZE, PSIZE, OSIZE, PSIZE, OSIZE)); break; default: assert(0); @@ -520,7 +521,8 @@ build(int ikey, int ivalue, int cnt) case WT_PAGE_COL_VAR: break; case WT_PAGE_ROW_LEAF: - snprintf(kbuf, sizeof(kbuf), "%010d KEY------", ikey); + testutil_check(__wt_snprintf( + kbuf, sizeof(kbuf), "%010d KEY------", ikey)); key.data = kbuf; key.size = 20; cursor->set_key(cursor, &key); @@ -533,8 +535,8 @@ build(int ikey, int ivalue, int cnt) break; case WT_PAGE_COL_VAR: case WT_PAGE_ROW_LEAF: - snprintf(vbuf, sizeof(vbuf), - "%010d VALUE----", value_unique ? ivalue : 37); + testutil_check(__wt_snprintf(vbuf, sizeof(vbuf), + "%010d VALUE----", value_unique ? ivalue : 37)); value.data = vbuf; value.size = 20; cursor->set_value(cursor, &value); @@ -621,9 +623,9 @@ process(void) /* Salvage. */ config[0] = '\0'; if (verbose) - snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "error_prefix=\"%s\",verbose=[salvage,verify],", - progname); + progname)); strcat(config, "log=(enabled=false),"); CHECK(wiredtiger_open(NULL, NULL, config, &conn) == 0); diff --git a/test/thread/file.c b/test/thread/file.c index 81ec6ad44f8..7a7d16c4cd6 100644 --- a/test/thread/file.c +++ b/test/thread/file.c @@ -33,20 +33,18 @@ file_create(const char *name) { WT_SESSION *session; int ret; - char *p, *end, config[128]; + char config[128]; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); - p = config; - end = config + sizeof(config); - p += snprintf(p, (size_t)(end - p), + testutil_check(__wt_snprintf(config, sizeof(config), "key_format=%s," "internal_page_max=%d," - "leaf_page_max=%d,", - ftype == ROW ? "u" : "r", 16 * 1024, 128 * 1024); - if (ftype == FIX) - (void)snprintf(p, (size_t)(end - p), ",value_format=3t"); + "leaf_page_max=%d," + "%s", + ftype == ROW ? "u" : "r", 16 * 1024, 128 * 1024, + ftype == FIX ? ",value_format=3t" : "")); if ((ret = session->create(session, name, config)) != 0) if (ret != EEXIST) @@ -62,9 +60,10 @@ load(const char *name) WT_CURSOR *cursor; WT_ITEM *key, _key, *value, _value; WT_SESSION *session; - char keybuf[64], valuebuf[64]; - u_int keyno; + uint64_t keyno; + size_t len; int ret; + char keybuf[64], valuebuf[64]; file_create(name); @@ -79,18 +78,22 @@ load(const char *name) value = &_value; for (keyno = 1; keyno <= nkeys; ++keyno) { if (ftype == ROW) { + testutil_check(__wt_snprintf_len_set( + keybuf, sizeof(keybuf), + &len, "%017" PRIu64, keyno)); key->data = keybuf; - key->size = (uint32_t) - snprintf(keybuf, sizeof(keybuf), "%017u", keyno); + key->size = (uint32_t)len; cursor->set_key(cursor, key); } else - cursor->set_key(cursor, (uint32_t)keyno); - value->data = valuebuf; + cursor->set_key(cursor, keyno); if (ftype == FIX) cursor->set_value(cursor, 0x01); else { - value->size = (uint32_t) - snprintf(valuebuf, sizeof(valuebuf), "%37u", keyno); + testutil_check(__wt_snprintf_len_set( + valuebuf, sizeof(valuebuf), + &len, "%37" PRIu64, keyno)); + value->data = valuebuf; + value->size = (uint32_t)len; cursor->set_value(cursor, value); } if ((ret = cursor->insert(cursor)) != 0) diff --git a/test/thread/rw.c b/test/thread/rw.c index c6107a06c49..e8a2650ca51 100644 --- a/test/thread/rw.c +++ b/test/thread/rw.c @@ -66,7 +66,8 @@ rw_start(u_int readers, u_int writers) for (i = 0; i < writers; ++i) { if (i == 0 || multiple_files) { run_info[i].name = dmalloc(64); - snprintf(run_info[i].name, 64, FNAME, i); + testutil_check(__wt_snprintf( + run_info[i].name, 64, FNAME, i)); /* Vary by orders of magnitude */ if (vary_nops) @@ -88,8 +89,8 @@ rw_start(u_int readers, u_int writers) run_info[offset].name = dmalloc(64); /* Have readers read from tables with writes. */ name_index = i % writers; - snprintf( - run_info[offset].name, 64, FNAME, name_index); + testutil_check(__wt_snprintf( + run_info[offset].name, 64, FNAME, name_index)); /* Vary by orders of magnitude */ if (vary_nops) @@ -158,7 +159,8 @@ static inline void reader_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) { WT_ITEM *key, _key; - u_int keyno; + size_t len; + uint64_t keyno; int ret; char keybuf[64]; @@ -166,17 +168,18 @@ reader_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) keyno = __wt_random(&s->rnd) % nkeys + 1; if (ftype == ROW) { + testutil_check(__wt_snprintf_len_set( + keybuf, sizeof(keybuf), &len, "%017" PRIu64, keyno)); key->data = keybuf; - key->size = (uint32_t) - snprintf(keybuf, sizeof(keybuf), "%017u", keyno); + key->size = (uint32_t)len; cursor->set_key(cursor, key); } else - cursor->set_key(cursor, (uint32_t)keyno); + cursor->set_key(cursor, keyno); if ((ret = cursor->search(cursor)) != 0 && ret != WT_NOTFOUND) testutil_die(ret, "cursor.search"); if (log_print) testutil_check(session->log_printf(session, - "Reader Thread %p key %017u", pthread_self(), keyno)); + "Reader Thread %p key %017" PRIu64, pthread_self(), keyno)); } /* @@ -195,7 +198,7 @@ reader(void *arg) id = (int)(uintptr_t)arg; s = &run_info[id]; - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf(" read thread %2d starting: tid: %s, file: %s\n", @@ -242,7 +245,8 @@ static inline void writer_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) { WT_ITEM *key, _key, *value, _value; - u_int keyno; + uint64_t keyno; + size_t len; int ret; char keybuf[64], valuebuf[64]; @@ -251,12 +255,13 @@ writer_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) keyno = __wt_random(&s->rnd) % nkeys + 1; if (ftype == ROW) { + testutil_check(__wt_snprintf_len_set( + keybuf, sizeof(keybuf), &len, "%017" PRIu64, keyno)); key->data = keybuf; - key->size = (uint32_t) - snprintf(keybuf, sizeof(keybuf), "%017u", keyno); + key->size = (uint32_t)len; cursor->set_key(cursor, key); } else - cursor->set_key(cursor, (uint32_t)keyno); + cursor->set_key(cursor, keyno); if (keyno % 5 == 0) { ++s->remove; if ((ret = @@ -268,8 +273,10 @@ writer_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) if (ftype == FIX) cursor->set_value(cursor, 0x10); else { - value->size = (uint32_t)snprintf( - valuebuf, sizeof(valuebuf), "XXX %37u", keyno); + testutil_check(__wt_snprintf_len_set( + valuebuf, sizeof(valuebuf), + &len, "XXX %37" PRIu64, keyno)); + value->size = (uint32_t)len; cursor->set_value(cursor, value); } if ((ret = cursor->update(cursor)) != 0) @@ -277,7 +284,7 @@ writer_op(WT_SESSION *session, WT_CURSOR *cursor, INFO *s) } if (log_print) testutil_check(session->log_printf(session, - "Writer Thread %p key %017u", pthread_self(), keyno)); + "Writer Thread %p key %017" PRIu64, pthread_self(), keyno)); } /* @@ -296,7 +303,7 @@ writer(void *arg) id = (int)(uintptr_t)arg; s = &run_info[id]; - __wt_thread_id(tid, sizeof(tid)); + testutil_check(__wt_thread_id(tid, sizeof(tid))); __wt_random_init(&s->rnd); printf("write thread %2d starting: tid: %s, file: %s\n", diff --git a/test/thread/stats.c b/test/thread/stats.c index 67a2c02719b..839d65e8a4d 100644 --- a/test/thread/stats.c +++ b/test/thread/stats.c @@ -65,7 +65,8 @@ stats(void) /* File statistics. */ if (!multiple_files) { - (void)snprintf(name, sizeof(name), "statistics:" FNAME, 0); + testutil_check(__wt_snprintf( + name, sizeof(name), "statistics:" FNAME, 0)); if ((ret = session->open_cursor( session, name, NULL, NULL, &cursor)) != 0) testutil_die(ret, "session.open_cursor"); diff --git a/test/thread/t.c b/test/thread/t.c index 9dfd02bdad2..d2ed4c74bb7 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -185,19 +185,15 @@ wt_connect(char *config_open) }; int ret; char config[512]; - size_t print_count; testutil_clean_work_dir(home); testutil_make_work_dir(home); - print_count = (size_t)snprintf(config, sizeof(config), + testutil_check(__wt_snprintf(config, sizeof(config), "create,statistics=(all),error_prefix=\"%s\",%s%s", progname, config_open == NULL ? "" : ",", - config_open == NULL ? "" : config_open); - - if (print_count >= sizeof(config)) - testutil_die(EINVAL, "Config string too long"); + config_open == NULL ? "" : config_open)); if ((ret = wiredtiger_open(home, &event_handler, config, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); diff --git a/test/utility/misc.c b/test/utility/misc.c index 61dad3d76c2..934dac86a7b 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -108,14 +108,14 @@ testutil_clean_work_dir(const char *dir) if ((buf = malloc(len)) == NULL) testutil_die(ENOMEM, "Failed to allocate memory"); - snprintf(buf, len, "%s %s %s %s", DIR_EXISTS_COMMAND, dir, - RM_COMMAND, dir); + testutil_check(__wt_snprintf( + buf, len, "%s %s %s %s", DIR_EXISTS_COMMAND, dir, RM_COMMAND, dir)); #else len = strlen(dir) + strlen(RM_COMMAND) + 1; if ((buf = malloc(len)) == NULL) testutil_die(ENOMEM, "Failed to allocate memory"); - snprintf(buf, len, "%s%s", RM_COMMAND, dir); + testutil_check(__wt_snprintf(buf, len, "%s%s", RM_COMMAND, dir)); #endif if ((ret = system(buf)) != 0 && ret != ENOENT) @@ -142,7 +142,7 @@ testutil_make_work_dir(char *dir) testutil_die(ENOMEM, "Failed to allocate memory"); /* mkdir shares syntax between Windows and Linux */ - snprintf(buf, len, "%s%s", MKDIR_COMMAND, dir); + testutil_check(__wt_snprintf(buf, len, "%s%s", MKDIR_COMMAND, dir)); if ((ret = system(buf)) != 0) testutil_die(ret, "%s", buf); free(buf); diff --git a/test/utility/parse_opts.c b/test/utility/parse_opts.c index af9256b199a..c3eff3360de 100644 --- a/test/utility/parse_opts.c +++ b/test/utility/parse_opts.c @@ -115,13 +115,15 @@ testutil_parse_opts(int argc, char * const *argv, TEST_OPTS *opts) if (opts->home == NULL) { len = strlen("WT_TEST.") + strlen(opts->progname) + 10; opts->home = dmalloc(len); - snprintf(opts->home, len, "WT_TEST.%s", opts->progname); + testutil_check(__wt_snprintf( + opts->home, len, "WT_TEST.%s", opts->progname)); } /* Setup the default URI string */ len = strlen("table:") + strlen(opts->progname) + 10; opts->uri = dmalloc(len); - snprintf(opts->uri, len, "table:%s", opts->progname); + testutil_check(__wt_snprintf( + opts->uri, len, "table:%s", opts->progname)); return (0); } diff --git a/test/utility/thread.c b/test/utility/thread.c index 38465b2f02b..122ad554442 100644 --- a/test/utility/thread.c +++ b/test/utility/thread.c @@ -57,8 +57,8 @@ thread_append(void *arg) if (opts->table_type == TABLE_FIX) cursor->set_value(cursor, buf[0]); else { - snprintf(buf, sizeof(buf), - "%" PRIu64 " VALUE ------", recno); + testutil_check(__wt_snprintf(buf, sizeof(buf), + "%" PRIu64 " VALUE ------", recno)); cursor->set_value(cursor, buf); } testutil_check(cursor->insert(cursor)); @@ -94,7 +94,8 @@ thread_insert_append(void *arg) session, opts->uri, NULL, NULL, &cursor)); for (i = 0; i < opts->nrecords; ++i) { - snprintf(kbuf, sizeof(kbuf), "%010d KEY------", (int)i); + testutil_check(__wt_snprintf( + kbuf, sizeof(kbuf), "%010d KEY------", (int)i)); cursor->set_key(cursor, kbuf); cursor->set_value(cursor, "========== VALUE ======="); testutil_check(cursor->insert(cursor)); diff --git a/test/windows/windows_shim.h b/test/windows/windows_shim.h index 648b991b1a2..8985904fb19 100644 --- a/test/windows/windows_shim.h +++ b/test/windows/windows_shim.h @@ -36,6 +36,8 @@ #include #include +#include "wt_internal.h" + #define inline __inline /* Define some POSIX types */ @@ -52,12 +54,7 @@ typedef int u_int; /* snprintf does not exist on <= VS 2013 */ #if _MSC_VER < 1900 -#define snprintf _wt_snprintf - -_Check_return_opt_ int __cdecl _wt_snprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, ...); +#define snprintf __wt_snprintf #endif /* -- cgit v1.2.1 From 0641cc7b36a130111c19c955875862ed989a1beb Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 08:59:59 -0400 Subject: WT-3136 bug fix: WiredTiger doesn't check sprintf calls for error return (#3347) Add a style check for use of the snprintf/vsnprintf calls rather than the WiredTiger library replacements. Fix a wtperf snprintf call I missed. --- bench/wtperf/wtperf.c | 4 ++-- dist/s_style | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 1eedaba4f32..80416cfdd5c 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2608,8 +2608,8 @@ main(int argc, char *argv[]) append_comma = ","; } if (opts->in_memory) { - pos += (size_t)snprintf( - cc_buf + pos, req_len - pos, "%s%s", + testutil_check(__wt_snprintf_len_incr( + cc_buf + pos, req_len - pos, &pos, "%s%s", append_comma, "in_memory=true"); append_comma = ","; } diff --git a/dist/s_style b/dist/s_style index 8e755224ee2..388a481ef56 100755 --- a/dist/s_style +++ b/dist/s_style @@ -93,6 +93,14 @@ else cat $t fi + if ! expr "$f" : 'examples/c/*' > /dev/null && + ! expr "$f" : 'ext/*' > /dev/null && + ! expr "$f" : 'src/os_posix/os_snprintf.c' > /dev/null && + egrep '[^a-z_]snprintf\(|[^a-z_]vsnprintf\(' $f > $t; then + echo "$f: snprintf call, use WiredTiger library replacements" + cat $t + fi + # Alignment directive before "struct". egrep 'WT_COMPILER_TYPE_ALIGN.*struct' $f > $t test -s $t && { -- cgit v1.2.1 From aba8062d15b6a255542e68b5266fcb61aaa2838c Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 09:37:36 -0400 Subject: WT-3136 bug fix: WiredTiger doesn't check sprintf calls for error return (#3348) Fix a typo. --- bench/wtperf/wtperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 80416cfdd5c..bdc0b0f3b3c 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2610,7 +2610,7 @@ main(int argc, char *argv[]) if (opts->in_memory) { testutil_check(__wt_snprintf_len_incr( cc_buf + pos, req_len - pos, &pos, "%s%s", - append_comma, "in_memory=true"); + append_comma, "in_memory=true")); append_comma = ","; } if (sess_cfg != NULL && strlen(sess_cfg) != 0) { -- cgit v1.2.1 From e552b240c997dba9434cd3d2d5b563bec7df5b96 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 24 Mar 2017 11:24:02 -0400 Subject: WT-98 Update the current cursor value without a search Revert "Change LSM WT_CURSOR.{compare,insert,update,remove} to accept an internal key instead of copying the key into WiredTiger-owned memory (in other words, replace WT_CURSOR_NEEDKEY calls with WT_CURSOR_CHECKKEY)." This reverts commit af2c787. --- src/lsm/lsm_cursor.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 0de39b38370..52265f02e62 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -844,8 +844,8 @@ __clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); - WT_CURSOR_CHECKKEY(a); - WT_CURSOR_CHECKKEY(b); + WT_CURSOR_NEEDKEY(a); + WT_CURSOR_NEEDKEY(b); WT_ERR(__wt_compare( session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp)); @@ -1521,7 +1521,7 @@ __clsm_insert(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); - WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); @@ -1565,7 +1565,7 @@ __clsm_update(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, NULL); - WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); @@ -1612,7 +1612,7 @@ __clsm_remove(WT_CURSOR *cursor) positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); CURSOR_REMOVE_API_CALL(cursor, session, NULL); - WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); -- cgit v1.2.1 From c2bde1ea5a810f47f26fb7a6e70fe9612ea15f1f Mon Sep 17 00:00:00 2001 From: sueloverso Date: Sun, 26 Mar 2017 21:48:14 -0400 Subject: WT-3207 Use config to determine checkpoint force value. (#3350) --- src/txn/txn_ckpt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 5ec8aa19e4c..f4ccf5eacd0 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -1599,7 +1599,9 @@ __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_CONFIG_ITEM cval; WT_DECL_RET; + bool force; /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); @@ -1608,8 +1610,10 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( - session, true, false, true, cfg)); + session, true, force, true, cfg)); WT_RET(ret); if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) return (0); -- cgit v1.2.1 From e36d8cdb2748ad5b6713b824bbe7be0c8f11c14d Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 27 Mar 2017 09:18:34 -0400 Subject: WT-3240 Coverity reports (#3354) * WT-3240 Coverity reports Coverity report 1373075: allocated memory is leaked if __wt_snprintf fails. * Coverity report 1373074: allocated memory is leaked if __wt_snprintf fails. * Coverity report 1373073: allocated memory is leaked if __wt_snprintf fails. * Coverity report 1373072: allocated memory is leaked if __wt_snprintf fails. * Coverity report 1373071: allocated memory is leaked if __wt_snprintf fails. * Coverity report 1369053: CID 1369053 (#1 of 1): Unused value (UNUSED_VALUE) assigned_pointer: Assigning value from "," to append_comma here, but that stored value is overwritten before it can be used. --- bench/wtperf/wtperf.c | 2 -- src/config/config_api.c | 2 +- src/os_common/filename.c | 18 ++++++++++-------- src/utilities/util_dump.c | 4 +++- src/utilities/util_load_json.c | 8 +++++--- src/utilities/util_main.c | 12 ++++++------ 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index bdc0b0f3b3c..6d79eebe8b2 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2623,7 +2623,6 @@ main(int argc, char *argv[]) testutil_check(__wt_snprintf_len_incr( cc_buf + pos, req_len - pos, &pos, "%s%s", append_comma, user_cconfig)); - append_comma = ","; } if (strlen(cc_buf) != 0 && (ret = @@ -2658,7 +2657,6 @@ main(int argc, char *argv[]) testutil_check(__wt_snprintf_len_incr( tc_buf + pos, req_len - pos, &pos, "%s%s", append_comma, user_tconfig)); - append_comma = ","; } if (strlen(tc_buf) != 0 && (ret = diff --git a/src/config/config_api.c b/src/config/config_api.c index 88e173459f9..c1299baaafe 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -278,8 +278,8 @@ __wt_configure_method(WT_SESSION_IMPL *session, entry->method = (*epp)->method; len = strlen((*epp)->base) + strlen(",") + strlen(config) + 1; WT_ERR(__wt_calloc_def(session, len, &p)); - WT_ERR(__wt_snprintf(p, len, "%s,%s", (*epp)->base, config)); entry->base = p; + WT_ERR(__wt_snprintf(p, len, "%s,%s", (*epp)->base, config)); /* * There may be a default value in the config argument passed in (for diff --git a/src/os_common/filename.c b/src/os_common/filename.c index f803144a3fb..d5695f63d91 100644 --- a/src/os_common/filename.c +++ b/src/os_common/filename.c @@ -29,6 +29,7 @@ int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path) { + WT_DECL_RET; size_t len; char *buf; @@ -39,16 +40,17 @@ __wt_nfilename( * the exists API which is used by the test utilities. */ if (session == NULL || __wt_absolute_path(name)) - WT_RET(__wt_strndup(session, name, namelen, path)); - else { - len = strlen(S2C(session)->home) + 1 + namelen + 1; - WT_RET(__wt_calloc(session, 1, len, &buf)); - WT_RET(__wt_snprintf(buf, len, "%s%s%.*s", S2C(session)->home, - __wt_path_separator(), (int)namelen, name)); - *path = buf; - } + return (__wt_strndup(session, name, namelen, path)); + len = strlen(S2C(session)->home) + 1 + namelen + 1; + WT_RET(__wt_calloc(session, 1, len, &buf)); + WT_ERR(__wt_snprintf(buf, len, "%s%s%.*s", + S2C(session)->home, __wt_path_separator(), (int)namelen, name)); + *path = buf; return (0); + +err: __wt_free(session, buf); + return (ret); } /* diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 238e2757099..955148b7d46 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -437,8 +437,10 @@ dump_table_parts_config(WT_SESSION *session, WT_CURSOR *cursor, len = strlen(entry) + strlen(name) + 1; if ((uriprefix = malloc(len)) == NULL) return (util_err(session, errno, NULL)); - if ((ret = __wt_snprintf(uriprefix, len, "%s%s", entry, name)) != 0) + if ((ret = __wt_snprintf(uriprefix, len, "%s%s", entry, name)) != 0) { + free(uriprefix); return (util_err(session, ret, NULL)); + } /* * Search the file looking for column group and index key/value pairs: diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index af5c2576b26..c693e2b7651 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -153,13 +153,15 @@ json_kvraw_append(WT_SESSION *session, needsize = strlen(ins->kvraw) + len + 2; if ((tmp = malloc(needsize)) == NULL) return (util_err(session, errno, NULL)); - if ((ret = __wt_snprintf( - tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str)) != 0) - return (util_err(session, ret, NULL)); + WT_ERR(__wt_snprintf( + tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str)); free(ins->kvraw); ins->kvraw = tmp; } return (0); + +err: free(tmp); + return (util_err(session, ret, NULL)); } /* diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 2b4ef36081a..c6f225bb667 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -326,12 +326,12 @@ util_uri(WT_SESSION *session, const char *s, const char *type) * the default type for the operation. */ if (strchr(s, ':') != NULL) - ret = __wt_snprintf(name, len, "%s", s); + WT_ERR(__wt_snprintf(name, len, "%s", s)); else - ret = __wt_snprintf(name, len, "%s:%s", type, s); - if (ret != 0) { - (void)util_err(session, ret, NULL); - return (NULL); - } + WT_ERR(__wt_snprintf(name, len, "%s:%s", type, s)); return (name); + +err: free(name); + (void)util_err(session, ret, NULL); + return (NULL); } -- cgit v1.2.1 From a5b3166ab7bcdb365b60686246b8e5624efeca84 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 27 Mar 2017 09:44:45 -0400 Subject: SERVER-28168 Cannot start or repair mongodb after unexpected shutdown. (#3353) Panic if there's an error in reading/writing from/to the turtle file, there's no point in continuing. This change avoids user confusion when the turtle file is corrupted or zero'd out by the filesystem. --- src/meta/meta_turtle.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 66e34c728f2..5a089471059 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -242,7 +242,7 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep) WT_DECL_ITEM(buf); WT_DECL_RET; WT_FSTREAM *fs; - bool exist, match; + bool exist; *valuep = NULL; @@ -258,22 +258,19 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep) __metadata_config(session, valuep) : WT_NOTFOUND); WT_RET(__wt_fopen(session, WT_METADATA_TURTLE, 0, WT_STREAM_READ, &fs)); - /* Search for the key. */ WT_ERR(__wt_scr_alloc(session, 512, &buf)); - for (match = false;;) { + + /* Search for the key. */ + do { WT_ERR(__wt_getline(session, fs, buf)); if (buf->size == 0) WT_ERR(WT_NOTFOUND); - if (strcmp(key, buf->data) == 0) - match = true; + } while (strcmp(key, buf->data) != 0); - /* Key matched: read the subsequent line for the value. */ - WT_ERR(__wt_getline(session, fs, buf)); - if (buf->size == 0) - WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE)); - if (match) - break; - } + /* Key matched: read the subsequent line for the value. */ + WT_ERR(__wt_getline(session, fs, buf)); + if (buf->size == 0) + WT_ERR(WT_NOTFOUND); /* Copy the value for the caller. */ WT_ERR(__wt_strdup(session, buf->data, valuep)); @@ -283,7 +280,12 @@ err: WT_TRET(__wt_fclose(session, &fs)); if (ret != 0) __wt_free(session, *valuep); - return (ret); + + /* + * A file error or a missing key/value pair in the turtle file means + * something has gone horribly wrong -- we're done. + */ + return (ret == 0 ? 0 : __wt_illegal_value(session, WT_METADATA_TURTLE)); } /* @@ -322,5 +324,9 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) err: WT_TRET(__wt_fclose(session, &fs)); WT_TRET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false)); - return (ret); + /* + * An error updating the turtle file means something has gone horribly + * wrong -- we're done. + */ + return (ret == 0 ? 0 : __wt_illegal_value(session, WT_METADATA_TURTLE)); } -- cgit v1.2.1 From d5a10d2e97853e7db6bb4c2635b97febf13607c5 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 28 Mar 2017 20:08:23 -0400 Subject: WT-3238 Java: Fix Cursor.compare and Cursor.equals to return int values. (#3355) Non-zero int values for these functions should not raise exceptions. --- lang/java/Makefile.am | 1 + lang/java/wiredtiger.i | 10 +- test/java/com/wiredtiger/test/CursorTest03.java | 175 +++++++++++++++++++++ test/java/com/wiredtiger/test/WiredTigerSuite.java | 1 + 4 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 test/java/com/wiredtiger/test/CursorTest03.java diff --git a/lang/java/Makefile.am b/lang/java/Makefile.am index 7184fe610dc..2ff822a5d08 100644 --- a/lang/java/Makefile.am +++ b/lang/java/Makefile.am @@ -49,6 +49,7 @@ JAVA_JUNIT = \ $(JAVATEST)/ConcurrentCloseTest.java \ $(JAVATEST)/CursorTest.java \ $(JAVATEST)/CursorTest02.java \ + $(JAVATEST)/CursorTest03.java \ $(JAVATEST)/ExceptionTest.java \ $(JAVATEST)/PackTest.java \ $(JAVATEST)/PackTest02.java \ diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i index efc512f2f5a..275b708090c 100644 --- a/lang/java/wiredtiger.i +++ b/lang/java/wiredtiger.i @@ -319,6 +319,15 @@ WT_CLASS(struct __wt_async_op, WT_ASYNC_OP, op) %rename (getValueFormat) __wt_async_op::getValue_format; %rename (getType) __wt_async_op::get_type; +/* + * Special cases: override the out typemap, return checking is done in the + * wrapper. + */ +%typemap(out) int __wt_cursor::compare_wrap, + int __wt_cursor::equals_wrap %{ + $result = $1; +%} + /* SWIG magic to turn Java byte strings into data / size. */ %apply (char *STRING, int LENGTH) { (char *data, int size) }; @@ -529,7 +538,6 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler}; %} %extend __wt_async_op { - %javamethodmodifiers get_key_wrap "protected"; WT_ITEM get_key_wrap(JNIEnv *jenv) { WT_ITEM k; diff --git a/test/java/com/wiredtiger/test/CursorTest03.java b/test/java/com/wiredtiger/test/CursorTest03.java new file mode 100644 index 00000000000..64f33f4d7b6 --- /dev/null +++ b/test/java/com/wiredtiger/test/CursorTest03.java @@ -0,0 +1,175 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package com.wiredtiger.test; + +import com.wiredtiger.db.Connection; +import com.wiredtiger.db.Cursor; +import com.wiredtiger.db.SearchStatus; +import com.wiredtiger.db.Session; +import com.wiredtiger.db.WiredTigerPackingException; +import com.wiredtiger.db.WiredTigerException; +import com.wiredtiger.db.wiredtiger; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.Assert; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/* + * Test cases for WT-3238. + * + * Most WiredTiger methods return int, and our SWIG typemaps for Java add + * checking that throws exceptions for non-zero returns. Certain methods + * (Cursor.compare, Cursor.equals) are declared as returning int in Java, + * but should not throw exceptions for normal returns (which may be + * non-zero). + */ +public class CursorTest03 { + Connection conn; + Session s; + static String values[] = { "key0", "key1" }; + + @Test + public void cursor_int_methods() + throws WiredTigerPackingException { + setup(); + + Cursor c1 = s.open_cursor("table:t", null, null); + Cursor c2 = s.open_cursor("table:t", null, null); + for (String s : values) { + c1.putKeyString(s); + c1.putValueString(s); + c1.insert(); + } + c1.reset(); + + // "key1" compared to "key1" + c1.putKeyString(values[1]); + Assert.assertEquals(c1.search_near(), SearchStatus.FOUND); + c2.putKeyString(values[1]); + Assert.assertEquals(c2.search_near(), SearchStatus.FOUND); + Assert.assertEquals(c1.compare(c2), 0); + Assert.assertEquals(c2.compare(c1), 0); + Assert.assertEquals(c1.compare(c1), 0); + Assert.assertEquals(c1.equals(c2), 1); + Assert.assertEquals(c2.equals(c1), 1); + Assert.assertEquals(c1.equals(c1), 1); + + // "key0" compared to "key1" + c1.putKeyString(values[0]); + Assert.assertEquals(c1.search_near(), SearchStatus.FOUND); + Assert.assertEquals(c1.compare(c2), -1); + Assert.assertEquals(c2.compare(c1), 1); + Assert.assertEquals(c1.equals(c2), 0); + Assert.assertEquals(c2.equals(c1), 0); + + c1.close(); + c2.close(); + teardown(); + } + + public void expectException(Cursor c1, Cursor c2) + { + boolean caught = false; + try { + c1.compare(c2); + } + catch (WiredTigerException wte) { + caught = true; + } + Assert.assertTrue(caught); + + caught = false; + try { + c1.equals(c2); + } + catch (WiredTigerException wte) { + caught = true; + } + Assert.assertTrue(caught); + } + + @Test + public void cursor_int_methods_errors() + throws WiredTigerPackingException { + setup(); + + Cursor c1 = s.open_cursor("table:t", null, null); + Cursor c2 = s.open_cursor("table:t", null, null); + Cursor cx = s.open_cursor("table:t2", null, null); + for (String s : values) { + c1.putKeyString(s); + c1.putValueString(s); + c1.insert(); + cx.putKeyString(s); + cx.putValueString(s); + cx.insert(); + } + c1.reset(); + cx.reset(); + + // With both cursors not set, should be an exception. + expectException(c1, c2); + expectException(c1, c2); + + // With any one cursor not set, should be an exception. + c1.putKeyString(values[1]); + Assert.assertEquals(c1.search_near(), SearchStatus.FOUND); + expectException(c1, c2); + expectException(c1, c2); + + // With two cursors from different tables, should be an exception. + cx.putKeyString(values[1]); + Assert.assertEquals(cx.search_near(), SearchStatus.FOUND); + expectException(c1, cx); + expectException(c1, cx); + + c1.close(); + c2.close(); + cx.close(); + teardown(); + } + + private void setup() { + conn = wiredtiger.open("WT_HOME", "create"); + s = conn.open_session(null); + s.create("table:t", "key_format=S,value_format=S"); + s.create("table:t2", "key_format=S,value_format=S"); + } + + private void teardown() { + s.drop("table:t", ""); + s.drop("table:t2", ""); + s.close(""); + conn.close(""); + } + +} + diff --git a/test/java/com/wiredtiger/test/WiredTigerSuite.java b/test/java/com/wiredtiger/test/WiredTigerSuite.java index 5bd98d53fac..9322d30671a 100644 --- a/test/java/com/wiredtiger/test/WiredTigerSuite.java +++ b/test/java/com/wiredtiger/test/WiredTigerSuite.java @@ -38,6 +38,7 @@ import org.junit.runners.Suite; ConfigTest.class, CursorTest.class, CursorTest02.class, + CursorTest03.class, ExceptionTest.class, PackTest.class, PackTest02.class, -- cgit v1.2.1 From 1c41c7735b3529521b7bd34180f80584caee7f59 Mon Sep 17 00:00:00 2001 From: Sulabh Mahajan Date: Wed, 29 Mar 2017 15:38:35 +1100 Subject: WT-2439 Improve page layout: keep pages more than half full (#3277) * Changes `split_pct` to have a minimum of 50%. --- dist/api_data.py | 4 +- src/btree/bt_handle.c | 9 +- src/config/config_def.c | 16 +- src/include/btree.h | 6 + src/include/wiredtiger.in | 4 +- src/reconcile/rec_write.c | 954 ++++++++++++++++++++++++---------------------- test/format/config.h | 2 +- 7 files changed, 532 insertions(+), 463 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 1d669fa7fe0..22600dd5e29 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -295,12 +295,12 @@ file_config = format_meta + file_runtime_config + [ Config('split_deepen_per_child', '0', r''' entries allocated per child when deepening the tree''', type='int', undoc=True), - Config('split_pct', '75', r''' + Config('split_pct', '90', r''' the Btree page split size as a percentage of the maximum Btree page size, that is, when a Btree page is split, it will be split into smaller pages, where each page is the specified percentage of the maximum Btree page size''', - min='25', max='100'), + min='50', max='100'), ] # File metadata, including both configurable and non-configurable (internal) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index f2bffee06da..98c246fb897 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -780,9 +780,16 @@ __btree_page_sizes(WT_SESSION_IMPL *session) * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. + * Set to the minimum, if the read value is less than that. */ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); - btree->split_pct = (int)cval.val; + if (cval.val < WT_BTREE_MIN_SPLIT_PCT) { + btree->split_pct = WT_BTREE_MIN_SPLIT_PCT; + WT_RET(__wt_msg(session, + "Re-setting split_pct for %s to the minimum allowed of " + "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT)); + } else + btree->split_pct = (int)cval.val; intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); diff --git a/src/config/config_def.c b/src/config/config_def.c index b11a8d63fdb..f152fbacad4 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -294,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "source", "string", NULL, NULL, NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "type", "string", NULL, NULL, NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, @@ -466,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -530,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -614,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -1119,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", + "split_deepen_per_child=0,split_pct=90,type=file,value_format=u", confchk_WT_SESSION_create, 42 }, { "WT_SESSION.drop", @@ -1213,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_file_config, 35 }, { "file.meta", @@ -1228,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0," "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" - ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," "value_format=u,version=(major=0,minor=0)", confchk_file_meta, 39 }, @@ -1253,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "merge_min=0),memory_page_max=5MB,old_chunks=," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_lsm_meta, 39 }, { "table.meta", diff --git a/src/include/btree.h b/src/include/btree.h index 88312f408cc..28fe1b94b23 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -57,6 +57,12 @@ /* Evict pages if we see this many consecutive deleted records. */ #define WT_BTREE_DELETE_THRESHOLD 1000 +/* + * Minimum size of the chunks (in percentage of the page size) a page gets split + * into during reconciliation. + */ +#define WT_BTREE_MIN_SPLIT_PCT 50 + /* * WT_BTREE -- * A btree handle. diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 558e93d3de0..707159ef6ae 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1242,8 +1242,8 @@ struct __wt_session { * @config{split_pct, the Btree page split size as a percentage of the * maximum Btree page size\, that is\, when a Btree page is split\, it * will be split into smaller pages\, where each page is the specified - * percentage of the maximum Btree page size., an integer between 25 and - * 100; default \c 75.} + * percentage of the maximum Btree page size., an integer between 50 and + * 100; default \c 90.} * @config{type, set the type of data source used to store a column * group\, index or simple table. By default\, a \c "file:" URI is * derived from the object name. The \c type configuration can be used diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 23f654caa70..e18d44f96ff 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -26,6 +26,11 @@ typedef struct { uint32_t flags; /* Caller's configuration */ WT_ITEM disk_image; /* Temporary disk-image buffer */ + /* + * Temporary buffer used to write out a disk image when managing two + * chunks worth of data in memory + */ + WT_ITEM *interim_buf; /* * Track start/stop write generation to decide if all changes to the @@ -127,6 +132,7 @@ typedef struct { * repeatedly split a packed page. */ uint32_t split_size; /* Split page size */ + uint32_t min_split_size; /* Minimum split page size */ /* * The problem with splits is we've done a lot of work by the time we @@ -151,16 +157,6 @@ typedef struct { */ size_t offset; /* Split's first byte */ - /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. These fields are used both - * to write the split chunk, and to create a new internal page - * to reference the split pages. - */ - uint64_t recno; /* Split's starting record */ - uint32_t entries; /* Split's entries */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -182,39 +178,42 @@ typedef struct { size_t supd_allocated; /* + * While reconciling pages, at any given time, we maintain two + * split chunks in the memory to be written out as pages. As we + * get to the last two chunks, if the last one turns out to be + * smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to + * the last chunk, hence increasing the size of the last page + * written without decreasing the penultimate page size beyond + * the minimum split size. For this reason, we maintain both a + * maximum split percentage boundary and a minimum split + * percentage boundary. + * + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both to + * write the split chunk, and to create a new internal page to + * reference the split pages. + * * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the recno field, is the * column-store key. */ - WT_ITEM key; /* Promoted row-store key */ + uint32_t max_bnd_entries; + uint64_t max_bnd_recno; + WT_ITEM max_bnd_key; + + size_t min_bnd_offset; + uint32_t min_bnd_entries; + uint64_t min_bnd_recno; + WT_ITEM min_bnd_key; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ size_t bnd_entries; /* Total boundary slots */ size_t bnd_allocated; /* Bytes allocated */ - /* - * We track the total number of page entries copied into split chunks - * so we can easily figure out how many entries in the current split - * chunk. - */ - uint32_t total_entries; /* Total entries in splits */ - - /* - * And there's state information as to where in this process we are: - * (1) tracking split boundaries because we can still fit more split - * chunks into the maximum page size, (2) tracking the maximum page - * size boundary because we can't fit any more split chunks into the - * maximum page size, (3) not performing boundary checks because it's - * either not useful with the current page size configuration, or - * because we've already been forced to split. - */ - enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ - SPLIT_MAX=1, /* Next: the maximum page boundary */ - SPLIT_TRACKING_OFF=2, /* No boundary checks */ - SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ - bnd_state; - /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are @@ -293,6 +292,14 @@ typedef struct { uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; +#define WT_CROSSING_MIN_BND(r, next_len) \ + ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \ + ((r)->space_avail - (next_len)) < \ + ((r)->split_size - (r)->min_split_size)) +#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) +#define WT_CHECK_CROSSING_BND(r, next_len) \ + (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) + static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); @@ -314,6 +321,7 @@ static int __rec_col_var(WT_SESSION_IMPL *, static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); +static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, @@ -323,7 +331,6 @@ static int __rec_row_leaf_insert( static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); -static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); @@ -968,6 +975,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->disk_image); + __wt_scr_free(session, &r->interim_buf); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1032,7 +1040,8 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); - __wt_buf_free(session, &bnd->key); + __wt_buf_free(session, &bnd->max_bnd_key); + __wt_buf_free(session, &bnd->min_bnd_key); } __wt_free(session, r->bnd); r->bnd_next = 0; @@ -1927,8 +1936,8 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = WT_RECNO_OOB; - bnd->entries = 0; + bnd->max_bnd_recno = WT_RECNO_OOB; + bnd->max_bnd_entries = 0; __wt_free(session, bnd->addr.addr); WT_CLEAR(bnd->addr); @@ -1943,6 +1952,10 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->already_compressed = false; + bnd->min_bnd_offset = 0; + bnd->min_bnd_entries = 0; + bnd->min_bnd_recno = WT_RECNO_OOB; + /* * Don't touch the key, we re-use that memory in each new * reconciliation. @@ -1974,39 +1987,63 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. + * __rec_split_page_size_from_pct -- + * Given a split percentage, calculate split page size in bytes. */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ +static uint32_t +__rec_split_page_size_from_pct( + int split_pct, uint32_t maxpagesize, uint32_t allocsize) { uintmax_t a; uint32_t split_size; /* * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so - * we don't waste space when we write). + * size rounded to an allocation unit (round to an allocation unit so we + * don't waste space when we write). */ a = maxpagesize; /* Don't overflow. */ split_size = (uint32_t)WT_ALIGN_NEAREST( - (a * (u_int)btree->split_pct) / 100, btree->allocsize); + (a * (u_int)split_pct) / 100, allocsize); /* - * Respect the configured split percentage if the calculated split - * size is either zero or a full page. The user has either configured - * an allocation size that matches the page size, or a split - * percentage that is close to zero or one hundred. Rounding is going - * to provide a worse outcome than having a split point that doesn't - * fall on an allocation size boundary in those cases. + * Respect the configured split percentage if the calculated split size + * is either zero or a full page. The user has either configured an + * allocation size that matches the page size, or a split percentage + * that is close to zero or one hundred. Rounding is going to provide a + * worse outcome than having a split point that doesn't fall on an + * allocation size boundary in those cases. */ if (split_size == 0 || split_size == maxpagesize) - split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + split_size = (uint32_t)((a * (u_int)split_pct) / 100); return (split_size); } +/* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + btree->split_pct, maxpagesize, btree->allocsize)); +} + +/* + * __rec_min_split_page_size -- + * Minimum split size boundary calculation: To track a boundary at the + * minimum split size that we could have split at instead of splitting at + * the split page size. + */ +static uint32_t +__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize)); +} + /* * __rec_split_init -- * Initialization for the reconciliation split functions. @@ -2018,7 +2055,7 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_BM *bm; WT_BTREE *btree; WT_PAGE_HEADER *dsk; - size_t corrected_page_size; + size_t corrected_page_size, disk_img_buf_size; btree = S2BT(session); bm = btree->bm; @@ -2053,33 +2090,6 @@ __rec_split_init(WT_SESSION_IMPL *session, r->max_raw_page_size = r->page_size = (uint32_t)WT_MIN(r->page_size * 10, WT_MAX(r->page_size, btree->maxmempage / 2)); - - /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. - */ - corrected_page_size = r->page_size; - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); - - /* - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. - * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. - */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? - corrected_page_size : WT_PAGE_HEADER_SIZE); - - /* - * Set the page type (the type doesn't change, and setting it later - * would require additional code in a few different places). - */ - dsk = r->disk_image.mem; - dsk->type = page->type; - /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2099,22 +2109,28 @@ __rec_split_init(WT_SESSION_IMPL *session, * creating overflow items and compacted data, for example, as those * items have already been written to disk). So, the loop calls the * helper functions when approaching a split boundary, and we save the - * information at that point. That allows us to go back and split the - * page at the boundary points if we eventually overflow the maximum - * page size. + * information at that point. We also save the boundary information at + * the minimum split size. We maintain two chunks (each boundary + * represents a chunk that gets written as a page) in the memory, + * writing out the older one to the disk as a page when we need to make + * space for a new chunk. On reaching the last chunk, if it turns out to + * be smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size boundary. This + * moves some data from the penultimate chunk to the last chunk, hence + * increasing the size of the last page written without decreasing the + * penultimate page size beyond the minimum split size. * * Finally, all this doesn't matter for fixed-size column-store pages, * raw compression, and salvage. Fixed-size column store pages can * split under (very) rare circumstances, but they're allocated at a * fixed page size, never anything smaller. In raw compression, the - * underlying compression routine decides when we split, so it's not - * our problem. In salvage, as noted above, we can't split at all. + * underlying compression routine decides when we split, so it's not our + * problem. In salvage, as noted above, we can't split at all. */ if (r->raw_compression || r->salvage != NULL) { r->split_size = 0; r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - } - else if (page->type == WT_PAGE_COL_FIX) { + } else if (page->type == WT_PAGE_COL_FIX) { r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); @@ -2122,32 +2138,53 @@ __rec_split_init(WT_SESSION_IMPL *session, r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_split_size = + __rec_min_split_page_size(btree, r->page_size); } + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + * + * The buffer that we build disk image in, needs to hold two chunks + * worth of data. Since we want to support split_size more than the page + * size (to allow for adjustments based on the compression), this buffer + * should be greater of twice of split_size and page_size. + */ + corrected_page_size = r->page_size; + disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + + /* + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + disk_img_buf_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). + */ + dsk = r->disk_image.mem; + dsk->type = page->type; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Initialize the first boundary. */ r->bnd_next = 0; WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); - r->bnd[0].recno = recno; + r->bnd[0].max_bnd_recno = recno; r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * If the maximum page size is the same as the split page size, either - * because of the object type or application configuration, there isn't - * any need to maintain split boundaries within a larger page. - * - * No configuration for salvage here, because salvage can't split. - */ - if (r->raw_compression) - r->bnd_state = SPLIT_TRACKING_RAW; - else if (max == r->split_size) - r->bnd_state = SPLIT_TRACKING_OFF; - else - r->bnd_state = SPLIT_BOUNDARY; - - /* Initialize the entry counters. */ - r->entries = r->total_entries = 0; + /* Initialize the entry counter. */ + r->entries = 0; /* Initialize the starting record number. */ r->recno = recno; @@ -2350,19 +2387,112 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, len; + size_t corrected_page_size, inuse, len; btree = S2BT(session); bm = btree->bm; len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - corrected_page_size = len + add_len; + inuse = (len - r->bnd[r->bnd_next].offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); + corrected_page_size = inuse + add_len; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); + /* Need to account for buffer carrying two chunks worth of data */ + WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + r->first_free = (uint8_t *)r->disk_image.mem + len; - WT_ASSERT(session, corrected_page_size >= len); - r->space_avail = corrected_page_size - len; + WT_ASSERT(session, corrected_page_size >= inuse); + r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); + + return (0); +} + +/* + * __rec_split_write_prev_and_shift_cur -- + * Write the previous split chunk to the disk as a page. Shift the contents + * of the current chunk to the start of the buffer, making space for a new + * chunk to be written. + * If the caller asks for a chunk resizing, the boundary between the two + * chunks is readjusted to the minimum split size boundary details stored + * in the previous chunk, letting the current chunk grow at the cost of the + * previous chunk. + */ +static int +__rec_split_write_prev_and_shift_cur( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +{ + WT_BM *bm; + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk, *dsk_tmp; + size_t cur_len, len; + uint8_t *dsk_start; + + WT_ASSERT(session, r->bnd_next != 0); + + btree = S2BT(session); + bm = btree->bm; + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + dsk = r->disk_image.mem; + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + + /* + * Resize chunks if the current is smaller than the minimum, and there + * are details on the minimum split size boundary available in the + * previous boundary details. + * + * There is a possibility that we do not have a minimum boundary set, in + * such a case we skip chunk resizing. Such a condition is possible for + * instance when we are building the image in the buffer and the first + * K/V pair is large enough that it surpasses both the minimum split + * size and the split size the application has set. In such a case we + * split the chunk without saving any minimum boundary. + */ + if (resize_chunks && + cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { + bnd_cur->offset = bnd_prev->min_bnd_offset; + bnd_cur->max_bnd_entries += + bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + + WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, + bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); + + /* Update current chunk's length */ + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + } + + /* + * Create an interim buffer if not already done to prepare the previous + * chunk's disk image. + */ + len = bnd_cur->offset; + WT_RET(bm->write_size(bm, session, &len)); + if (r->interim_buf == NULL) + WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); + else + WT_RET(__wt_buf_init(session, r->interim_buf, len)); + + dsk_tmp = r->interim_buf->mem; + memcpy(dsk_tmp, dsk, bnd_cur->offset); + dsk_tmp->recno = bnd_prev->max_bnd_recno; + dsk_tmp->u.entries = bnd_prev->max_bnd_entries; + dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); + r->interim_buf->size = dsk_tmp->mem_size; + WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); + + /* Shift the current chunk to the start of the buffer */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); + + /* Fix boundary offset */ + bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); + /* Fix where free points */ + r->first_free = dsk_start + cur_len; return (0); } @@ -2382,6 +2512,9 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); dsk = r->disk_image.mem; + /* Fixed length col store can call with next_len 0 */ + WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); + /* * We should never split during salvage, and we're about to drop core * because there's no parent page. @@ -2391,147 +2524,58 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - /* Hitting a page boundary resets the dictionary, in all cases. */ - __rec_dictionary_reset(r); - - inuse = WT_PTRDIFF(r->first_free, dsk); - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; - - /* - * About to cross a split boundary but not yet forced to split - * into multiple pages. If we have to split, this is one of the - * split points, save information about where we are when the - * split would have happened. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; - - /* Set the number of entries for the just finished chunk. */ - last->entries = r->entries - r->total_entries; - r->total_entries = r->entries; - - /* Set the key for the next chunk. */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); - - /* - * Set the starting buffer offset and clear the entries (the - * latter not required, but cleaner). - */ - next->offset = WT_PTRDIFF(r->first_free, dsk); - next->entries = 0; - - /* Set the space available to another split-size chunk. */ - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - - /* - * Adjust the space available to handle two cases: - * - We don't have enough room for another full split-size - * chunk on the page. - * - We chose to fill past a page boundary because of a - * large item. - */ - if (inuse + r->space_avail > r->page_size) { - r->space_avail = - r->page_size > inuse ? (r->page_size - inuse) : 0; - - /* There are no further boundary points. */ - r->bnd_state = SPLIT_MAX; - } - - /* - * Return if the next object fits into this page, else we have - * to split the page. - */ - if (r->space_avail >= next_len) - return (0); - - /* FALLTHROUGH */ - case SPLIT_MAX: - /* - * We're going to have to split and create multiple pages. - * - * Cycle through the saved split-point information, writing the - * split chunks we have tracked. The underlying fixup function - * sets the space available and other information, and copied - * any unwritten chunk of data to the beginning of the buffer. - */ - WT_RET(__rec_split_fixup(session, r)); - - /* We're done saving split chunks. */ - r->bnd_state = SPLIT_TRACKING_OFF; - break; - case SPLIT_TRACKING_OFF: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; + last = &r->bnd[r->bnd_next]; + inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * The key/value pairs didn't fit into a single page, but either - * we've already noticed that and are now processing the rest of - * the pairs at split size boundaries, or the split size was the - * same as the page size, and we never bothered with split point - * information at all. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if we + * haven't already consumed a reasonable portion of a split chunk. + */ + if (inuse < r->split_size / 2) + goto done; - /* - * Set the key for the next chunk (before writing the block, a - * key range is needed in that code). - */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); + /* Hitting a page boundary resets the dictionary, in all cases. */ + __rec_dictionary_reset(r); - /* Clear the entries (not required, but cleaner). */ - next->entries = 0; + /* Set the number of entries for the just finished chunk. */ + last->max_bnd_entries = r->entries; - /* Finalize the header information and write the page. */ - dsk->recno = last->recno; - dsk->u.entries = r->entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); + /* + * In case of bulk load, write out chunks as we get them. + * In other cases, we keep two chunks in memory at a given time. So, if + * there is a previous chunk, write it out, making space in the buffer + * for the next chunk to be written. + */ + if (r->is_bulk_load) { + dsk->recno = last->max_bnd_recno; + dsk->u.entries = last->max_bnd_entries; + dsk->mem_size = (uint32_t)inuse; r->disk_image.size = dsk->mem_size; - WT_RET( - __rec_split_write(session, r, last, &r->disk_image, false)); - - /* - * Set the caller's entry count and buffer information for the - * next chunk. We only get here if we're not splitting or have - * already split, so it's split-size chunks from here on out. - */ - r->entries = 0; + WT_RET(__rec_split_write( + session, r, last, &r->disk_image, false)); + /* Fix where free points */ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - break; - case SPLIT_TRACKING_RAW: - return (__wt_illegal_value(session, NULL)); - } + } else if (r->bnd_next != 0) + WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); - /* + /* Prepare the next boundary */ + WT_RET(__rec_split_bnd_grow(session, r)); + r->bnd_next++; + next = &r->bnd[r->bnd_next]; + next->offset = WT_PTRDIFF(r->first_free, dsk); + /* Set the key for the next chunk. */ + next->max_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->max_bnd_key, dsk->type)); + + r->entries = 0; + /* Set the space available to another split-size chunk. */ + r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + +done: /* * Overflow values can be larger than the maximum page size but still be * "on-page". If the next key/value pair is larger than space available * after a split has happened (in other words, larger than the maximum @@ -2548,6 +2592,66 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) return (0); } +/* + * __rec_split_crossing_bnd -- + * Save the details for the minimum split size boundary or call for a + * split. + */ +static inline int +__rec_split_crossing_bnd( + WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t min_bnd_offset; + + WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); + + /* + * If crossing the minimum split size boundary, store the boundary + * details at the current location in the buffer. If we are crossing the + * split boundary at the same time, possible when the next record is + * large enough, just split at this point. + */ + if (WT_CROSSING_MIN_BND(r, next_len) && + !WT_CROSSING_SPLIT_BND(r, next_len)) { + btree = S2BT(session); + bnd = &r->bnd[r->bnd_next]; + dsk = r->disk_image.mem; + min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - + bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) + /* + * This is possible if the first record doesn't fit in + * the minimum split size, we write this record without + * setting up any boundary here. We will get the + * opportunity to setup a boundary before writing out + * the next record. + */ + return (0); + + WT_ASSERT(session, bnd->min_bnd_offset == 0); + + /* + * Hitting a page boundary resets the dictionary, in all cases. + */ + __rec_dictionary_reset(r); + + bnd->min_bnd_offset = min_bnd_offset; + bnd->min_bnd_entries = r->entries; + bnd->min_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &bnd->min_bnd_key, dsk->type)); + return (0); + } + + /* We are crossing a split boundary */ + return (__rec_split(session, r, next_len)); +} + /* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. @@ -2626,7 +2730,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, */ recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) - recno = last->recno; + recno = last->max_bnd_recno; entry = max_image_slot = slots = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { @@ -2853,7 +2957,7 @@ no_slots: */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; - dsk_dst->recno = last->recno; + dsk_dst->recno = last->max_bnd_recno; dsk_dst->mem_size = r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; dsk_dst->u.entries = r->raw_entries[result_slots - 1]; @@ -2873,7 +2977,7 @@ no_slots: WT_RET(__wt_strndup(session, dsk, dsk_dst->mem_size, &last->disk_image)); disk_image = last->disk_image; - disk_image->recno = last->recno; + disk_image->recno = last->max_bnd_recno; disk_image->mem_size = dsk_dst->mem_size; disk_image->u.entries = dsk_dst->u.entries; } @@ -2903,14 +3007,14 @@ no_slots: */ switch (dsk->type) { case WT_PAGE_COL_INT: - next->recno = r->raw_recnos[result_slots]; + next->max_bnd_recno = r->raw_recnos[result_slots]; break; case WT_PAGE_COL_VAR: - next->recno = r->raw_recnos[result_slots - 1]; + next->max_bnd_recno = r->raw_recnos[result_slots - 1]; break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = WT_RECNO_OOB; + next->max_bnd_recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2919,7 +3023,7 @@ no_slots: */ WT_ASSERT(session, len > 0); WT_RET(__rec_split_row_promote_cell( - session, dsk, &next->key)); + session, dsk, &next->max_bnd_key)); } break; } @@ -2931,7 +3035,7 @@ no_slots: */ WT_STAT_DATA_INCR(session, compress_raw_fail); - dsk->recno = last->recno; + dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->disk_image.size = dsk->mem_size; @@ -3008,35 +3112,9 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd; + WT_BOUNDARY *bnd_cur, *bnd_prev; WT_PAGE_HEADER *dsk; - - /* Adjust the boundary information based on our split status. */ - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - case SPLIT_MAX: - /* - * We never split, the reconciled page fit into a maximum page - * size. Change the first boundary slot to represent the full - * page (the first boundary slot is largely correct, just update - * the number of entries). - */ - r->bnd_next = 0; - break; - case SPLIT_TRACKING_OFF: - /* - * If we have already split, or aren't tracking boundaries, put - * the remaining data in the next boundary slot. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - break; - case SPLIT_TRACKING_RAW: - /* - * We were configured for raw compression, and either we never - * wrote anything, or there's a remaindered block of data. - */ - break; - } + bool grow_bnd; /* * We may arrive here with no entries to write if the page was entirely @@ -3063,20 +3141,66 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - /* Set the boundary reference and increment the count. */ - bnd = &r->bnd[r->bnd_next++]; - bnd->entries = r->entries; - - /* Finalize the header information. */ dsk = r->disk_image.mem; - dsk->recno = bnd->recno; - dsk->u.entries = r->entries; + + /* Set the number of entries for the just finished chunk. */ + bnd_cur = &r->bnd[r->bnd_next]; + bnd_cur->max_bnd_entries = r->entries; + + grow_bnd = true; + /* + * We can reach here even with raw_compression when the last split chunk + * is too small to be sent for raw compression. + */ + if (!r->is_bulk_load && !r->raw_compression) { + if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && + r->bnd_next != 0) { + /* + * We hold two boundaries worth of data in the buffer, + * and this data doesn't fit in a single page. If the + * last chunk is too small, readjust the boundary to a + * pre-computed minimum. + * Write out the penultimate chunk to the disk as a page + */ + WT_RET(__rec_split_write_prev_and_shift_cur( + session, r, true)); + } else + if (r->bnd_next != 0) { + /* + * We have two boundaries, but the data in the + * buffer can fit a single page. Merge the + * boundaries to create a single chunk. + */ + bnd_prev = bnd_cur - 1; + bnd_prev->max_bnd_entries += + bnd_cur->max_bnd_entries; + r->bnd_next--; + grow_bnd = false; + } + } + + /* + * We already have space for an extra boundary if we merged two + * boundaries above, in that case we do not need to grow the boundary + * structure. + */ + if (grow_bnd) + WT_RET(__rec_split_bnd_grow(session, r)); + bnd_cur = &r->bnd[r->bnd_next]; + r->bnd_next++; + + /* + * Current boundary now has all the remaining data/last page now. + * Let's write it to the disk + */ + dsk->recno = bnd_cur->max_bnd_recno; + dsk->u.entries = bnd_cur->max_bnd_entries; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ - return (__rec_is_checkpoint(session, r, bnd) ? - 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); + return (__rec_is_checkpoint(session, r, bnd_cur) ? + 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); } /* @@ -3109,98 +3233,6 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (__rec_split_finish_std(session, r)); } -/* - * __rec_split_fixup -- - * Fix up after crossing the maximum page boundary. - */ -static int -__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) -{ - WT_BOUNDARY *bnd; - WT_BTREE *btree; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE_HEADER *dsk; - size_t i, len; - uint8_t *dsk_start, *p; - - /* - * When we overflow physical limits of the page, we walk the list of - * split chunks we've created and write those pages out, then update - * the caller's information. - */ - btree = S2BT(session); - - /* - * The data isn't laid out on a page boundary or nul padded; copy it to - * a clean, aligned, padded buffer before writing it. - * - * Allocate a scratch buffer to hold the new disk image. Copy the disk - * page's header and block-manager space into the scratch buffer, most - * of the header information remains unchanged between the pages. - */ - WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); - dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); - - /* - * For each split chunk we've created, update the disk image and copy - * it into place. - */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { - /* Copy the page contents to the temporary buffer. */ - len = (bnd + 1)->offset - bnd->offset; - memcpy(dsk_start, - (uint8_t *)r->disk_image.mem + bnd->offset, len); - - /* Finalize the header information and write the page. */ - dsk->recno = bnd->recno; - dsk->u.entries = bnd->entries; - tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; - dsk->mem_size = WT_STORE_SIZE(tmp->size); - WT_ERR(__rec_split_write(session, r, bnd, tmp, false)); - } - - /* - * There is probably a remnant in the working buffer that didn't get - * written, copy it down to the beginning of the working buffer. - * - * Confirm the remnant is no larger than a split-sized chunk, including - * header. We know that's the maximum sized remnant because we only have - * remnants if split switches from accumulating to a split boundary to - * accumulating to the end of the page (the other path here is when we - * hit a split boundary, there was room for another split chunk in the - * page, and the next item still wouldn't fit, in which case there is no - * remnant). So: we were accumulating to the end of the page and created - * a remnant. We know the remnant cannot be as large as a split-sized - * chunk, including header, because if there was room for that large a - * remnant, we wouldn't have switched from accumulating to a page end. - */ - p = (uint8_t *)r->disk_image.mem + bnd->offset; - len = WT_PTRDIFF(r->first_free, p); - if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - WT_PANIC_ERR(session, EINVAL, - "Reconciliation remnant too large for the split buffer"); - dsk = r->disk_image.mem; - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, p, len); - - /* - * Fix up our caller's information, including updating the starting - * record number. - */ - r->entries -= r->total_entries; - r->first_free = dsk_start + len; - WT_ASSERT(session, - r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); - r->space_avail = - r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); - -err: __wt_scr_free(session, &tmp); - return (ret); -} - /* * __rec_split_write -- * Write a disk block out for the split helper functions. @@ -3238,8 +3270,6 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } - bnd->entries = r->entries; - /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3285,7 +3315,8 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + if (WT_INSERT_RECNO(supd->ins) >= + (bnd + 1)->max_bnd_recno) goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: @@ -3296,8 +3327,8 @@ __rec_split_write(WT_SESSION_IMPL *session, key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); } - WT_ERR(__wt_compare(session, - btree->collator, key, &(bnd + 1)->key, &cmp)); + WT_ERR(__wt_compare(session, btree->collator, + key, &(bnd + 1)->max_bnd_key, &cmp)); if (cmp >= 0) goto supd_check_complete; break; @@ -3387,14 +3418,14 @@ supd_check_complete: #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && + bnd->max_bnd_entries < 6) __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d", - r->entries, r->page->memory_footprint, r->bnd_next, - F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", - r->bnd_state); + ", page count %" PRIu32 ", %s", bnd->max_bnd_entries, + r->page->memory_footprint, r->bnd_next, + F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint"); #endif WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, @@ -3680,11 +3711,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) - WT_RET( - __rec_split_raw(session, r, key->len + val->len)); - else { + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3696,10 +3728,9 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } - - WT_RET(__rec_split(session, r, key->len + val->len)); + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -3740,6 +3771,10 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, cbulk->entry, __bitstr_size( @@ -3844,10 +3879,12 @@ __wt_bulk_insert_var( r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -3983,10 +4020,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_ERR(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_ERR(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_ERR(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4028,10 +4068,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4139,6 +4182,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); @@ -4295,10 +4342,13 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4961,11 +5011,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -4977,10 +5028,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_IKEY_DATA(ikey), ikey->size)); key_onpage_ovfl = false; } - WT_ERR(__rec_split( + + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5030,10 +5081,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, key->len + val->len) : - __rec_split(session, r, key->len + val->len)); + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5362,16 +5417,17 @@ build: } /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* - * In one path above, we copied address blocks - * from the page rather than building the actual - * key. In that case, we have to build the key - * now because we are about to promote it. + * If we copied address blocks from the page + * rather than building the actual key, we have + * to build the key now because we are about to + * promote it. */ if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, @@ -5390,14 +5446,13 @@ build: if (!ovfl_key) WT_ERR( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_ERR(__rec_split( + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5460,11 +5515,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5476,14 +5532,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (!ovfl_key) WT_RET( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_RET(__rec_split( + WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5595,13 +5650,14 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) __wt_verbose(session, WT_VERB_SPLIT, "starting key %s", __wt_buf_set_printable( - session, bnd->key.data, bnd->key.size, tkey)); + session, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, tkey)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __wt_verbose(session, WT_VERB_SPLIT, - "starting recno %" PRIu64, bnd->recno); + "starting recno %" PRIu64, bnd->max_bnd_recno); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -5863,10 +5919,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* We never set the first page's key, grab it from the original page. */ ref = r->ref; if (__wt_ref_is_root(ref)) - WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1)); else { __wt_ref_key(ref->home, ref, &p, &size); - WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size)); } /* Allocate, then initialize the array of replacement blocks. */ @@ -5874,8 +5930,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey_alloc(session, 0, - bnd->key.data, bnd->key.size, &multi->key.ikey)); + WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, &multi->key.ikey)); /* * Copy any disk image. Don't take saved updates without a @@ -5922,7 +5978,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - multi->key.recno = bnd->recno; + multi->key.recno = bnd->max_bnd_recno; /* * Copy any disk image. Don't take saved updates without a diff --git a/test/format/config.h b/test/format/config.h index e3e1e73a786..b5feb7a5321 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -284,7 +284,7 @@ static CONFIG c[] = { { "split_pct", "page split size as a percentage of the maximum page size", - 0x0, 40, 85, 85, &g.c_split_pct, NULL }, + 0x0, 50, 100, 100, &g.c_split_pct, NULL }, { "statistics", "maintain statistics", /* 20% */ -- cgit v1.2.1 From ebff498af45a3e64fab05cd6360b117c010634b9 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 29 Mar 2017 15:55:04 +1100 Subject: Revert "WT-2439 Improve page layout: keep pages more than half full (#3277)" This reverts commit 1c41c7735b3529521b7bd34180f80584caee7f59. --- dist/api_data.py | 4 +- src/btree/bt_handle.c | 9 +- src/config/config_def.c | 16 +- src/include/btree.h | 6 - src/include/wiredtiger.in | 4 +- src/reconcile/rec_write.c | 954 ++++++++++++++++++++++------------------------ test/format/config.h | 2 +- 7 files changed, 463 insertions(+), 532 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 22600dd5e29..1d669fa7fe0 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -295,12 +295,12 @@ file_config = format_meta + file_runtime_config + [ Config('split_deepen_per_child', '0', r''' entries allocated per child when deepening the tree''', type='int', undoc=True), - Config('split_pct', '90', r''' + Config('split_pct', '75', r''' the Btree page split size as a percentage of the maximum Btree page size, that is, when a Btree page is split, it will be split into smaller pages, where each page is the specified percentage of the maximum Btree page size''', - min='50', max='100'), + min='25', max='100'), ] # File metadata, including both configurable and non-configurable (internal) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 98c246fb897..f2bffee06da 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -780,16 +780,9 @@ __btree_page_sizes(WT_SESSION_IMPL *session) * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. - * Set to the minimum, if the read value is less than that. */ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); - if (cval.val < WT_BTREE_MIN_SPLIT_PCT) { - btree->split_pct = WT_BTREE_MIN_SPLIT_PCT; - WT_RET(__wt_msg(session, - "Re-setting split_pct for %s to the minimum allowed of " - "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT)); - } else - btree->split_pct = (int)cval.val; + btree->split_pct = (int)cval.val; intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); diff --git a/src/config/config_def.c b/src/config/config_def.c index f152fbacad4..b11a8d63fdb 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -294,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "source", "string", NULL, NULL, NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, { "type", "string", NULL, NULL, NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, @@ -466,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -530,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -614,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -1119,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=90,type=file,value_format=u", + "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", confchk_WT_SESSION_create, 42 }, { "WT_SESSION.drop", @@ -1213,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=90,value_format=u", + "split_deepen_per_child=0,split_pct=75,value_format=u", confchk_file_config, 35 }, { "file.meta", @@ -1228,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0," "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" - ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," + ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," "value_format=u,version=(major=0,minor=0)", confchk_file_meta, 39 }, @@ -1253,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "merge_min=0),memory_page_max=5MB,old_chunks=," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=90,value_format=u", + "split_deepen_per_child=0,split_pct=75,value_format=u", confchk_lsm_meta, 39 }, { "table.meta", diff --git a/src/include/btree.h b/src/include/btree.h index 28fe1b94b23..88312f408cc 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -57,12 +57,6 @@ /* Evict pages if we see this many consecutive deleted records. */ #define WT_BTREE_DELETE_THRESHOLD 1000 -/* - * Minimum size of the chunks (in percentage of the page size) a page gets split - * into during reconciliation. - */ -#define WT_BTREE_MIN_SPLIT_PCT 50 - /* * WT_BTREE -- * A btree handle. diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 707159ef6ae..558e93d3de0 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1242,8 +1242,8 @@ struct __wt_session { * @config{split_pct, the Btree page split size as a percentage of the * maximum Btree page size\, that is\, when a Btree page is split\, it * will be split into smaller pages\, where each page is the specified - * percentage of the maximum Btree page size., an integer between 50 and - * 100; default \c 90.} + * percentage of the maximum Btree page size., an integer between 25 and + * 100; default \c 75.} * @config{type, set the type of data source used to store a column * group\, index or simple table. By default\, a \c "file:" URI is * derived from the object name. The \c type configuration can be used diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index e18d44f96ff..23f654caa70 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -26,11 +26,6 @@ typedef struct { uint32_t flags; /* Caller's configuration */ WT_ITEM disk_image; /* Temporary disk-image buffer */ - /* - * Temporary buffer used to write out a disk image when managing two - * chunks worth of data in memory - */ - WT_ITEM *interim_buf; /* * Track start/stop write generation to decide if all changes to the @@ -132,7 +127,6 @@ typedef struct { * repeatedly split a packed page. */ uint32_t split_size; /* Split page size */ - uint32_t min_split_size; /* Minimum split page size */ /* * The problem with splits is we've done a lot of work by the time we @@ -157,6 +151,16 @@ typedef struct { */ size_t offset; /* Split's first byte */ + /* + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both + * to write the split chunk, and to create a new internal page + * to reference the split pages. + */ + uint64_t recno; /* Split's starting record */ + uint32_t entries; /* Split's entries */ + WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -178,42 +182,39 @@ typedef struct { size_t supd_allocated; /* - * While reconciling pages, at any given time, we maintain two - * split chunks in the memory to be written out as pages. As we - * get to the last two chunks, if the last one turns out to be - * smaller than the minimum split size, we go back into the - * penultimate chunk and split at this minimum split size - * boundary. This moves some data from the penultimate chunk to - * the last chunk, hence increasing the size of the last page - * written without decreasing the penultimate page size beyond - * the minimum split size. For this reason, we maintain both a - * maximum split percentage boundary and a minimum split - * percentage boundary. - * - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. These fields are used both to - * write the split chunk, and to create a new internal page to - * reference the split pages. - * * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the recno field, is the * column-store key. */ - uint32_t max_bnd_entries; - uint64_t max_bnd_recno; - WT_ITEM max_bnd_key; - - size_t min_bnd_offset; - uint32_t min_bnd_entries; - uint64_t min_bnd_recno; - WT_ITEM min_bnd_key; + WT_ITEM key; /* Promoted row-store key */ } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ size_t bnd_entries; /* Total boundary slots */ size_t bnd_allocated; /* Bytes allocated */ + /* + * We track the total number of page entries copied into split chunks + * so we can easily figure out how many entries in the current split + * chunk. + */ + uint32_t total_entries; /* Total entries in splits */ + + /* + * And there's state information as to where in this process we are: + * (1) tracking split boundaries because we can still fit more split + * chunks into the maximum page size, (2) tracking the maximum page + * size boundary because we can't fit any more split chunks into the + * maximum page size, (3) not performing boundary checks because it's + * either not useful with the current page size configuration, or + * because we've already been forced to split. + */ + enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ + SPLIT_MAX=1, /* Next: the maximum page boundary */ + SPLIT_TRACKING_OFF=2, /* No boundary checks */ + SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ + bnd_state; + /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are @@ -292,14 +293,6 @@ typedef struct { uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; -#define WT_CROSSING_MIN_BND(r, next_len) \ - ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \ - ((r)->space_avail - (next_len)) < \ - ((r)->split_size - (r)->min_split_size)) -#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) -#define WT_CHECK_CROSSING_BND(r, next_len) \ - (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) - static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); @@ -321,7 +314,6 @@ static int __rec_col_var(WT_SESSION_IMPL *, static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); -static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, @@ -331,6 +323,7 @@ static int __rec_row_leaf_insert( static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); +static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); @@ -975,7 +968,6 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->disk_image); - __wt_scr_free(session, &r->interim_buf); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1040,8 +1032,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); - __wt_buf_free(session, &bnd->max_bnd_key); - __wt_buf_free(session, &bnd->min_bnd_key); + __wt_buf_free(session, &bnd->key); } __wt_free(session, r->bnd); r->bnd_next = 0; @@ -1936,8 +1927,8 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->max_bnd_recno = WT_RECNO_OOB; - bnd->max_bnd_entries = 0; + bnd->recno = WT_RECNO_OOB; + bnd->entries = 0; __wt_free(session, bnd->addr.addr); WT_CLEAR(bnd->addr); @@ -1952,10 +1943,6 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->already_compressed = false; - bnd->min_bnd_offset = 0; - bnd->min_bnd_entries = 0; - bnd->min_bnd_recno = WT_RECNO_OOB; - /* * Don't touch the key, we re-use that memory in each new * reconciliation. @@ -1987,63 +1974,39 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_split_page_size_from_pct -- - * Given a split percentage, calculate split page size in bytes. + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. */ -static uint32_t -__rec_split_page_size_from_pct( - int split_pct, uint32_t maxpagesize, uint32_t allocsize) { +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ uintmax_t a; uint32_t split_size; /* * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so we - * don't waste space when we write). + * size rounded to an allocation unit (round to an allocation unit so + * we don't waste space when we write). */ a = maxpagesize; /* Don't overflow. */ split_size = (uint32_t)WT_ALIGN_NEAREST( - (a * (u_int)split_pct) / 100, allocsize); + (a * (u_int)btree->split_pct) / 100, btree->allocsize); /* - * Respect the configured split percentage if the calculated split size - * is either zero or a full page. The user has either configured an - * allocation size that matches the page size, or a split percentage - * that is close to zero or one hundred. Rounding is going to provide a - * worse outcome than having a split point that doesn't fall on an - * allocation size boundary in those cases. + * Respect the configured split percentage if the calculated split + * size is either zero or a full page. The user has either configured + * an allocation size that matches the page size, or a split + * percentage that is close to zero or one hundred. Rounding is going + * to provide a worse outcome than having a split point that doesn't + * fall on an allocation size boundary in those cases. */ if (split_size == 0 || split_size == maxpagesize) - split_size = (uint32_t)((a * (u_int)split_pct) / 100); + split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); return (split_size); } -/* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. - */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ - return (__rec_split_page_size_from_pct( - btree->split_pct, maxpagesize, btree->allocsize)); -} - -/* - * __rec_min_split_page_size -- - * Minimum split size boundary calculation: To track a boundary at the - * minimum split size that we could have split at instead of splitting at - * the split page size. - */ -static uint32_t -__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ - return (__rec_split_page_size_from_pct( - WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize)); -} - /* * __rec_split_init -- * Initialization for the reconciliation split functions. @@ -2055,7 +2018,7 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_BM *bm; WT_BTREE *btree; WT_PAGE_HEADER *dsk; - size_t corrected_page_size, disk_img_buf_size; + size_t corrected_page_size; btree = S2BT(session); bm = btree->bm; @@ -2090,6 +2053,33 @@ __rec_split_init(WT_SESSION_IMPL *session, r->max_raw_page_size = r->page_size = (uint32_t)WT_MIN(r->page_size * 10, WT_MAX(r->page_size, btree->maxmempage / 2)); + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + */ + corrected_page_size = r->page_size; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); + + /* + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + corrected_page_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). + */ + dsk = r->disk_image.mem; + dsk->type = page->type; + /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2109,28 +2099,22 @@ __rec_split_init(WT_SESSION_IMPL *session, * creating overflow items and compacted data, for example, as those * items have already been written to disk). So, the loop calls the * helper functions when approaching a split boundary, and we save the - * information at that point. We also save the boundary information at - * the minimum split size. We maintain two chunks (each boundary - * represents a chunk that gets written as a page) in the memory, - * writing out the older one to the disk as a page when we need to make - * space for a new chunk. On reaching the last chunk, if it turns out to - * be smaller than the minimum split size, we go back into the - * penultimate chunk and split at this minimum split size boundary. This - * moves some data from the penultimate chunk to the last chunk, hence - * increasing the size of the last page written without decreasing the - * penultimate page size beyond the minimum split size. + * information at that point. That allows us to go back and split the + * page at the boundary points if we eventually overflow the maximum + * page size. * * Finally, all this doesn't matter for fixed-size column-store pages, * raw compression, and salvage. Fixed-size column store pages can * split under (very) rare circumstances, but they're allocated at a * fixed page size, never anything smaller. In raw compression, the - * underlying compression routine decides when we split, so it's not our - * problem. In salvage, as noted above, we can't split at all. + * underlying compression routine decides when we split, so it's not + * our problem. In salvage, as noted above, we can't split at all. */ if (r->raw_compression || r->salvage != NULL) { r->split_size = 0; r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - } else if (page->type == WT_PAGE_COL_FIX) { + } + else if (page->type == WT_PAGE_COL_FIX) { r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); @@ -2138,53 +2122,32 @@ __rec_split_init(WT_SESSION_IMPL *session, r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - r->min_split_size = - __rec_min_split_page_size(btree, r->page_size); } - - /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. - * - * The buffer that we build disk image in, needs to hold two chunks - * worth of data. Since we want to support split_size more than the page - * size (to allow for adjustments based on the compression), this buffer - * should be greater of twice of split_size and page_size. - */ - corrected_page_size = r->page_size; - disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); - - /* - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. - * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. - */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? - disk_img_buf_size : WT_PAGE_HEADER_SIZE); - - /* - * Set the page type (the type doesn't change, and setting it later - * would require additional code in a few different places). - */ - dsk = r->disk_image.mem; - dsk->type = page->type; - r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Initialize the first boundary. */ r->bnd_next = 0; WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); - r->bnd[0].max_bnd_recno = recno; + r->bnd[0].recno = recno; r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* Initialize the entry counter. */ - r->entries = 0; + /* + * If the maximum page size is the same as the split page size, either + * because of the object type or application configuration, there isn't + * any need to maintain split boundaries within a larger page. + * + * No configuration for salvage here, because salvage can't split. + */ + if (r->raw_compression) + r->bnd_state = SPLIT_TRACKING_RAW; + else if (max == r->split_size) + r->bnd_state = SPLIT_TRACKING_OFF; + else + r->bnd_state = SPLIT_BOUNDARY; + + /* Initialize the entry counters. */ + r->entries = r->total_entries = 0; /* Initialize the starting record number. */ r->recno = recno; @@ -2387,112 +2350,19 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, inuse, len; + size_t corrected_page_size, len; btree = S2BT(session); bm = btree->bm; len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - inuse = (len - r->bnd[r->bnd_next].offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); - corrected_page_size = inuse + add_len; - + corrected_page_size = len + add_len; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - /* Need to account for buffer carrying two chunks worth of data */ - WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); - + WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); r->first_free = (uint8_t *)r->disk_image.mem + len; - WT_ASSERT(session, corrected_page_size >= inuse); - r->space_avail = corrected_page_size - inuse; + WT_ASSERT(session, corrected_page_size >= len); + r->space_avail = corrected_page_size - len; WT_ASSERT(session, r->space_avail >= add_len); - - return (0); -} - -/* - * __rec_split_write_prev_and_shift_cur -- - * Write the previous split chunk to the disk as a page. Shift the contents - * of the current chunk to the start of the buffer, making space for a new - * chunk to be written. - * If the caller asks for a chunk resizing, the boundary between the two - * chunks is readjusted to the minimum split size boundary details stored - * in the previous chunk, letting the current chunk grow at the cost of the - * previous chunk. - */ -static int -__rec_split_write_prev_and_shift_cur( - WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) -{ - WT_BM *bm; - WT_BOUNDARY *bnd_cur, *bnd_prev; - WT_BTREE *btree; - WT_PAGE_HEADER *dsk, *dsk_tmp; - size_t cur_len, len; - uint8_t *dsk_start; - - WT_ASSERT(session, r->bnd_next != 0); - - btree = S2BT(session); - bm = btree->bm; - bnd_cur = &r->bnd[r->bnd_next]; - bnd_prev = bnd_cur - 1; - dsk = r->disk_image.mem; - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; - - /* - * Resize chunks if the current is smaller than the minimum, and there - * are details on the minimum split size boundary available in the - * previous boundary details. - * - * There is a possibility that we do not have a minimum boundary set, in - * such a case we skip chunk resizing. Such a condition is possible for - * instance when we are building the image in the buffer and the first - * K/V pair is large enough that it surpasses both the minimum split - * size and the split size the application has set. In such a case we - * split the chunk without saving any minimum boundary. - */ - if (resize_chunks && - cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { - bnd_cur->offset = bnd_prev->min_bnd_offset; - bnd_cur->max_bnd_entries += - bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; - bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; - bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; - - WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, - bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); - - /* Update current chunk's length */ - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; - } - - /* - * Create an interim buffer if not already done to prepare the previous - * chunk's disk image. - */ - len = bnd_cur->offset; - WT_RET(bm->write_size(bm, session, &len)); - if (r->interim_buf == NULL) - WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); - else - WT_RET(__wt_buf_init(session, r->interim_buf, len)); - - dsk_tmp = r->interim_buf->mem; - memcpy(dsk_tmp, dsk, bnd_cur->offset); - dsk_tmp->recno = bnd_prev->max_bnd_recno; - dsk_tmp->u.entries = bnd_prev->max_bnd_entries; - dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); - r->interim_buf->size = dsk_tmp->mem_size; - WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); - - /* Shift the current chunk to the start of the buffer */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); - - /* Fix boundary offset */ - bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* Fix where free points */ - r->first_free = dsk_start + cur_len; return (0); } @@ -2512,9 +2382,6 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); dsk = r->disk_image.mem; - /* Fixed length col store can call with next_len 0 */ - WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); - /* * We should never split during salvage, and we're about to drop core * because there's no parent page. @@ -2524,58 +2391,147 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - last = &r->bnd[r->bnd_next]; - inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); - - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if we - * haven't already consumed a reasonable portion of a split chunk. - */ - if (inuse < r->split_size / 2) - goto done; - /* Hitting a page boundary resets the dictionary, in all cases. */ __rec_dictionary_reset(r); - /* Set the number of entries for the just finished chunk. */ - last->max_bnd_entries = r->entries; + inuse = WT_PTRDIFF(r->first_free, dsk); + switch (r->bnd_state) { + case SPLIT_BOUNDARY: + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if + * we haven't already consumed a reasonable portion of a split + * chunk. + */ + if (inuse < r->split_size / 2) + break; - /* - * In case of bulk load, write out chunks as we get them. - * In other cases, we keep two chunks in memory at a given time. So, if - * there is a previous chunk, write it out, making space in the buffer - * for the next chunk to be written. - */ - if (r->is_bulk_load) { - dsk->recno = last->max_bnd_recno; - dsk->u.entries = last->max_bnd_entries; - dsk->mem_size = (uint32_t)inuse; + /* + * About to cross a split boundary but not yet forced to split + * into multiple pages. If we have to split, this is one of the + * split points, save information about where we are when the + * split would have happened. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* Set the number of entries for the just finished chunk. */ + last->entries = r->entries - r->total_entries; + r->total_entries = r->entries; + + /* Set the key for the next chunk. */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* + * Set the starting buffer offset and clear the entries (the + * latter not required, but cleaner). + */ + next->offset = WT_PTRDIFF(r->first_free, dsk); + next->entries = 0; + + /* Set the space available to another split-size chunk. */ + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + + /* + * Adjust the space available to handle two cases: + * - We don't have enough room for another full split-size + * chunk on the page. + * - We chose to fill past a page boundary because of a + * large item. + */ + if (inuse + r->space_avail > r->page_size) { + r->space_avail = + r->page_size > inuse ? (r->page_size - inuse) : 0; + + /* There are no further boundary points. */ + r->bnd_state = SPLIT_MAX; + } + + /* + * Return if the next object fits into this page, else we have + * to split the page. + */ + if (r->space_avail >= next_len) + return (0); + + /* FALLTHROUGH */ + case SPLIT_MAX: + /* + * We're going to have to split and create multiple pages. + * + * Cycle through the saved split-point information, writing the + * split chunks we have tracked. The underlying fixup function + * sets the space available and other information, and copied + * any unwritten chunk of data to the beginning of the buffer. + */ + WT_RET(__rec_split_fixup(session, r)); + + /* We're done saving split chunks. */ + r->bnd_state = SPLIT_TRACKING_OFF; + break; + case SPLIT_TRACKING_OFF: + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if + * we haven't already consumed a reasonable portion of a split + * chunk. + */ + if (inuse < r->split_size / 2) + break; + + /* + * The key/value pairs didn't fit into a single page, but either + * we've already noticed that and are now processing the rest of + * the pairs at split size boundaries, or the split size was the + * same as the page size, and we never bothered with split point + * information at all. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + last = &r->bnd[r->bnd_next++]; + next = last + 1; + + /* + * Set the key for the next chunk (before writing the block, a + * key range is needed in that code). + */ + next->recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->key, dsk->type)); + + /* Clear the entries (not required, but cleaner). */ + next->entries = 0; + + /* Finalize the header information and write the page. */ + dsk->recno = last->recno; + dsk->u.entries = r->entries; + dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; - WT_RET(__rec_split_write( - session, r, last, &r->disk_image, false)); - /* Fix where free points */ + WT_RET( + __rec_split_write(session, r, last, &r->disk_image, false)); + + /* + * Set the caller's entry count and buffer information for the + * next chunk. We only get here if we're not splitting or have + * already split, so it's split-size chunks from here on out. + */ + r->entries = 0; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - } else if (r->bnd_next != 0) - WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); + r->space_avail = + r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + break; + case SPLIT_TRACKING_RAW: + return (__wt_illegal_value(session, NULL)); + } - /* Prepare the next boundary */ - WT_RET(__rec_split_bnd_grow(session, r)); - r->bnd_next++; - next = &r->bnd[r->bnd_next]; - next->offset = WT_PTRDIFF(r->first_free, dsk); - /* Set the key for the next chunk. */ - next->max_bnd_recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->max_bnd_key, dsk->type)); - - r->entries = 0; - /* Set the space available to another split-size chunk. */ - r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - -done: /* + /* * Overflow values can be larger than the maximum page size but still be * "on-page". If the next key/value pair is larger than space available * after a split has happened (in other words, larger than the maximum @@ -2592,66 +2548,6 @@ done: /* return (0); } -/* - * __rec_split_crossing_bnd -- - * Save the details for the minimum split size boundary or call for a - * split. - */ -static inline int -__rec_split_crossing_bnd( - WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) -{ - WT_BOUNDARY *bnd; - WT_BTREE *btree; - WT_PAGE_HEADER *dsk; - size_t min_bnd_offset; - - WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); - - /* - * If crossing the minimum split size boundary, store the boundary - * details at the current location in the buffer. If we are crossing the - * split boundary at the same time, possible when the next record is - * large enough, just split at this point. - */ - if (WT_CROSSING_MIN_BND(r, next_len) && - !WT_CROSSING_SPLIT_BND(r, next_len)) { - btree = S2BT(session); - bnd = &r->bnd[r->bnd_next]; - dsk = r->disk_image.mem; - min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - - bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); - if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) - /* - * This is possible if the first record doesn't fit in - * the minimum split size, we write this record without - * setting up any boundary here. We will get the - * opportunity to setup a boundary before writing out - * the next record. - */ - return (0); - - WT_ASSERT(session, bnd->min_bnd_offset == 0); - - /* - * Hitting a page boundary resets the dictionary, in all cases. - */ - __rec_dictionary_reset(r); - - bnd->min_bnd_offset = min_bnd_offset; - bnd->min_bnd_entries = r->entries; - bnd->min_bnd_recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &bnd->min_bnd_key, dsk->type)); - return (0); - } - - /* We are crossing a split boundary */ - return (__rec_split(session, r, next_len)); -} - /* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. @@ -2730,7 +2626,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, */ recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) - recno = last->max_bnd_recno; + recno = last->recno; entry = max_image_slot = slots = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { @@ -2957,7 +2853,7 @@ no_slots: */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; - dsk_dst->recno = last->max_bnd_recno; + dsk_dst->recno = last->recno; dsk_dst->mem_size = r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; dsk_dst->u.entries = r->raw_entries[result_slots - 1]; @@ -2977,7 +2873,7 @@ no_slots: WT_RET(__wt_strndup(session, dsk, dsk_dst->mem_size, &last->disk_image)); disk_image = last->disk_image; - disk_image->recno = last->max_bnd_recno; + disk_image->recno = last->recno; disk_image->mem_size = dsk_dst->mem_size; disk_image->u.entries = dsk_dst->u.entries; } @@ -3007,14 +2903,14 @@ no_slots: */ switch (dsk->type) { case WT_PAGE_COL_INT: - next->max_bnd_recno = r->raw_recnos[result_slots]; + next->recno = r->raw_recnos[result_slots]; break; case WT_PAGE_COL_VAR: - next->max_bnd_recno = r->raw_recnos[result_slots - 1]; + next->recno = r->raw_recnos[result_slots - 1]; break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->max_bnd_recno = WT_RECNO_OOB; + next->recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -3023,7 +2919,7 @@ no_slots: */ WT_ASSERT(session, len > 0); WT_RET(__rec_split_row_promote_cell( - session, dsk, &next->max_bnd_key)); + session, dsk, &next->key)); } break; } @@ -3035,7 +2931,7 @@ no_slots: */ WT_STAT_DATA_INCR(session, compress_raw_fail); - dsk->recno = last->max_bnd_recno; + dsk->recno = last->recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->disk_image.size = dsk->mem_size; @@ -3112,9 +3008,35 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BOUNDARY *bnd; WT_PAGE_HEADER *dsk; - bool grow_bnd; + + /* Adjust the boundary information based on our split status. */ + switch (r->bnd_state) { + case SPLIT_BOUNDARY: + case SPLIT_MAX: + /* + * We never split, the reconciled page fit into a maximum page + * size. Change the first boundary slot to represent the full + * page (the first boundary slot is largely correct, just update + * the number of entries). + */ + r->bnd_next = 0; + break; + case SPLIT_TRACKING_OFF: + /* + * If we have already split, or aren't tracking boundaries, put + * the remaining data in the next boundary slot. + */ + WT_RET(__rec_split_bnd_grow(session, r)); + break; + case SPLIT_TRACKING_RAW: + /* + * We were configured for raw compression, and either we never + * wrote anything, or there's a remaindered block of data. + */ + break; + } /* * We may arrive here with no entries to write if the page was entirely @@ -3141,66 +3063,20 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - dsk = r->disk_image.mem; + /* Set the boundary reference and increment the count. */ + bnd = &r->bnd[r->bnd_next++]; + bnd->entries = r->entries; - /* Set the number of entries for the just finished chunk. */ - bnd_cur = &r->bnd[r->bnd_next]; - bnd_cur->max_bnd_entries = r->entries; - - grow_bnd = true; - /* - * We can reach here even with raw_compression when the last split chunk - * is too small to be sent for raw compression. - */ - if (!r->is_bulk_load && !r->raw_compression) { - if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && - r->bnd_next != 0) { - /* - * We hold two boundaries worth of data in the buffer, - * and this data doesn't fit in a single page. If the - * last chunk is too small, readjust the boundary to a - * pre-computed minimum. - * Write out the penultimate chunk to the disk as a page - */ - WT_RET(__rec_split_write_prev_and_shift_cur( - session, r, true)); - } else - if (r->bnd_next != 0) { - /* - * We have two boundaries, but the data in the - * buffer can fit a single page. Merge the - * boundaries to create a single chunk. - */ - bnd_prev = bnd_cur - 1; - bnd_prev->max_bnd_entries += - bnd_cur->max_bnd_entries; - r->bnd_next--; - grow_bnd = false; - } - } - - /* - * We already have space for an extra boundary if we merged two - * boundaries above, in that case we do not need to grow the boundary - * structure. - */ - if (grow_bnd) - WT_RET(__rec_split_bnd_grow(session, r)); - bnd_cur = &r->bnd[r->bnd_next]; - r->bnd_next++; - - /* - * Current boundary now has all the remaining data/last page now. - * Let's write it to the disk - */ - dsk->recno = bnd_cur->max_bnd_recno; - dsk->u.entries = bnd_cur->max_bnd_entries; + /* Finalize the header information. */ + dsk = r->disk_image.mem; + dsk->recno = bnd->recno; + dsk->u.entries = r->entries; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ - return (__rec_is_checkpoint(session, r, bnd_cur) ? - 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); + return (__rec_is_checkpoint(session, r, bnd) ? + 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); } /* @@ -3233,6 +3109,98 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (__rec_split_finish_std(session, r)); } +/* + * __rec_split_fixup -- + * Fix up after crossing the maximum page boundary. + */ +static int +__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE_HEADER *dsk; + size_t i, len; + uint8_t *dsk_start, *p; + + /* + * When we overflow physical limits of the page, we walk the list of + * split chunks we've created and write those pages out, then update + * the caller's information. + */ + btree = S2BT(session); + + /* + * The data isn't laid out on a page boundary or nul padded; copy it to + * a clean, aligned, padded buffer before writing it. + * + * Allocate a scratch buffer to hold the new disk image. Copy the disk + * page's header and block-manager space into the scratch buffer, most + * of the header information remains unchanged between the pages. + */ + WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); + dsk = tmp->mem; + memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); + + /* + * For each split chunk we've created, update the disk image and copy + * it into place. + */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { + /* Copy the page contents to the temporary buffer. */ + len = (bnd + 1)->offset - bnd->offset; + memcpy(dsk_start, + (uint8_t *)r->disk_image.mem + bnd->offset, len); + + /* Finalize the header information and write the page. */ + dsk->recno = bnd->recno; + dsk->u.entries = bnd->entries; + tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; + dsk->mem_size = WT_STORE_SIZE(tmp->size); + WT_ERR(__rec_split_write(session, r, bnd, tmp, false)); + } + + /* + * There is probably a remnant in the working buffer that didn't get + * written, copy it down to the beginning of the working buffer. + * + * Confirm the remnant is no larger than a split-sized chunk, including + * header. We know that's the maximum sized remnant because we only have + * remnants if split switches from accumulating to a split boundary to + * accumulating to the end of the page (the other path here is when we + * hit a split boundary, there was room for another split chunk in the + * page, and the next item still wouldn't fit, in which case there is no + * remnant). So: we were accumulating to the end of the page and created + * a remnant. We know the remnant cannot be as large as a split-sized + * chunk, including header, because if there was room for that large a + * remnant, we wouldn't have switched from accumulating to a page end. + */ + p = (uint8_t *)r->disk_image.mem + bnd->offset; + len = WT_PTRDIFF(r->first_free, p); + if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) + WT_PANIC_ERR(session, EINVAL, + "Reconciliation remnant too large for the split buffer"); + dsk = r->disk_image.mem; + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, p, len); + + /* + * Fix up our caller's information, including updating the starting + * record number. + */ + r->entries -= r->total_entries; + r->first_free = dsk_start + len; + WT_ASSERT(session, + r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); + r->space_avail = + r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + +err: __wt_scr_free(session, &tmp); + return (ret); +} + /* * __rec_split_write -- * Write a disk block out for the split helper functions. @@ -3270,6 +3238,8 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } + bnd->entries = r->entries; + /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3315,8 +3285,7 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(supd->ins) >= - (bnd + 1)->max_bnd_recno) + if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: @@ -3327,8 +3296,8 @@ __rec_split_write(WT_SESSION_IMPL *session, key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); } - WT_ERR(__wt_compare(session, btree->collator, - key, &(bnd + 1)->max_bnd_key, &cmp)); + WT_ERR(__wt_compare(session, + btree->collator, key, &(bnd + 1)->key, &cmp)); if (cmp >= 0) goto supd_check_complete; break; @@ -3418,14 +3387,14 @@ supd_check_complete: #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && - bnd->max_bnd_entries < 6) + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s", bnd->max_bnd_entries, - r->page->memory_footprint, r->bnd_next, - F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint"); + ", page count %" PRIu32 ", %s, split state: %d", + r->entries, r->page->memory_footprint, r->bnd_next, + F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", + r->bnd_state); #endif WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, @@ -3711,12 +3680,11 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) - WT_RET(__rec_split_raw( - session, r, key->len + val->len)); - } else - if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) + WT_RET( + __rec_split_raw(session, r, key->len + val->len)); + else { /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3728,9 +3696,10 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } - WT_RET(__rec_split_crossing_bnd( - session, r, key->len + val->len)); + + WT_RET(__rec_split(session, r, key->len + val->len)); } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -3771,10 +3740,6 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * split. * * Boundary: split or write the page. - * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. */ __rec_incr(session, r, cbulk->entry, __bitstr_size( @@ -3879,12 +3844,10 @@ __wt_bulk_insert_var( r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) - WT_RET(__rec_split_raw(session, r, val->len)); - } else - if (WT_CROSSING_SPLIT_BND(r, val->len)) - WT_RET(__rec_split_crossing_bnd(session, r, val->len)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -4020,13 +3983,10 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) - WT_ERR(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) - WT_ERR(__rec_split_crossing_bnd( - session, r, val->len)); + if (val->len > r->space_avail) + WT_ERR(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4068,13 +4028,10 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) - WT_RET(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) - WT_RET(__rec_split_crossing_bnd( - session, r, val->len)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4182,10 +4139,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * split. * * Boundary: split or write the page. - * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); @@ -4342,13 +4295,10 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (val->len > r->space_avail) - WT_RET(__rec_split_raw(session, r, val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, val->len)) - WT_RET(__rec_split_crossing_bnd( - session, r, val->len)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -5011,12 +4961,11 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -5028,10 +4977,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_IKEY_DATA(ikey), ikey->size)); key_onpage_ovfl = false; } - - WT_ERR(__rec_split_crossing_bnd( + WT_ERR(__rec_split( session, r, key->len + val->len)); } + } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5081,14 +5030,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) - WT_RET(__rec_split_raw( - session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) - WT_RET(__rec_split_crossing_bnd( - session, r, key->len + val->len)); + if (key->len + val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, key->len + val->len) : + __rec_split(session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5417,17 +5362,16 @@ build: } /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* - * If we copied address blocks from the page - * rather than building the actual key, we have - * to build the key now because we are about to - * promote it. + * In one path above, we copied address blocks + * from the page rather than building the actual + * key. In that case, we have to build the key + * now because we are about to promote it. */ if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, @@ -5446,13 +5390,14 @@ build: if (!ovfl_key) WT_ERR( __rec_cell_build_leaf_key( - session, r, NULL, 0, - &ovfl_key)); + session, + r, NULL, 0, &ovfl_key)); } - WT_ERR(__rec_split_crossing_bnd( + WT_ERR(__rec_split( session, r, key->len + val->len)); } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5515,12 +5460,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (r->raw_compression) { - if (key->len + val->len > r->space_avail) + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - } else - if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { + else { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5532,13 +5476,14 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (!ovfl_key) WT_RET( __rec_cell_build_leaf_key( - session, r, NULL, 0, - &ovfl_key)); + session, + r, NULL, 0, &ovfl_key)); } - WT_RET(__rec_split_crossing_bnd( + WT_RET(__rec_split( session, r, key->len + val->len)); } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5650,14 +5595,13 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) __wt_verbose(session, WT_VERB_SPLIT, "starting key %s", __wt_buf_set_printable( - session, bnd->max_bnd_key.data, - bnd->max_bnd_key.size, tkey)); + session, bnd->key.data, bnd->key.size, tkey)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __wt_verbose(session, WT_VERB_SPLIT, - "starting recno %" PRIu64, bnd->max_bnd_recno); + "starting recno %" PRIu64, bnd->recno); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -5919,10 +5863,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* We never set the first page's key, grab it from the original page. */ ref = r->ref; if (__wt_ref_is_root(ref)) - WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1)); + WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); else { __wt_ref_key(ref->home, ref, &p, &size); - WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size)); + WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); } /* Allocate, then initialize the array of replacement blocks. */ @@ -5930,8 +5874,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data, - bnd->max_bnd_key.size, &multi->key.ikey)); + WT_RET(__wt_row_ikey_alloc(session, 0, + bnd->key.data, bnd->key.size, &multi->key.ikey)); /* * Copy any disk image. Don't take saved updates without a @@ -5978,7 +5922,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - multi->key.recno = bnd->max_bnd_recno; + multi->key.recno = bnd->recno; /* * Copy any disk image. Don't take saved updates without a diff --git a/test/format/config.h b/test/format/config.h index b5feb7a5321..e3e1e73a786 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -284,7 +284,7 @@ static CONFIG c[] = { { "split_pct", "page split size as a percentage of the maximum page size", - 0x0, 50, 100, 100, &g.c_split_pct, NULL }, + 0x0, 40, 85, 85, &g.c_split_pct, NULL }, { "statistics", "maintain statistics", /* 20% */ -- cgit v1.2.1 From 0a0bfa94d912933bede4f2550ac34a69916cb416 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 29 Mar 2017 16:26:57 -0400 Subject: WT-3244 Turn off in-memory cache-full checks on the metadata file (#3359) This avoids metadata operations failing in in-memory configurations. --- src/btree/bt_handle.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index f2bffee06da..57e0a3422f2 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -359,6 +359,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } else F_CLR(btree, WT_BTREE_IGNORE_CACHE); + /* + * The metadata isn't blocked by in-memory cache limits because metadata + * "unroll" is performed by updates that are potentially blocked by the + * cache-full checks. + */ + if (WT_IS_METADATA(btree->dhandle)) + F_SET(btree, WT_BTREE_IGNORE_CACHE); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); -- cgit v1.2.1 From 2874db3364248da2e96ca0bde45fa08482445b57 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 30 Mar 2017 07:38:32 +1100 Subject: WT-3208 Don't count page rewrites as eviction making progress. (#3356) --- src/btree/bt_discard.c | 33 +++++++++++++++++++++++++++------ src/btree/bt_split.c | 2 +- src/include/btree.i | 12 ++++++++++-- src/include/extern.h | 1 + 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index d2beb84fee9..bab7b8145d6 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -16,13 +16,14 @@ static void __free_skip_array( WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t, bool); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *, bool); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t, bool); +static void __page_out_int(WT_SESSION_IMPL *, WT_PAGE **, bool); /* - * __wt_ref_out -- + * __wt_ref_out_int -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) { /* * A version of the page-out function that allows us to make additional @@ -56,15 +57,25 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) } #endif - __wt_page_out(session, &ref->page); + __page_out_int(session, &ref->page, rewrite); } /* - * __wt_page_out -- + * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) +__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) +{ + __wt_ref_out_int(session, ref, false); +} + +/* + * __page_out_int -- + * Discard an in-memory page, freeing all memory associated with it. + */ +static void +__page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) { WT_PAGE *page; WT_PAGE_HEADER *dsk; @@ -103,7 +114,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) } /* Update the cache's information. */ - __wt_cache_page_evict(session, page); + __wt_cache_page_evict(session, page, rewrite); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) @@ -147,6 +158,16 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) __wt_overwrite_and_free(session, page); } +/* + * __wt_page_out -- + * Discard an in-memory page, freeing all memory associated with it. + */ +void +__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) +{ + __page_out_int(session, pagep, false); +} + /* * __free_page_modify -- * Discard the page's associated modification structures. diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index b1bad760826..49043c8bab4 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -2274,7 +2274,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) * reconciliation, do it now. */ __wt_page_modify_clear(session, page); - __wt_ref_out(session, ref); + __wt_ref_out_int(session, ref, true); /* Swap the new page into place. */ ref->page = new->page; diff --git a/src/include/btree.i b/src/include/btree.i index eefc2db075d..a4d88d5fda1 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -413,7 +413,7 @@ __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size) * Evict pages from the cache. */ static inline void -__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) { WT_BTREE *btree; WT_CACHE *cache; @@ -456,7 +456,15 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update pages and bytes evicted. */ (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint); - (void)__wt_atomic_addv64(&cache->pages_evict, 1); + + /* + * Don't count rewrites as eviction: there's no guarantee we are making + * real progress. + */ + if (rewrite) + (void)__wt_atomic_subv64(&cache->pages_inmem, 1); + else + (void)__wt_atomic_addv64(&cache->pages_evict, 1); } /* diff --git a/src/include/extern.h b/src/include/extern.h index a7eb4b491a9..2759ac1dec3 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -126,6 +126,7 @@ extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From f379b4be6881ebda712f79053b6dc1e13938e59a Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 29 Mar 2017 17:15:25 -0400 Subject: WT-3155 Remove WT_CONN_SERVER_RUN flag (#3344) Set WT_CONN_CLOSING earlier in the connection close process (before calling the async close functions). This requires removing the assert in btree handle open that close hasn't yet been called. Add a barrier after setting the connection close flag to ensure the write is flushed. LSM workers checked both the WT_CONN_SERVER_RUN and WT_LSM_WORKER_RUN flags because the LSM destroy path (__lsm_manager_worker_shutdown), didn't clear WT_LSM_WORKER_RUN flag. Add that clear, change __lsm_worker to only check WT_LSM_WORKER_RUN. Previously, the LSM manager checked the WT_CONN_SERVER_RUN flag in the LSM destroy path and connection shutdown waited on the LSM manager to stop and clear WT_CONN_SERVER_LSM. Flip that process: the LSM shutdown path now clears WT_CONN_SERVER_LSM, and the LSM manager stops when it sees WT_CONN_SERVER_LSM is cleared. The LSM manager sets a new flag, WT_LSM_MANAGER_SHUTDOWN, when it's stopped, and the shutdown process waits on that new flag. Add memory barriers to the thread create and join functions. WiredTiger typically sets (clears) state and expects threads to see the state and start (stop). It simpler and safer if we imply a barrier in the thread API. * Rename WT_CONN_LOG_SERVER_RUN to WT_CONN_SERVER_LOG to match the other server flags. * Once the async and LSM servers have exited, assert no more files are opened. * Instead of using a barrier to ensure the worker run state isn't cached, declare the structure field volatile. Use a stand-alone structure field instead of a set of flags, it's a simpler "volatile" story. * In one of two places, when shutting down worker threads, we signalled the condition variable to wake the worker thread. For consistency, remove the signal (we're only sleeping for 100th of a second, the wake isn't buying us anything). * Restore the assertion in __open_session() that we're not in the "closing" path, returning an error is more dangerous, it might cause a thread to panic, and then we have a panic racing with the close. * A wt_thread_t (POSIX pthread_t) is an opaque type, and can't be assigned to 0 or tested against an integral value portably. Add a bool WT_LSM_WORKER_ARGS.tid_set field instead of assigning or testing the wt_thread_t. We already have an __wt_lsm_start function, add a __wt_lsm_stop function and move the setting/clearing of the WT_LSM_WORKER_ARGS.{running,tid_set} fields into those functions so we ensure the ordering is correct. --- dist/flags.py | 4 ++-- src/conn/conn_dhandle.c | 3 ++- src/conn/conn_log.c | 12 +++++------ src/conn/conn_open.c | 19 ++++++++++------- src/include/extern.h | 1 + src/include/flags.h | 14 ++++++------ src/include/lsm.h | 10 +++++++-- src/lsm/lsm_manager.c | 54 +++++++++++++++++++++++++---------------------- src/lsm/lsm_worker.c | 23 +++++++++++++++----- src/os_posix/os_thread.c | 14 ++++++++++++ src/os_win/os_thread.c | 14 ++++++++++++ src/session/session_api.c | 4 ++-- 12 files changed, 114 insertions(+), 58 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index b20a7181532..64b5d789e72 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -96,19 +96,19 @@ flags = { 'CONN_CACHE_POOL', 'CONN_CKPT_SYNC', 'CONN_CLOSING', + 'CONN_CLOSING_NO_MORE_OPENS', 'CONN_EVICTION_RUN', 'CONN_IN_MEMORY', 'CONN_LAS_OPEN', 'CONN_LEAK_MEMORY', - 'CONN_LOG_SERVER_RUN', 'CONN_LSM_MERGE', 'CONN_PANIC', 'CONN_READONLY', 'CONN_RECOVERING', 'CONN_SERVER_ASYNC', 'CONN_SERVER_CHECKPOINT', + 'CONN_SERVER_LOG', 'CONN_SERVER_LSM', - 'CONN_SERVER_RUN', 'CONN_SERVER_STATISTICS', 'CONN_SERVER_SWEEP', 'CONN_WAS_BACKUP', diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index c5480897494..657cdebf7ee 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -314,7 +314,8 @@ __wt_conn_btree_open( F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); - WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING)); + WT_ASSERT(session, + !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); /* * If the handle is already open, it has to be closed so it can be diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index c6dd795389d..b8b5bd2a908 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -341,7 +341,7 @@ __wt_log_truncate_files( conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) && + if (F_ISSET(conn, WT_CONN_SERVER_LOG) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -382,7 +382,7 @@ __log_file_server(void *arg) conn = S2C(session); log = conn->log; locked = false; - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. @@ -708,7 +708,7 @@ __log_wrlsn_server(void *arg) log = conn->log; yield = 0; WT_INIT_LSN(&prev); - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * Write out any log record buffers if anything was done * since last time. Only call the function to walk the @@ -783,7 +783,7 @@ __log_server(void *arg) * takes to sync out an earlier file. */ did_work = true; - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the @@ -923,7 +923,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - F_SET(conn, WT_CONN_LOG_SERVER_RUN); + F_SET(conn, WT_CONN_SERVER_LOG); /* * Start the log close thread. It is not configurable. @@ -995,7 +995,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); - F_CLR(conn, WT_CONN_LOG_SERVER_RUN); + F_CLR(conn, WT_CONN_SERVER_LOG); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 5b20377d437..eb3c79422a0 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -21,12 +21,6 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) session = conn->default_session; WT_ASSERT(session, session->iface.connection == &conn->iface); - /* - * Tell internal server threads to run: this must be set before opening - * any sessions. - */ - F_SET(conn, WT_CONN_SERVER_RUN); - /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions)); @@ -100,6 +94,10 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_yield(); } + /* Shut down the subsystems, ensuring workers see the state change. */ + F_SET(conn, WT_CONN_CLOSING); + WT_FULL_BARRIER(); + /* * Clear any pending async operations and shut down the async worker * threads and system before closing LSM. @@ -113,10 +111,15 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * btree handles, so take care in ordering shutdown to make sure they * exit before files are closed. */ - F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_lsm_manager_destroy(session)); - F_SET(conn, WT_CONN_CLOSING); + /* + * Once the async and LSM threads exit, we shouldn't be opening any + * more files. + */ + F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); + WT_FULL_BARRIER(); + WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); WT_TRET(__wt_sweep_destroy(session)); diff --git a/src/include/extern.h b/src/include/extern.h index 2759ac1dec3..47b4e03a7b7 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -458,6 +458,7 @@ extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_apply_all(WT_SESSION_IMPL *session, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index c1fff920e3b..f26a45c68f5 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -6,19 +6,19 @@ #define WT_CONN_CACHE_POOL 0x00000001 #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_CLOSING 0x00000004 -#define WT_CONN_EVICTION_RUN 0x00000008 -#define WT_CONN_IN_MEMORY 0x00000010 -#define WT_CONN_LAS_OPEN 0x00000020 -#define WT_CONN_LEAK_MEMORY 0x00000040 -#define WT_CONN_LOG_SERVER_RUN 0x00000080 +#define WT_CONN_CLOSING_NO_MORE_OPENS 0x00000008 +#define WT_CONN_EVICTION_RUN 0x00000010 +#define WT_CONN_IN_MEMORY 0x00000020 +#define WT_CONN_LAS_OPEN 0x00000040 +#define WT_CONN_LEAK_MEMORY 0x00000080 #define WT_CONN_LSM_MERGE 0x00000100 #define WT_CONN_PANIC 0x00000200 #define WT_CONN_READONLY 0x00000400 #define WT_CONN_RECOVERING 0x00000800 #define WT_CONN_SERVER_ASYNC 0x00001000 #define WT_CONN_SERVER_CHECKPOINT 0x00002000 -#define WT_CONN_SERVER_LSM 0x00004000 -#define WT_CONN_SERVER_RUN 0x00008000 +#define WT_CONN_SERVER_LOG 0x00004000 +#define WT_CONN_SERVER_LSM 0x00008000 #define WT_CONN_SERVER_STATISTICS 0x00010000 #define WT_CONN_SERVER_SWEEP 0x00020000 #define WT_CONN_WAS_BACKUP 0x00040000 diff --git a/src/include/lsm.h b/src/include/lsm.h index 2bbb813bad2..e3f6897ef9d 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -23,11 +23,14 @@ struct __wt_lsm_worker_cookie { struct __wt_lsm_worker_args { WT_SESSION_IMPL *session; /* Session */ WT_CONDVAR *work_cond; /* Owned by the manager */ + wt_thread_t tid; /* Thread id */ + bool tid_set; /* Thread id set */ + u_int id; /* My manager slot id */ uint32_t type; /* Types of operations handled */ -#define WT_LSM_WORKER_RUN 0x01 - uint32_t flags; /* Worker flags */ + + volatile bool running; /* Worker is running */ }; /* @@ -162,6 +165,9 @@ struct __wt_lsm_manager { #define WT_LSM_MAX_WORKERS 20 #define WT_LSM_MIN_WORKERS 3 WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS]; + +#define WT_LSM_MANAGER_SHUTDOWN 0x01 /* Manager has shut down */ + uint32_t flags; }; /* diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 6dc06146179..e33e119aa41 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -89,7 +89,6 @@ __lsm_general_worker_start(WT_SESSION_IMPL *session) if (manager->lsm_workers % 2 == 0) FLD_SET(worker_args->type, WT_LSM_WORK_MERGE); } - F_SET(worker_args, WT_LSM_WORKER_RUN); WT_RET(__wt_lsm_worker_start(session, worker_args)); } @@ -129,17 +128,13 @@ __lsm_stop_workers(WT_SESSION_IMPL *session) manager->lsm_workers--) { worker_args = &manager->lsm_worker_cookies[manager->lsm_workers - 1]; - /* - * Clear this worker's flag so it stops. - */ - F_CLR(worker_args, WT_LSM_WORKER_RUN); - WT_ASSERT(session, worker_args->tid != 0); - WT_RET(__wt_thread_join(session, worker_args->tid)); - worker_args->tid = 0; + WT_ASSERT(session, worker_args->tid_set); + + WT_RET(__wt_lsm_worker_stop(session, worker_args)); worker_args->type = 0; - worker_args->flags = 0; + /* - * We do not clear the session because they are allocated + * We do not clear the other fields because they are allocated * statically when the connection was opened. */ } @@ -237,12 +232,12 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) manager->lsm_worker_cookies[i].session = worker_session; } + F_SET(conn, WT_CONN_SERVER_LSM); + /* Start the LSM manager thread. */ WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager, &manager->lsm_worker_cookies[0])); - F_SET(conn, WT_CONN_SERVER_LSM); - if (0) { err: for (i = 0; (worker_session = @@ -289,13 +284,18 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager = &conn->lsm_manager; removed = 0; + /* + * Clear the LSM server flag and flush to ensure running threads see + * the state change. + */ + F_CLR(conn, WT_CONN_SERVER_LSM); + WT_FULL_BARRIER(); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || manager->lsm_workers == 0); if (manager->lsm_workers > 0) { - /* - * Stop the main LSM manager thread first. - */ - while (F_ISSET(conn, WT_CONN_SERVER_LSM)) + /* Wait for the main LSM manager thread to finish. */ + while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) __wt_yield(); /* Clean up open LSM handles. */ @@ -303,7 +303,6 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join( session, manager->lsm_worker_cookies[0].tid)); - manager->lsm_worker_cookies[0].tid = 0; /* Release memory from any operations left on the queue. */ while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) { @@ -342,7 +341,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) /* * __lsm_manager_worker_shutdown -- - * Shutdown the LSM manager and worker threads. + * Shutdown the LSM worker threads. */ static int __lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) @@ -354,14 +353,13 @@ __lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) manager = &S2C(session)->lsm_manager; /* - * Wait for the rest of the LSM workers to shutdown. Stop at index + * Wait for the rest of the LSM workers to shutdown. Start at index * one - since we (the manager) are at index 0. */ for (i = 1; i < manager->lsm_workers; i++) { - WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0); - __wt_cond_signal(session, manager->work_cond); - WT_TRET(__wt_thread_join( - session, manager->lsm_worker_cookies[i].tid)); + WT_ASSERT(session, manager->lsm_worker_cookies[i].tid_set); + WT_TRET(__wt_lsm_worker_stop( + session, &manager->lsm_worker_cookies[i])); } return (ret); } @@ -383,7 +381,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) conn = S2C(session); dhandle_locked = false; - while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LSM)) { __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; @@ -469,11 +467,13 @@ static WT_THREAD_RET __lsm_worker_manager(void *arg) { WT_DECL_RET; + WT_LSM_MANAGER *manager; WT_LSM_WORKER_ARGS *cookie; WT_SESSION_IMPL *session; cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; + manager = &S2C(session)->lsm_manager; WT_ERR(__lsm_general_worker_start(session)); WT_ERR(__lsm_manager_run_server(session)); @@ -482,7 +482,11 @@ __lsm_worker_manager(void *arg) if (ret != 0) { err: WT_PANIC_MSG(session, ret, "LSM worker manager thread error"); } - F_CLR(S2C(session), WT_CONN_SERVER_LSM); + + /* Connection close waits on us to shutdown, let it know we're done. */ + F_SET(manager, WT_LSM_MANAGER_SHUTDOWN); + WT_FULL_BARRIER(); + return (WT_THREAD_RET_VALUE); } diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index ffa00c0a5e7..1cabbd4888d 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -21,7 +21,23 @@ __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) { __wt_verbose(session, WT_VERB_LSM_MANAGER, "Start LSM worker %u type %#" PRIx32, args->id, args->type); - return (__wt_thread_create(session, &args->tid, __lsm_worker, args)); + + args->running = true; + WT_RET(__wt_thread_create(session, &args->tid, __lsm_worker, args)); + args->tid_set = true; + return (0); +} + +/* + * __wt_lsm_worker_stop -- + * A wrapper around the LSM worker thread stop. + */ +int +__wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) +{ + args->running = false; + args->tid_set = false; + return (__wt_thread_join(session, args->tid)); } /* @@ -84,7 +100,6 @@ err: __wt_lsm_manager_free_work_unit(session, entry); static WT_THREAD_RET __lsm_worker(void *arg) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_WORK_UNIT *entry; WT_LSM_WORKER_ARGS *cookie; @@ -93,11 +108,9 @@ __lsm_worker(void *arg) cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; - conn = S2C(session); entry = NULL; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(cookie, WT_LSM_WORKER_RUN)) { + while (cookie->running) { progress = false; /* diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c index 85d43f10a33..18e4c347436 100644 --- a/src/os_posix/os_thread.c +++ b/src/os_posix/os_thread.c @@ -18,6 +18,13 @@ __wt_thread_create(WT_SESSION_IMPL *session, { WT_DECL_RET; + /* + * Creating a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to start. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + /* Spawn a new thread of control. */ WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret); if (ret == 0) @@ -34,6 +41,13 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { WT_DECL_RET; + /* + * Joining a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to halt. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + WT_SYSCALL(pthread_join(tid, NULL), ret); if (ret == 0) return (0); diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c index 7442fb08a36..4c8f212bb4f 100644 --- a/src/os_win/os_thread.c +++ b/src/os_win/os_thread.c @@ -16,6 +16,13 @@ int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) { + /* + * Creating a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to start. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + /* Spawn a new thread of control. */ *tidret = (HANDLE)_beginthreadex(NULL, 0, func, arg, 0, NULL); if (*tidret != 0) @@ -33,6 +40,13 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { DWORD windows_error; + /* + * Joining a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to halt. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + if ((windows_error = WaitForSingleObject(tid, INFINITE)) != WAIT_OBJECT_0) { if (windows_error == WAIT_FAILED) diff --git a/src/session/session_api.c b/src/session/session_api.c index 51233e5e224..b7daf0e2e02 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1502,7 +1502,7 @@ __transaction_sync_run_chk(WT_SESSION_IMPL *session) conn = S2C(session); - return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN)); + return (FLD_ISSET(conn->flags, WT_CONN_SERVER_LOG)); } /* @@ -1812,7 +1812,7 @@ __open_session(WT_CONNECTION_IMPL *conn, * closes the connection. This is particularly intended to catch * cases where server threads open sessions. */ - WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN)); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_CLOSING)); /* Find the first inactive session slot. */ for (session_ret = conn->sessions, -- cgit v1.2.1 From 478c69bde8244349bf2b41505a877889fef3c500 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 30 Mar 2017 01:39:11 -0400 Subject: WT-2439 Enhance reconciliation page layout (#3358) * Set minimum split pct to 50. * The leaf-page value dictionary stores cell offsets in the disk image, which implies a dictionary reset any time we hit a boundary or grow the disk image buffer. Recent changes broke that, we weren't resetting the dictionary when the disk image buffer was resized. Instead of clearing the dictionary on buffer resize, switch to using cell offsets in the dictionary instead of cell pointers. It's unlikely to be a big win for many workloads, but it might help some, and it's cleaner than resetting the dictionary more often. Add a verify of disk images we don't write: the I/O routines verify any image we write, but we need to verify any image we create. --- dist/api_data.py | 4 +- src/btree/bt_handle.c | 9 +- src/config/config_def.c | 16 +- src/include/btree.h | 6 + src/include/wiredtiger.in | 4 +- src/reconcile/rec_write.c | 1029 +++++++++++++++++++++++++-------------------- test/format/config.h | 2 +- 7 files changed, 591 insertions(+), 479 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 1d669fa7fe0..22600dd5e29 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -295,12 +295,12 @@ file_config = format_meta + file_runtime_config + [ Config('split_deepen_per_child', '0', r''' entries allocated per child when deepening the tree''', type='int', undoc=True), - Config('split_pct', '75', r''' + Config('split_pct', '90', r''' the Btree page split size as a percentage of the maximum Btree page size, that is, when a Btree page is split, it will be split into smaller pages, where each page is the specified percentage of the maximum Btree page size''', - min='25', max='100'), + min='50', max='100'), ] # File metadata, including both configurable and non-configurable (internal) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 57e0a3422f2..d76720b19ae 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -788,9 +788,16 @@ __btree_page_sizes(WT_SESSION_IMPL *session) * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. + * Set to the minimum, if the read value is less than that. */ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); - btree->split_pct = (int)cval.val; + if (cval.val < WT_BTREE_MIN_SPLIT_PCT) { + btree->split_pct = WT_BTREE_MIN_SPLIT_PCT; + WT_RET(__wt_msg(session, + "Re-setting split_pct for %s to the minimum allowed of " + "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT)); + } else + btree->split_pct = (int)cval.val; intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); diff --git a/src/config/config_def.c b/src/config/config_def.c index b11a8d63fdb..f152fbacad4 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -294,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "source", "string", NULL, NULL, NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "type", "string", NULL, NULL, NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, @@ -466,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -530,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -614,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -1119,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", + "split_deepen_per_child=0,split_pct=90,type=file,value_format=u", confchk_WT_SESSION_create, 42 }, { "WT_SESSION.drop", @@ -1213,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_file_config, 35 }, { "file.meta", @@ -1228,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0," "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" - ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," "value_format=u,version=(major=0,minor=0)", confchk_file_meta, 39 }, @@ -1253,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "merge_min=0),memory_page_max=5MB,old_chunks=," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_lsm_meta, 39 }, { "table.meta", diff --git a/src/include/btree.h b/src/include/btree.h index 88312f408cc..28fe1b94b23 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -57,6 +57,12 @@ /* Evict pages if we see this many consecutive deleted records. */ #define WT_BTREE_DELETE_THRESHOLD 1000 +/* + * Minimum size of the chunks (in percentage of the page size) a page gets split + * into during reconciliation. + */ +#define WT_BTREE_MIN_SPLIT_PCT 50 + /* * WT_BTREE -- * A btree handle. diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 558e93d3de0..707159ef6ae 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1242,8 +1242,8 @@ struct __wt_session { * @config{split_pct, the Btree page split size as a percentage of the * maximum Btree page size\, that is\, when a Btree page is split\, it * will be split into smaller pages\, where each page is the specified - * percentage of the maximum Btree page size., an integer between 25 and - * 100; default \c 75.} + * percentage of the maximum Btree page size., an integer between 50 and + * 100; default \c 90.} * @config{type, set the type of data source used to store a column * group\, index or simple table. By default\, a \c "file:" URI is * derived from the object name. The \c type configuration can be used diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 23f654caa70..6f95b84d292 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -26,6 +26,11 @@ typedef struct { uint32_t flags; /* Caller's configuration */ WT_ITEM disk_image; /* Temporary disk-image buffer */ + /* + * Temporary buffer used to write out a disk image when managing two + * chunks worth of data in memory + */ + WT_ITEM *interim_buf; /* * Track start/stop write generation to decide if all changes to the @@ -127,6 +132,7 @@ typedef struct { * repeatedly split a packed page. */ uint32_t split_size; /* Split page size */ + uint32_t min_split_size; /* Minimum split page size */ /* * The problem with splits is we've done a lot of work by the time we @@ -151,16 +157,6 @@ typedef struct { */ size_t offset; /* Split's first byte */ - /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. These fields are used both - * to write the split chunk, and to create a new internal page - * to reference the split pages. - */ - uint64_t recno; /* Split's starting record */ - uint32_t entries; /* Split's entries */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -182,39 +178,42 @@ typedef struct { size_t supd_allocated; /* + * While reconciling pages, at any given time, we maintain two + * split chunks in the memory to be written out as pages. As we + * get to the last two chunks, if the last one turns out to be + * smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to + * the last chunk, hence increasing the size of the last page + * written without decreasing the penultimate page size beyond + * the minimum split size. For this reason, we maintain both a + * maximum split percentage boundary and a minimum split + * percentage boundary. + * + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both to + * write the split chunk, and to create a new internal page to + * reference the split pages. + * * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the recno field, is the * column-store key. */ - WT_ITEM key; /* Promoted row-store key */ + uint32_t max_bnd_entries; + uint64_t max_bnd_recno; + WT_ITEM max_bnd_key; + + size_t min_bnd_offset; + uint32_t min_bnd_entries; + uint64_t min_bnd_recno; + WT_ITEM min_bnd_key; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ size_t bnd_entries; /* Total boundary slots */ size_t bnd_allocated; /* Bytes allocated */ - /* - * We track the total number of page entries copied into split chunks - * so we can easily figure out how many entries in the current split - * chunk. - */ - uint32_t total_entries; /* Total entries in splits */ - - /* - * And there's state information as to where in this process we are: - * (1) tracking split boundaries because we can still fit more split - * chunks into the maximum page size, (2) tracking the maximum page - * size boundary because we can't fit any more split chunks into the - * maximum page size, (3) not performing boundary checks because it's - * either not useful with the current page size configuration, or - * because we've already been forced to split. - */ - enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ - SPLIT_MAX=1, /* Next: the maximum page boundary */ - SPLIT_TRACKING_OFF=2, /* No boundary checks */ - SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ - bnd_state; - /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are @@ -226,6 +225,8 @@ typedef struct { uint32_t entries; /* Current number of entries */ uint8_t *first_free; /* Current first free byte */ size_t space_avail; /* Remaining space in this chunk */ + /* Remaining space in this chunk to put a minimum size boundary */ + size_t min_space_avail; /* * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and @@ -247,15 +248,14 @@ typedef struct { /* * WT_DICTIONARY -- - * We optionally build a dictionary of row-store values for leaf - * pages. Where two value cells are identical, only write the value - * once, the second and subsequent copies point to the original cell. - * The dictionary is fixed size, but organized in a skip-list to make - * searches faster. + * We optionally build a dictionary of values for leaf pages. Where + * two value cells are identical, only write the value once, the second + * and subsequent copies point to the original cell. The dictionary is + * fixed size, but organized in a skip-list to make searches faster. */ struct __rec_dictionary { uint64_t hash; /* Hash value */ - void *cell; /* Matching cell */ + uint32_t offset; /* Matching cell */ u_int depth; /* Skiplist */ WT_DICTIONARY *next[0]; @@ -293,6 +293,13 @@ typedef struct { uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; +#define WT_CROSSING_MIN_BND(r, next_len) \ + ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \ + (next_len) > (r)->min_space_avail) +#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) +#define WT_CHECK_CROSSING_BND(r, next_len) \ + (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) + static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); @@ -314,6 +321,7 @@ static int __rec_col_var(WT_SESSION_IMPL *, static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); +static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, @@ -323,7 +331,6 @@ static int __rec_row_leaf_insert( static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); -static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); @@ -968,6 +975,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->disk_image); + __wt_scr_free(session, &r->interim_buf); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1032,7 +1040,8 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); - __wt_buf_free(session, &bnd->key); + __wt_buf_free(session, &bnd->max_bnd_key); + __wt_buf_free(session, &bnd->min_bnd_key); } __wt_free(session, r->bnd); r->bnd_next = 0; @@ -1717,6 +1726,17 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) r->entries += v; r->space_avail -= size; r->first_free += size; + + /* + * If offset for the minimum split size boundary is not set, we have not + * yet reached the minimum boundary, reduce the space available for it. + */ + if (r->bnd[r->bnd_next].min_bnd_offset == 0) { + if (r->min_space_avail >= size) + r->min_space_avail -= size; + else + r->min_space_avail = 0; + } } /* @@ -1781,16 +1801,22 @@ __rec_dict_replace( return (0); /* - * If the dictionary cell reference is not set, we're creating a new - * entry in the dictionary, update its location. + * If the dictionary offset isn't set, we're creating a new entry in the + * dictionary, set its location. * - * If the dictionary cell reference is set, we have a matching value. - * Create a copy cell instead. + * If the dictionary offset is set, we have a matching value. Create a + * copy cell instead. */ - if (dp->cell == NULL) - dp->cell = r->first_free; + if (dp->offset == 0) + dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem); else { - offset = WT_PTRDIFF(r->first_free, dp->cell); + /* + * The offset is the byte offset from this cell to the previous, + * matching cell, NOT the byte offset from the beginning of the + * page. + */ + offset = (uint64_t)WT_PTRDIFF(r->first_free, + (uint8_t *)r->disk_image.mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy(&val->cell, rle, offset); val->buf.data = NULL; @@ -1927,8 +1953,8 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = WT_RECNO_OOB; - bnd->entries = 0; + bnd->max_bnd_recno = WT_RECNO_OOB; + bnd->max_bnd_entries = 0; __wt_free(session, bnd->addr.addr); WT_CLEAR(bnd->addr); @@ -1943,6 +1969,10 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->already_compressed = false; + bnd->min_bnd_offset = 0; + bnd->min_bnd_entries = 0; + bnd->min_bnd_recno = WT_RECNO_OOB; + /* * Don't touch the key, we re-use that memory in each new * reconciliation. @@ -1974,39 +2004,63 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. + * __rec_split_page_size_from_pct -- + * Given a split percentage, calculate split page size in bytes. */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ +static uint32_t +__rec_split_page_size_from_pct( + int split_pct, uint32_t maxpagesize, uint32_t allocsize) { uintmax_t a; uint32_t split_size; /* * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so - * we don't waste space when we write). + * size rounded to an allocation unit (round to an allocation unit so we + * don't waste space when we write). */ a = maxpagesize; /* Don't overflow. */ split_size = (uint32_t)WT_ALIGN_NEAREST( - (a * (u_int)btree->split_pct) / 100, btree->allocsize); + (a * (u_int)split_pct) / 100, allocsize); /* - * Respect the configured split percentage if the calculated split - * size is either zero or a full page. The user has either configured - * an allocation size that matches the page size, or a split - * percentage that is close to zero or one hundred. Rounding is going - * to provide a worse outcome than having a split point that doesn't - * fall on an allocation size boundary in those cases. + * Respect the configured split percentage if the calculated split size + * is either zero or a full page. The user has either configured an + * allocation size that matches the page size, or a split percentage + * that is close to zero or one hundred. Rounding is going to provide a + * worse outcome than having a split point that doesn't fall on an + * allocation size boundary in those cases. */ if (split_size == 0 || split_size == maxpagesize) - split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + split_size = (uint32_t)((a * (u_int)split_pct) / 100); return (split_size); } +/* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + btree->split_pct, maxpagesize, btree->allocsize)); +} + +/* + * __rec_min_split_page_size -- + * Minimum split size boundary calculation: To track a boundary at the + * minimum split size that we could have split at instead of splitting at + * the split page size. + */ +static uint32_t +__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize)); +} + /* * __rec_split_init -- * Initialization for the reconciliation split functions. @@ -2018,7 +2072,7 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_BM *bm; WT_BTREE *btree; WT_PAGE_HEADER *dsk; - size_t corrected_page_size; + size_t corrected_page_size, disk_img_buf_size; btree = S2BT(session); bm = btree->bm; @@ -2053,33 +2107,6 @@ __rec_split_init(WT_SESSION_IMPL *session, r->max_raw_page_size = r->page_size = (uint32_t)WT_MIN(r->page_size * 10, WT_MAX(r->page_size, btree->maxmempage / 2)); - - /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. - */ - corrected_page_size = r->page_size; - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); - - /* - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. - * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. - */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? - corrected_page_size : WT_PAGE_HEADER_SIZE); - - /* - * Set the page type (the type doesn't change, and setting it later - * would require additional code in a few different places). - */ - dsk = r->disk_image.mem; - dsk->type = page->type; - /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2099,22 +2126,28 @@ __rec_split_init(WT_SESSION_IMPL *session, * creating overflow items and compacted data, for example, as those * items have already been written to disk). So, the loop calls the * helper functions when approaching a split boundary, and we save the - * information at that point. That allows us to go back and split the - * page at the boundary points if we eventually overflow the maximum - * page size. + * information at that point. We also save the boundary information at + * the minimum split size. We maintain two chunks (each boundary + * represents a chunk that gets written as a page) in the memory, + * writing out the older one to the disk as a page when we need to make + * space for a new chunk. On reaching the last chunk, if it turns out to + * be smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size boundary. This + * moves some data from the penultimate chunk to the last chunk, hence + * increasing the size of the last page written without decreasing the + * penultimate page size beyond the minimum split size. * * Finally, all this doesn't matter for fixed-size column-store pages, * raw compression, and salvage. Fixed-size column store pages can * split under (very) rare circumstances, but they're allocated at a * fixed page size, never anything smaller. In raw compression, the - * underlying compression routine decides when we split, so it's not - * our problem. In salvage, as noted above, we can't split at all. + * underlying compression routine decides when we split, so it's not our + * problem. In salvage, as noted above, we can't split at all. */ if (r->raw_compression || r->salvage != NULL) { r->split_size = 0; r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - } - else if (page->type == WT_PAGE_COL_FIX) { + } else if (page->type == WT_PAGE_COL_FIX) { r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); @@ -2122,32 +2155,55 @@ __rec_split_init(WT_SESSION_IMPL *session, r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_split_size = + __rec_min_split_page_size(btree, r->page_size); + r->min_space_avail = + r->min_split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); } + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + * + * The buffer that we build disk image in, needs to hold two chunks + * worth of data. Since we want to support split_size more than the page + * size (to allow for adjustments based on the compression), this buffer + * should be greater of twice of split_size and page_size. + */ + corrected_page_size = r->page_size; + disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + + /* + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + disk_img_buf_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). + */ + dsk = r->disk_image.mem; + dsk->type = page->type; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Initialize the first boundary. */ r->bnd_next = 0; WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); - r->bnd[0].recno = recno; + r->bnd[0].max_bnd_recno = recno; r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * If the maximum page size is the same as the split page size, either - * because of the object type or application configuration, there isn't - * any need to maintain split boundaries within a larger page. - * - * No configuration for salvage here, because salvage can't split. - */ - if (r->raw_compression) - r->bnd_state = SPLIT_TRACKING_RAW; - else if (max == r->split_size) - r->bnd_state = SPLIT_TRACKING_OFF; - else - r->bnd_state = SPLIT_BOUNDARY; - - /* Initialize the entry counters. */ - r->entries = r->total_entries = 0; + /* Initialize the entry counter. */ + r->entries = 0; /* Initialize the starting record number. */ r->recno = recno; @@ -2350,19 +2406,112 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, len; + size_t corrected_page_size, inuse, len; btree = S2BT(session); bm = btree->bm; len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - corrected_page_size = len + add_len; + inuse = (len - r->bnd[r->bnd_next].offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); + corrected_page_size = inuse + add_len; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); + /* Need to account for buffer carrying two chunks worth of data */ + WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + r->first_free = (uint8_t *)r->disk_image.mem + len; - WT_ASSERT(session, corrected_page_size >= len); - r->space_avail = corrected_page_size - len; + WT_ASSERT(session, corrected_page_size >= inuse); + r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); + + return (0); +} + +/* + * __rec_split_write_prev_and_shift_cur -- + * Write the previous split chunk to the disk as a page. Shift the contents + * of the current chunk to the start of the buffer, making space for a new + * chunk to be written. + * If the caller asks for a chunk resizing, the boundary between the two + * chunks is readjusted to the minimum split size boundary details stored + * in the previous chunk, letting the current chunk grow at the cost of the + * previous chunk. + */ +static int +__rec_split_write_prev_and_shift_cur( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +{ + WT_BM *bm; + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk, *dsk_tmp; + size_t cur_len, len; + uint8_t *dsk_start; + + WT_ASSERT(session, r->bnd_next != 0); + + btree = S2BT(session); + bm = btree->bm; + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + dsk = r->disk_image.mem; + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + + /* + * Resize chunks if the current is smaller than the minimum, and there + * are details on the minimum split size boundary available in the + * previous boundary details. + * + * There is a possibility that we do not have a minimum boundary set, in + * such a case we skip chunk resizing. Such a condition is possible for + * instance when we are building the image in the buffer and the first + * K/V pair is large enough that it surpasses both the minimum split + * size and the split size the application has set. In such a case we + * split the chunk without saving any minimum boundary. + */ + if (resize_chunks && + cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { + bnd_cur->offset = bnd_prev->min_bnd_offset; + bnd_cur->max_bnd_entries += + bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + + WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, + bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); + + /* Update current chunk's length */ + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + } + + /* + * Create an interim buffer if not already done to prepare the previous + * chunk's disk image. + */ + len = bnd_cur->offset; + WT_RET(bm->write_size(bm, session, &len)); + if (r->interim_buf == NULL) + WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); + else + WT_RET(__wt_buf_init(session, r->interim_buf, len)); + + dsk_tmp = r->interim_buf->mem; + memcpy(dsk_tmp, dsk, bnd_cur->offset); + dsk_tmp->recno = bnd_prev->max_bnd_recno; + dsk_tmp->u.entries = bnd_prev->max_bnd_entries; + dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); + r->interim_buf->size = dsk_tmp->mem_size; + WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); + + /* Shift the current chunk to the start of the buffer */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); + + /* Fix boundary offset */ + bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); + /* Fix where free points */ + r->first_free = dsk_start + cur_len; return (0); } @@ -2382,6 +2531,9 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); dsk = r->disk_image.mem; + /* Fixed length col store can call with next_len 0 */ + WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); + /* * We should never split during salvage, and we're about to drop core * because there's no parent page. @@ -2391,147 +2543,63 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - /* Hitting a page boundary resets the dictionary, in all cases. */ - __rec_dictionary_reset(r); - - inuse = WT_PTRDIFF(r->first_free, dsk); - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; - - /* - * About to cross a split boundary but not yet forced to split - * into multiple pages. If we have to split, this is one of the - * split points, save information about where we are when the - * split would have happened. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; - - /* Set the number of entries for the just finished chunk. */ - last->entries = r->entries - r->total_entries; - r->total_entries = r->entries; - - /* Set the key for the next chunk. */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); - - /* - * Set the starting buffer offset and clear the entries (the - * latter not required, but cleaner). - */ - next->offset = WT_PTRDIFF(r->first_free, dsk); - next->entries = 0; - - /* Set the space available to another split-size chunk. */ - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - - /* - * Adjust the space available to handle two cases: - * - We don't have enough room for another full split-size - * chunk on the page. - * - We chose to fill past a page boundary because of a - * large item. - */ - if (inuse + r->space_avail > r->page_size) { - r->space_avail = - r->page_size > inuse ? (r->page_size - inuse) : 0; - - /* There are no further boundary points. */ - r->bnd_state = SPLIT_MAX; - } - - /* - * Return if the next object fits into this page, else we have - * to split the page. - */ - if (r->space_avail >= next_len) - return (0); - - /* FALLTHROUGH */ - case SPLIT_MAX: - /* - * We're going to have to split and create multiple pages. - * - * Cycle through the saved split-point information, writing the - * split chunks we have tracked. The underlying fixup function - * sets the space available and other information, and copied - * any unwritten chunk of data to the beginning of the buffer. - */ - WT_RET(__rec_split_fixup(session, r)); - - /* We're done saving split chunks. */ - r->bnd_state = SPLIT_TRACKING_OFF; - break; - case SPLIT_TRACKING_OFF: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; + last = &r->bnd[r->bnd_next]; + inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * The key/value pairs didn't fit into a single page, but either - * we've already noticed that and are now processing the rest of - * the pairs at split size boundaries, or the split size was the - * same as the page size, and we never bothered with split point - * information at all. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if we + * haven't already consumed a reasonable portion of a split chunk. + */ + if (inuse < r->split_size / 2) + goto done; - /* - * Set the key for the next chunk (before writing the block, a - * key range is needed in that code). - */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); + /* All page boundaries reset the dictionary. */ + __rec_dictionary_reset(r); - /* Clear the entries (not required, but cleaner). */ - next->entries = 0; + /* Set the number of entries for the just finished chunk. */ + last->max_bnd_entries = r->entries; - /* Finalize the header information and write the page. */ - dsk->recno = last->recno; - dsk->u.entries = r->entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); + /* + * In case of bulk load, write out chunks as we get them. Otherwise we + * keep two chunks in memory at a given time. So, if there is a previous + * chunk, write it out, making space in the buffer for the next chunk to + * be written. + */ + if (r->is_bulk_load) { + dsk->recno = last->max_bnd_recno; + dsk->u.entries = last->max_bnd_entries; + dsk->mem_size = (uint32_t)inuse; r->disk_image.size = dsk->mem_size; - WT_RET( - __rec_split_write(session, r, last, &r->disk_image, false)); - - /* - * Set the caller's entry count and buffer information for the - * next chunk. We only get here if we're not splitting or have - * already split, so it's split-size chunks from here on out. - */ - r->entries = 0; + WT_RET(__rec_split_write( + session, r, last, &r->disk_image, false)); + /* Fix where free points */ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - break; - case SPLIT_TRACKING_RAW: - return (__wt_illegal_value(session, NULL)); - } + } else if (r->bnd_next != 0) + WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); - /* + /* Prepare the next boundary */ + WT_RET(__rec_split_bnd_grow(session, r)); + r->bnd_next++; + next = &r->bnd[r->bnd_next]; + next->offset = WT_PTRDIFF(r->first_free, dsk); + /* Set the key for the next chunk. */ + next->max_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->max_bnd_key, dsk->type)); + + r->entries = 0; + /* + * Set the space available to another split-size and minimum split-size + * chunk. + */ + r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_space_avail = + r->min_split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + +done: /* * Overflow values can be larger than the maximum page size but still be * "on-page". If the next key/value pair is larger than space available * after a split has happened (in other words, larger than the maximum @@ -2548,6 +2616,64 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) return (0); } +/* + * __rec_split_crossing_bnd -- + * Save the details for the minimum split size boundary or call for a + * split. + */ +static inline int +__rec_split_crossing_bnd( + WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t min_bnd_offset; + + WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); + + /* + * If crossing the minimum split size boundary, store the boundary + * details at the current location in the buffer. If we are crossing the + * split boundary at the same time, possible when the next record is + * large enough, just split at this point. + */ + if (WT_CROSSING_MIN_BND(r, next_len) && + !WT_CROSSING_SPLIT_BND(r, next_len)) { + btree = S2BT(session); + bnd = &r->bnd[r->bnd_next]; + dsk = r->disk_image.mem; + min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - + bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) + /* + * This is possible if the first record doesn't fit in + * the minimum split size, we write this record without + * setting up any boundary here. We will get the + * opportunity to setup a boundary before writing out + * the next record. + */ + return (0); + + WT_ASSERT(session, bnd->min_bnd_offset == 0); + + /* All page boundaries reset the dictionary. */ + __rec_dictionary_reset(r); + + bnd->min_bnd_offset = min_bnd_offset; + bnd->min_bnd_entries = r->entries; + bnd->min_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &bnd->min_bnd_key, dsk->type)); + return (0); + } + + /* We are crossing a split boundary */ + return (__rec_split(session, r, next_len)); +} + /* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. @@ -2626,7 +2752,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, */ recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) - recno = last->recno; + recno = last->max_bnd_recno; entry = max_image_slot = slots = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { @@ -2853,7 +2979,7 @@ no_slots: */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; - dsk_dst->recno = last->recno; + dsk_dst->recno = last->max_bnd_recno; dsk_dst->mem_size = r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; dsk_dst->u.entries = r->raw_entries[result_slots - 1]; @@ -2873,7 +2999,7 @@ no_slots: WT_RET(__wt_strndup(session, dsk, dsk_dst->mem_size, &last->disk_image)); disk_image = last->disk_image; - disk_image->recno = last->recno; + disk_image->recno = last->max_bnd_recno; disk_image->mem_size = dsk_dst->mem_size; disk_image->u.entries = dsk_dst->u.entries; } @@ -2903,14 +3029,14 @@ no_slots: */ switch (dsk->type) { case WT_PAGE_COL_INT: - next->recno = r->raw_recnos[result_slots]; + next->max_bnd_recno = r->raw_recnos[result_slots]; break; case WT_PAGE_COL_VAR: - next->recno = r->raw_recnos[result_slots - 1]; + next->max_bnd_recno = r->raw_recnos[result_slots - 1]; break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = WT_RECNO_OOB; + next->max_bnd_recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2919,7 +3045,7 @@ no_slots: */ WT_ASSERT(session, len > 0); WT_RET(__rec_split_row_promote_cell( - session, dsk, &next->key)); + session, dsk, &next->max_bnd_key)); } break; } @@ -2931,7 +3057,7 @@ no_slots: */ WT_STAT_DATA_INCR(session, compress_raw_fail); - dsk->recno = last->recno; + dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->disk_image.size = dsk->mem_size; @@ -3008,35 +3134,9 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd; + WT_BOUNDARY *bnd_cur, *bnd_prev; WT_PAGE_HEADER *dsk; - - /* Adjust the boundary information based on our split status. */ - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - case SPLIT_MAX: - /* - * We never split, the reconciled page fit into a maximum page - * size. Change the first boundary slot to represent the full - * page (the first boundary slot is largely correct, just update - * the number of entries). - */ - r->bnd_next = 0; - break; - case SPLIT_TRACKING_OFF: - /* - * If we have already split, or aren't tracking boundaries, put - * the remaining data in the next boundary slot. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - break; - case SPLIT_TRACKING_RAW: - /* - * We were configured for raw compression, and either we never - * wrote anything, or there's a remaindered block of data. - */ - break; - } + bool grow_bnd; /* * We may arrive here with no entries to write if the page was entirely @@ -3063,20 +3163,66 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - /* Set the boundary reference and increment the count. */ - bnd = &r->bnd[r->bnd_next++]; - bnd->entries = r->entries; - - /* Finalize the header information. */ dsk = r->disk_image.mem; - dsk->recno = bnd->recno; - dsk->u.entries = r->entries; + + /* Set the number of entries for the just finished chunk. */ + bnd_cur = &r->bnd[r->bnd_next]; + bnd_cur->max_bnd_entries = r->entries; + + grow_bnd = true; + /* + * We can reach here even with raw_compression when the last split chunk + * is too small to be sent for raw compression. + */ + if (!r->is_bulk_load && !r->raw_compression) { + if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && + r->bnd_next != 0) { + /* + * We hold two boundaries worth of data in the buffer, + * and this data doesn't fit in a single page. If the + * last chunk is too small, readjust the boundary to a + * pre-computed minimum. + * Write out the penultimate chunk to the disk as a page + */ + WT_RET(__rec_split_write_prev_and_shift_cur( + session, r, true)); + } else + if (r->bnd_next != 0) { + /* + * We have two boundaries, but the data in the + * buffer can fit a single page. Merge the + * boundaries to create a single chunk. + */ + bnd_prev = bnd_cur - 1; + bnd_prev->max_bnd_entries += + bnd_cur->max_bnd_entries; + r->bnd_next--; + grow_bnd = false; + } + } + + /* + * We already have space for an extra boundary if we merged two + * boundaries above, in that case we do not need to grow the boundary + * structure. + */ + if (grow_bnd) + WT_RET(__rec_split_bnd_grow(session, r)); + bnd_cur = &r->bnd[r->bnd_next]; + r->bnd_next++; + + /* + * Current boundary now has all the remaining data/last page now. + * Let's write it to the disk + */ + dsk->recno = bnd_cur->max_bnd_recno; + dsk->u.entries = bnd_cur->max_bnd_entries; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ - return (__rec_is_checkpoint(session, r, bnd) ? - 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); + return (__rec_is_checkpoint(session, r, bnd_cur) ? + 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); } /* @@ -3109,98 +3255,6 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (__rec_split_finish_std(session, r)); } -/* - * __rec_split_fixup -- - * Fix up after crossing the maximum page boundary. - */ -static int -__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) -{ - WT_BOUNDARY *bnd; - WT_BTREE *btree; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE_HEADER *dsk; - size_t i, len; - uint8_t *dsk_start, *p; - - /* - * When we overflow physical limits of the page, we walk the list of - * split chunks we've created and write those pages out, then update - * the caller's information. - */ - btree = S2BT(session); - - /* - * The data isn't laid out on a page boundary or nul padded; copy it to - * a clean, aligned, padded buffer before writing it. - * - * Allocate a scratch buffer to hold the new disk image. Copy the disk - * page's header and block-manager space into the scratch buffer, most - * of the header information remains unchanged between the pages. - */ - WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); - dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); - - /* - * For each split chunk we've created, update the disk image and copy - * it into place. - */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { - /* Copy the page contents to the temporary buffer. */ - len = (bnd + 1)->offset - bnd->offset; - memcpy(dsk_start, - (uint8_t *)r->disk_image.mem + bnd->offset, len); - - /* Finalize the header information and write the page. */ - dsk->recno = bnd->recno; - dsk->u.entries = bnd->entries; - tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; - dsk->mem_size = WT_STORE_SIZE(tmp->size); - WT_ERR(__rec_split_write(session, r, bnd, tmp, false)); - } - - /* - * There is probably a remnant in the working buffer that didn't get - * written, copy it down to the beginning of the working buffer. - * - * Confirm the remnant is no larger than a split-sized chunk, including - * header. We know that's the maximum sized remnant because we only have - * remnants if split switches from accumulating to a split boundary to - * accumulating to the end of the page (the other path here is when we - * hit a split boundary, there was room for another split chunk in the - * page, and the next item still wouldn't fit, in which case there is no - * remnant). So: we were accumulating to the end of the page and created - * a remnant. We know the remnant cannot be as large as a split-sized - * chunk, including header, because if there was room for that large a - * remnant, we wouldn't have switched from accumulating to a page end. - */ - p = (uint8_t *)r->disk_image.mem + bnd->offset; - len = WT_PTRDIFF(r->first_free, p); - if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - WT_PANIC_ERR(session, EINVAL, - "Reconciliation remnant too large for the split buffer"); - dsk = r->disk_image.mem; - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, p, len); - - /* - * Fix up our caller's information, including updating the starting - * record number. - */ - r->entries -= r->total_entries; - r->first_free = dsk_start + len; - WT_ASSERT(session, - r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); - r->space_avail = - r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); - -err: __wt_scr_free(session, &tmp); - return (ret); -} - /* * __rec_split_write -- * Write a disk block out for the split helper functions. @@ -3222,11 +3276,17 @@ __rec_split_write(WT_SESSION_IMPL *session, int cmp; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; bool need_image; +#ifdef HAVE_DIAGNOSTIC + bool verify_image; +#endif btree = S2BT(session); dsk = buf->mem; page = r->page; mod = page->modify; +#ifdef HAVE_DIAGNOSTIC + verify_image = true; +#endif /* Set the zero-length value flag in the page header. */ if (dsk->type == WT_PAGE_ROW_LEAF) { @@ -3238,8 +3298,6 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } - bnd->entries = r->entries; - /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3285,7 +3343,8 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + if (WT_INSERT_RECNO(supd->ins) >= + (bnd + 1)->max_bnd_recno) goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: @@ -3296,8 +3355,8 @@ __rec_split_write(WT_SESSION_IMPL *session, key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); } - WT_ERR(__wt_compare(session, - btree->collator, key, &(bnd + 1)->key, &cmp)); + WT_ERR(__wt_compare(session, btree->collator, + key, &(bnd + 1)->max_bnd_key, &cmp)); if (cmp >= 0) goto supd_check_complete; break; @@ -3387,18 +3446,21 @@ supd_check_complete: #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && + bnd->max_bnd_entries < 6) __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d", - r->entries, r->page->memory_footprint, r->bnd_next, - F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", - r->bnd_state); + ", page count %" PRIu32 ", %s", bnd->max_bnd_entries, + r->page->memory_footprint, r->bnd_next, + F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint"); #endif WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, false, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); +#ifdef HAVE_DIAGNOSTIC + verify_image = false; +#endif WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; @@ -3425,9 +3487,20 @@ copy_image: */ need_image = F_ISSET(r, WT_EVICT_SCRUB) || (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL); - if (need_image && bnd->disk_image == NULL) + if (need_image && bnd->disk_image == NULL) { +#ifdef HAVE_DIAGNOSTIC + /* + * The I/O routines verify all disk images we write, but there + * are paths in reconciliation that don't do I/O. Verify those + * images, too. + */ + WT_ASSERT(session, verify_image == false || + __wt_verify_dsk_image( + session, "[reconcile-image]", buf->data, 0, true) == 0); +#endif WT_ERR(__wt_strndup( session, buf->data, buf->size, &bnd->disk_image)); + } if (!need_image) __wt_free(session, bnd->disk_image); @@ -3680,11 +3753,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) - WT_RET( - __rec_split_raw(session, r, key->len + val->len)); - else { + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3696,10 +3770,9 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } - - WT_RET(__rec_split(session, r, key->len + val->len)); + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -3740,6 +3813,10 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, cbulk->entry, __bitstr_size( @@ -3844,10 +3921,12 @@ __wt_bulk_insert_var( r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -3983,10 +4062,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_ERR(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_ERR(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_ERR(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4028,10 +4110,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4139,6 +4224,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); @@ -4295,10 +4384,13 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4961,11 +5053,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -4977,10 +5070,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_IKEY_DATA(ikey), ikey->size)); key_onpage_ovfl = false; } - WT_ERR(__rec_split( + + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5030,10 +5123,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, key->len + val->len) : - __rec_split(session, r, key->len + val->len)); + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5362,16 +5459,17 @@ build: } /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* - * In one path above, we copied address blocks - * from the page rather than building the actual - * key. In that case, we have to build the key - * now because we are about to promote it. + * If we copied address blocks from the page + * rather than building the actual key, we have + * to build the key now because we are about to + * promote it. */ if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, @@ -5390,14 +5488,13 @@ build: if (!ovfl_key) WT_ERR( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_ERR(__rec_split( + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5460,11 +5557,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5476,14 +5574,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (!ovfl_key) WT_RET( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_RET(__rec_split( + WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5595,13 +5692,14 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) __wt_verbose(session, WT_VERB_SPLIT, "starting key %s", __wt_buf_set_printable( - session, bnd->key.data, bnd->key.size, tkey)); + session, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, tkey)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __wt_verbose(session, WT_VERB_SPLIT, - "starting recno %" PRIu64, bnd->recno); + "starting recno %" PRIu64, bnd->max_bnd_recno); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -5863,10 +5961,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* We never set the first page's key, grab it from the original page. */ ref = r->ref; if (__wt_ref_is_root(ref)) - WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1)); else { __wt_ref_key(ref->home, ref, &p, &size); - WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size)); } /* Allocate, then initialize the array of replacement blocks. */ @@ -5874,8 +5972,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey_alloc(session, 0, - bnd->key.data, bnd->key.size, &multi->key.ikey)); + WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, &multi->key.ikey)); /* * Copy any disk image. Don't take saved updates without a @@ -5922,7 +6020,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - multi->key.recno = bnd->recno; + multi->key.recno = bnd->max_bnd_recno; /* * Copy any disk image. Don't take saved updates without a @@ -6399,7 +6497,8 @@ __rec_dictionary_lookup( for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); dp != NULL && dp->hash == hash; dp = dp->next[0]) { WT_RET(__wt_cell_pack_data_match( - dp->cell, &val->cell, val->buf.data, &match)); + (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset), + &val->cell, val->buf.data, &match)); if (match) { WT_STAT_DATA_INCR(session, rec_dictionary); *dpp = dp; @@ -6425,7 +6524,7 @@ __rec_dictionary_lookup( * know where on the page it will be written). */ next = r->dictionary[r->dictionary_next++]; - next->cell = NULL; /* Not necessary, just cautious. */ + next->offset = 0; /* Not necessary, just cautious. */ next->hash = hash; __rec_dictionary_skip_insert(r->dictionary_head, next, hash); *dpp = next; diff --git a/test/format/config.h b/test/format/config.h index e3e1e73a786..b5feb7a5321 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -284,7 +284,7 @@ static CONFIG c[] = { { "split_pct", "page split size as a percentage of the maximum page size", - 0x0, 40, 85, 85, &g.c_split_pct, NULL }, + 0x0, 50, 100, 100, &g.c_split_pct, NULL }, { "statistics", "maintain statistics", /* 20% */ -- cgit v1.2.1 From d2dd272da04d8ca33f23eac11de953e3c16f9a95 Mon Sep 17 00:00:00 2001 From: "Alexandra (Sasha) Fedorova" Date: Thu, 30 Mar 2017 14:38:30 -0700 Subject: WT-3190 perform a complete re-tune of eviction workers every 30 seconds. (#3324) Otherwise the number of workers wouldn't adjust when the workload changed. --- dist/stat_data.py | 1 + src/evict/evict_lru.c | 83 ++++++++--- src/include/stat.h | 1 + src/include/wiredtiger.in | 362 +++++++++++++++++++++++----------------------- src/support/stat.c | 4 + 5 files changed, 250 insertions(+), 201 deletions(-) diff --git a/dist/stat_data.py b/dist/stat_data.py index a4d92345f88..8fed3f3ac4a 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -206,6 +206,7 @@ connection_stats = [ CacheStat('cache_eviction_force', 'pages evicted because they exceeded the in-memory maximum'), CacheStat('cache_eviction_force_delete', 'pages evicted because they had chains of deleted items'), CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'), + CacheStat('cache_eviction_force_retune', 'force re-tuning of eviction workers once in a while'), CacheStat('cache_eviction_get_ref', 'eviction calls to get a page'), CacheStat('cache_eviction_get_ref_empty', 'eviction calls to get a page found queue empty'), CacheStat('cache_eviction_get_ref_empty2', 'eviction calls to get a page found queue empty after locking'), diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a957d245958..3ce35c60f2e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -900,24 +900,32 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) } #define EVICT_TUNE_BATCH 1 /* Max workers to add each period */ -#define EVICT_TUNE_DATAPT_MIN 3 /* Data points needed before deciding - if we should keep adding workers or - settle on an earlier value. */ +/* + * Data points needed before deciding if we should keep adding workers or settle + * on an earlier value. + */ +#define EVICT_TUNE_DATAPT_MIN 3 #define EVICT_TUNE_PERIOD 1 /* Tune period in seconds */ +/* + * We will do a fresh re-tune every that many seconds to adjust to + * significant phase changes. + */ +#define EVICT_FORCE_RETUNE 30 + /* * __evict_tune_workers -- * Find the right number of eviction workers. Gradually ramp up the number of * workers increasing the number in batches indicated by the setting above. - * Store the number of workers that gave us the best throughput so far and - * the number of data points we have tried. + * Store the number of workers that gave us the best throughput so far and the + * number of data points we have tried. * - * Every once in a while when we have the minimum number of data points - * we check whether the eviction throughput achieved with the current number - * of workers is the best we have seen so far. If so, we will keep increasing - * the number of workers. If not, we are past the infliction point on the - * eviction throughput curve. In that case, we will set the number of workers - * to the best observed so far and settle into a stable state. + * Every once in a while when we have the minimum number of data points we check + * whether the eviction throughput achieved with the current number of workers + * is the best we have seen so far. If so, we will keep increasing the number of + * workers. If not, we are past the infliction point on the eviction throughput + * curve. In that case, we will set the number of workers to the best observed + * so far and settle into a stable state. */ static int __evict_tune_workers(WT_SESSION_IMPL *session) @@ -927,27 +935,60 @@ __evict_tune_workers(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; - uint64_t pgs_evicted_cur, pgs_evicted_persec_cur; + uint64_t pgs_evicted_cur, pgs_evicted_persec_cur, time_diff; uint32_t thread_surplus; conn = S2C(session); cache = conn->cache; WT_ASSERT(session, conn->evict_threads.threads[0]->session == session); - - if (conn->evict_tune_stable) - return (0); + pgs_evicted_cur = pgs_evicted_persec_cur = 0; __wt_epoch(session, ¤t_time); + time_diff = WT_TIMEDIFF_SEC(current_time, conn->evict_tune_last_time); /* - * Every EVICT_TUNE_PERIOD seconds record the number of - * pages evicted per second observed in the previous period. + * If we have reached the stable state and have not run long enough to + * surpass the forced re-tuning threshold, return. */ - if (WT_TIMEDIFF_SEC( - current_time, conn->evict_tune_last_time) < EVICT_TUNE_PERIOD) - return (0); + if (conn->evict_tune_stable) { + if (time_diff < EVICT_FORCE_RETUNE) + return (0); + + /* + * Stable state was reached a long time ago. Let's re-tune. + * Reset all the state. + */ + conn->evict_tune_stable = 0; + conn->evict_tune_last_action_time.tv_sec = 0; + conn->evict_tune_pgs_last = 0; + conn->evict_tune_num_points = 0; + conn->evict_tune_pg_sec_max = 0; + conn->evict_tune_workers_best = 0; + + /* Reduce the number of eviction workers to the minimum */ + thread_surplus = conn->evict_threads.current_threads - + conn->evict_threads_min; + for (i = 0; i < thread_surplus; i++) { + WT_ERR(__wt_thread_group_stop_one( + session, &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_removed); + } + WT_STAT_CONN_INCR(session, cache_eviction_force_retune); + } else + if (time_diff < EVICT_TUNE_PERIOD) + /* + * If we have not reached stable state, don't do + * anything unless enough time has passed since the last + * time we have taken any action in this function. + */ + return (0); + /* + * Measure the number of evicted pages so far. Eviction rate correlates + * to performance, so this is our metric of success. + */ pgs_evicted_cur = cache->pages_evict; /* @@ -1025,7 +1066,7 @@ __evict_tune_workers(WT_SESSION_IMPL *session) conn->evict_tune_stable = true; WT_STAT_CONN_SET(session, cache_eviction_active_workers, conn->evict_threads.current_threads); - return (0); + goto err; } } diff --git a/src/include/stat.h b/src/include/stat.h index ed3d588b7d3..bc7a7cab7ce 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -318,6 +318,7 @@ struct __wt_connection_stats { int64_t cache_eviction_force_fail; int64_t cache_eviction_walks_active; int64_t cache_eviction_walks_started; + int64_t cache_eviction_force_retune; int64_t cache_eviction_hazard; int64_t cache_hazard_checks; int64_t cache_hazard_walks; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 707159ef6ae..ced6df3d29d 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -4460,384 +4460,386 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1059 /*! cache: files with new eviction walks started */ #define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1060 +/*! cache: force re-tuning of eviction workers once in a while */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1061 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1061 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1062 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1062 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1063 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1063 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1064 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1064 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1065 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1065 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1066 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1066 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1067 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1067 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1068 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1068 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1069 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1069 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1070 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1070 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1071 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1071 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1072 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1072 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1073 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1073 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1074 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1074 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1075 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1075 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1076 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1076 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1077 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1077 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1078 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1078 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1079 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1079 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1080 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1080 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1081 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1081 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1082 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1082 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1083 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1083 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1084 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1084 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1085 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1085 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1086 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1086 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1087 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1087 +#define WT_STAT_CONN_CACHE_READ 1088 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1088 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1089 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1089 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1090 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1090 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1091 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1091 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1092 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1092 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1093 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1093 +#define WT_STAT_CONN_CACHE_WRITE 1094 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1094 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1095 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1095 +#define WT_STAT_CONN_CACHE_OVERHEAD 1096 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1096 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1097 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1097 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1098 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1098 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1099 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1099 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1100 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1100 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1101 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1101 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1102 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1102 +#define WT_STAT_CONN_COND_AUTO_WAIT 1103 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1103 +#define WT_STAT_CONN_FILE_OPEN 1104 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1104 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1105 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1105 +#define WT_STAT_CONN_MEMORY_FREE 1106 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1106 +#define WT_STAT_CONN_MEMORY_GROW 1107 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1107 +#define WT_STAT_CONN_COND_WAIT 1108 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1108 +#define WT_STAT_CONN_RWLOCK_READ 1109 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1109 +#define WT_STAT_CONN_RWLOCK_WRITE 1110 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1110 +#define WT_STAT_CONN_FSYNC_IO 1111 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1111 +#define WT_STAT_CONN_READ_IO 1112 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1112 +#define WT_STAT_CONN_WRITE_IO 1113 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1113 +#define WT_STAT_CONN_CURSOR_CREATE 1114 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1114 +#define WT_STAT_CONN_CURSOR_INSERT 1115 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1115 +#define WT_STAT_CONN_CURSOR_NEXT 1116 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1116 +#define WT_STAT_CONN_CURSOR_PREV 1117 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1117 +#define WT_STAT_CONN_CURSOR_REMOVE 1118 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1118 +#define WT_STAT_CONN_CURSOR_RESET 1119 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1119 +#define WT_STAT_CONN_CURSOR_RESTART 1120 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1120 +#define WT_STAT_CONN_CURSOR_SEARCH 1121 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1121 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1122 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1122 +#define WT_STAT_CONN_CURSOR_UPDATE 1123 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1123 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1124 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1124 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1125 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1125 +#define WT_STAT_CONN_DH_SWEEP_REF 1126 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1126 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1127 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1127 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1128 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1128 +#define WT_STAT_CONN_DH_SWEEP_TOD 1129 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1129 +#define WT_STAT_CONN_DH_SWEEPS 1130 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1130 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1131 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1131 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1132 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1132 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1133 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1134 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1135 /*! lock: handle-list lock eviction thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135 +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1136 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1137 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1138 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1139 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1140 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1141 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1142 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1143 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1144 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1145 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1146 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1147 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1147 +#define WT_STAT_CONN_LOG_SLOT_RACES 1148 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1149 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1149 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1150 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1151 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1152 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1153 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1153 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1154 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1154 +#define WT_STAT_CONN_LOG_FLUSH 1155 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1155 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1156 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1157 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1158 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1159 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1160 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1161 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1161 +#define WT_STAT_CONN_LOG_SCANS 1162 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1162 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1163 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1163 +#define WT_STAT_CONN_LOG_WRITE_LSN 1164 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1165 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1165 +#define WT_STAT_CONN_LOG_SYNC 1166 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1166 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1167 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1167 +#define WT_STAT_CONN_LOG_SYNC_DIR 1168 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1169 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1169 +#define WT_STAT_CONN_LOG_WRITES 1170 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1171 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1172 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1173 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1174 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1175 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1175 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1176 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1177 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1178 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1179 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1180 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1181 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1182 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1183 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1183 +#define WT_STAT_CONN_REC_PAGES 1184 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1184 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1185 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1185 +#define WT_STAT_CONN_REC_PAGE_DELETE 1186 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1187 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1188 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1189 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1189 +#define WT_STAT_CONN_SESSION_OPEN 1190 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1191 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1192 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1193 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1194 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1195 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1196 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1197 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1198 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1199 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1200 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1201 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1202 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1203 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1204 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1205 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1206 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1207 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1208 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1209 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1210 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1211 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1212 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1213 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1214 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1215 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1216 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1217 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1218 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1218 +#define WT_STAT_CONN_PAGE_SLEEP 1219 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1220 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1221 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1221 +#define WT_STAT_CONN_TXN_BEGIN 1222 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1223 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1224 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1225 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1226 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1227 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1228 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1229 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1230 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1230 +#define WT_STAT_CONN_TXN_CHECKPOINT 1231 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1232 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1232 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1233 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1234 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1235 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1235 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1236 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1237 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1238 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1238 +#define WT_STAT_CONN_TXN_SYNC 1239 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1239 +#define WT_STAT_CONN_TXN_COMMIT 1240 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1240 +#define WT_STAT_CONN_TXN_ROLLBACK 1241 /*! * @} diff --git a/src/support/stat.c b/src/support/stat.c index fd38e1b79ee..57c1ee06000 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -685,6 +685,7 @@ static const char * const __stats_connection_desc[] = { "cache: failed eviction of pages that exceeded the in-memory maximum", "cache: files with active eviction walks", "cache: files with new eviction walks started", + "cache: force re-tuning of eviction workers once in a while", "cache: hazard pointer blocked page eviction", "cache: hazard pointer check calls", "cache: hazard pointer check entries walked", @@ -968,6 +969,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_force_fail = 0; /* not clearing cache_eviction_walks_active */ stats->cache_eviction_walks_started = 0; + stats->cache_eviction_force_retune = 0; stats->cache_eviction_hazard = 0; stats->cache_hazard_checks = 0; stats->cache_hazard_walks = 0; @@ -1252,6 +1254,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cache_eviction_walks_active); to->cache_eviction_walks_started += WT_STAT_READ(from, cache_eviction_walks_started); + to->cache_eviction_force_retune += + WT_STAT_READ(from, cache_eviction_force_retune); to->cache_eviction_hazard += WT_STAT_READ(from, cache_eviction_hazard); to->cache_hazard_checks += WT_STAT_READ(from, cache_hazard_checks); -- cgit v1.2.1 From 423f4e11050f7644b1a8d2b6b1cc60c35ef915c8 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 30 Mar 2017 21:01:05 -0400 Subject: WT-3243 Reorder log slot release so joins don't wait on IO (#3360) --- dist/stat_data.py | 2 + src/include/extern.h | 1 - src/include/log.h | 3 +- src/include/stat.h | 2 + src/include/wiredtiger.in | 192 ++++++++++++++++++++++--------------------- src/log/log_slot.c | 205 ++++++++++++++++++++++++---------------------- src/support/stat.c | 8 ++ 7 files changed, 221 insertions(+), 192 deletions(-) diff --git a/dist/stat_data.py b/dist/stat_data.py index 8fed3f3ac4a..ac79ffd029a 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -324,10 +324,12 @@ connection_stats = [ LogStat('log_scan_records', 'records processed by log scan'), LogStat('log_scan_rereads', 'log scan records requiring two reads'), LogStat('log_scans', 'log scan operations'), + LogStat('log_slot_active_closed', 'consolidated slot join active slot closed'), LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_coalesced', 'written slots coalesced'), LogStat('log_slot_consolidated', 'logging bytes consolidated', 'size'), LogStat('log_slot_joins', 'consolidated slot joins'), + LogStat('log_slot_no_free_slots', 'consolidated slot transitions unable to find free slot'), LogStat('log_slot_races', 'consolidated slot join races'), LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'), LogStat('log_slot_transitions', 'consolidated slot join transitions'), diff --git a/src/include/extern.h b/src/include/extern.h index 47b4e03a7b7..c0a6087e9b1 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -405,7 +405,6 @@ extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/log.h b/src/include/log.h index f0999ba316b..fb3c961417f 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -193,7 +193,8 @@ struct __wt_myslot { wt_off_t end_offset; /* My end offset in buffer */ wt_off_t offset; /* Slot buffer offset */ #define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */ -#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */ +#define WT_MYSLOT_NEEDS_RELEASE 0x02 /* This thread is releasing the slot */ +#define WT_MYSLOT_UNBUFFERED 0x04 /* Write directly */ uint32_t flags; /* Flags */ }; diff --git a/src/include/stat.h b/src/include/stat.h index bc7a7cab7ce..6c274484bcb 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -405,9 +405,11 @@ struct __wt_connection_stats { int64_t lock_table_wait_internal; int64_t log_slot_switch_busy; int64_t log_slot_closes; + int64_t log_slot_active_closed; int64_t log_slot_races; int64_t log_slot_transitions; int64_t log_slot_joins; + int64_t log_slot_no_free_slots; int64_t log_slot_unbuffered; int64_t log_bytes_payload; int64_t log_bytes_written; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index ced6df3d29d..ddecb2ac765 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -4640,206 +4640,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1146 /*! log: consolidated slot closures */ #define WT_STAT_CONN_LOG_SLOT_CLOSES 1147 +/*! log: consolidated slot join active slot closed */ +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1148 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1148 +#define WT_STAT_CONN_LOG_SLOT_RACES 1149 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1149 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1150 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1151 +/*! log: consolidated slot transitions unable to find free slot */ +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1152 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1151 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1153 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1152 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1154 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1153 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1155 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1154 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1156 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1155 +#define WT_STAT_CONN_LOG_FLUSH 1157 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1156 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1158 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1157 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1159 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1158 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1160 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1159 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1161 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1160 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1162 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1161 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1163 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1162 +#define WT_STAT_CONN_LOG_SCANS 1164 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1163 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1165 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1164 +#define WT_STAT_CONN_LOG_WRITE_LSN 1166 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1165 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1167 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1166 +#define WT_STAT_CONN_LOG_SYNC 1168 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1167 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1169 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1168 +#define WT_STAT_CONN_LOG_SYNC_DIR 1170 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1169 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1171 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1170 +#define WT_STAT_CONN_LOG_WRITES 1172 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1171 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1173 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1172 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1174 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1173 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1175 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1174 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1176 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1175 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1177 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1176 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1178 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1177 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1179 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1178 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1180 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1179 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1181 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1180 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1182 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1181 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1183 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1182 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1184 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1183 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1185 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1184 +#define WT_STAT_CONN_REC_PAGES 1186 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1185 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1187 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1186 +#define WT_STAT_CONN_REC_PAGE_DELETE 1188 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1187 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1189 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1188 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1190 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1189 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1191 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1190 +#define WT_STAT_CONN_SESSION_OPEN 1192 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1191 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1193 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1192 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1194 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1193 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1195 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1194 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1196 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1195 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1197 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1196 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1198 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1197 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1199 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1198 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1200 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1199 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1201 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1200 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1202 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1201 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1203 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1202 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1204 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1203 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1205 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1204 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1206 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1205 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1207 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1206 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1208 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1207 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1209 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1208 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1210 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1209 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1211 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1210 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1212 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1211 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1213 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1212 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1214 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1213 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1215 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1214 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1216 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1215 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1217 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1216 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1218 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1217 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1219 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1218 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1220 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1219 +#define WT_STAT_CONN_PAGE_SLEEP 1221 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1220 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1222 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1221 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1223 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1222 +#define WT_STAT_CONN_TXN_BEGIN 1224 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1223 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1225 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1226 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1227 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1228 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1229 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1230 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1231 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1230 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1232 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT 1233 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1234 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1233 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1235 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1234 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1236 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1235 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1237 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1236 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1238 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1237 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1239 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1238 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1240 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1239 +#define WT_STAT_CONN_TXN_SYNC 1241 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1240 +#define WT_STAT_CONN_TXN_COMMIT 1242 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1241 +#define WT_STAT_CONN_TXN_ROLLBACK 1243 /*! * @} diff --git a/src/log/log_slot.c b/src/log/log_slot.c index c685aec3ffc..512a84dbd13 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -195,103 +195,12 @@ retry: } /* - * __log_slot_switch_internal -- - * Switch out the current slot and set up a new one. - */ -static int -__log_slot_switch_internal( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced) -{ - WT_DECL_RET; - WT_LOG *log; - WT_LOGSLOT *slot; - bool free_slot, release; - - log = S2C(session)->log; - release = false; - slot = myslot->slot; - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - - /* - * If someone else raced us to closing this specific slot, we're - * done here. - */ - if (slot != log->active_slot) - return (0); - - WT_RET(WT_SESSION_CHECK_PANIC(session)); - /* - * We may come through here multiple times if we were able to close - * a slot but could not set up a new one. If we closed it already, - * don't try to do it again but still set up the new slot. - */ - if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) { - ret = __log_slot_close(session, slot, &release, forced); - /* - * If close returns WT_NOTFOUND it means that someone else - * is processing the slot change. - */ - if (ret == WT_NOTFOUND) - return (0); - WT_RET(ret); - if (release) { - WT_RET(__wt_log_release(session, slot, &free_slot)); - if (free_slot) - __wt_log_slot_free(session, slot); - } - } - /* - * Set that we have closed this slot because we may call in here - * multiple times if we retry creating a new slot. - */ - F_SET(myslot, WT_MYSLOT_CLOSE); - WT_RET(__wt_log_slot_new(session)); - F_CLR(myslot, WT_MYSLOT_CLOSE); - return (0); -} - -/* - * __wt_log_slot_switch -- - * Switch out the current slot and set up a new one. - */ -int -__wt_log_slot_switch( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) -{ - WT_DECL_RET; - WT_LOG *log; - - log = S2C(session)->log; - /* - * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the - * compiler does not like it combined directly with the while loop - * here. - * - * The loop conditional is a bit complex. We have to retry if we - * closed the slot but were unable to set up a new slot. In that - * case the flag indicating we have closed the slot will still be set. - * We have to retry in that case regardless of the retry setting - * because we are responsible for setting up the new slot. - */ - do { - WT_WITH_SLOT_LOCK(session, log, - ret = __log_slot_switch_internal(session, myslot, forced)); - if (ret == EBUSY) { - WT_STAT_CONN_INCR(session, log_slot_switch_busy); - __wt_yield(); - } - } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY)); - return (ret); -} - -/* - * __wt_log_slot_new -- + * __log_slot_new -- * Find a free slot and switch it as the new active slot. * Must be called holding the slot lock. */ -int -__wt_log_slot_new(WT_SESSION_IMPL *session) +static int +__log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; @@ -351,6 +260,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* * If we didn't find any free slots signal the worker thread. */ + WT_STAT_CONN_INCR(session, log_slot_no_free_slots); __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); #ifdef HAVE_DIAGNOSTIC @@ -370,6 +280,108 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* NOTREACHED */ } +/* + * __log_slot_switch_internal -- + * Switch out the current slot and set up a new one. + */ +static int +__log_slot_switch_internal( + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced) +{ + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT *slot; + bool free_slot, release; + + log = S2C(session)->log; + release = false; + slot = myslot->slot; + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + + /* + * If someone else raced us to closing this specific slot, we're + * done here. + */ + if (slot != log->active_slot) + return (0); + WT_RET(WT_SESSION_CHECK_PANIC(session)); + + /* + * We may come through here multiple times if we were not able to + * set up a new one. If we closed it already, + * don't try to do it again but still set up the new slot. + */ + if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) { + ret = __log_slot_close(session, slot, &release, forced); + /* + * If close returns WT_NOTFOUND it means that someone else + * is processing the slot change. + */ + if (ret == WT_NOTFOUND) + return (0); + WT_RET(ret); + /* + * Set that we have closed this slot because we may call in here + * multiple times if we retry creating a new slot. Similarly + * set retain whether this slot needs releasing so that we don't + * lose that information if we retry. + */ + F_SET(myslot, WT_MYSLOT_CLOSE); + if (release) + F_SET(myslot, WT_MYSLOT_NEEDS_RELEASE); + } + /* + * Now that the slot is closed, set up a new one so that joining + * threads don't have to wait on writing the previous slot if we + * release it. Release after setting a new one. + */ + WT_RET(__log_slot_new(session)); + F_CLR(myslot, WT_MYSLOT_CLOSE); + if (F_ISSET(myslot, WT_MYSLOT_NEEDS_RELEASE)) { + WT_RET(__wt_log_release(session, slot, &free_slot)); + F_CLR(myslot, WT_MYSLOT_NEEDS_RELEASE); + if (free_slot) + __wt_log_slot_free(session, slot); + } + return (ret); +} + +/* + * __wt_log_slot_switch -- + * Switch out the current slot and set up a new one. + */ +int +__wt_log_slot_switch( + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) +{ + WT_DECL_RET; + WT_LOG *log; + + log = S2C(session)->log; + + /* + * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the + * compiler does not like it combined directly with the while loop + * here. + * + * The loop conditional is a bit complex. We have to retry if we + * closed the slot but were unable to set up a new slot. In that + * case the flag indicating we have closed the slot will still be set. + * We have to retry in that case regardless of the retry setting + * because we are responsible for setting up the new slot. + */ + do { + WT_WITH_SLOT_LOCK(session, log, + ret = __log_slot_switch_internal(session, myslot, forced)); + if (ret == EBUSY) { + WT_STAT_CONN_INCR(session, log_slot_switch_busy); + __wt_yield(); + } + } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY)); + return (ret); +} + /* * __wt_log_slot_init -- * Initialize the slot array. @@ -531,12 +543,13 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, if (__wt_atomic_casiv64( &slot->slot_state, old_state, new_state)) break; - } + WT_STAT_CONN_INCR(session, log_slot_races); + } else + WT_STAT_CONN_INCR(session, log_slot_active_closed); /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ - WT_STAT_CONN_INCR(session, log_slot_races); __wt_yield(); } /* diff --git a/src/support/stat.c b/src/support/stat.c index 57c1ee06000..2c2217f8c20 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -772,9 +772,11 @@ static const char * const __stats_connection_desc[] = { "lock: table lock internal thread time waiting for the table lock (usecs)", "log: busy returns attempting to switch slots", "log: consolidated slot closures", + "log: consolidated slot join active slot closed", "log: consolidated slot join races", "log: consolidated slot join transitions", "log: consolidated slot joins", + "log: consolidated slot transitions unable to find free slot", "log: consolidated slot unbuffered writes", "log: log bytes of payload data", "log: log bytes written", @@ -1056,9 +1058,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_table_wait_internal = 0; stats->log_slot_switch_busy = 0; stats->log_slot_closes = 0; + stats->log_slot_active_closed = 0; stats->log_slot_races = 0; stats->log_slot_transitions = 0; stats->log_slot_joins = 0; + stats->log_slot_no_free_slots = 0; stats->log_slot_unbuffered = 0; stats->log_bytes_payload = 0; stats->log_bytes_written = 0; @@ -1370,9 +1374,13 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_table_wait_internal); to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); + to->log_slot_active_closed += + WT_STAT_READ(from, log_slot_active_closed); to->log_slot_races += WT_STAT_READ(from, log_slot_races); to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions); to->log_slot_joins += WT_STAT_READ(from, log_slot_joins); + to->log_slot_no_free_slots += + WT_STAT_READ(from, log_slot_no_free_slots); to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered); to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload); to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); -- cgit v1.2.1 From 871889c0b87dcd2560704248eba0a4b119ca26f1 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 4 Apr 2017 00:31:05 +1000 Subject: WT-3250 Fix spinlock statistics tracking on Windows. (#3363) MongoDB user on Windows noticed the "LSM: application work units currently queued" statistic was changing in a configuration that involved no LSM code. This was caused by a bug in code that tracks time spent in spinlocks incrementing the wrong statistic. In particular, spinlocks contain fields describing which statistics should be used to track time spent in that spinlock. A value of -1 indicates that the spinlock should not be tracked, but a value of zero is the first statistic in the array for a connection, which happens to be the "LSM: application work units currently queued" statistic. The Windows implementation of spinlocks was not setting these fields to -1, leading to the bug. This bug was introduced by WT 2955 and also meant that every WiredTiger spinlock on Windows was being timed, which may have negatively impacted Windows performance. --- src/include/mutex.i | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/include/mutex.i b/src/include/mutex.i index 640706284c3..eb95d76a1a2 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -32,7 +32,9 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) WT_UNUSED(name); t->lock = 0; + t->name = name; t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; + t->initialized = 1; return (0); } @@ -196,6 +198,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) } t->name = name; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; t->initialized = 1; return (0); } -- cgit v1.2.1 From e8efd76093d126a8d7b8e21c650123e96e9d6f13 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 3 Apr 2017 12:51:14 -0400 Subject: WT-3250 Have one function initializing the WT portion of the spinlock. (#3364) Unify spinlock structures. --- src/include/mutex.h | 28 ++++------------------------ src/include/mutex.i | 26 +++++++++++++++----------- 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/src/include/mutex.h b/src/include/mutex.h index 06b8c4a3304..910eb7af5b9 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -62,31 +62,17 @@ union __wt_rwlock { /* Read/write lock */ #define SPINLOCK_PTHREAD_MUTEX 2 #define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 3 -#if SPINLOCK_TYPE == SPINLOCK_GCC - struct __wt_spinlock { WT_CACHE_LINE_PAD_BEGIN +#if SPINLOCK_TYPE == SPINLOCK_GCC volatile int lock; - - /* - * We track acquisitions and time spent waiting for some locks. For - * performance reasons and to make it possible to write generic code - * that tracks statistics for different locks, we store the offset - * of the statistics fields to be updated during lock acquisition. - */ - int16_t stat_count_off; /* acquisitions offset */ - int16_t stat_app_usecs_off; /* waiting application threads offset */ - int16_t stat_int_usecs_off; /* waiting server threads offset */ - WT_CACHE_LINE_PAD_END -}; - #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ SPINLOCK_TYPE == SPINLOCK_MSVC - -struct __wt_spinlock { - WT_CACHE_LINE_PAD_BEGIN wt_mutex_t lock; +#else +#error Unknown spinlock type +#endif const char *name; /* Mutex name */ @@ -103,9 +89,3 @@ struct __wt_spinlock { int8_t initialized; /* Lock initialized, for cleanup */ WT_CACHE_LINE_PAD_END }; - -#else - -#error Unknown spinlock type - -#endif diff --git a/src/include/mutex.i b/src/include/mutex.i index eb95d76a1a2..2d483972ed2 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -14,6 +14,18 @@ * of instructions. */ +/* + * __spin_init_internal -- + * Initialize the WT portion of a spinlock. + */ +static inline void +__spin_init_internal(WT_SPINLOCK *t, const char *name) +{ + t->name = name; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; + t->initialized = 1; +} + #if SPINLOCK_TYPE == SPINLOCK_GCC /* Default to spinning 1000 times before yielding. */ @@ -29,12 +41,9 @@ static inline int __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) { WT_UNUSED(session); - WT_UNUSED(name); t->lock = 0; - t->name = name; - t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; - t->initialized = 1; + __spin_init_internal(t, name); return (0); } @@ -112,10 +121,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) #else WT_RET(pthread_mutex_init(&t->lock, NULL)); #endif - - t->name = name; - t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; - t->initialized = 1; + __spin_init_internal(t, name); WT_UNUSED(session); return (0); @@ -197,9 +203,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) return (__wt_map_windows_error(windows_error)); } - t->name = name; - t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; - t->initialized = 1; + __spin_init_internal(t, name); return (0); } -- cgit v1.2.1 From 27b483289376e8441da87723a5b6a2ec420ad858 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Tue, 4 Apr 2017 15:10:52 -0400 Subject: WT-3254 Fix typo in reconfig string (#3366) --- test/suite/test_reconfig02.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/suite/test_reconfig02.py b/test/suite/test_reconfig02.py index 8054b2a6ab5..042d3bbe71f 100644 --- a/test/suite/test_reconfig02.py +++ b/test/suite/test_reconfig02.py @@ -62,7 +62,7 @@ class test_reconfig02(wttest.WiredTigerTestCase): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.conn.reconfigure("log=(path=foo)"), msg) self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.conn.reconfigure("log=(recovery=true)"), msg) + lambda: self.conn.reconfigure("log=(recover=true)"), msg) # Logging starts on, but prealloc is off. Verify it is off. # Reconfigure it on and run again, making sure that log files -- cgit v1.2.1 From adbe2ec5cd6dc2da2af913087b53c402b2f0b87c Mon Sep 17 00:00:00 2001 From: sueloverso Date: Tue, 4 Apr 2017 15:48:22 -0400 Subject: WT-3249 Look at slot_state during force while holding lock. (#3365) We could race an in-progress switch that set a new, empty active slot but has not yet released the previously active slot and get an incorrect LSN. --- src/include/extern.h | 2 +- src/log/log.c | 12 ++---------- src/log/log_slot.c | 22 ++++++++++++++++++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/include/extern.h b/src/include/extern.h index c0a6087e9b1..55ba1bada7c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -404,7 +404,7 @@ extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8 extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/log/log.c b/src/log/log.c index 5b24250fffc..803d3e8dfab 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1919,7 +1919,6 @@ __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { WT_LOG *log; WT_MYSLOT myslot; - uint32_t joined; log = S2C(session)->log; memset(&myslot, 0, sizeof(myslot)); @@ -1927,14 +1926,7 @@ __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) if (did_work != NULL) *did_work = true; myslot.slot = log->active_slot; - joined = WT_LOG_SLOT_JOINED(log->active_slot->slot_state); - if (joined == 0) { - WT_STAT_CONN_INCR(session, log_force_write_skip); - if (did_work != NULL) - *did_work = false; - return (0); - } - return (__wt_log_slot_switch(session, &myslot, retry, true)); + return (__wt_log_slot_switch(session, &myslot, retry, true, did_work)); } /* @@ -2146,7 +2138,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ret = 0; if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) - ret = __wt_log_slot_switch(session, &myslot, true, false); + ret = __wt_log_slot_switch(session, &myslot, true, false, NULL); if (ret == 0) ret = __log_fill(session, &myslot, false, record, &lsn); release_size = __wt_log_slot_release( diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 512a84dbd13..97e317ce68c 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -286,12 +286,13 @@ __log_slot_new(WT_SESSION_IMPL *session) */ static int __log_slot_switch_internal( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced) + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; bool free_slot, release; + uint32_t joined; log = S2C(session)->log; release = false; @@ -305,6 +306,18 @@ __log_slot_switch_internal( */ if (slot != log->active_slot) return (0); + /* + * If the current active slot is unused and this is a forced switch, + * we're done. If this is a non-forced switch we always switch + * because the slot could be part of an unbuffered operation. + */ + joined = WT_LOG_SLOT_JOINED(slot->slot_state); + if (joined == 0 && forced) { + WT_STAT_CONN_INCR(session, log_force_write_skip); + if (did_work != NULL) + *did_work = false; + return (0); + } WT_RET(WT_SESSION_CHECK_PANIC(session)); /* @@ -352,8 +365,8 @@ __log_slot_switch_internal( * Switch out the current slot and set up a new one. */ int -__wt_log_slot_switch( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) +__wt_log_slot_switch(WT_SESSION_IMPL *session, + WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; @@ -373,7 +386,8 @@ __wt_log_slot_switch( */ do { WT_WITH_SLOT_LOCK(session, log, - ret = __log_slot_switch_internal(session, myslot, forced)); + ret = __log_slot_switch_internal( + session, myslot, forced, did_work)); if (ret == EBUSY) { WT_STAT_CONN_INCR(session, log_slot_switch_busy); __wt_yield(); -- cgit v1.2.1 From 7a3e2484ec1ced43653cf33f4c68b0ebc8a0ee55 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Sat, 8 Apr 2017 00:56:28 +1000 Subject: WT-3262 Don't check if the cache is full when accessing metadata. (#3376) Also don't check for a full cache while holding the table lock (we're likely reading the metadata in that case, just being extra careful). --- src/include/cache.i | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/include/cache.i b/src/include/cache.i index d71978ccf35..90dd1bcdda8 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -360,11 +360,13 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) /* * LSM sets the no-cache-check flag when holding the LSM tree lock, in - * that case, or when holding the schema or handle list locks (which - * block eviction), we don't want to highjack the thread for eviction. + * that case, or when holding the handle list, schema or table locks + * (which can block checkpoints and eviction), don't block the thread + * for eviction. */ if (F_ISSET(session, WT_SESSION_NO_EVICTION | - WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA | + WT_SESSION_LOCKED_TABLE)) return (0); /* In memory configurations don't block when the cache is full. */ @@ -372,11 +374,14 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) return (0); /* - * Threads operating on cache-resident trees are ignored because they're - * not contributing to the problem. + * Threads operating on cache-resident trees are ignored because + * they're not contributing to the problem. We also don't block while + * reading metadata because we're likely to be holding some other + * resources that could block checkpoints or eviction. */ btree = S2BT_SAFE(session); - if (btree != NULL && F_ISSET(btree, WT_BTREE_IN_MEMORY)) + if (btree != NULL && (F_ISSET(btree, WT_BTREE_IN_MEMORY) || + WT_IS_METADATA(session->dhandle))) return (0); /* Check if eviction is needed. */ -- cgit v1.2.1 From d3ed5e9585a33af75c1c32b65e234bbb97b393b4 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Sat, 8 Apr 2017 00:58:05 +1000 Subject: WT-3265 Allow eviction of recently split pages when tree is locked. (#3372) (cherry picked from commit: 84e6ac0e67019bba22af87b99b40bb0bc0e21157) When pages split in WiredTiger, internal pages cannot be evicted immediately because there is a chance that a reader is still looking at an index pointing to the page. We check for this when considering pages for eviction, and assert that we never evict an internal page in an active generation. However, if a page splits and then we try to get exclusive access to the tree (e.g., to verify it), we could fail to evict the tree from cache even though we have guaranteed exclusive access to it. Relax the check on internal pages to allow eviction from trees that are locked exclusive. --- src/include/btree.i | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/include/btree.i b/src/include/btree.i index a4d88d5fda1..1d6fcd6272c 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1354,8 +1354,13 @@ __wt_page_can_evict( * the original parent page's index, because evicting an internal page * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. + * + * One special case where we know this is safe is if the handle is + * locked exclusive (e.g., when the whole tree is being evicted). In + * that case, no readers can be looking at an old index. */ - if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( + if (!F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) && + WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( session, page->pg_intl_split_gen)) return (false); -- cgit v1.2.1 From cb16839cfbdf338af95bed43ca40979ae6e32f54 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Mon, 10 Apr 2017 23:54:54 +1000 Subject: WT-3271 Prevent integer overflow in eviction tuning. (#3379) (cherry picked from: 8f371403f0ccfae0188d7e4c2e6d629ade697b13) --- src/evict/evict_lru.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 3ce35c60f2e..26bbf9f679b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -934,9 +934,9 @@ __evict_tune_workers(WT_SESSION_IMPL *session) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - uint64_t cur_threads, delta_msec, delta_pages, i, target_threads; + uint64_t delta_msec, delta_pages; uint64_t pgs_evicted_cur, pgs_evicted_persec_cur, time_diff; - uint32_t thread_surplus; + int32_t cur_threads, i, target_threads, thread_surplus; conn = S2C(session); cache = conn->cache; @@ -967,8 +967,10 @@ __evict_tune_workers(WT_SESSION_IMPL *session) conn->evict_tune_workers_best = 0; /* Reduce the number of eviction workers to the minimum */ - thread_surplus = conn->evict_threads.current_threads - - conn->evict_threads_min; + thread_surplus = + (int32_t)conn->evict_threads.current_threads - + (int32_t)conn->evict_threads_min; + for (i = 0; i < thread_surplus; i++) { WT_ERR(__wt_thread_group_stop_one( session, &conn->evict_threads, false)); @@ -1026,18 +1028,18 @@ __evict_tune_workers(WT_SESSION_IMPL *session) * settle into a stable state. */ if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) { - if ((conn->evict_tune_workers_best == - conn->evict_threads.current_threads) && - (conn->evict_threads.current_threads < - conn->evict_threads_max)) { + if (conn->evict_tune_workers_best == + conn->evict_threads.current_threads && + conn->evict_threads.current_threads < + conn->evict_threads_max) { /* * Keep adding workers. We will check again * at the next check point. */ - conn->evict_tune_datapts_needed += - WT_MIN(EVICT_TUNE_DATAPT_MIN, - (conn->evict_threads_max - - conn->evict_threads.current_threads) / + conn->evict_tune_datapts_needed += WT_MIN( + EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max - + conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); } else { /* @@ -1046,8 +1048,8 @@ __evict_tune_workers(WT_SESSION_IMPL *session) * settle into a stable state. */ thread_surplus = - conn->evict_threads.current_threads - - conn->evict_tune_workers_best; + (int32_t)conn->evict_threads.current_threads - + (int32_t)conn->evict_tune_workers_best; for (i = 0; i < thread_surplus; i++) { /* @@ -1082,13 +1084,13 @@ __evict_tune_workers(WT_SESSION_IMPL *session) conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) { - cur_threads = conn->evict_threads.current_threads; + cur_threads = (int32_t)conn->evict_threads.current_threads; target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, - conn->evict_threads_max); + (int32_t)conn->evict_threads_max); /* * Start the new threads. */ - for (i = 0; i < (target_threads - cur_threads); ++i) { + for (i = cur_threads; i < target_threads; ++i) { /* * If we get an error, it should be because we were * unable to acquire the thread group lock. Break out -- cgit v1.2.1