From 2764dd76aebbf6b71b61bf574b01a8028526731d Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Mon, 23 Jan 2017 00:19:30 -0500 Subject: WT-3120 Fix ordering problem in connection_close for custom filesystem loaded via shared lib (#3239) Also add fail_fs extension, as well as a simple test for it. --- build_posix/Make.subdirs | 1 + dist/s_void | 5 + ext/test/fail_fs/Makefile.am | 9 + ext/test/fail_fs/fail_fs.c | 703 ++++++++++++++++++++++++++++++++++++++ src/conn/conn_handle.c | 11 +- src/conn/conn_open.c | 25 +- src/include/extern.h | 2 +- test/csuite/Makefile.am | 3 + test/csuite/wt3120_filesys/main.c | 98 ++++++ 9 files changed, 837 insertions(+), 20 deletions(-) create mode 100644 ext/test/fail_fs/Makefile.am create mode 100644 ext/test/fail_fs/fail_fs.c create mode 100644 test/csuite/wt3120_filesys/main.c diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index 01f23dcbbc1..4ecec37ca6c 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -17,6 +17,7 @@ ext/encryptors/nop ext/encryptors/rotn ext/extractors/csv ext/test/kvs_bdb HAVE_BERKELEY_DB +ext/test/fail_fs . api/leveldb LEVELDB examples/c diff --git a/dist/s_void b/dist/s_void index 025f6d4c7eb..4a6b4ad91a2 100755 --- a/dist/s_void +++ b/dist/s_void @@ -78,6 +78,11 @@ func_ok() -e '/int demo_file_sync$/d' \ -e '/int demo_fs_directory_list_free$/d' \ -e '/int demo_fs_exist$/d' \ + -e '/int fail_file_lock$/d' \ + -e '/int fail_file_sync$/d' \ + -e '/int fail_fs_directory_list_free$/d' \ + -e '/int fail_fs_exist$/d' \ + -e '/int fail_fs_terminate$/d' \ -e '/int handle_message$/d' \ -e '/int handle_progress$/d' \ -e '/int helium_cursor_reset$/d' \ diff --git a/ext/test/fail_fs/Makefile.am b/ext/test/fail_fs/Makefile.am new file mode 100644 index 00000000000..f31f5395cd1 --- /dev/null +++ b/ext/test/fail_fs/Makefile.am @@ -0,0 +1,9 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_LTLIBRARIES = libwiredtiger_fail_fs.la +libwiredtiger_fail_fs_la_SOURCES = fail_fs.c + +# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well +# as installation, it will only build static libraries. As far as I can tell, +# the "approved" libtool way to turn them back on is by adding -rpath. +libwiredtiger_fail_fs_la_LDFLAGS = -avoid-version -module -rpath /nowhere diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c new file mode 100644 index 00000000000..e2538023a2c --- /dev/null +++ b/ext/test/fail_fs/fail_fs.c @@ -0,0 +1,703 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "queue.h" + +#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) + +/* + * A "fail file system", that is, a file system extension that fails when we + * want it to. This is only used in test frameworks, this fact allows us + * to simplify some error paths. + */ +typedef struct { + WT_FILE_SYSTEM iface; + /* + * WiredTiger performs schema and I/O operations in parallel, all file + * system and file handle access must be thread-safe. This extension + * uses a single, global file system lock. + */ + pthread_rwlock_t lock; /* Lock */ + int64_t read_ops; + int64_t write_ops; + int64_t allow_reads; + int64_t allow_writes; + /* Queue of file handles */ + TAILQ_HEAD(fail_file_handle_qh, fail_file_handle) fileq; + WT_EXTENSION_API *wtext; /* Extension functions */ +} FAIL_FILE_SYSTEM; + +typedef struct fail_file_handle { + WT_FILE_HANDLE iface; + + /* + * Track the system file descriptor for each file. + */ + FAIL_FILE_SYSTEM *fail_fs; /* Enclosing file system */ + TAILQ_ENTRY(fail_file_handle) q; /* Queue of handles */ + int fd; /* System file descriptor */ +} FAIL_FILE_HANDLE; + +static int fail_file_close(WT_FILE_HANDLE *, WT_SESSION *); +static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *); +static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool); +static int fail_file_read( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *); +static int fail_file_size( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); +static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *); +static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); +static int fail_file_write( + WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *); +static bool fail_fs_arg( + const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, + int64_t *argp); +static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, const char *, char ***, uint32_t *); +static int fail_fs_directory_list_free( + WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t); +static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); +static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *, + const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **); +static int fail_fs_remove( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t); +static int fail_fs_rename( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t); +static int fail_fs_size( + WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *); +static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *); + +/* + * We use pthread functions for portable locking. + * Assert on errors for simplicity. + */ +static void +fail_fs_allocate_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_init(lockp, NULL) == 0); +} + +static void +fail_fs_destroy_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_destroy(lockp) == 0); +} + +static void +fail_fs_lock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_wrlock(lockp) == 0); +} + +static void +fail_fs_unlock(pthread_rwlock_t *lockp) +{ + assert(pthread_rwlock_unlock(lockp) == 0); +} + +/* + * fail_file_close -- + * ANSI C close. + */ +static int +fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + + if (fail_fh->fd < 0) + return (EINVAL); + ret = close(fail_fh->fd); + fail_fh->fd = -1; + fail_file_handle_remove(session, fail_fh); + return (ret); +} + +/* + * fail_file_handle_remove -- + * Destroy an in-memory file handle. Should only happen on remove or + * shutdown. + */ +static void +fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh) +{ + FAIL_FILE_SYSTEM *fail_fs; + + (void)session; /* Unused */ + fail_fs = fail_fh->fail_fs; + + TAILQ_REMOVE(&fail_fs->fileq, fail_fh, q); + + free(fail_fh->iface.name); + free(fail_fh); +} + +/* + * fail_file_lock -- + * Lock/unlock a file. + */ +static int +fail_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock) +{ + /* Locks are always granted. */ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + (void)lock; /* Unused */ + + return (0); +} + +/* + * fail_file_read -- + * POSIX pread. + */ +static int +fail_file_read(WT_FILE_HANDLE *file_handle, + WT_SESSION *session, wt_off_t offset, size_t len, void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t read_ops; + int ret; + size_t chunk; + ssize_t nr; + uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + read_ops = ++fail_fs->read_ops; + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->allow_reads != 0 && read_ops % fail_fs->allow_reads == 0) { + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " reads\n", fail_fh->iface.name, read_ops); + return (EIO); + } + + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-read: failed to read %" PRIu64 + " bytes at offset %" PRIu64 ": %s", + fail_fh->iface.name, (uint64_t)len, + (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_file_size -- + * Get the size of a file in bytes, by file handle. + */ +static int +fail_file_size( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep) +{ + FAIL_FILE_HANDLE *fail_fh; + struct stat statbuf; + int ret; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + ret = 0; + + if ((ret = fstat(fail_fh->fd, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_file_sync -- + * Ensure the content of the file is stable. This is a no-op in our + * memory backed file system. + */ +static int +fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session) +{ + (void)file_handle; /* Unused */ + (void)session; /* Unused */ + + return (0); +} + +/* + * fail_file_truncate -- + * POSIX ftruncate. + */ +static int +fail_file_truncate( + WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t offset) +{ + FAIL_FILE_HANDLE *fail_fh; + + (void)session; /* Unused */ + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + return (ftruncate(fail_fh->fd, offset)); +} + +/* + * fail_file_write -- + * POSIX pwrite. + */ +static int +fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, + wt_off_t offset, size_t len, const void *buf) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int64_t write_ops; + int ret; + size_t chunk; + ssize_t nr; + const uint8_t *addr; + + fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; + wtext = fail_fs->wtext; + ret = 0; + + fail_fs_lock(&fail_fs->lock); + write_ops = ++fail_fs->write_ops; + fail_fs_unlock(&fail_fs->lock); + + if (fail_fs->allow_writes != 0 && + write_ops % fail_fs->allow_writes == 0) { + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " writes\n", fail_fh->iface.name, write_ops); + return (EIO); + } + + /* Break writes larger than 1GB into 1GB chunks. */ + for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { + chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; + if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) { + (void)wtext->err_printf(wtext, session, + "%s: handle-write: failed to write %" PRIu64 + " bytes at offset %" PRIu64 ": %s", + fail_fh->iface.name, (uint64_t)len, + (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + ret = (nr == 0 ? WT_ERROR : errno); + break; + } + } + return (ret); +} + +/* + * fail_fs_arg -- + * If the key matches, return the value interpreted as an integer. + */ +static bool +fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, + int64_t *argp) +{ + char *s; + int64_t result; + + if (strncmp(match, key->str, key->len) == 0 && + match[key->len] == '\0') { + s = (char *)value->str; + result = strtoll(s, &s, 10); + if ((size_t)(s - (char *)value->str) == value->len) { + *argp = result; + return (true); + } + } + return (false); +} + +/* + * fail_fs_directory_list -- + * Return a list of files in a given sub-directory. + */ +static int +fail_fs_directory_list(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *directory, + const char *prefix, char ***dirlistp, uint32_t *countp) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + size_t len, prefix_len; + uint32_t allocated, count; + int ret; + char *name, **entries; + + (void)session; /* Unused */ + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + ret = 0; + *dirlistp = NULL; + *countp = 0; + + entries = NULL; + allocated = count = 0; + len = strlen(directory); + prefix_len = prefix == NULL ? 0 : strlen(prefix); + + fail_fs_lock(&fail_fs->lock); + TAILQ_FOREACH(fail_fh, &fail_fs->fileq, q) { + name = fail_fh->iface.name; + if (strncmp(name, directory, len) != 0 || + (prefix != NULL && strncmp(name, prefix, prefix_len) != 0)) + continue; + + /* + * Increase the list size in groups of 10, it doesn't + * matter if the list is a bit longer than necessary. + */ + if (count >= allocated) { + entries = realloc( + entries, (allocated + 10) * sizeof(char *)); + if (entries == NULL) { + ret = ENOMEM; + goto err; + } + memset(entries + allocated * sizeof(char *), + 0, 10 * sizeof(char *)); + allocated += 10; + } + entries[count++] = strdup(name); + } + + *dirlistp = entries; + *countp = count; + +err: fail_fs_unlock(&fail_fs->lock); + if (ret == 0) + return (0); + + if (entries != NULL) { + while (count > 0) + free(entries[--count]); + free(entries); + } + + return (ret); +} + +/* + * fail_fs_directory_list_free -- + * Free memory allocated by fail_fs_directory_list. + */ +static int +fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, char **dirlist, uint32_t count) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + if (dirlist != NULL) { + while (count > 0) + free(dirlist[--count]); + free(dirlist); + } + return (0); +} + +/* + * fail_fs_exist -- + * Return if the file exists. + */ +static int +fail_fs_exist(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, bool *existp) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + *existp = (access(name, 0) == 0); + return (0); +} + +/* + * fail_fs_open -- + * fopen for the fail file system. + */ +static int +fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, + const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags, + WT_FILE_HANDLE **file_handlep) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + WT_FILE_HANDLE *file_handle; + int open_flags; + int ret; + + (void)file_type; /* Unused */ + (void)session; /* Unused */ + + *file_handlep = NULL; + ret = 0; + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fail_fh = NULL; + + fail_fs_lock(&fail_fs->lock); + + open_flags = 0; + if ((flags & WT_FS_OPEN_CREATE) != 0) + open_flags |= O_CREAT; + if ((flags & WT_FS_OPEN_EXCLUSIVE) != 0) + open_flags |= O_EXCL; + if ((flags & WT_FS_OPEN_READONLY) != 0) + open_flags |= O_RDONLY; + else + open_flags |= O_RDWR; + + if ((ret = open(name, open_flags, 0666)) < 0) + goto err; + + /* We create a handle structure for each open. */ + if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Initialize private information. */ + fail_fh->fail_fs = fail_fs; + fail_fh->fd = ret; + ret = 0; + + /* Initialize public information. */ + file_handle = (WT_FILE_HANDLE *)fail_fh; + if ((file_handle->name = strdup(name)) == NULL) { + ret = ENOMEM; + goto err; + } + + /* Setup the function call table. */ + file_handle->close = fail_file_close; + file_handle->fh_advise = NULL; + file_handle->fh_extend = NULL; + file_handle->fh_extend_nolock = NULL; + file_handle->fh_lock = fail_file_lock; + file_handle->fh_map = NULL; + file_handle->fh_map_discard = NULL; + file_handle->fh_map_preload = NULL; + file_handle->fh_unmap = NULL; + file_handle->fh_read = fail_file_read; + file_handle->fh_size = fail_file_size; + file_handle->fh_sync = fail_file_sync; + file_handle->fh_sync_nowait = NULL; + file_handle->fh_truncate = fail_file_truncate; + file_handle->fh_write = fail_file_write; + + TAILQ_INSERT_HEAD(&fail_fs->fileq, fail_fh, q); + + *file_handlep = file_handle; + + if (0) { +err: free(fail_fh); + } + + fail_fs_unlock(&fail_fs->lock); + return (ret); +} + +/* + * fail_fs_remove -- + * POSIX remove. + */ +static int +fail_fs_remove(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (unlink(name)); +} + +/* + * fail_fs_rename -- + * POSIX rename. + */ +static int +fail_fs_rename(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *from, const char *to, uint32_t flags) +{ + (void)file_system; /* Unused */ + (void)session; /* Unused */ + (void)flags; /* Unused */ + + return (rename(from, to)); +} + +/* + * fail_fs_size -- + * Get the size of a file in bytes, by file name. + */ +static int +fail_fs_size(WT_FILE_SYSTEM *file_system, + WT_SESSION *session, const char *name, wt_off_t *sizep) +{ + struct stat statbuf; + int ret; + + (void)file_system; /* Unused */ + (void)session; /* Unused */ + + ret = 0; + if ((ret = stat(name, &statbuf)) != 0) + return (ret); + *sizep = statbuf.st_size; + return (0); +} + +/* + * fail_fs_terminate -- + * Discard any resources on termination + */ +static int +fail_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session) +{ + FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; + + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + + while ((fail_fh = TAILQ_FIRST(&fail_fs->fileq)) != NULL) + fail_file_handle_remove(session, fail_fh); + + fail_fs_destroy_lock(&fail_fs->lock); + free(fail_fs); + + return (0); +} + +/* + * wiredtiger_extension_init -- + * WiredTiger fail filesystem extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; + WT_FILE_SYSTEM *file_system; + int ret; + + ret = 0; + wtext = conn->get_extension_api(conn); + if ((fail_fs = calloc(1, sizeof(FAIL_FILE_SYSTEM))) == NULL) { + (void)wtext->err_printf(wtext, NULL, + "fail_file_system extension_init: %s", + wtext->strerror(wtext, NULL, ENOMEM)); + return (ENOMEM); + } + fail_fs->wtext = wtext; + file_system = (WT_FILE_SYSTEM *)fail_fs; + + /* Get any configuration values. */ + if ((ret = wtext->config_parser_open_arg( + wtext, NULL, config, &config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_EXTENSION_API.config_parser_open: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { + if (fail_fs_arg("allow_writes", &k, &v, &fail_fs->allow_writes)) + continue; + if (fail_fs_arg("allow_reads", &k, &v, &fail_fs->allow_reads)) + continue; + + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: unexpected configuration " + "information: %.*s=%.*s: %s", + (int)k.len, k.str, (int)v.len, v.str, + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if (ret != WT_NOTFOUND) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.next: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + if ((ret = config_parser->close(config_parser)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONFIG_PARSER.close: config: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + + fail_fs_allocate_lock(&fail_fs->lock); + /* Initialize the in-memory jump table. */ + file_system->fs_directory_list = fail_fs_directory_list; + file_system->fs_directory_list_free = fail_fs_directory_list_free; + file_system->fs_exist = fail_fs_exist; + file_system->fs_open_file = fail_fs_open; + file_system->fs_remove = fail_fs_remove; + file_system->fs_rename = fail_fs_rename; + file_system->fs_size = fail_fs_size; + file_system->terminate = fail_fs_terminate; + if ((ret = conn->set_file_system(conn, file_system, NULL)) != 0) { + (void)wtext->err_printf(wtext, NULL, + "WT_CONNECTION.set_file_system: %s", + wtext->strerror(wtext, NULL, ret)); + goto err; + } + return (0); + +err: free(fail_fs); + return (ret); +} diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 3f7fc9bb2a7..7203b75e4ae 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -109,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ -int +void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { - WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) - return (0); + return; session = conn->default_session; @@ -149,11 +148,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); - /* Destroy the file-system configuration. */ - if (conn->file_system != NULL && conn->file_system->terminate != NULL) - WT_TRET(conn->file_system->terminate( - conn->file_system, (WT_SESSION *)session)); - /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); @@ -162,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); - return (ret); } diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index d4ace127bb2..f8029f2c728 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -159,15 +159,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Discard transaction state. */ __wt_txn_global_destroy(session); - /* Close extensions, first calling any unload entry point. */ - while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { - TAILQ_REMOVE(&conn->dlhqh, dlh, q); - - if (dlh->terminate != NULL) - WT_TRET(dlh->terminate(wt_conn)); - WT_TRET(__wt_dlclose(session, dlh)); - } - /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); @@ -199,8 +190,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_free(session, s->hazard); } + /* Destroy the file-system configuration. */ + if (conn->file_system != NULL && conn->file_system->terminate != NULL) + WT_TRET(conn->file_system->terminate( + conn->file_system, (WT_SESSION *)session)); + + /* Close extensions, first calling any unload entry point. */ + while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { + TAILQ_REMOVE(&conn->dlhqh, dlh, q); + + if (dlh->terminate != NULL) + WT_TRET(dlh->terminate(wt_conn)); + WT_TRET(__wt_dlclose(session, dlh)); + } + /* Destroy the handle. */ - WT_TRET(__wt_connection_destroy(conn)); + __wt_connection_destroy(conn); return (ret); } diff --git a/src/include/extern.h b/src/include/extern.h index 566eb386c29..16b3c916b24 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -262,7 +262,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index a96492c1e71..bcdbf120d67 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -40,6 +40,9 @@ noinst_PROGRAMS += test_wt2853_perf test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c noinst_PROGRAMS += test_wt2999_join_extractor +test_wt3120_filesys_SOURCES = wt3120_filesys/main.c +noinst_PROGRAMS += test_wt3120_filesys + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c new file mode 100644 index 00000000000..abf660db046 --- /dev/null +++ b/test/csuite/wt3120_filesys/main.c @@ -0,0 +1,98 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3120 + * Test case description: A simple file system extension built into + * a shared library. + * Failure mode: Loading the file system and closing the connection + * is enough to evoke the failure. This test does slightly more + * than that. + */ + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor; + WT_SESSION *session; + char *kstr, *vstr; + char buf[100]; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + snprintf(buf, sizeof(buf), + "create,extensions=" + "[\"../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so\"]"); + testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->create(session, opts->uri, + "key_format=S,value_format=S")); + + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + cursor->set_key(cursor, "a"); + cursor->set_value(cursor, "0"); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, "b"); + cursor->set_value(cursor, "1"); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + + /* Force to disk and re-open. */ + testutil_check(opts->conn->close(opts->conn, NULL)); + testutil_check(wiredtiger_open(opts->home, NULL, NULL, &opts->conn)); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, + &cursor)); + testutil_check(cursor->next(cursor)); + cursor->get_key(cursor, &kstr); + cursor->get_value(cursor, &vstr); + testutil_assert(strcmp(kstr, "a") == 0); + testutil_assert(strcmp(vstr, "0") == 0); + testutil_check(cursor->next(cursor)); + cursor->get_key(cursor, &kstr); + cursor->get_value(cursor, &vstr); + testutil_assert(strcmp(kstr, "b") == 0); + testutil_assert(strcmp(vstr, "1") == 0); + testutil_assert(cursor->next(cursor) == WT_NOTFOUND); + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); + printf("Success\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1 From 52171b4c668528c80d1e2084183899f294d4c797 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 00:51:14 -0500 Subject: WT-3144 Print WT_REF instead of WT_REF.page in verbose/debugging output. (#3258) --- src/btree/bt_debug.c | 2 +- src/btree/bt_split.c | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index b62125e069d..a89eca230fd 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -652,7 +652,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) page = ref->page; mod = page->modify; - WT_RET(ds->f(ds, "%p", (void *)page)); + WT_RET(ds->f(ds, "%p", (void *)ref)); switch (page->type) { case WT_PAGE_COL_INT: diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 6b0b8a08c02..7cfcd08f931 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -2086,8 +2086,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); if ((ret = __split_insert(session, ref)) != 0) { @@ -2178,8 +2177,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { @@ -2207,8 +2205,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); ret = __split_parent(session, ref, NULL, 0, 0, false, true); @@ -2229,8 +2226,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) page = ref->page; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref); /* * This isn't a split: a reconciliation failed because we couldn't write -- cgit v1.2.1 From 5e6ffcc7ef98a609e4bbc0ecfef58dade45de1d7 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 00:53:47 -0500 Subject: WT-3144 Make it less likely for random lookups to return WT_NOTFOUND (#3259) There may be empty pages in the tree, and they're useless to us when trying to find random samples. If we don't find a non-empty page in "entries" random guesses, take the first non-empty page in the tree. If the search page contains nothing other than empty pages, restart from the root some number of times before giving up. --- src/btree/row_srch.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index aa299a161da..5b3f1195784 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -792,9 +792,11 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *current, *descent; + uint32_t i, entries, retry; btree = S2BT(session); current = NULL; + retry = 100; if (0) { restart: /* @@ -812,8 +814,32 @@ restart: /* break; WT_INTL_INDEX_GET(session, page, pindex); - descent = pindex->index[ - __wt_random(&session->rnd) % pindex->entries]; + entries = pindex->entries; + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + */ + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state != WT_REF_DELETED) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state != WT_REF_DELETED) + break; + } + if (i == entries) { + if (--retry > 0) + goto restart; + return (WT_NOTFOUND); + } /* * Swap the current page for the child page. If the page splits -- cgit v1.2.1 From f214daa45a860021f107c498ddfd1328b6b3f517 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 07:49:41 -0500 Subject: WT-3144 bug fix: random cursor returns not-found when descending to an empty page. clang 3.8 complains descent might be left uninitialized in some case. I don't think that's possible, but it's a simple change. --- src/btree/row_srch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 5b3f1195784..1c3d5ad5daa 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -823,6 +823,7 @@ restart: /* * search page contains nothing other than empty pages, restart * from the root some number of times before giving up. */ + descent = NULL; for (i = 0; i < entries; ++i) { descent = pindex->index[__wt_random(&session->rnd) % entries]; @@ -835,7 +836,7 @@ restart: /* if (descent->state != WT_REF_DELETED) break; } - if (i == entries) { + if (i == entries || descent == NULL) { if (--retry > 0) goto restart; return (WT_NOTFOUND); -- cgit v1.2.1 From b2ab33d476c657120c56ed31aa05f54557f010e0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 11:34:06 -0500 Subject: WT-3120 Fix ordering problem in connection_close for filesystem loaded in an extension (#3261) This commit represents fixes for Coverity errors, LeakSanitizer errors, and additional cleanup: * pread/pwrite return value is -1 on error, but the error is in errno. * Convert size_t and off_t to uintmax_t/PRIuMAX, not uint64_t/PRIu64. * Coverity ID 1369085 (#1 of 1): Extra sizeof expression (SIZEOF_MISMATCH) suspicious_pointer_arithmetic: Adding allocated * 8UL /* sizeof (char *) */ to pointer entries of type char ** is suspicious because adding an integral value to this pointer automatically scales that value by the size, 8 bytes, of the pointed-to type, char *. Most likely, the multiplication by sizeof (char *) in this expression is extraneous and should be eliminated. * CID 1369084 (#1 of 1): Resource leak (RESOURCE_LEAK) 9. overwrite_var: Overwriting handle ret in ret = 12 leaks the handle. * CID 1369083 (#1 of 1): Logically dead code (DEADCODE) dead_error_line: Execution cannot reach this statement: while (count > 0U) null: At condition entries != NULL, the value of entries must be NULL. dead_error_condition: The condition entries != NULL cannot be true. * Custom filesystems have to configure early-load, otherwise we'll have already configured a default filesystem by the time the extension is loaded. * Add early-load configuration to the wt3120_filesys test. * Add code to WiredTiger that fails if a custom filesystem is configured after we've already configured a default filesystem. --- examples/c/ex_file_system.c | 13 ++++++---- ext/test/fail_fs/fail_fs.c | 50 +++++++++++++++++++++++---------------- src/conn/conn_api.c | 10 ++++++++ test/csuite/wt3120_filesys/main.c | 7 +++--- 4 files changed, 51 insertions(+), 29 deletions(-) diff --git a/examples/c/ex_file_system.c b/examples/c/ex_file_system.c index 56869171558..e807ac54d3b 100644 --- a/examples/c/ex_file_system.c +++ b/examples/c/ex_file_system.c @@ -399,6 +399,7 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, uint32_t allocated, count; int ret = 0; char *name, **entries; + void *p; (void)session; /* Unused */ @@ -424,14 +425,16 @@ demo_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - entries = realloc( - entries, (allocated + 10) * sizeof(char *)); - if (entries == NULL) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { ret = ENOMEM; goto err; } - memset(entries + allocated * sizeof(char *), - 0, 10 * sizeof(char *)); + + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); allocated += 10; } entries[count++] = strdup(name); diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index e2538023a2c..29d469768c5 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -224,10 +224,11 @@ fail_file_read(WT_FILE_HANDLE *file_handle, chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { (void)wtext->err_printf(wtext, session, - "%s: handle-read: failed to read %" PRIu64 - " bytes at offset %" PRIu64 ": %s", - fail_fh->iface.name, (uint64_t)len, - (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + "%s: handle-read: failed to read %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); ret = (nr == 0 ? WT_ERROR : errno); break; } @@ -327,10 +328,11 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pwrite(fail_fh->fd, addr, chunk, offset)) <= 0) { (void)wtext->err_printf(wtext, session, - "%s: handle-write: failed to write %" PRIu64 - " bytes at offset %" PRIu64 ": %s", - fail_fh->iface.name, (uint64_t)len, - (uint64_t)offset, wtext->strerror(wtext, NULL, nr)); + "%s: handle-write: failed to write %" PRIuMAX + " bytes at offset %" PRIuMAX ": %s", + fail_fh->iface.name, + (uintmax_t)len, (uintmax_t)offset, + wtext->strerror(wtext, NULL, errno)); ret = (nr == 0 ? WT_ERROR : errno); break; } @@ -376,6 +378,7 @@ fail_fs_directory_list(WT_FILE_SYSTEM *file_system, uint32_t allocated, count; int ret; char *name, **entries; + void *p; (void)session; /* Unused */ @@ -401,14 +404,15 @@ fail_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - entries = realloc( - entries, (allocated + 10) * sizeof(char *)); - if (entries == NULL) { + p = realloc( + entries, (allocated + 10) * sizeof(*entries)); + if (p == NULL) { ret = ENOMEM; goto err; } - memset(entries + allocated * sizeof(char *), - 0, 10 * sizeof(char *)); + entries = p; + memset(entries + allocated * sizeof(*entries), + 0, 10 * sizeof(*entries)); allocated += 10; } entries[count++] = strdup(name); @@ -476,16 +480,17 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_FILE_HANDLE *file_handle; - int open_flags; - int ret; + int fd, open_flags, ret; (void)file_type; /* Unused */ (void)session; /* Unused */ *file_handlep = NULL; - ret = 0; - fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fail_fh = NULL; + fail_fs = (FAIL_FILE_SYSTEM *)file_system; + fd = -1; + ret = 0; fail_fs_lock(&fail_fs->lock); @@ -499,8 +504,10 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, else open_flags |= O_RDWR; - if ((ret = open(name, open_flags, 0666)) < 0) + if ((fd = open(name, open_flags, 0666)) < 0) { + ret = errno; goto err; + } /* We create a handle structure for each open. */ if ((fail_fh = calloc(1, sizeof(FAIL_FILE_HANDLE))) == NULL) { @@ -510,8 +517,7 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, /* Initialize private information. */ fail_fh->fail_fs = fail_fs; - fail_fh->fd = ret; - ret = 0; + fail_fh->fd = fd; /* Initialize public information. */ file_handle = (WT_FILE_HANDLE *)fail_fh; @@ -542,7 +548,9 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, *file_handlep = file_handle; if (0) { -err: free(fail_fh); +err: if (fd != -1) + (void)close(fd); + free(fail_fh); } fail_fs_unlock(&fail_fs->lock); diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index f691a76b1f2..d76e08067b5 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1987,6 +1987,16 @@ __conn_set_file_system( CONNECTION_API_CALL(conn, session, set_file_system, config, cfg); WT_UNUSED(cfg); + /* + * You can only configure a file system once, and attempting to do it + * again probably means the extension argument didn't have early-load + * set and we've already configured the default file system. + */ + if (conn->file_system != NULL) + WT_ERR_MSG(session, EPERM, + "filesystem already configured; custom filesystems should " + "enable \"early_load\" configuration"); + conn->file_system = file_system; err: API_END_RET(session, ret); diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c index abf660db046..a4b830d6a70 100644 --- a/test/csuite/wt3120_filesys/main.c +++ b/test/csuite/wt3120_filesys/main.c @@ -36,6 +36,8 @@ * than that. */ +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + int main(int argc, char *argv[]) { @@ -43,7 +45,7 @@ main(int argc, char *argv[]) WT_CURSOR *cursor; WT_SESSION *session; char *kstr, *vstr; - char buf[100]; + char buf[1024]; opts = &_opts; memset(opts, 0, sizeof(*opts)); @@ -51,8 +53,7 @@ main(int argc, char *argv[]) testutil_make_work_dir(opts->home); snprintf(buf, sizeof(buf), - "create,extensions=" - "[\"../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so\"]"); + "create,extensions=(" WT_FAIL_FS_LIB "=(early_load=true))"); testutil_check(wiredtiger_open(opts->home, NULL, buf, &opts->conn)); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); -- cgit v1.2.1 From d7dc59045b87a37f029c0046082489af557c7018 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 23 Jan 2017 17:49:50 -0500 Subject: WT-2790 Fix a text case false positive in test_sweep01. (#3263) --- test/suite/test_sweep01.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index 71f8fcb180e..5559190caca 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -116,10 +116,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # Give slow machines time to process files. stat_cursor = self.session.open_cursor('statistics:', None, None) this_nfile = stat_cursor[stat.conn.file_open][2] + removed = stat_cursor[stat.conn.dh_sweep_remove][2] stat_cursor.close() self.pr("==== loop " + str(sleep)) self.pr("this_nfile " + str(this_nfile)) - if this_nfile == final_nfile: + self.pr("removed " + str(removed)) + # On slow machines there can be a lag where files get closed but + # the sweep server cannot yet remove the handles. So wait for the + # removed statistic to indicate forward progress too. + if this_nfile == final_nfile and removed != remove1: break c.close() self.pr("Sweep loop took " + str(sleep)) -- cgit v1.2.1 From 75345eabdf5e54aa56fa51134fc53d5ae75aa7d8 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 23 Jan 2017 18:05:36 -0500 Subject: WT-3120 Add error handling to get_key/get_value in a test (#3262) --- test/csuite/wt3120_filesys/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/csuite/wt3120_filesys/main.c b/test/csuite/wt3120_filesys/main.c index a4b830d6a70..09dce624066 100644 --- a/test/csuite/wt3120_filesys/main.c +++ b/test/csuite/wt3120_filesys/main.c @@ -80,13 +80,13 @@ main(int argc, char *argv[]) testutil_check(session->open_cursor(session, opts->uri, NULL, NULL, &cursor)); testutil_check(cursor->next(cursor)); - cursor->get_key(cursor, &kstr); - cursor->get_value(cursor, &vstr); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); testutil_assert(strcmp(kstr, "a") == 0); testutil_assert(strcmp(vstr, "0") == 0); testutil_check(cursor->next(cursor)); - cursor->get_key(cursor, &kstr); - cursor->get_value(cursor, &vstr); + testutil_check(cursor->get_key(cursor, &kstr)); + testutil_check(cursor->get_value(cursor, &vstr)); testutil_assert(strcmp(kstr, "b") == 0); testutil_assert(strcmp(vstr, "1") == 0); testutil_assert(cursor->next(cursor) == WT_NOTFOUND); -- cgit v1.2.1 From 314675c75a777f18995cbac6303b3065c88f5e06 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Tue, 24 Jan 2017 01:30:09 -0500 Subject: WT-3137 Fix a hang in logging due to a race condition (#3223) --- src/include/log.h | 1 + src/log/log_slot.c | 199 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 156 insertions(+), 44 deletions(-) diff --git a/src/include/log.h b/src/include/log.h index d9fea892c68..82fcbf1be58 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -254,6 +254,7 @@ struct __wt_log { #define WT_SLOT_POOL 128 WT_LOGSLOT *active_slot; /* Active slot */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + int32_t pool_index; /* Index into slot pool */ size_t slot_buf_size; /* Buffer size for slots */ #ifdef HAVE_DIAGNOSTIC uint64_t write_calls; /* Calls to log_write */ diff --git a/src/log/log_slot.c b/src/log/log_slot.c index a29a34e5652..cb44cadcb70 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -8,6 +8,49 @@ #include "wt_internal.h" +#ifdef HAVE_DIAGNOSTIC +/* + * __log_slot_dump -- + * Dump the entire slot state. + */ +static void +__log_slot_dump(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int32_t earliest, i; + + conn = S2C(session); + log = conn->log; + earliest = 0; + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (__wt_log_cmp(&slot->slot_release_lsn, + &log->slot_pool[earliest].slot_release_lsn) < 0) + earliest = i; + __wt_errx(session, "Slot %d:", i); + __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32, + slot->slot_state, slot->flags); + __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32, + slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset); + __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32, + slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset); + __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, + slot->slot_release_lsn.l.file, + slot->slot_release_lsn.l.offset); + __wt_errx(session, " Offset: start: %" PRIu32 + " last:%" PRIu32, (uint32_t)slot->slot_start_offset, + (uint32_t)slot->slot_last_offset); + __wt_errx(session, " Unbuffered: %" PRId64 + " error: %" PRId32, slot->slot_unbuffered, + slot->slot_error); + } + __wt_errx(session, "Earliest slot: %d", earliest); + +} +#endif + /* * __wt_log_slot_activate -- * Initialize a slot to become active. @@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) conn = S2C(session); log = conn->log; - slot->slot_state = 0; /* * !!! slot_release_lsn must be set outside this function because * this function may be called after a log file switch and the @@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * set for closing the file handle on a log file switch. The flags * are reset when the slot is freed. See log_slot_free. */ + slot->slot_unbuffered = 0; slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; slot->slot_start_offset = log->alloc_lsn.l.offset; slot->slot_last_offset = log->alloc_lsn.l.offset; slot->slot_fh = log->log_fh; slot->slot_error = 0; - slot->slot_unbuffered = 0; + WT_DIAGNOSTIC_YIELD; + /* + * Set the slot state last. Other threads may have a stale pointer + * to this slot and could try to alter the state and other fields once + * they see the state cleared. + */ + WT_PUBLISH(slot->slot_state, 0); } /* @@ -50,6 +99,10 @@ __log_slot_close( WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); @@ -101,9 +154,32 @@ retry: * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ - while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) && - slot->slot_unbuffered == 0) - __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif + if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { + while (slot->slot_unbuffered == 0) { + __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, "SLOT_CLOSE: Slot %" + PRIu32 " Timeout unbuffered, state 0x%" + PRIx64 " unbuffered %" PRIu64, + (uint32_t)(slot - &log->slot_pool[0]), + slot->slot_state, + slot->slot_unbuffered); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif + } + } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; @@ -218,7 +294,11 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int32_t i; + int32_t i, pool_i; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); @@ -232,16 +312,22 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif /* * Keep trying until we can find a free slot. */ for (;;) { /* - * For now just restart at 0. We could use log->pool_index - * if that is inefficient. + * Rotate among the slots to lessen collisions. */ - for (i = 0; i < WT_SLOT_POOL; i++) { - slot = &log->slot_pool[i]; + for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; + i++, pool_i++) { + if (pool_i >= WT_SLOT_POOL) + pool_i = 0; + slot = &log->slot_pool[pool_i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the @@ -256,6 +342,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) WT_STAT_CONN_INCR(session, log_slot_transitions); log->active_slot = slot; + log->pool_index = pool_i; return (0); } } @@ -264,6 +351,19 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) */ __wt_cond_auto_signal(session, conn->log_wrlsn_cond); __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, + "SLOT_NEW: Timeout free slot"); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif } /* NOTREACHED */ } @@ -311,10 +411,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. + * The release LSN is usually the same as the slot_start_lsn except + * around a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; + log->pool_index = 0; if (0) { err: while (--i >= 0) @@ -370,53 +473,62 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOGSLOT *slot; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join; -#ifdef HAVE_DIAGNOSTIC - bool unbuf_force; -#endif + bool unbuffered, yld; conn = S2C(session); log = conn->log; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ + unbuffered = false; #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; + yld = (++log->write_calls % 7) == 0; + if ((log->write_calls % WT_THOUSAND) == 0 || + mysize > WT_LOG_SLOT_BUF_MAX) { +#else + yld = false; + if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif + unbuffered = true; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + } for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; - /* - * Try to join our size into the existing size and - * atomically write it back into the state. - */ - flag_state = WT_LOG_SLOT_FLAGS(old_state); - released = WT_LOG_SLOT_RELEASED(old_state); - join_offset = WT_LOG_SLOT_JOINED(old_state); -#ifdef HAVE_DIAGNOSTIC - if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { -#else - if (mysize > WT_LOG_SLOT_BUF_MAX) { -#endif - new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; - F_SET(myslot, WT_MYSLOT_UNBUFFERED); - myslot->slot = slot; - } else - new_join = join_offset + (int32_t)mysize; - new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( - (int64_t)new_join, (int64_t)released, (int64_t)flag_state); - - /* - * Check if the slot is open for joining and we are able to - * swap in our size into the state. - */ - if (WT_LOG_SLOT_OPEN(old_state) && - __wt_atomic_casiv64( - &slot->slot_state, old_state, new_state)) - break; + if (WT_LOG_SLOT_OPEN(old_state)) { + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); + if (unbuffered) + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, + (int64_t)flag_state); + + /* + * Braces used due to potential empty body warning. + */ + if (yld) { + WT_DIAGNOSTIC_YIELD; + } + /* + * Attempt to swap our size into the state. + */ + if (__wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + } /* * The slot is no longer open or we lost the race to * update it. Yield and try again. @@ -428,8 +540,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, * We joined this slot. Fill in our information to return to * the caller. */ - if (mysize != 0) - WT_STAT_CONN_INCR(session, log_slot_joins); + WT_STAT_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) -- cgit v1.2.1 From 3695a0dd4dbb1612518ed3f68a2e3c6e7550e0ed Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 09:09:24 -0500 Subject: WT-3137 Fix a hang in logging due to a race condition (#3266) Lint: Don't print int32_t's with %d. WT_LOGSLOT.slot_error is an int, not an int32_t. Don't print off_t's as 32-bits, use the maximum size unsigned object. --- src/include/log.h | 2 +- src/log/log_slot.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/include/log.h b/src/include/log.h index 82fcbf1be58..a6be3582b4d 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -163,7 +163,7 @@ struct __wt_logslot { WT_CACHE_LINE_PAD_BEGIN volatile int64_t slot_state; /* Slot state */ int64_t slot_unbuffered; /* Unbuffered data in this slot */ - int32_t slot_error; /* Error value */ + int slot_error; /* Error value */ wt_off_t slot_start_offset; /* Starting file offset */ wt_off_t slot_last_offset; /* Last record offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ diff --git a/src/log/log_slot.c b/src/log/log_slot.c index cb44cadcb70..d70c0d689be 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -19,7 +19,7 @@ __log_slot_dump(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; - int32_t earliest, i; + int earliest, i; conn = S2C(session); log = conn->log; @@ -39,9 +39,9 @@ __log_slot_dump(WT_SESSION_IMPL *session) __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, slot->slot_release_lsn.l.file, slot->slot_release_lsn.l.offset); - __wt_errx(session, " Offset: start: %" PRIu32 - " last:%" PRIu32, (uint32_t)slot->slot_start_offset, - (uint32_t)slot->slot_last_offset); + __wt_errx(session, " Offset: start: %" PRIuMAX + " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset, + (uintmax_t)slot->slot_last_offset); __wt_errx(session, " Unbuffered: %" PRId64 " error: %" PRId32, slot->slot_unbuffered, slot->slot_error); -- cgit v1.2.1 From d5ae763f990af5ba5522b07c18b9b37fdaae0e88 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 20:28:32 -0500 Subject: WT-3113 Add a verbose mode to dump the cache when eviction is stuck. (#3234) --- dist/api_data.py | 1 + dist/flags.py | 1 + src/config/config_def.c | 60 +++---- src/conn/conn_api.c | 1 + src/evict/evict_lru.c | 370 ++++++++++++++++-------------------------- src/include/cache.h | 2 +- src/include/extern.h | 3 +- src/include/flags.h | 45 ++--- src/include/wiredtiger.in | 26 +-- src/txn/txn.c | 95 +++++++++++ test/suite/test_reconfig04.py | 2 - 11 files changed, 305 insertions(+), 301 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 324d1e4f281..b1332320a7c 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -524,6 +524,7 @@ connection_runtime_config = [ 'checkpoint', 'compact', 'evict', + 'evict_stuck', 'evictserver', 'fileops', 'handleops', diff --git a/dist/flags.py b/dist/flags.py index 70e18712839..55ce233e60d 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -64,6 +64,7 @@ flags = { 'VERB_COMPACT', 'VERB_EVICT', 'VERB_EVICTSERVER', + 'VERB_EVICT_STUCK', 'VERB_FILEOPS', 'VERB_HANDLEOPS', 'VERB_LOG', diff --git a/src/config/config_def.c b/src/config/config_def.c index 6a93c1d05e2..b11a8d63fdb 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -147,12 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -750,12 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -837,12 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -919,12 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1001,12 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\"," - "\"recovery_progress\",\"salvage\",\"shared_cache\",\"split\"," - "\"temporary\",\"thread_group\",\"transaction\",\"verify\"," - "\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index d76e08067b5..124250a7a7d 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "checkpoint", WT_VERB_CHECKPOINT }, { "compact", WT_VERB_COMPACT }, { "evict", WT_VERB_EVICT }, + { "evict_stuck", WT_VERB_EVICT_STUCK }, { "evictserver", WT_VERB_EVICTSERVER }, { "fileops", WT_VERB_FILEOPS }, { "handleops", WT_VERB_HANDLEOPS }, diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 9b969de9a9e..0cf746f84eb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -285,7 +285,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) conn = S2C(session); cache = conn->cache; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* * Ensure the cache stuck timer is initialized when starting eviction. */ @@ -353,12 +353,12 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) + struct timespec now; +#endif WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; -#ifdef HAVE_DIAGNOSTIC - struct timespec now; -#endif uint64_t orig_pages_evicted; conn = S2C(session); @@ -395,11 +395,15 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) __wt_epoch(session, &cache->stuck_ts); } else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { /* - * After being stuck for 5 minutes, give up. + * If we're stuck for 5 minutes in diagnostic mode, or the + * verbose evict_stuck flag is configured, log the cache + * and transaction state. + * + * If we're stuck for 5 minutes in diagnostic mode, give up. * * We don't do this check for in-memory workloads because * application threads are not blocked by the cache being full. @@ -408,11 +412,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) */ __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { - ret = ETIMEDOUT; - __wt_err(session, ret, +#if defined(HAVE_DIAGNOSTIC) + __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); - WT_TRET(__wt_dump_stuck_info(session, NULL)); + ret = ETIMEDOUT; + WT_TRET(__wt_verbose_dump_txn(session)); + WT_TRET(__wt_verbose_dump_cache(session)); return (ret); +#elif defined(HAVE_VERBOSE) + if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) { + WT_RET(__wt_verbose_dump_txn(session)); + WT_RET(__wt_verbose_dump_cache(session)); + + /* Reset the timer. */ + __wt_epoch(session, &cache->stuck_ts); + } +#endif } #endif } @@ -2184,226 +2199,138 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) S2BT(session)->evict_priority = 0; } -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* - * __dump_txn_state -- - * Output debugging information about the global transaction state. + * __verbose_dump_cache_single -- + * Output diagnostic information about a single file in the cache. */ static int -__dump_txn_state(WT_SESSION_IMPL *session, FILE *fp) +__verbose_dump_cache_single(WT_SESSION_IMPL *session, + uint64_t *total_bytesp, uint64_t *total_dirty_bytesp) { - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - WT_TXN *txn; - WT_TXN_STATE *s; - const char *iso_tag; - uint64_t id; - uint32_t i, session_cnt; - - conn = S2C(session); - txn_global = &conn->txn_global; - WT_ORDERED_READ(session_cnt, conn->session_cnt); - - /* Note: odd string concatenation avoids spelling errors. */ - if (fprintf(fp, "==========\n" "transaction state dump\n") < 0) - return (EIO); - - if (fprintf(fp, - "current ID: %" PRIu64 "\n" - "last running ID: %" PRIu64 "\n" - "oldest ID: %" PRIu64 "\n" - "oldest named snapshot ID: %" PRIu64 "\n", - txn_global->current, txn_global->last_running, - txn_global->oldest_id, txn_global->nsnap_oldest_id) < 0) - return (EIO); - - if (fprintf(fp, - "checkpoint running? %s\n" - "checkpoint generation: %" PRIu64 "\n" - "checkpoint pinned ID: %" PRIu64 "\n" - "checkpoint txn ID: %" PRIu64 "\n" - "session count: %" PRIu32 "\n", - txn_global->checkpoint_running ? "yes" : "no", - txn_global->checkpoint_gen, - txn_global->checkpoint_pinned, - txn_global->checkpoint_txnid, - session_cnt) < 0) - return (EIO); - - if (fprintf(fp, "Dumping transaction state of active sessions\n") < 0) - return (EIO); - - /* - * Walk each session transaction state and dump information. Accessing - * the content of session handles is not thread safe, so some - * information may change while traversing if other threads are active - * at the same time, which is OK since this is diagnostic code. - */ - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip sessions with no active transaction */ - if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) - continue; + WT_DATA_HANDLE *dhandle; + WT_PAGE *page; + WT_REF *next_walk; + size_t size; + uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; + uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; + uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; + uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; - txn = &conn->sessions[i].txn; - iso_tag = "INVALID"; - switch (txn->isolation) { - case WT_ISO_READ_COMMITTED: - iso_tag = "WT_ISO_READ_COMMITTED"; - break; - case WT_ISO_READ_UNCOMMITTED: - iso_tag = "WT_ISO_READ_UNCOMMITTED"; - break; - case WT_ISO_SNAPSHOT: - iso_tag = "WT_ISO_SNAPSHOT"; - break; + intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; + intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; + leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; + leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; + + next_walk = NULL; + while (__wt_tree_walk(session, &next_walk, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + page = next_walk->page; + size = page->memory_footprint; + + if (WT_PAGE_IS_INTERNAL(page)) { + ++intl_pages; + intl_bytes += size; + intl_bytes_max = WT_MAX(intl_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++intl_dirty_pages; + intl_dirty_bytes += size; + intl_dirty_bytes_max = + WT_MAX(intl_dirty_bytes_max, size); + } + } else { + ++leaf_pages; + leaf_bytes += size; + leaf_bytes_max = WT_MAX(leaf_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++leaf_dirty_pages; + leaf_dirty_bytes += size; + leaf_dirty_bytes_max = + WT_MAX(leaf_dirty_bytes_max, size); + } } - - if (fprintf(fp, - "ID: %6" PRIu64 - ", mod count: %u" - ", pinned ID: %" PRIu64 - ", snap min: %" PRIu64 - ", snap max: %" PRIu64 - ", metadata pinned ID: %" PRIu64 - ", flags: 0x%08" PRIx32 - ", name: %s" - ", isolation: %s" "\n", - id, - txn->mod_count, - s->pinned_id, - txn->snap_min, - txn->snap_max, - s->metadata_pinned, - txn->flags, - conn->sessions[i].name == NULL ? - "EMPTY" : conn->sessions[i].name, - iso_tag) < 0) - return (EIO); } + dhandle = session->dhandle; + if (dhandle->checkpoint == NULL) + WT_RET(__wt_msg(session, "%s():", dhandle->name)); + else + WT_RET(__wt_msg(session, "%s(checkpoint=%s):", + dhandle->name, dhandle->checkpoint)); + if (intl_pages != 0) + WT_RET(__wt_msg(session, + "internal: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + intl_pages, + intl_bytes / WT_MEGABYTE, + intl_pages - intl_dirty_pages, + intl_dirty_pages, + (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE, + intl_dirty_bytes / WT_MEGABYTE, + intl_bytes_max / WT_MEGABYTE, + intl_dirty_bytes_max / WT_MEGABYTE)); + if (leaf_pages != 0) + WT_RET(__wt_msg(session, + "leaf: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + leaf_pages, + leaf_bytes / WT_MEGABYTE, + leaf_pages - leaf_dirty_pages, + leaf_dirty_pages, + (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE, + leaf_dirty_bytes / WT_MEGABYTE, + leaf_bytes_max / WT_MEGABYTE, + leaf_dirty_bytes_max / WT_MEGABYTE)); + + *total_bytesp += intl_bytes + leaf_bytes; + *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes; + return (0); } /* - * __dump_cache -- - * Output debugging information about the size of the files in cache. + * __wt_verbose_dump_cache -- + * Output diagnostic information about the cache. */ -static int -__dump_cache(WT_SESSION_IMPL *session, FILE *fp) +int +__wt_verbose_dump_cache(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle, *saved_dhandle; - WT_PAGE *page; - WT_REF *next_walk; - uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; - uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; - uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; - uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t total_bytes, total_dirty_bytes; - size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; - /* Note: odd string concatenation avoids spelling errors. */ - if (fprintf(fp, "==========\n" "cache dump\n") < 0) - return (EIO); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "cache dump")); - saved_dhandle = session->dhandle; + __wt_spin_lock(session, &conn->dhandle_lock); TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; - intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; - leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; - leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; - - next_walk = NULL; - session->dhandle = dhandle; - while (__wt_tree_walk(session, &next_walk, - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && - next_walk != NULL) { - page = next_walk->page; - size = page->memory_footprint; - - if (WT_PAGE_IS_INTERNAL(page)) { - ++intl_pages; - intl_bytes += size; - intl_bytes_max = WT_MAX(intl_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++intl_dirty_pages; - intl_dirty_bytes += size; - intl_dirty_bytes_max = - WT_MAX(intl_dirty_bytes_max, size); - } - } else { - ++leaf_pages; - leaf_bytes += size; - leaf_bytes_max = WT_MAX(leaf_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++leaf_dirty_pages; - leaf_dirty_bytes += size; - leaf_dirty_bytes_max = - WT_MAX(leaf_dirty_bytes_max, size); - } - } - } - session->dhandle = NULL; - - if (dhandle->checkpoint == NULL) { - if (fprintf(fp, - "%s(): \n", dhandle->name) < 0) - return (EIO); - } else { - if (fprintf(fp, "%s(checkpoint=%s): \n", - dhandle->name, dhandle->checkpoint) < 0) - return (EIO); - } - if (intl_pages != 0) { - if (fprintf(fp, - "\t" "internal: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - intl_pages, - intl_bytes >> 20, - intl_pages - intl_dirty_pages, - intl_dirty_pages, - (intl_bytes - intl_dirty_bytes) >> 20, - intl_dirty_bytes >> 20, - intl_bytes_max >> 20, - intl_dirty_bytes_max >> 20) < 0) - return (EIO); - } - if (leaf_pages != 0) { - if (fprintf(fp, - "\t" "leaf: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - leaf_pages, - leaf_bytes >> 20, - leaf_pages - leaf_dirty_pages, - leaf_dirty_pages, - (leaf_bytes - leaf_dirty_bytes) >> 20, - leaf_dirty_bytes >> 20, - leaf_bytes_max >> 20, - leaf_dirty_bytes_max >> 20) < 0) - return (EIO); - } - - total_bytes += intl_bytes + leaf_bytes; - total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; + WT_WITH_DHANDLE(session, dhandle, + ret = __verbose_dump_cache_single( + session, &total_bytes, &total_dirty_bytes)); + if (ret != 0) + break; } - session->dhandle = saved_dhandle; + __wt_spin_unlock(session, &conn->dhandle_lock); + WT_RET(ret); /* * Apply the overhead percentage so our total bytes are comparable with @@ -2411,39 +2338,16 @@ __dump_cache(WT_SESSION_IMPL *session, FILE *fp) */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); - if (fprintf(fp, + WT_RET(__wt_msg(session, "cache dump: " - "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" - "total dirty bytes: %" PRIu64 "MB\n", - total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, - total_dirty_bytes >> 20) < 0) - return (EIO); - if (fprintf(fp, "==========\n") < 0) - return (EIO); + "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB", + total_bytes / WT_MEGABYTE, + __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE)); + WT_RET(__wt_msg(session, + "total dirty bytes: %" PRIu64 "MB", + total_dirty_bytes / WT_MEGABYTE)); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); return (0); } - -/* - * __wt_dump_stuck_info -- - * Dump debugging information to a file (default stderr) about the state - * of WiredTiger when we have determined that the cache is stuck full. - */ -int -__wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) -{ - FILE *fp; - WT_DECL_RET; - - if (ofile == NULL) - fp = stderr; - else if ((fp = fopen(ofile, "w")) == NULL) - return (EIO); - - WT_ERR(__dump_txn_state(session, fp)); - WT_ERR(__dump_cache(session, fp)); -err: if (ofile != NULL && fclose(fp) != 0) - return (EIO); - return (ret); -} #endif diff --git a/src/include/cache.h b/src/include/cache.h index 70f6169200d..abd5a1901f7 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -83,7 +83,7 @@ struct __wt_cache { uint64_t worker_evicts; /* Pages evicted by worker threads */ uint64_t evict_max_page_size; /* Largest page seen at eviction */ -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) struct timespec stuck_ts; /* Stuck timestamp */ #endif diff --git a/src/include/extern.h b/src/include/extern.h index 16b3c916b24..88fb8823930 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -352,7 +352,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_dump_stuck_info(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -741,6 +741,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index 2f0c207078a..0b92a12c686 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -90,28 +90,29 @@ #define WT_VERB_COMPACT 0x00000008 #define WT_VERB_EVICT 0x00000010 #define WT_VERB_EVICTSERVER 0x00000020 -#define WT_VERB_FILEOPS 0x00000040 -#define WT_VERB_HANDLEOPS 0x00000080 -#define WT_VERB_LOG 0x00000100 -#define WT_VERB_LSM 0x00000200 -#define WT_VERB_LSM_MANAGER 0x00000400 -#define WT_VERB_METADATA 0x00000800 -#define WT_VERB_MUTEX 0x00001000 -#define WT_VERB_OVERFLOW 0x00002000 -#define WT_VERB_READ 0x00004000 -#define WT_VERB_REBALANCE 0x00008000 -#define WT_VERB_RECONCILE 0x00010000 -#define WT_VERB_RECOVERY 0x00020000 -#define WT_VERB_RECOVERY_PROGRESS 0x00040000 -#define WT_VERB_SALVAGE 0x00080000 -#define WT_VERB_SHARED_CACHE 0x00100000 -#define WT_VERB_SPLIT 0x00200000 -#define WT_VERB_TEMPORARY 0x00400000 -#define WT_VERB_THREAD_GROUP 0x00800000 -#define WT_VERB_TRANSACTION 0x01000000 -#define WT_VERB_VERIFY 0x02000000 -#define WT_VERB_VERSION 0x04000000 -#define WT_VERB_WRITE 0x08000000 +#define WT_VERB_EVICT_STUCK 0x00000040 +#define WT_VERB_FILEOPS 0x00000080 +#define WT_VERB_HANDLEOPS 0x00000100 +#define WT_VERB_LOG 0x00000200 +#define WT_VERB_LSM 0x00000400 +#define WT_VERB_LSM_MANAGER 0x00000800 +#define WT_VERB_METADATA 0x00001000 +#define WT_VERB_MUTEX 0x00002000 +#define WT_VERB_OVERFLOW 0x00004000 +#define WT_VERB_READ 0x00008000 +#define WT_VERB_REBALANCE 0x00010000 +#define WT_VERB_RECONCILE 0x00020000 +#define WT_VERB_RECOVERY 0x00040000 +#define WT_VERB_RECOVERY_PROGRESS 0x00080000 +#define WT_VERB_SALVAGE 0x00100000 +#define WT_VERB_SHARED_CACHE 0x00200000 +#define WT_VERB_SPLIT 0x00400000 +#define WT_VERB_TEMPORARY 0x00800000 +#define WT_VERB_THREAD_GROUP 0x01000000 +#define WT_VERB_TRANSACTION 0x02000000 +#define WT_VERB_VERIFY 0x04000000 +#define WT_VERB_VERSION 0x08000000 +#define WT_VERB_WRITE 0x10000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 90989cc679d..03bff7cd04f 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1982,12 +1982,13 @@ struct __wt_connection { * as a list\, such as "verbose=[evictserver\,read]"., a * list\, with values chosen from the following options: \c "api"\, \c * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c - * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, - * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, - * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\, - * \c "version"\, \c "write"; default empty.} + * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c + * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2513,12 +2514,13 @@ struct __wt_connection { * WiredTiger is configured with --enable-verbose. Options are given as a * list\, such as "verbose=[evictserver\,read]"., a list\, with * values chosen from the following options: \c "api"\, \c "block"\, \c - * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, - * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c - * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c - * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c + * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c + * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c + * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write"; + * default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as "write_through=[data]". Configuring \c write_through requires diff --git a/src/txn/txn.c b/src/txn/txn.c index 660d37b17d5..e5e59c2b901 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -803,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); __wt_free(session, txn_global->states); } + +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) +/* + * __wt_verbose_dump_txn -- + * Output diagnostic information about the global transaction state. + */ +int +__wt_verbose_dump_txn(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN *txn; + WT_TXN_STATE *s; + const char *iso_tag; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "transaction state dump")); + + WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); + WT_RET(__wt_msg(session, + "last running ID: %" PRIu64, txn_global->last_running)); + WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + + WT_RET(__wt_msg(session, "checkpoint running? %s", + txn_global->checkpoint_running ? "yes" : "no")); + WT_RET(__wt_msg(session, + "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen)); + WT_RET(__wt_msg(session, + "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned)); + WT_RET(__wt_msg(session, + "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid)); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); + + WT_RET(__wt_msg(session, "Transaction state of active sessions:")); + + /* + * Walk each session transaction state and dump information. Accessing + * the content of session handles is not thread safe, so some + * information may change while traversing if other threads are active + * at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + + txn = &conn->sessions[i].txn; + iso_tag = "INVALID"; + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + } + + WT_RET(__wt_msg(session, + "ID: %6" PRIu64 + ", mod count: %u" + ", pinned ID: %" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); + } + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + + return (0); +} +#endif diff --git a/test/suite/test_reconfig04.py b/test/suite/test_reconfig04.py index be5e6d3729e..51d9b91c1f4 100644 --- a/test/suite/test_reconfig04.py +++ b/test/suite/test_reconfig04.py @@ -26,9 +26,7 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. -import fnmatch, os, time import wiredtiger, wttest -from wtdataset import SimpleDataSet # test_reconfig04.py # Test WT_SESSION::reconfigure -- cgit v1.2.1 From 8aa3922883e7f3d4a9003211faf595250c3bbfdd Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 24 Jan 2017 22:07:16 -0500 Subject: WT-3097 Avoid waiting for threads to timeout during close (#3253) * Add run-time flags checking to __wt_cond_wait_signal(), and its wrappers (__wt_cond_wait(), __wt_cond_auto_wait_signal() and __wt_cond_auto_wait()) so callers of those functions can configure a check that ensures that if the waiting thread races with a waking thread that's turned off flags so the waiting thread quits, the waiting thread returns immediately. * Rework the WT_SESSION.transaction_sync code to wait for the entire time it's configured to wait, it will be awoken if the log reaches stability before that. * Assert we're not waiting longer than a second if not checking the run status. * Set/Clear WT_CONN_LOG_SERVER_RUN in __wt_logmgr_open/__wt_logmgr_destroy rather than in the connection open code. (It's the only server-run flag that gets set in the connection-open code, and I can't see any reason for that exception.) --- dist/api_data.py | 2 +- dist/s_string.ok | 4 +++ src/async/async_api.c | 5 ++- src/async/async_worker.c | 2 +- src/conn/conn_cache.c | 6 ++-- src/conn/conn_cache_pool.c | 8 ++--- src/conn/conn_ckpt.c | 26 +++++++++++---- src/conn/conn_handle.c | 2 +- src/conn/conn_log.c | 50 +++++++++++++---------------- src/conn/conn_open.c | 17 ++++++---- src/conn/conn_stat.c | 25 +++++++++++---- src/conn/conn_sweep.c | 24 +++++++++++--- src/evict/evict_lru.c | 16 ++++++---- src/include/extern.h | 8 ++--- src/include/extern_posix.h | 4 +-- src/include/extern_win.h | 4 +-- src/include/misc.i | 5 +-- src/include/mutex.h | 4 +-- src/include/wiredtiger.in | 2 +- src/log/log.c | 21 ++++++------ src/log/log_slot.c | 2 +- src/lsm/lsm_worker.c | 2 +- src/os_posix/os_mtx_cond.c | 28 ++++++++++++---- src/os_win/os_mtx_cond.c | 43 +++++++++++++++++-------- src/session/session_api.c | 38 ++++++++++++++-------- src/support/cond_auto.c | 80 ++++++++++------------------------------------ src/support/thread_group.c | 2 +- 27 files changed, 237 insertions(+), 193 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index b1332320a7c..1d669fa7fe0 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -718,7 +718,7 @@ wiredtiger_open_common =\ ]), Config('extensions', '', r''' list of shared library extensions to load (using dlopen). - Any values specified to an library extension are passed to + Any values specified to a library extension are passed to WT_CONNECTION::load_extension as the \c config parameter (for example, extensions=(/path/ext.so={entry=my_entry}))''', diff --git a/dist/s_string.ok b/dist/s_string.ok index 2b998c27813..bb0cacd9d5d 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -1217,6 +1217,7 @@ upg uri uri's uris +usec usecs usedp userbad @@ -1247,6 +1248,9 @@ vunpack vw vxr waitpid +waker +wakeup +wakeups walk's warmup wb diff --git a/src/async/async_api.c b/src/async/async_api.c index 54bcb7cd26c..026a008188c 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session) async = conn->async; TAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); - WT_RET(__wt_cond_alloc( - session, "async flush", false, &async->flush_cond)); + WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond)); WT_RET(__wt_async_op_init(session)); /* @@ -541,7 +540,7 @@ retry: async->flush_op.state = WT_ASYNCOP_READY; WT_RET(__wt_async_op_enqueue(session, &async->flush_op)); while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) - __wt_cond_wait(session, async->flush_cond, 100000); + __wt_cond_wait(session, async->flush_cond, 100000, NULL); /* * Flush is done. Clear the flags. */ diff --git a/src/async/async_worker.c b/src/async/async_worker.c index b1bc3902f7c..11f59ed14f1 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) { while (async->flush_state == WT_ASYNC_FLUSHING && async->flush_gen == my_gen) - __wt_cond_wait(session, async->flush_cond, 10000); + __wt_cond_wait(session, async->flush_cond, 10000, NULL); } /* diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 2b0e5081f04..28dd06332e0 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -187,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", - false, 10000, WT_MILLION, &cache->evict_cond)); + WT_RET(__wt_cond_auto_alloc(session, + "cache eviction server", 10000, WT_MILLION, &cache->evict_cond)); WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); @@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) cache->bytes_dirty_intl + cache->bytes_dirty_leaf, cache->pages_dirty_intl + cache->pages_dirty_leaf); - WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 79c2fc23da5..49b766f4602 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -32,7 +32,7 @@ */ #define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 #define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 -#define WT_CACHE_POOL_READ_MULTIPLIER 1 +#define WT_CACHE_POOL_READ_MULTIPLIER 1 static void __cache_pool_adjust( WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *); @@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) TAILQ_INIT(&cp->cache_pool_qh); WT_ERR(__wt_spin_init( session, &cp->cache_pool_lock, "cache shared pool")); - WT_ERR(__wt_cond_alloc(session, - "cache pool server", false, &cp->cache_pool_cond)); + WT_ERR(__wt_cond_alloc( + session, "cache pool server", &cp->cache_pool_cond)); __wt_process.cache_pool = cp; __wt_verbose(session, @@ -733,7 +733,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) __wt_cond_wait( - session, cp->cache_pool_cond, WT_MILLION); + session, cp->cache_pool_cond, WT_MILLION, NULL); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index faeef4e71a2..7797ed4421c 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -62,6 +62,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) return (0); } +/* + * __ckpt_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__ckpt_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT)); +} + /* * __ckpt_server -- * The checkpoint server thread. @@ -78,14 +88,18 @@ __ckpt_server(void *arg) conn = S2C(session); wt_session = (WT_SESSION *)session; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { + for (;;) { /* * Wait... * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ - __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); + __wt_cond_wait(session, + conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__ckpt_server_run_chk(session)) + break; /* * Checkpoint the database if the connection is marked dirty. @@ -113,7 +127,8 @@ __ckpt_server(void *arg) * it so we don't do another checkpoint * immediately. */ - __wt_cond_wait(session, conn->ckpt_cond, 1); + __wt_cond_wait( + session, conn->ckpt_cond, 1, NULL); } } else WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); @@ -152,8 +167,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; - WT_RET(__wt_cond_alloc( - session, "checkpoint server", false, &conn->ckpt_cond)); + WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond)); /* * Start the thread. diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 7203b75e4ae..54bcfd98aba 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -79,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init( session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); WT_RET(__wt_cond_alloc( - session, "LSM worker cond", false, &conn->lsm_manager.work_cond)); + session, "LSM worker cond", &conn->lsm_manager.work_cond)); /* * Generation numbers. diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 8f8f8614ba8..c6dd795389d 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -174,7 +174,7 @@ __logmgr_config( WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); return (0); } @@ -341,7 +341,7 @@ __wt_log_truncate_files( conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && + if (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -505,8 +505,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - __wt_cond_auto_signal( - session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn @@ -517,8 +516,9 @@ __log_file_server(void *arg) continue; } } + /* Wait until the next event. */ - __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); + __wt_cond_wait(session, conn->log_file_cond, 100000, NULL); } if (0) { @@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg) if (yield++ < WT_THOUSAND) __wt_yield(); else - /* - * Send in false because if we did any work we would - * not be on this path. - */ __wt_cond_auto_wait( - session, conn->log_wrlsn_cond, did_work); + session, conn->log_wrlsn_cond, did_work, NULL); } /* * On close we need to do this one more time because there could @@ -840,10 +836,9 @@ __log_server(void *arg) } /* Wait until the next event. */ - __wt_epoch(session, &start); - __wt_cond_auto_wait_signal(session, - conn->log_cond, did_work, &signalled); + __wt_cond_auto_wait_signal( + session, conn->log_cond, did_work, NULL, &signalled); __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } @@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; - WT_RET(__wt_cond_alloc( - session, "log sync", false, &log->log_sync_cond)); - WT_RET(__wt_cond_alloc( - session, "log write", false, &log->log_write_cond)); + WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond)); + WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); @@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); + F_SET(conn, WT_CONN_LOG_SERVER_RUN); + /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. @@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) session_flags = WT_SESSION_NO_DATA_HANDLES; WT_RET(__wt_open_internal_session(conn, "log-close-server", false, session_flags, &conn->log_file_session)); - WT_RET(__wt_cond_alloc(conn->log_file_session, - "log close server", false, &conn->log_file_cond)); + WT_RET(__wt_cond_alloc( + conn->log_file_session, "log close server", &conn->log_file_cond)); /* * Start the log file close thread. @@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, - "log write lsn server", false, 10000, WT_MILLION, - &conn->log_wrlsn_cond)); + "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_auto_alloc(conn->log_session, - "log server", false, 50000, WT_MILLION, &conn->log_cond)); + "log server", 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); + F_CLR(conn, WT_CONN_LOG_SERVER_RUN); + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without @@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index f8029f2c728..5b20377d437 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -25,7 +25,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Tell internal server threads to run: this must be set before opening * any sessions. */ - F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); + F_SET(conn, WT_CONN_SERVER_RUN); /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, @@ -100,8 +100,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_yield(); } - /* Clear any pending async ops. */ + /* + * Clear any pending async operations and shut down the async worker + * threads and system before closing LSM. + */ WT_TRET(__wt_async_flush(session)); + WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is @@ -110,14 +114,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); - WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); - WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); + WT_TRET(__wt_sweep_destroy(session)); + + /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ @@ -126,7 +130,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); - /* Shut down metadata tracking, required before creating tables. */ + /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* @@ -140,7 +144,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); - F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 3bcdfd7ecb1..31dc9c45992 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -485,8 +485,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running"); @@ -497,6 +496,16 @@ err: __wt_scr_free(session, &tmp); return (ret); } +/* + * __statlog_server_run_chk -- + * Check to decide if the statistics log server should continue running. + */ +static bool +__statlog_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS)); +} + /* * __statlog_server -- * The statistics server thread. @@ -525,10 +534,14 @@ __statlog_server(void *arg) WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs); + __wt_cond_wait(session, conn->stat_cond, + conn->stat_usecs, __statlog_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__statlog_server_run_chk(session)) + break; if (WT_STAT_ENABLED(session)) WT_ERR(__statlog_log_one(session, &path, &tmp)); @@ -563,7 +576,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn) session = conn->stat_session; WT_RET(__wt_cond_alloc( - session, "statistics log server", false, &conn->stat_cond)); + session, "statistics log server", &conn->stat_cond)); /* * Start the thread. diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 7d5cb7d7c72..f9b7305c7d8 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -245,6 +245,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) return (ret == EBUSY ? 0 : ret); } +/* + * __sweep_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__sweep_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP)); +} + /* * __sweep_server -- * The handle sweep server thread. @@ -266,11 +276,15 @@ __sweep_server(void *arg) /* * Sweep for dead and excess handles. */ - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, - conn->sweep_cond, conn->sweep_interval * WT_MILLION); + __wt_cond_wait(session, conn->sweep_cond, + conn->sweep_interval * WT_MILLION, __sweep_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__sweep_server_run_chk(session)) + break; + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); @@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session) session = conn->sweep_session; WT_RET(__wt_cond_alloc( - session, "handle sweep server", false, &conn->sweep_cond)); + session, "handle sweep server", &conn->sweep_cond)); WT_RET(__wt_thread_create( session, &conn->sweep_tid, __sweep_server, session)); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 0cf746f84eb..48ea1ccb02b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -267,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) } #endif - __wt_cond_auto_signal(session, cache->evict_cond); + __wt_cond_signal(session, cache->evict_cond); } /* @@ -311,9 +311,10 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); + /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait( - session, cache->evict_cond, did_work); + session, cache->evict_cond, did_work, NULL); __wt_verbose(session, WT_VERB_EVICTSERVER, "waking"); } else WT_ERR(__evict_lru_pages(session, false)); @@ -712,8 +713,8 @@ __evict_pass(WT_SESSION_IMPL *session) */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); - __wt_cond_wait( - session, cache->evict_cond, WT_THOUSAND); + __wt_cond_wait(session, + cache->evict_cond, WT_THOUSAND, NULL); continue; } @@ -1102,7 +1103,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)) - __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait( + session, conn->evict_threads.wait_cond, 10000, NULL); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -2102,8 +2104,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ - __wt_cond_wait( - session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait(session, + conn->evict_threads.wait_cond, 10000, NULL); cache->app_waits++; break; default: diff --git a/src/include/extern.h b/src/include/extern.h index 88fb8823930..eb2f9a0e784 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -613,11 +613,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index 5acb7b0ed27..fed7835ada1 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 11b45f11304..0bfc821c7a6 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/misc.i b/src/include/misc.i index f36be32d6a2..d5692a3f9cf 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -11,11 +11,12 @@ * Wait on a mutex, optionally timing out. */ static inline void -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)) { bool notused; - __wt_cond_wait_signal(session, cond, usecs, ¬used); + __wt_cond_wait_signal(session, cond, usecs, run_func, ¬used); } /* diff --git a/src/include/mutex.h b/src/include/mutex.h index 727a690bb1c..06b8c4a3304 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -21,8 +21,8 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ /* - * The following fields are only used for automatically adjusting - * condition variables. They could be in a separate structure. + * The following fields are used for automatically adjusting condition + * variable wait times. */ uint64_t min_wait; /* Minimum wait duration */ uint64_t max_wait; /* Maximum wait duration */ diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 03bff7cd04f..f05d3d4ab55 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -2362,7 +2362,7 @@ struct __wt_connection { * @config{exclusive, fail if the database already exists\, generally used with * the \c create option., a boolean flag; default \c false.} * @config{extensions, list of shared library extensions to load (using dlopen). - * Any values specified to an library extension are passed to + * Any values specified to a library extension are passed to * WT_CONNECTION::load_extension as the \c config parameter (for example\, * extensions=(/path/ext.so={entry=my_entry}))., a list of strings; * default empty.} diff --git a/src/log/log.c b/src/log/log.c index da500a74e87..614ae1a9b6d 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -43,11 +43,11 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) */ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); if (++yield_count < WT_THOUSAND) __wt_yield(); else - __wt_cond_wait(session, log->log_write_cond, 200); + __wt_cond_wait(session, log->log_write_cond, 200, NULL); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } @@ -89,7 +89,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } /* @@ -170,7 +170,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ while (log->sync_lsn.l.file < min_lsn->l.file) { __wt_cond_signal(session, S2C(session)->log_file_cond); - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); @@ -915,7 +915,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) else { WT_STAT_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - __wt_cond_auto_signal( + __wt_cond_signal( session, conn->log_cond); } } @@ -1490,7 +1490,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); continue; } locked = true; @@ -2160,7 +2161,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); __wt_yield(); } else WT_ERR(__wt_log_force_write(session, 1, NULL)); @@ -2169,12 +2170,14 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_write_cond, 10000); + __wt_cond_wait( + session, log->log_write_cond, 10000, NULL); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); } /* diff --git a/src/log/log_slot.c b/src/log/log_slot.c index d70c0d689be..d6e692f8c51 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -349,7 +349,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* * If we didn't find any free slots signal the worker thread. */ - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index b0d0758775d..ffa00c0a5e7 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -154,7 +154,7 @@ __lsm_worker(void *arg) /* Don't busy wait if there was any work to do. */ if (!progress) { - __wt_cond_wait(session, cookie->work_cond, 10000); + __wt_cond_wait(session, cookie->work_cond, 10000, NULL); continue; } } diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index be8b1abda31..a5ee78f9e3e 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; @@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -42,8 +41,8 @@ err: __wt_free(session, cond); * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { struct timespec ts; WT_DECL_RET; @@ -62,6 +61,23 @@ __wt_cond_wait_signal( WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { __wt_epoch(session, &ts); ts.tv_sec += (time_t) @@ -81,7 +97,7 @@ __wt_cond_wait_signal( ret == ETIME || #endif ret == ETIMEDOUT) { - *signalled = false; +skipping: *signalled = false; ret = 0; } diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 79c62ccd7f2..0001c6c2322 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; @@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, InitializeConditionVariable(&cond->cond); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { BOOL sleepret; DWORD milliseconds, windows_error; @@ -59,8 +58,26 @@ __wt_cond_wait_signal( EnterCriticalSection(&cond->mtx); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { - milliseconds64 = usecs / 1000; + milliseconds64 = usecs / WT_THOUSAND; /* * Check for 32-bit unsigned integer overflow @@ -90,7 +107,7 @@ __wt_cond_wait_signal( if (sleepret == 0) { windows_error = __wt_getlasterror(); if (windows_error == ERROR_TIMEOUT) { - *signalled = false; +skipping: *signalled = false; sleepret = 1; } } @@ -117,17 +134,17 @@ void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; - bool locked; - - locked = false; __wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name); /* - * Our callers are often setting flags to cause a thread to exit. Add - * a barrier to ensure the flags are seen by the threads. + * Our callers often set flags to cause a thread to exit. Add a barrier + * to ensure exit flags are seen by the sleeping threads, otherwise we + * can wake up a thread, it immediately goes back to sleep, and we'll + * hang. Use a full barrier (we may not write before waiting on thread + * join). */ - WT_WRITE_BARRIER(); + WT_FULL_BARRIER(); /* * Fast path if we are in (or can enter), a state where the next waiter diff --git a/src/session/session_api.c b/src/session/session_api.c index fcbfa8809b3..71626e098cb 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1488,6 +1488,20 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) err: API_END_RET(session, ret); } +/* + * __transaction_sync_run_chk -- + * Check to decide if the transaction sync call should continue running. + */ +static bool +__transaction_sync_run_chk(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + return (FLD_ISSET(conn->flags, WT_CONN_LOG_SERVER_RUN)); +} + /* * __session_transaction_sync -- * WT_SESSION->transaction_sync method. @@ -1502,7 +1516,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; WT_TXN *txn; struct timespec now, start; - uint64_t timeout_ms, waited_ms; + uint64_t remaining_usec, timeout_ms, waited_ms; bool forever; session = (WT_SESSION_IMPL *)wt_session; @@ -1555,22 +1569,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach - * our timeout. + * our timeout, or there's some other reason to quit. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + if (!__transaction_sync_run_chk(session)) + WT_ERR(ETIMEDOUT); + __wt_cond_signal(session, conn->log_file_cond); __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); - if (forever || waited_ms < timeout_ms) - /* - * Note, we will wait an increasing amount of time - * each iteration, likely doubling. Also note that - * the function timeout value is in usecs (we are - * computing the wait time in msecs and passing that - * in, unchanged, as the usecs to wait). - */ - __wt_cond_wait(session, log->log_sync_cond, waited_ms); - else + if (forever || waited_ms < timeout_ms) { + remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND; + __wt_cond_wait(session, log->log_sync_cond, + remaining_usec, __transaction_sync_run_chk); + } else WT_ERR(ETIMEDOUT); } @@ -1825,7 +1837,7 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); + WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond)); if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c index a3ae67f5baa..600e5eab0ff 100644 --- a/src/support/cond_auto.c +++ b/src/support/cond_auto.c @@ -1,29 +1,9 @@ /*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. + * See the file LICENSE for redistribution information. */ #include "wt_internal.h" @@ -38,13 +18,12 @@ * Allocate and initialize an automatically adjusting condition variable. */ int -__wt_cond_auto_alloc( - WT_SESSION_IMPL *session, const char *name, - bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +__wt_cond_auto_alloc(WT_SESSION_IMPL *session, + const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) { WT_CONDVAR *cond; - WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + WT_RET(__wt_cond_alloc(session, name, condp)); cond = *condp; cond->min_wait = min; @@ -54,34 +33,20 @@ __wt_cond_auto_alloc( return (0); } -/* - * __wt_cond_auto_signal -- - * Signal a condition variable. - */ -void -__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) -{ - - WT_ASSERT(session, cond->min_wait != 0); - __wt_cond_signal(session, cond); -} - /* * __wt_cond_auto_wait_signal -- * Wait on a mutex, optionally timing out. If we get it before the time * out period expires, let the caller know. - * TODO: Can this version of the API be removed, now that we have the - * auto adjusting condition variables? */ void -__wt_cond_auto_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { uint64_t delta; /* * Catch cases where this function is called with a condition variable - * that was initialized non-auto. + * that wasn't initialized to do automatic adjustments. */ WT_ASSERT(session, cond->min_wait != 0); @@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal( cond->max_wait, cond->prev_wait + delta); } - __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled); + __wt_cond_wait_signal( + session, cond, cond->prev_wait, run_func, signalled); if (progress || *signalled) WT_STAT_CONN_INCR(session, cond_auto_wait_reset); @@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal( * out period expires, let the caller know. */ void -__wt_cond_auto_wait( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +__wt_cond_auto_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) { - bool signalled; - - /* - * Call the signal version so the wait period is reset if the - * condition is woken explicitly. - */ - __wt_cond_auto_wait_signal(session, cond, progress, &signalled); -} + bool notused; -/* - * __wt_cond_auto_destroy -- - * Destroy a condition variable. - */ -int -__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) -{ - return (__wt_cond_destroy(session, condp)); + __wt_cond_auto_wait_signal(session, cond, progress, run_func, ¬used); } diff --git a/src/support/thread_group.c b/src/support/thread_group.c index beb143e63e2..2b4b7ad4e61 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -259,7 +259,7 @@ __wt_thread_group_create( __wt_rwlock_init(session, &group->lock); WT_ERR(__wt_cond_alloc( - session, "Thread group cond", false, &group->wait_cond)); + session, "thread group cond", &group->wait_cond)); cond_alloced = true; __wt_writelock(session, &group->lock); -- cgit v1.2.1 From 0a70661a0d33c9705509955baafded2855054a29 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 26 Jan 2017 16:54:46 -0500 Subject: WT-3156 Add check in assertions for errors. (#3271) --- src/log/log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/log/log.c b/src/log/log.c index 614ae1a9b6d..1482cc0aca1 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -2202,12 +2202,12 @@ err: /* * If one of the sync flags is set, assert the proper LSN has moved to - * match. + * match on success. */ - WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) || + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); - WT_ASSERT(session, - !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || + __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } -- cgit v1.2.1 From 1e24579efee68f6fdb6a4c582275a50d95d7eb81 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Wed, 1 Feb 2017 12:11:48 +1100 Subject: WT-3115 Convert the dhandle list lock into a read/write lock. (#3236) It was a spinlock, but most acquirers only need shared access and it can be a contention point in many-table workloads. Split uses of the handle list lock into small operations. In particular, only hold the handle list lock to get the "next" handle, not for loops over all the handles in the system. Update statistics around handle list lock and corresponding doc. --- dist/flags.py | 3 +- dist/s_stat | 3 - dist/stat_data.py | 4 +- src/conn/conn_dhandle.c | 55 ++++++----- src/conn/conn_handle.c | 4 +- src/conn/conn_stat.c | 8 +- src/conn/conn_sweep.c | 2 +- src/cursor/cur_backup.c | 8 +- src/docs/upgrading.dox | 6 ++ src/evict/evict_lru.c | 50 +++++----- src/evict/evict_stat.c | 2 +- src/include/cache.i | 2 +- src/include/connection.h | 6 +- src/include/dhandle.h | 18 ++++ src/include/extern.h | 1 + src/include/flags.h | 33 +++---- src/include/schema.h | 72 +++++++++++--- src/include/stat.h | 4 +- src/include/wiredtiger.in | 218 +++++++++++++++++++++--------------------- src/lsm/lsm_cursor.c | 4 +- src/lsm/lsm_manager.c | 12 +-- src/lsm/lsm_stat.c | 4 +- src/lsm/lsm_tree.c | 63 ++++++------ src/lsm/lsm_work_unit.c | 4 +- src/schema/schema_drop.c | 2 +- src/schema/schema_rename.c | 2 +- src/schema/schema_worker.c | 2 +- src/session/session_dhandle.c | 43 +++++---- src/support/stat.c | 16 +--- src/txn/txn_ckpt.c | 5 +- 30 files changed, 359 insertions(+), 297 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index 55ce233e60d..216f7c29e0a 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -117,7 +117,8 @@ flags = { 'SESSION_CAN_WAIT', 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', - 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_HANDLE_LIST_READ', + 'SESSION_LOCKED_HANDLE_LIST_WRITE', 'SESSION_LOCKED_METADATA', 'SESSION_LOCKED_PASS', 'SESSION_LOCKED_SCHEMA', diff --git a/dist/s_stat b/dist/s_stat index 5d5937e1833..6aeeca6faa6 100755 --- a/dist/s_stat +++ b/dist/s_stat @@ -25,9 +25,6 @@ cat << UNUSED_STAT_FIELDS lock_checkpoint_count lock_checkpoint_wait_application lock_checkpoint_wait_internal -lock_handle_list_count -lock_handle_list_wait_application -lock_handle_list_wait_internal lock_metadata_count lock_metadata_wait_application lock_metadata_wait_internal diff --git a/dist/stat_data.py b/dist/stat_data.py index 0af5d6d017e..a4d92345f88 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -288,9 +288,7 @@ connection_stats = [ LockStat('lock_checkpoint_count', 'checkpoint lock acquisitions'), LockStat('lock_checkpoint_wait_application', 'checkpoint lock application thread wait time (usecs)'), LockStat('lock_checkpoint_wait_internal', 'checkpoint lock internal thread wait time (usecs)'), - LockStat('lock_handle_list_count', 'handle-list lock acquisitions'), - LockStat('lock_handle_list_wait_application', 'handle-list lock application thread wait time (usecs)'), - LockStat('lock_handle_list_wait_internal', 'handle-list lock internal thread wait time (usecs)'), + LockStat('lock_handle_list_wait_eviction', 'handle-list lock eviction thread wait time (usecs)'), LockStat('lock_metadata_count', 'metadata lock acquisitions'), LockStat('lock_metadata_wait_application', 'metadata lock application thread wait time (usecs)'), LockStat('lock_metadata_wait_internal', 'metadata lock internal thread wait time (usecs)'), diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index b2f4bb04ce4..866b8633f71 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -25,21 +25,19 @@ __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) } /* - * __conn_dhandle_alloc -- + * __wt_conn_dhandle_alloc -- * Allocate a new data handle and return it linked into the connection's * list. */ -static int -__conn_dhandle_alloc(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep) +int +__wt_conn_dhandle_alloc( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint64_t bucket; - *dhandlep = NULL; - WT_RET(__wt_calloc_one(session, &dhandle)); __wt_rwlock_init(session, &dhandle->rwlock); @@ -75,7 +73,7 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket); - *dhandlep = dhandle; + session->dhandle = dhandle; return (0); err: __conn_dhandle_destroy(session, dhandle); @@ -122,10 +120,7 @@ __wt_conn_dhandle_find( } } - WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle)); - - session->dhandle = dhandle; - return (0); + return (WT_NOTFOUND); } /* @@ -419,12 +414,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t bucket; conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - /* * If we're given a URI, then we walk only the hash list for that * name. If we don't have a URI we walk the entire dhandle list. @@ -432,29 +426,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, + &conn->dhhash[bucket], hashq)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || strcmp(uri, dhandle->name) != 0) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || !WT_PREFIX_MATCH(dhandle->name, "file:") || WT_IS_METADATA(dhandle)) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } - return (0); +err: WT_DHANDLE_RELEASE(dhandle); + return (ret); } /* @@ -473,7 +480,8 @@ __wt_conn_dhandle_close_all( conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, session->dhandle == NULL); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; @@ -534,7 +542,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final) dhandle = session->dhandle; bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ @@ -583,7 +592,7 @@ __wt_conn_dhandle_discard_single( } /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __conn_dhandle_remove(session, final)); if (set_pass_intr) (void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 54bcfd98aba..4f8d89fa9d2 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -53,7 +53,6 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); - WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); @@ -64,6 +63,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ + __wt_rwlock_init(session, &conn->dhandle_lock); __wt_rwlock_init(session, &conn->hot_backup_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); @@ -134,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); - __wt_spin_destroy(session, &conn->dhandle_lock); + __wt_rwlock_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 31dc9c45992..d89392b66c6 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) struct timespec ts; struct tm *tm, _tm; WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_FSTREAM *log_stream; conn = S2C(session); @@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * Lock the schema and walk the list of open handles, dumping * any that match the list of object sources. */ - if (conn->stat_sources != NULL) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply( + if (conn->stat_sources != NULL) + WT_RET(__wt_conn_btree_apply( session, NULL, __statlog_apply, NULL, NULL)); - WT_RET(ret); - } /* * Walk the list of open LSM trees, dumping any that match the diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index f9b7305c7d8..8c186c63939 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __sweep_remove_one(session, dhandle)); if (ret == 0) WT_STAT_CONN_INCR(session, dh_sweep_remove); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 08b15e6ca5e..61ced8d11e7 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) static int __backup_all(WT_SESSION_IMPL *session) { - WT_DECL_RET; - /* Build a list of the file objects that need to be copied. */ - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); - - return (ret); + return (__wt_meta_apply_all( + session, NULL, __backup_list_uri_append, NULL)); } /* diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 4a356f7da61..f463e6bc615 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -7,6 +7,12 @@ The WiredTiger Utility can now \c truncate an object. Removing all contents from the specified object. +
Handle list lock statistics
+
+In the 2.9.1 release we added statistics tracking handle list lock timing, we +have switched that lock from a spin lock to a read-write lock, and consequently +changed the statistics tracking lock related wait time. +
@section version_291 Upgrading to Version 2.9.1 diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 48ea1ccb02b..de1cff85816 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -24,40 +24,40 @@ static int __evict_walk_file( (S2C(s)->evict_threads.current_threads > 1) /* - * __evict_lock_dhandle -- - * Try to get the dhandle lock, with yield and sleep back off. + * __evict_lock_handle_list -- + * Try to get the handle list lock, with yield and sleep back off. * Keep timing statistics overall. */ static int -__evict_lock_dhandle(WT_SESSION_IMPL *session) +__evict_lock_handle_list(WT_SESSION_IMPL *session) { struct timespec enter, leave; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_SPINLOCK *dh_lock; - int64_t **stats; + WT_RWLOCK *dh_lock; u_int spins; bool dh_stats; conn = S2C(session); cache = conn->cache; dh_lock = &conn->dhandle_lock; - stats = (int64_t **)conn->stats; - dh_stats = WT_STAT_ENABLED(session) && dh_lock->stat_count_off != -1; /* - * Maintain lock acquisition timing statistics as if this were a - * regular lock acquisition. + * Setup tracking of handle lock acquisition wait time if statistics + * are enabled. */ + dh_stats = WT_STAT_ENABLED(session); + if (dh_stats) __wt_epoch(session, &enter); + /* * Use a custom lock acquisition back off loop so the eviction server * notices any interrupt quickly. */ for (spins = 0; - (ret = __wt_spin_trylock_track(session, dh_lock)) == EBUSY && + (ret = __wt_try_readlock(session, dh_lock)) == EBUSY && cache->pass_intr == 0; spins++) { if (spins < WT_THOUSAND) __wt_yield(); @@ -70,8 +70,9 @@ __evict_lock_dhandle(WT_SESSION_IMPL *session) WT_RET(ret); if (dh_stats) { __wt_epoch(session, &leave); - stats[session->stat_bucket][dh_lock->stat_int_usecs_off] += - (int64_t)WT_TIMEDIFF_US(leave, enter); + WT_STAT_CONN_INCRV( + session, lock_handle_list_wait_eviction, + (int64_t)WT_TIMEDIFF_US(leave, enter)); } return (0); } @@ -379,18 +380,17 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - /* - * If we gave up acquiring the lock, that indicates a - * session is waiting for us to clear walks. Do that - * as part of a normal pass (without the handle list + * Try to get the handle list lock: if we give up, that + * indicates a session is waiting for us to clear walks. Do + * that as part of a normal pass (without the handle list * lock) to avoid deadlock. */ - if ((ret = __evict_lock_dhandle(session)) == EBUSY) + if ((ret = __evict_lock_handle_list(session)) == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); WT_RET(ret); cache->pages_evicted = 0; @@ -1321,7 +1321,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - WT_ERR(__evict_lock_dhandle(session)); + WT_ERR(__evict_lock_handle_list(session)); dhandle_locked = true; } @@ -1400,7 +1400,7 @@ retry: while (slot < max_entries) { (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; /* @@ -1447,7 +1447,7 @@ retry: while (slot < max_entries) { } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; } @@ -2319,8 +2319,11 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "cache dump")); - __wt_spin_lock(session, &conn->dhandle_lock); - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; @@ -2331,7 +2334,6 @@ __wt_verbose_dump_cache(WT_SESSION_IMPL *session) if (ret != 0) break; } - __wt_spin_unlock(session, &conn->dhandle_lock); WT_RET(ret); /* diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c index 2dd3b1e83a0..7c2d5722a63 100644 --- a/src/evict/evict_stat.c +++ b/src/evict/evict_stat.c @@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_STAT_DATA_SET(session, cache_state_root_size, btree->root.page->memory_footprint); - WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); + __evict_stat_walk(session); } diff --git a/src/include/cache.i b/src/include/cache.i index 17ab39e97d2..d71978ccf35 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -364,7 +364,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) * block eviction), we don't want to highjack the thread for eviction. */ if (F_ISSET(session, WT_SESSION_NO_EVICTION | - WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_LOCKED_HANDLE_LIST_WRITE | WT_SESSION_LOCKED_SCHEMA)) return (0); /* In memory configurations don't block when the cache is full. */ diff --git a/src/include/connection.h b/src/include/connection.h index 64ac4271db1..3a719e59608 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -123,12 +123,16 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ ++conn->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ --conn->dhandle_count; \ @@ -163,13 +167,13 @@ struct __wt_connection_impl { WT_SPINLOCK api_lock; /* Connection API spinlock */ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ - WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ + WT_RWLOCK dhandle_lock; /* Data handle list lock */ /* * We distribute the btree page locks across a set of spin locks. Don't diff --git a/src/include/dhandle.h b/src/include/dhandle.h index dcc788f0839..4f318e7bccf 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -37,6 +37,24 @@ #define WT_SESSION_META_DHANDLE(s) \ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) +#define WT_DHANDLE_ACQUIRE(dhandle) \ + (void)__wt_atomic_add32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_RELEASE(dhandle) \ + (void)__wt_atomic_sub32(&dhandle->session_ref, 1) + +#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\ + if (dhandle == NULL) \ + dhandle = TAILQ_FIRST(head); \ + else { \ + WT_DHANDLE_RELEASE(dhandle); \ + dhandle = TAILQ_NEXT(dhandle, field); \ + } \ + if (dhandle != NULL) \ + WT_DHANDLE_ACQUIRE(dhandle); \ +} while (0) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. diff --git a/src/include/extern.h b/src/include/extern.h index eb2f9a0e784..d7d58c58048 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -254,6 +254,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index 0b92a12c686..5219bf33ed6 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -53,22 +53,23 @@ #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008 -#define WT_SESSION_LOCKED_METADATA 0x00000010 -#define WT_SESSION_LOCKED_PASS 0x00000020 -#define WT_SESSION_LOCKED_SCHEMA 0x00000040 -#define WT_SESSION_LOCKED_SLOT 0x00000080 -#define WT_SESSION_LOCKED_TABLE 0x00000100 -#define WT_SESSION_LOCKED_TURTLE 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_PASS 0x00000040 +#define WT_SESSION_LOCKED_SCHEMA 0x00000080 +#define WT_SESSION_LOCKED_SLOT 0x00000100 +#define WT_SESSION_LOCKED_TABLE 0x00000200 +#define WT_SESSION_LOCKED_TURTLE 0x00000400 +#define WT_SESSION_LOGGING_INMEM 0x00000800 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 +#define WT_SESSION_NO_CACHE 0x00002000 +#define WT_SESSION_NO_DATA_HANDLES 0x00004000 +#define WT_SESSION_NO_EVICTION 0x00008000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 +#define WT_SESSION_SERVER_ASYNC 0x00080000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 diff --git a/src/include/schema.h b/src/include/schema.h index bb116e5cf2f..fff57951c0e 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -78,6 +78,11 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) +/* Make it simple to check a generic locked state on the handle list lock */ +#define WT_SESSION_LOCKED_HANDLE_LIST \ + (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST_WRITE) + /* * WT_WITH_LOCK_WAIT -- * Wait for a lock, perform an operation, drop the lock. @@ -122,16 +127,47 @@ struct __wt_table { &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) /* - * WT_WITH_HANDLE_LIST_LOCK -- - * Acquire the data handle list lock, perform an operation, drop the lock. + * WT_WITH_HANDLE_LIST_READ_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. * * Note: always waits because some operations need the handle list lock to * discard handles, and we only expect it to be held across short * operations. */ -#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) +#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ + op; \ + } else { \ + __wt_readlock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HANDLE_LIST_WRITE_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. + * Automatically upgrade from a read lock if held. + */ +#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ + __wt_writelock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) /* * WT_WITH_METADATA_LOCK -- @@ -192,15 +228,21 @@ struct __wt_table { WT_CONNECTION_IMPL *__conn = S2C(session); \ bool __checkpoint_locked = \ F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ - bool __handle_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + bool __handle_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + bool __handle_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ bool __table_locked = \ F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ - if (__handle_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - __wt_spin_unlock(session, &__conn->dhandle_lock); \ + if (__handle_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &__conn->dhandle_lock); \ + } \ + if (__handle_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &__conn->dhandle_lock); \ } \ if (__table_locked) { \ F_CLR(session, WT_SESSION_LOCKED_TABLE); \ @@ -227,8 +269,12 @@ struct __wt_table { __wt_spin_lock(session, &__conn->table_lock); \ F_SET(session, WT_SESSION_LOCKED_TABLE); \ } \ - if (__handle_locked) { \ - __wt_spin_lock(session, &__conn->dhandle_lock); \ - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + if (__handle_read_locked) { \ + __wt_readlock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + } \ + if (__handle_write_locked) { \ + __wt_writelock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ } \ } while (0) diff --git a/src/include/stat.h b/src/include/stat.h index fd3e3290d95..8b2e78a4ed5 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -392,9 +392,7 @@ struct __wt_connection_stats { int64_t lock_checkpoint_count; int64_t lock_checkpoint_wait_application; int64_t lock_checkpoint_wait_internal; - int64_t lock_handle_list_count; - int64_t lock_handle_list_wait_application; - int64_t lock_handle_list_wait_internal; + int64_t lock_handle_list_wait_eviction; int64_t lock_metadata_count; int64_t lock_metadata_wait_application; int64_t lock_metadata_wait_internal; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f05d3d4ab55..d1e3d383396 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -4595,240 +4595,236 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1133 /*! lock: checkpoint lock internal thread wait time (usecs) */ #define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1134 -/*! lock: handle-list lock acquisitions */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1135 -/*! lock: handle-list lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1136 -/*! lock: handle-list lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1137 +/*! lock: handle-list lock eviction thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1135 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1138 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1136 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1139 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1137 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1140 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1138 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1141 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1139 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1142 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1140 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1143 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1141 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1144 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1142 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1145 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1143 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1146 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1144 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1147 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1145 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1148 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1146 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1149 +#define WT_STAT_CONN_LOG_SLOT_RACES 1147 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1148 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1151 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1149 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1152 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1150 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1153 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1151 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1154 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1152 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1155 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1153 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1156 +#define WT_STAT_CONN_LOG_FLUSH 1154 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1157 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1155 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1158 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1156 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1159 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1157 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1160 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1158 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1161 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1159 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1162 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1160 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1163 +#define WT_STAT_CONN_LOG_SCANS 1161 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1164 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1162 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1165 +#define WT_STAT_CONN_LOG_WRITE_LSN 1163 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1166 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1164 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1167 +#define WT_STAT_CONN_LOG_SYNC 1165 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1168 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1166 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1169 +#define WT_STAT_CONN_LOG_SYNC_DIR 1167 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1170 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1168 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1171 +#define WT_STAT_CONN_LOG_WRITES 1169 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1172 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1170 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1173 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1171 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1174 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1172 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1175 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1173 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1176 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1174 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1177 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1175 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1178 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1176 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1179 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1177 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1180 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1178 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1181 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1179 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1182 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1180 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1183 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1181 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1184 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1182 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1185 +#define WT_STAT_CONN_REC_PAGES 1183 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1186 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1184 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1187 +#define WT_STAT_CONN_REC_PAGE_DELETE 1185 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1188 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1186 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1189 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1187 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1190 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1188 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1191 +#define WT_STAT_CONN_SESSION_OPEN 1189 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1192 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1190 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1193 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1191 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1194 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1192 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1193 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1194 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1195 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1196 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1197 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1198 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1199 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1200 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1201 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1202 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1203 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1204 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1207 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1205 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1208 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1206 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1209 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1207 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1210 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1208 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1211 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1209 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1212 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1210 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1213 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1211 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1214 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1212 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1215 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1213 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1216 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1214 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1217 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1215 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1218 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1216 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1219 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1217 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1220 +#define WT_STAT_CONN_PAGE_SLEEP 1218 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1221 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1219 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1222 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1220 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1223 +#define WT_STAT_CONN_TXN_BEGIN 1221 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1222 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1223 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1224 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1225 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1226 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1227 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1230 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1228 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1229 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT 1230 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1233 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1231 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1234 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1232 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1235 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1233 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1236 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1234 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1237 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1235 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1238 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1236 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1239 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1237 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1240 +#define WT_STAT_CONN_TXN_SYNC 1238 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1241 +#define WT_STAT_CONN_TXN_COMMIT 1239 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1242 +#define WT_STAT_CONN_TXN_ROLLBACK 1240 /*! * @} diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index a2511f48e2b..60afbc99ade 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1692,8 +1692,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, bulk = cval.val != 0; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree); + /* * Check whether the exclusive open for a bulk load succeeded, and * if it did ensure that it's safe to bulk load into the tree. diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index cbd83a5cd30..6dc06146179 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -387,8 +387,8 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; - __wt_spin_lock(session, &conn->dhandle_lock); - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readlock(session, &conn->dhandle_lock); + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) @@ -448,14 +448,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = false; } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); } return (ret); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 150de968722..21e8991be94 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -33,9 +33,7 @@ __curstat_lsm_init( "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Propagate all, fast and/or clear to the cursors we open. */ diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 71a981a6284..a9275976023 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); } @@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, metadata = NULL; /* If the tree can be opened, it already exists. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - if (ret == 0) { + if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } @@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session, } *treep = lsm_tree; + + WT_ASSERT(session, lsm_tree->excl_session == + (exclusive ? session : NULL)); return (0); } @@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session, conn = S2C(session); lsm_tree = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) @@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, { WT_DECL_RET; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - ret = __lsm_tree_find(session, uri, exclusive, treep); + /* + * Dropping and re-acquiring the lock is safe here, since the tree open + * call checks to see if another thread beat it to opening the tree + * before proceeding. + */ + if (exclusive) + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); + else + WT_WITH_HANDLE_LIST_READ_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); if (ret == WT_NOTFOUND) - ret = __lsm_tree_open(session, uri, exclusive, treep); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_open(session, uri, exclusive, treep)); - WT_ASSERT(session, ret != 0 || - (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -857,9 +866,7 @@ __wt_lsm_tree_alter( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -899,9 +906,7 @@ __wt_lsm_tree_drop( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ @@ -934,7 +939,7 @@ __wt_lsm_tree_drop( WT_ASSERT(session, !lsm_tree->active); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1007,7 +1010,7 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1068,7 +1069,7 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); } @@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skipp = true; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, @@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = false; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index d9c185a3f58..4349acf7b55 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; @@ -517,7 +517,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); WT_RET(ret); diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index c1a4f257648..49801e4e5f9 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -30,7 +30,7 @@ __drop_file( WT_RET(__wt_schema_backup_check(session, filename)); /* Close all btree handles associated with this file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index f512482c162..a374f4c2831 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -33,7 +33,7 @@ __rename_file( WT_RET(__wt_schema_backup_check(session, filename)); WT_RET(__wt_schema_backup_check(session, newfile)); /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, false)); WT_ERR(ret); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index fb7f8cec074..e5f71b5d56f 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all( session, uri, false)); WT_ERR(ret); diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index f1251794b89..ee9bddbfc19 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -44,8 +44,7 @@ __session_discard_dhandle( TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); - (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1); - + WT_DHANDLE_RELEASE(dhandle_cache->dhandle); __wt_overwrite_and_free(session, dhandle_cache); } @@ -412,17 +411,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) /* * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's - * cache. Since the data handle isn't locked, this must be called holding - * the handle list lock, and we must increment the handle's reference - * count before releasing it. + * cache. We must increment the handle's reference count while holding + * the handle list lock. */ static int __session_find_shared_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); - (void)__wt_atomic_add32(&session->dhandle->session_ref, 1); - return (0); + WT_DECL_RET; + + WT_WITH_HANDLE_LIST_READ_LOCK(session, + if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + if (ret != WT_NOTFOUND) + return (ret); + + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + return (ret); } /* @@ -450,16 +459,16 @@ __session_get_dhandle( * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __session_find_shared_dhandle(session, uri, checkpoint)); - WT_RET(ret); + WT_RET(__session_find_shared_dhandle(session, uri, checkpoint)); /* * Fixup the reference count on failure (we incremented the reference * count while holding the handle-list lock). */ - if ((ret = __session_add_dhandle(session)) != 0) - (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); + if ((ret = __session_add_dhandle(session)) != 0) { + WT_DHANDLE_RELEASE(session->dhandle); + session->dhandle = NULL; + } return (ret); } @@ -505,17 +514,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, * reopen handles in the meantime. A combination of the schema * and handle list locks are used to enforce this. */ - if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); __wt_writeunlock(session, &dhandle->rwlock); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_session_get_btree( - session, uri, checkpoint, cfg, flags))); + ret = __wt_session_get_btree( + session, uri, checkpoint, cfg, flags)); return (ret); } diff --git a/src/support/stat.c b/src/support/stat.c index 167d17137ce..fd38e1b79ee 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -759,9 +759,7 @@ static const char * const __stats_connection_desc[] = { "lock: checkpoint lock acquisitions", "lock: checkpoint lock application thread wait time (usecs)", "lock: checkpoint lock internal thread wait time (usecs)", - "lock: handle-list lock acquisitions", - "lock: handle-list lock application thread wait time (usecs)", - "lock: handle-list lock internal thread wait time (usecs)", + "lock: handle-list lock eviction thread wait time (usecs)", "lock: metadata lock acquisitions", "lock: metadata lock application thread wait time (usecs)", "lock: metadata lock internal thread wait time (usecs)", @@ -1044,9 +1042,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_checkpoint_count = 0; stats->lock_checkpoint_wait_application = 0; stats->lock_checkpoint_wait_internal = 0; - stats->lock_handle_list_count = 0; - stats->lock_handle_list_wait_application = 0; - stats->lock_handle_list_wait_internal = 0; + stats->lock_handle_list_wait_eviction = 0; stats->lock_metadata_count = 0; stats->lock_metadata_wait_application = 0; stats->lock_metadata_wait_internal = 0; @@ -1351,12 +1347,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_checkpoint_wait_application); to->lock_checkpoint_wait_internal += WT_STAT_READ(from, lock_checkpoint_wait_internal); - to->lock_handle_list_count += - WT_STAT_READ(from, lock_handle_list_count); - to->lock_handle_list_wait_application += - WT_STAT_READ(from, lock_handle_list_wait_application); - to->lock_handle_list_wait_internal += - WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_handle_list_wait_eviction += + WT_STAT_READ(from, lock_handle_list_wait_eviction); to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); to->lock_metadata_wait_application += WT_STAT_READ(from, lock_metadata_wait_application); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3b19162fd3d..7b33b0c7788 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -640,9 +640,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, WT_WITH_TABLE_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_get_handles, NULL)))); + ret = __checkpoint_apply_all( + session, cfg, __wt_checkpoint_get_handles, NULL))); WT_ERR(ret); /* -- cgit v1.2.1 From 0562f92104f0b2d8ef218d9fe465ef718bc2d9cd Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 2 Feb 2017 16:40:30 +1100 Subject: WT-3150 Reduce impact of checkpoints on eviction. (#3265) In particular, don't have the eviction server give up all walks each time it is interrupted, and only wait for requesting threads to make progress: don't go to sleep. --- src/evict/evict_lru.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index de1cff85816..3cb513fd87b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -281,7 +281,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - bool did_work; + bool did_work, was_intr; conn = S2C(session); cache = conn->cache; @@ -309,8 +309,21 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); + was_intr = cache->pass_intr != 0; __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); + + /* + * If the eviction server was interrupted, wait until + * requests have been processed: the system may + * otherwise be busy so don't go to sleep. + */ + if (was_intr) { + while (cache->pass_intr != 0) + __wt_yield(); + continue; + } + __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); /* Don't rely on signals: check periodically. */ @@ -372,7 +385,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || + cache->pass_intr != 0) return (0); /* -- cgit v1.2.1 From 3e68fb2d7da35eeb122308971f02203c58caa538 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 3 Feb 2017 03:28:50 +1100 Subject: WT-3139 Enhance wtperf to support periodic table scans (#3268) * Enhance wtperf to support periodic table scans * Implement scans as read_range. * Use a random cursor to set key in table properly. * Don't allow insert workload with table specifier. * Reset the rand cursor so it isn't positioned. * Make wtperf pre_load_data an option. --- bench/wtperf/config.c | 42 ++++++- bench/wtperf/idle_table_cycle.c | 2 + bench/wtperf/stress/btree-split-stress.wtperf | 3 +- bench/wtperf/wtperf.c | 163 ++++++++++++++++++++------ bench/wtperf/wtperf.h | 5 + bench/wtperf/wtperf_opt.i | 10 +- src/docs/wtperf.dox | 6 +- 7 files changed, 183 insertions(+), 48 deletions(-) diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index a15a3485dde..9eea99eeec4 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -215,6 +215,7 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) return (EINVAL); } workp = &wtperf->workload[wtperf->workload_cnt++]; + workp->table_index = INT32_MAX; while ((ret = scan->next(scan, &k, &v)) == 0) { if (STRING_MATCH("count", k.str, k.len)) { @@ -233,12 +234,28 @@ config_threads(WTPERF *wtperf, const char *config, size_t len) goto err; continue; } + if (STRING_MATCH("pause", k.str, k.len)) { + if ((workp->pause = v.val) < 0) + goto err; + continue; + } if (STRING_MATCH("read", k.str, k.len) || STRING_MATCH("reads", k.str, k.len)) { if ((workp->read = v.val) < 0) goto err; continue; } + if (STRING_MATCH("read_range", k.str, k.len)) { + if ((workp->read_range = v.val) < 0) + goto err; + continue; + } + if (STRING_MATCH("table", k.str, k.len)) { + if (v.val <= 0) + goto err; + workp->table_index = (int32_t)v.val - 1; + continue; + } if (STRING_MATCH("throttle", k.str, k.len)) { workp->throttle = (uint64_t)v.val; continue; @@ -760,16 +777,33 @@ config_sanity(WTPERF *wtperf) opts->value_sz_min = opts->value_sz; } - if (opts->readonly && wtperf->workload != NULL) + if (wtperf->workload != NULL) for (i = 0, workp = wtperf->workload; - i < wtperf->workload_cnt; ++i, ++workp) - if (workp->insert != 0 || workp->update != 0 || - workp->truncate != 0) { + i < wtperf->workload_cnt; ++i, ++workp) { + if (opts->readonly && + (workp->insert != 0 || workp->update != 0 || + workp->truncate != 0)) { fprintf(stderr, "Invalid workload: insert, update or " "truncate specified with readonly\n"); return (EINVAL); } + if (workp->insert != 0 && + workp->table_index != INT32_MAX) { + fprintf(stderr, + "Invalid workload: Cannot insert into " + "specific table only\n"); + return (EINVAL); + } + if (workp->table_index != INT32_MAX && + workp->table_index >= (int32_t)opts->table_count) { + fprintf(stderr, + "Workload table index %" PRId32 + " is larger than table count %" PRId32, + workp->table_index, opts->table_count); + return (EINVAL); + } + } return (0); } diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c index 13fa55e86f5..bb44cfbde59 100644 --- a/bench/wtperf/idle_table_cycle.c +++ b/bench/wtperf/idle_table_cycle.c @@ -120,6 +120,7 @@ cycle_idle_tables(void *arg) return (NULL); start = stop; +#if 1 /* * Drop the table. Keep retrying on EBUSY failure - it is an * expected return when checkpoints are happening. @@ -136,6 +137,7 @@ cycle_idle_tables(void *arg) } if (check_timing(wtperf, "drop", start, &stop) != 0) return (NULL); +#endif } return (NULL); diff --git a/bench/wtperf/stress/btree-split-stress.wtperf b/bench/wtperf/stress/btree-split-stress.wtperf index 86bb288fc6d..eb6ca1cfddc 100644 --- a/bench/wtperf/stress/btree-split-stress.wtperf +++ b/bench/wtperf/stress/btree-split-stress.wtperf @@ -6,5 +6,4 @@ run_time=300 reopen_connection=false populate_threads=2 value_sz=256 -read_range=100 -threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1)) +threads=((count=4,inserts=1,throttle=100000),(count=8,reads=1,read_range=100)) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index baa259f8817..044fd38dc06 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -432,19 +432,17 @@ err: wtperf->error = wtperf->stop = true; * search do them. Ensuring the keys we see are always in order. */ static int -do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) +do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor, int64_t read_range) { - CONFIG_OPTS *opts; - size_t range; uint64_t next_val, prev_val; + int64_t range; char *range_key_buf; char buf[512]; int ret; - opts = wtperf->opts; ret = 0; - if (opts->read_range == 0) + if (read_range == 0) return (0); memset(&buf[0], 0, 512 * sizeof(char)); @@ -454,7 +452,7 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &range_key_buf)); extract_key(range_key_buf, &next_val); - for (range = 0; range < opts->read_range; ++range) { + for (range = 0; range < read_range; ++range) { prev_val = next_val; ret = cursor->next(cursor); /* We are done if we reach the end. */ @@ -475,12 +473,56 @@ do_range_reads(WTPERF *wtperf, WT_CURSOR *cursor) return (0); } +/* pre_load_data -- + * Pull everything into cache before starting the workload phase. + */ +static int +pre_load_data(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_SESSION *session; + char *key; + int ret; + size_t i; + + opts = wtperf->opts; + conn = wtperf->conn; + + if ((ret = conn->open_session( + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); + goto err; + } + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + while (cursor->next(cursor) == 0) + if ((ret = cursor->get_key(cursor, &key)) != 0) + goto err; + if ((ret = cursor->close(cursor)) != 0) + goto err; + } + if ((ret = session->close(session, NULL)) != 0) + goto err; + if (ret != 0) +err: lprintf(wtperf, ret, 0, "Pre-workload traverse error"); + return (ret); +} + static void * worker(void *arg) { struct timespec start, stop; CONFIG_OPTS *opts; TRACK *trk; + WORKLOAD *workload; WTPERF *wtperf; WTPERF_THREAD *thread; WT_CONNECTION *conn; @@ -495,13 +537,14 @@ worker(void *arg) char buf[512]; thread = (WTPERF_THREAD *)arg; + workload = thread->workload; wtperf = thread->wtperf; opts = wtperf->opts; conn = wtperf->conn; cursors = NULL; - log_table_cursor = NULL; /* -Wconditional-initialized */ + cursor = log_table_cursor = NULL; /* -Wconditional-initialized */ ops = 0; - ops_per_txn = thread->workload->ops_per_txn; + ops_per_txn = workload->ops_per_txn; session = NULL; trk = NULL; @@ -510,7 +553,6 @@ worker(void *arg) lprintf(wtperf, ret, 0, "worker: WT_CONNECTION.open_session"); goto err; } - cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); for (i = 0; i < opts->table_count_idle; i++) { snprintf(buf, 512, "%s_idle%05d", wtperf->uris[0], (int)i); if ((ret = session->open_cursor( @@ -525,14 +567,34 @@ worker(void *arg) goto err; } } - for (i = 0; i < opts->table_count; i++) { + if (workload->table_index != INT32_MAX) { if ((ret = session->open_cursor(session, - wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + wtperf->uris[workload->table_index], + NULL, NULL, &cursor)) != 0) { lprintf(wtperf, ret, 0, "worker: WT_SESSION.open_cursor: %s", - wtperf->uris[i]); + wtperf->uris[workload->table_index]); + goto err; + } + if ((ret = session->open_cursor(session, + wtperf->uris[workload->table_index], + NULL, "next_random=true", &thread->rand_cursor)) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: random %s", + wtperf->uris[workload->table_index]); goto err; } + } else { + cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < opts->table_count; i++) { + if ((ret = session->open_cursor(session, + wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { + lprintf(wtperf, ret, 0, + "worker: WT_SESSION.open_cursor: %s", + wtperf->uris[i]); + goto err; + } + } } if (opts->log_like_table && (ret = session->open_cursor(session, wtperf->log_table_uri, NULL, NULL, &log_table_cursor)) != 0) { @@ -543,19 +605,19 @@ worker(void *arg) } /* Setup the timer for throttling. */ - if (thread->workload->throttle != 0) + if (workload->throttle != 0) setup_throttle(thread); /* Setup for truncate */ - if (thread->workload->truncate != 0) + if (workload->truncate != 0) if ((ret = setup_truncate(wtperf, thread, session)) != 0) goto err; key_buf = thread->key_buf; value_buf = thread->value_buf; - op = thread->workload->ops; - op_end = op + sizeof(thread->workload->ops); + op = workload->ops; + op_end = op + sizeof(workload->ops); if ((ops_per_txn != 0 || opts->log_like_table) && (ret = session->begin_transaction(session, NULL)) != 0) { @@ -564,6 +626,8 @@ worker(void *arg) } while (!wtperf->stop) { + if (workload->pause != 0) + (void)sleep((unsigned int)workload->pause); /* * Generate the next key and setup operation specific * statistics tracking objects. @@ -603,10 +667,12 @@ worker(void *arg) generate_key(opts, key_buf, next_val); - /* - * Spread the data out around the multiple databases. - */ - cursor = cursors[map_key_to_table(wtperf->opts, next_val)]; + if (workload->table_index == INT32_MAX) + /* + * Spread the data out around the multiple databases. + */ + cursor = cursors[ + map_key_to_table(wtperf->opts, next_val)]; /* * Skip the first time we do an operation, when trk->ops @@ -642,7 +708,8 @@ worker(void *arg) * for several operations, confirming that the * next key is in the correct order. */ - ret = do_range_reads(wtperf, cursor); + ret = do_range_reads(wtperf, + cursor, workload->read_range); } if (ret == 0 || ret == WT_NOTFOUND) @@ -689,7 +756,7 @@ worker(void *arg) */ strncpy(value_buf, value, opts->value_sz_max - 1); - if (thread->workload->update_delta != 0) + if (workload->update_delta != 0) update_value_delta(thread); if (value_buf[0] == 'a') value_buf[0] = 'b'; @@ -806,7 +873,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { /* Schedule the next operation */ if (++op == op_end) - op = thread->workload->ops; + op = workload->ops; /* * Decrement throttle ops and check if we should sleep @@ -843,7 +910,7 @@ run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt) uint8_t *p, *end; /* Jump around the array to roughly spread out the operations. */ - jump = 100 / op_cnt; + jump = (int)(100 / op_cnt); /* * Find a read operation and replace it with another operation. This @@ -884,17 +951,6 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) opts = wtperf->opts; - /* Confirm reads, inserts, truncates and updates cannot all be zero. */ - if (workp->insert == 0 && workp->read == 0 && - workp->truncate == 0 && workp->update == 0) { - lprintf(wtperf, EINVAL, 0, "no operations scheduled"); - return (EINVAL); - } - - /* - * Handle truncate first - it's a special case that can't be used in - * a mixed workload. - */ if (workp->truncate != 0) { if (workp->insert != 0 || workp->read != 0 || workp->update != 0) { @@ -906,6 +962,12 @@ run_mix_schedule(WTPERF *wtperf, WORKLOAD *workp) return (0); } + /* Confirm reads, inserts and updates cannot all be zero. */ + if (workp->insert == 0 && workp->read == 0 && workp->update == 0) { + lprintf(wtperf, EINVAL, 0, "no operations scheduled"); + return (EINVAL); + } + /* * Check for a simple case where the thread is only doing insert or * update operations (because the default operation for a @@ -2244,6 +2306,8 @@ start_run(WTPERF *wtperf) opts->checkpoint_threads, checkpoint_worker) != 0) goto err; } + if (opts->pre_load_data && (ret = pre_load_data(wtperf)) != 0) + goto err; /* Execute the workload. */ if ((ret = execute_workload(wtperf)) != 0) goto err; @@ -2827,13 +2891,42 @@ static uint64_t wtperf_rand(WTPERF_THREAD *thread) { CONFIG_OPTS *opts; + WT_CURSOR *rnd_cursor; WTPERF *wtperf; double S1, S2, U; uint64_t rval; + int ret; + char *key_buf; wtperf = thread->wtperf; opts = wtperf->opts; + /* + * If we have a random cursor set up then use it. + */ + if ((rnd_cursor = thread->rand_cursor) != NULL) { + if ((ret = rnd_cursor->next(rnd_cursor))) { + lprintf(wtperf, ret, 0, "worker: rand next failed"); + /* 0 is outside the expected range. */ + return (0); + } + if ((ret = rnd_cursor->get_key(rnd_cursor, &key_buf)) != 0) { + lprintf(wtperf, ret, 0, + "worker: rand next key retrieval"); + return (0); + } + /* + * Resetting the cursor is not fatal. We still return the + * value we retrieved above. We do it so that we don't + * leave a cursor positioned. + */ + if ((ret = rnd_cursor->reset(rnd_cursor)) != 0) + lprintf(wtperf, ret, 0, + "worker: rand cursor reset failed"); + extract_key(key_buf, &rval); + return (rval); + } + /* * Use WiredTiger's random number routine: it's lock-free and fairly * good. diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 81d74e134f6..db88d0b0271 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -66,6 +66,9 @@ typedef struct { uint64_t throttle; /* Maximum operations/second */ /* Number of operations per transaction. Zero for autocommit */ int64_t ops_per_txn; + int64_t pause; /* Time between scans */ + int64_t read_range; /* Range of reads */ + int32_t table_index; /* Table to focus ops on */ int64_t truncate; /* Truncate ratio */ uint64_t truncate_pct; /* Truncate Percent */ uint64_t truncate_count; /* Truncate Count */ @@ -225,6 +228,7 @@ typedef struct { struct __wtperf_thread { /* Per-thread structure */ WTPERF *wtperf; /* Enclosing configuration */ + WT_CURSOR *rand_cursor; /* Random key cursor */ WT_RAND_STATE rnd; /* Random number generation state */ @@ -241,6 +245,7 @@ struct __wtperf_thread { /* Per-thread structure */ TRACK ckpt; /* Checkpoint operations */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ + TRACK scan; /* Scan operations */ TRACK update; /* Update operations */ TRACK truncate; /* Truncate operations */ TRACK truncate_sleep; /* Truncate sleep operations */ diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 680eb53a90e..63cef4c28fb 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -145,12 +145,13 @@ DEF_OPT_AS_UINT32(populate_ops_per_txn, 0, "phase, zero for auto-commit") DEF_OPT_AS_UINT32(populate_threads, 1, "number of populate threads, 1 for bulk load") +DEF_OPT_AS_BOOL(pre_load_data, 0, + "Scan all data prior to starting the workload phase to warm the cache") DEF_OPT_AS_UINT32(random_range, 0, "if non zero choose a value from within this range as the key for " "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") DEF_OPT_AS_BOOL(range_partition, 0, "partition data by range (vs hash)") -DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search") DEF_OPT_AS_BOOL(readonly, 0, "reopen the connection between populate and workload phases in readonly " "mode. Requires reopen_connection turned on (default). Requires that " @@ -192,9 +193,10 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' " "which would create 2 threads doing nothing but reads and 8 threads " "each doing 50% inserts and 25% reads and updates. Allowed configuration " - "values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', " - "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are " - "also behavior modifiers, supported modifiers are 'ops_per_txn'") + "values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', " + "'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. " + "There are also behavior modifiers, supported modifiers are " + "'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", "WT_SESSION.begin_transaction configuration string, applied during the " "populate phase when populate_ops_per_txn is nonzero") diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 83aadf8a776..2eac0fef3f4 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -195,14 +195,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load +@par pre_load_data (boolean, default=false) +Scan all data prior to starting the workload phase to warm the cache @par random_range (unsigned int, default=0) if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) partition data by range (vs hash) -@par read_range (unsigned int, default=0) -scan a range of keys after each search @par readonly (boolean, default=false) reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) @@ -228,7 +228,7 @@ number of tables to run operations over. Keys are divided evenly over the table @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. @par threads (string, default="") -workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' @par transaction_config (string, default="") WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero @par table_name (string, default="test") -- cgit v1.2.1 From 17ec908453f8dae29d18cd8ba172360ef0473c8f Mon Sep 17 00:00:00 2001 From: sueloverso Date: Thu, 2 Feb 2017 14:01:31 -0500 Subject: WT-3157 Fix checkpoint error path (#3274) --- src/txn/txn_ckpt.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 7b33b0c7788..90804db3240 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -181,7 +181,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; - u_int i; + u_int i, j; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { @@ -189,10 +189,22 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); - WT_RET(ret); + WT_ERR(ret); } - return (0); +err: + /* + * If we have an error somewhere in processing the handles, then + * we need to mark earlier trees dirty. + */ + if (ret != 0) + for (j = 0; j < i; ++j) { + if (session->ckpt_handle[j] == NULL) + continue; + WT_WITH_DHANDLE(session, session->ckpt_handle[j], + S2BT(session)->modified = true); + } + return (ret); } /* @@ -824,7 +836,7 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0 && !conn->modified) + if (ret != 0) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; @@ -1340,7 +1352,6 @@ __checkpoint_tree( WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; - int was_modified; bool fake_ckpt; WT_UNUSED(cfg); @@ -1351,7 +1362,6 @@ __checkpoint_tree( conn = S2C(session); dhandle = session->dhandle; fake_ckpt = false; - was_modified = btree->modified; /* * Set the checkpoint LSN to the maximum LSN so that if logging is @@ -1482,10 +1492,9 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) { + if (ret != 0) { btree->modified = true; - if (!S2C(session)->modified) - S2C(session)->modified = true; + S2C(session)->modified = true; } __wt_meta_ckptlist_free(session, ckptbase); -- cgit v1.2.1 From 009959863f181a07d6c5bb73bcd0e4f1fded7b78 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 12:57:31 +1100 Subject: WT-3150 Fix: don't spin forever during eviction interrupts. (#3276) --- src/evict/evict_lru.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 3cb513fd87b..a071730d4bd 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -319,7 +319,9 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) * otherwise be busy so don't go to sleep. */ if (was_intr) { - while (cache->pass_intr != 0) + while (cache->pass_intr != 0 && + F_ISSET(conn, WT_CONN_EVICTION_RUN) && + F_ISSET(thread, WT_THREAD_RUN)) __wt_yield(); continue; } -- cgit v1.2.1 From 6df1a46875156202f560d6d173ba0be7afe8ca98 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 15:45:32 +1100 Subject: WT-3148 Improve efficiency of eviction with many small trees. (#3264) --- src/evict/evict_lru.c | 95 ++++++++++++++++++++++++++++++++++++++------------- src/include/btree.i | 22 ++++++++++++ src/include/extern.h | 1 + src/support/rand.c | 12 +++++++ 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a071730d4bd..2b7b46e19fa 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1559,6 +1559,19 @@ __evict_walk_file(WT_SESSION_IMPL *session, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; total_slots = max_entries - queue->evict_entries; + btree_inuse = cache_inuse = 0; + target_pages_clean = target_pages_dirty = 0; + + /* + * The number of times we should fill the queue by the end of + * considering all trees. + */ +#define QUEUE_FILLS_PER_PASS 10 + + /* + * The minimum number of pages we should consider per tree. + */ +#define MIN_PAGES_PER_TREE 10 /* * The target number of pages for this tree is proportional to the @@ -1567,13 +1580,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, * cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { - btree_inuse = __wt_btree_bytes_inuse(session); + btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_clean = 0; + } if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); @@ -1581,35 +1593,58 @@ __evict_walk_file(WT_SESSION_IMPL *session, bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_dirty = 0; + } - target_pages = WT_MAX(target_pages_clean, target_pages_dirty); + /* + * Weight the number of target pages by the number of times we want to + * fill the cache per pass through all the trees. Note that we don't + * build this into the calculation above because we don't want to favor + * small trees, so round to a whole number of slots (zero for small + * trees) before multiplying. + */ + target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * + QUEUE_FILLS_PER_PASS; + /* + * Randomly walk trees with a small fraction of the cache in case there + * are so many trees that none of them use enough of the cache to be + * allocated slots. + * + * The chance of walking a tree is equal to the chance that a random + * byte in cache belongs to the tree, weighted by how many times we + * want to fill queues during a pass through all the trees in cache. + */ if (target_pages == 0) { - /* - * Randomly walk trees with a tiny fraction of the cache in - * case there are so many trees that none of them use enough of - * the cache to be allocated slots. Walk small trees 1% of the - * time. - */ - if (__wt_random(&session->rnd) > UINT32_MAX / 100) + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { + btree_inuse = __wt_btree_bytes_evictable(session); + cache_inuse = __wt_cache_bytes_inuse(cache); + } else { + btree_inuse = __wt_btree_dirty_leaf_inuse(session); + cache_inuse = __wt_cache_dirty_leaf_inuse(cache); + } + if (btree_inuse == 0 || cache_inuse == 0) + return (0); + if (__wt_random64(&session->rnd) % cache_inuse > + btree_inuse * QUEUE_FILLS_PER_PASS) return (0); - target_pages = 10; } + /* + * There is some cost associated with walking a tree. If we're going + * to visit this tree, always look for a minimum number of pages. + */ + if (target_pages < MIN_PAGES_PER_TREE) + target_pages = MIN_PAGES_PER_TREE; + + /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - /* Randomize the walk direction. */ - if (btree->evict_walk_reverse) - FLD_SET(walk_flags, WT_READ_PREV); - /* * Examine at least a reasonable number of pages before deciding * whether to give up. When we are only looking for dirty pages, @@ -1620,6 +1655,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + + /* Randomize the walk direction. */ + if (btree->evict_walk_reverse) + FLD_SET(walk_flags, WT_READ_PREV); + /* * Get some more eviction candidate pages. * @@ -1752,12 +1794,17 @@ fast: /* If the page can't be evicted, give up. */ session, cache_eviction_pages_queued, (u_int)(evict - start)); /* - * If we didn't find any candidates in the file, reverse the direction - * of the walk and skip it next time. + * If gave up the walk, reverse the direction of the walk and skip it + * next time. */ if (give_up) btree->evict_walk_reverse = !btree->evict_walk_reverse; - if (pages_queued == 0 && !urgent_queued) + + /* + * If we couldn't find the number of pages we were looking for, skip + * the tree next time. + */ + if (pages_queued < target_pages / 2 && !urgent_queued) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) diff --git a/src/include/btree.i b/src/include/btree.i index 09fa8df8c56..1e971fa81c9 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -70,6 +70,28 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem)); } +/* + * __wt_btree_bytes_evictable -- + * Return the number of bytes that can be evicted (i.e. bytes apart from + * the pinned root page). + */ +static inline uint64_t +__wt_btree_bytes_evictable(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + uint64_t bytes_inmem, bytes_root; + + btree = S2BT(session); + cache = S2C(session)->cache; + + bytes_inmem = btree->bytes_inmem; + bytes_root = btree->root.page->memory_footprint; + + return (bytes_inmem <= bytes_root ? 0 : + __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); +} + /* * __wt_btree_dirty_inuse -- * Return the number of dirty bytes in use. diff --git a/src/include/extern.h b/src/include/extern.h index d7d58c58048..863d2a02861 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -688,6 +688,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/support/rand.c b/src/support/rand.c index a5b229b9abc..4fae43edc8e 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state) return ((z << 16) + (w & 65535)); } + +/* + * __wt_random64 -- + * Return a 64-bit pseudo-random number. + */ +uint64_t +__wt_random64(WT_RAND_STATE volatile * rnd_state) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + return (((uint64_t)__wt_random(rnd_state) << 32) + + __wt_random(rnd_state)); +} -- cgit v1.2.1 From de3424c0bca2d7660acaff17383e05849d164a16 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 3 Feb 2017 17:00:41 +1100 Subject: WT-3148 Check that we have a root page when calculating evictable size. --- src/include/btree.i | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/include/btree.i b/src/include/btree.i index 1e971fa81c9..378d93dd2ee 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -80,13 +80,15 @@ __wt_btree_bytes_evictable(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; + WT_PAGE *root_page; uint64_t bytes_inmem, bytes_root; btree = S2BT(session); cache = S2C(session)->cache; + root_page = btree->root.page; bytes_inmem = btree->bytes_inmem; - bytes_root = btree->root.page->memory_footprint; + bytes_root = root_page == NULL ? 0 : root_page->memory_footprint; return (bytes_inmem <= bytes_root ? 0 : __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); -- cgit v1.2.1 From b2173f8f063b1528dcd086f00ca8cf072f0445d0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sun, 5 Feb 2017 19:55:36 -0500 Subject: WT-3111 util_create() doesnt free memory assigned to "uri" (#3279) Always print an error message if a WT_SESSION method fails, we don't know if the WiredTiger library printed out a message or not. Free memory allocated by util_uri() in some cases where we either didn't have the necessary free call, or simply returned without freeing memory. Try and be more consistent with error messages, use the leading call as the first string, and any arguments to that call as the second. Replace some of the places we're writing to stderr explicitly with the utility error handlers. Initialize the return variable from util_uri() in all cases. Change error messages that referenced WT_SESSION.open to reference WT_SESSION.open_cursor. --- bench/wtperf/wtperf.c | 2 +- bench/wtperf/wtperf.h | 1 - src/evict/evict_lru.c | 5 ++--- src/utilities/util.h | 2 +- src/utilities/util_alter.c | 9 ++++++--- src/utilities/util_compact.c | 14 +++----------- src/utilities/util_create.c | 12 +++++++----- src/utilities/util_drop.c | 10 ++++++---- src/utilities/util_dump.c | 26 +++++++++++++------------- src/utilities/util_list.c | 21 ++++++++++----------- src/utilities/util_load.c | 2 +- src/utilities/util_load_json.c | 2 +- src/utilities/util_loadtext.c | 13 +++++++++---- src/utilities/util_main.c | 4 ++-- src/utilities/util_printlog.c | 14 +++----------- src/utilities/util_read.c | 19 +++++++++++++------ src/utilities/util_rebalance.c | 30 +++++++++++++----------------- src/utilities/util_rename.c | 15 ++++----------- src/utilities/util_salvage.c | 30 +++++++++++++----------------- src/utilities/util_stat.c | 6 +++--- src/utilities/util_truncate.c | 11 ++++++----- src/utilities/util_upgrade.c | 30 +++++++++++++----------------- src/utilities/util_verify.c | 34 +++++++++++++++------------------- src/utilities/util_write.c | 20 +++++++++++++------- 24 files changed, 158 insertions(+), 174 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 044fd38dc06..7f5e5ad3373 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -2905,7 +2905,7 @@ wtperf_rand(WTPERF_THREAD *thread) * If we have a random cursor set up then use it. */ if ((rnd_cursor = thread->rand_cursor) != NULL) { - if ((ret = rnd_cursor->next(rnd_cursor))) { + if ((ret = rnd_cursor->next(rnd_cursor)) != 0) { lprintf(wtperf, ret, 0, "worker: rand next failed"); /* 0 is outside the expected range. */ return (0); diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index db88d0b0271..3efb8ab700e 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -245,7 +245,6 @@ struct __wtperf_thread { /* Per-thread structure */ TRACK ckpt; /* Checkpoint operations */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ - TRACK scan; /* Scan operations */ TRACK update; /* Update operations */ TRACK truncate; /* Truncate operations */ TRACK truncate_sleep; /* Truncate sleep operations */ diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 2b7b46e19fa..db39a5acdee 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -198,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } - WT_ASSERT(session, - !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } @@ -1781,7 +1780,7 @@ fast: /* If the page can't be evicted, give up. */ ++pages_queued; if (WT_PAGE_IS_INTERNAL(page)) - ++internal_pages; + ++internal_pages; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, diff --git a/src/utilities/util.h b/src/utilities/util.h index cf12d7d4aa6..93a96d44219 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *); int util_list(WT_SESSION *, int, char *[]); int util_load(WT_SESSION *, int, char *[]); int util_loadtext(WT_SESSION *, int, char *[]); -char *util_name(WT_SESSION *, const char *, const char *); int util_printlog(WT_SESSION *, int, char *[]); int util_read(WT_SESSION *, int, char *[]); int util_read_line(WT_SESSION *, ULINE *, bool, bool *); @@ -51,5 +50,6 @@ int util_stat(WT_SESSION *, int, char *[]); int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop); int util_truncate(WT_SESSION *, int, char *[]); int util_upgrade(WT_SESSION *, int, char *[]); +char *util_uri(WT_SESSION *, const char *, const char *); int util_verify(WT_SESSION *, int, char *[]); int util_write(WT_SESSION *, int, char *[]); diff --git a/src/utilities/util_alter.c b/src/utilities/util_alter.c index d228c15cd48..ef01a1ed826 100644 --- a/src/utilities/util_alter.c +++ b/src/utilities/util_alter.c @@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[]) for (configp = argv; configp != NULL && *configp != NULL; configp += 2) if ((ret = session->alter( - session, configp[0], configp[1])) != 0) - break; - return (ret); + session, configp[0], configp[1])) != 0) { + (void)util_err(session, ret, + "session.alter: %s, %s", configp[0], configp[1]); + return (1); + } + return (0); } static int diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c index c114eb207fa..e469b4dce6e 100644 --- a/src/utilities/util_compact.c +++ b/src/utilities/util_compact.c @@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->compact(session, uri, NULL)) != 0) { - fprintf(stderr, "%s: compact(%s): %s\n", - progname, uri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->compact(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.compact: %s", uri); free(uri); - return (ret); } diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c index 4e609736f2d..7c22a67792b 100644 --- a/src/utilities/util_create.c +++ b/src/utilities/util_create.c @@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - const char *config, *uri; + char *config, *uri; - config = NULL; + config = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF) switch (ch) { case 'c': /* command-line configuration */ @@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[]) if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); if ((ret = session->create(session, uri, config)) != 0) - return (util_err(session, ret, "%s: session.create", uri)); - return (0); + (void)util_err(session, ret, "session.create: %s", uri); + + free(uri); + return (ret); } static int diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c index ba41445dfb6..456005d445d 100644 --- a/src/utilities/util_drop.c +++ b/src/utilities/util_drop.c @@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - ret = session->drop(session, name, "force"); + if ((ret = session->drop(session, uri, "force")) != 0) + (void)util_err(session, ret, "session.drop: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 3f8b4a49dfe..cded40a8b45 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -37,10 +37,10 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) size_t len; int ch, i; bool hex, json, reverse; - char *checkpoint, *config, *name, *p, *simplename; + char *checkpoint, *config, *p, *simpleuri, *uri; hex = json = reverse = false; - checkpoint = config = name = simplename = NULL; + checkpoint = config = simpleuri = uri = NULL; cursor = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) switch (ch) { @@ -89,11 +89,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) if (json && i > 0) if (dump_json_separator(session) != 0) goto err; - free(name); - free(simplename); - name = simplename = NULL; + free(uri); + free(simpleuri); + uri = simpleuri = NULL; - if ((name = util_name(session, argv[i], "table")) == NULL) + if ((uri = util_uri(session, argv[i], "table")) == NULL) goto err; len = @@ -113,19 +113,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) (void)strcat(config, json ? "dump=json" : (hex ? "dump=hex" : "dump=print")); if ((ret = session->open_cursor( - session, name, NULL, config, &cursor)) != 0) { + session, uri, NULL, config, &cursor)) != 0) { fprintf(stderr, "%s: cursor open(%s) failed: %s\n", - progname, name, session->strerror(session, ret)); + progname, uri, session->strerror(session, ret)); goto err; } - if ((simplename = strdup(name)) == NULL) { + if ((simpleuri = strdup(uri)) == NULL) { (void)util_err(session, errno, NULL); goto err; } - if ((p = strchr(simplename, '(')) != NULL) + if ((p = strchr(simpleuri, '(')) != NULL) *p = '\0'; - if (dump_config(session, simplename, cursor, hex, json) != 0) + if (dump_config(session, simpleuri, cursor, hex, json) != 0) goto err; if (dump_record(cursor, reverse, json) != 0) @@ -148,8 +148,8 @@ err: ret = 1; } free(config); - free(name); - free(simplename); + free(uri); + free(simpleuri); if (cursor != NULL && (ret = cursor->close(cursor)) != 0) { (void)util_err(session, ret, NULL); ret = 1; diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index e91dbfce05b..f19ba4d1f97 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; bool cflag, vflag; - char *name; + char *uri; cflag = vflag = false; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF) switch (ch) { case 'c': @@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[]) case 0: break; case 1: - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); break; default: return (usage()); } - ret = list_print(session, name, cflag, vflag); - - free(name); + ret = list_print(session, uri, cflag, vflag); + free(uri); return (ret); } @@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) * List the high-level objects in the database. */ static int -list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) +list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag) { WT_CURSOR *cursor; WT_DECL_RET; @@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI)); } - found = name == NULL; + found = uri == NULL; while ((ret = cursor->next(cursor)) == 0) { /* Get the key. */ if ((ret = cursor->get_key(cursor, &key)) != 0) @@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) /* * If a name is specified, only show objects that match. */ - if (name != NULL) { - if (!WT_PREFIX_MATCH(key, name)) + if (uri != NULL) { + if (!WT_PREFIX_MATCH(key, uri)) continue; found = true; } @@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) if (ret != WT_NOTFOUND) return (util_cerr(cursor, "next", ret)); if (!found) { - fprintf(stderr, "%s: %s: not found\n", progname, name); + fprintf(stderr, "%s: %s: not found\n", progname, uri); return (1); } diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index ac18df80851..ca77643eb49 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -126,7 +126,7 @@ load_dump(WT_SESSION *session) append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index 020a4ed9ba9..1189d49a483 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -242,7 +242,7 @@ json_data(WT_SESSION *session, LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } keyformat = cursor->key_format; diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c index f9c5b6e9a1f..7602d43f8c9 100644 --- a/src/utilities/util_loadtext.c +++ b/src/utilities/util_loadtext.c @@ -15,9 +15,11 @@ static int usage(void); int util_loadtext(WT_SESSION *session, int argc, char *argv[]) { + WT_DECL_RET; int ch; - const char *uri; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF) switch (ch) { case 'f': /* input file */ @@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - return (text(session, uri)); + ret = text(session, uri); + + free(uri); + return (ret); } /* @@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri) */ if ((ret = session->open_cursor( session, uri, NULL, "append,overwrite", &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + return (util_err(session, ret, "%s: session.open_cursor", uri)); /* * We're about to load strings, make sure the formats match. diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 001a66d6d9e..7157f0d90fe 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -285,11 +285,11 @@ usage(void) } /* - * util_name -- + * util_uri -- * Build a name. */ char * -util_name(WT_SESSION *session, const char *s, const char *type) +util_uri(WT_SESSION *session, const char *s, const char *type) { size_t len; char *name; diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index e7fa2134934..5f3ed43905b 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -14,8 +14,8 @@ int util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; - int ch; uint32_t flags; + int ch; flags = 0; while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) @@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - ret = __wt_txn_printlog(session, flags); - - if (ret != 0) { - fprintf(stderr, "%s: printlog failed: %s\n", - progname, session->strerror(session, ret)); - goto err; - } + if ((ret = __wt_txn_printlog(session, flags)) != 0) + (void)util_err(session, ret, "printlog"); - if (0) { -err: ret = 1; - } return (ret); } diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c index 2e766377aa9..393949b6a1c 100644 --- a/src/utilities/util_read.c +++ b/src/utilities/util_read.c @@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool rkey, rval; - const char *uri, *value; + char *uri, *value; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are a uri followed by a list of keys. */ if (argc < 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ - if ((ret = session->open_cursor( - session, uri, NULL, NULL, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c index 45f161487e5..c188ea17d22 100644 --- a/src/utilities/util_rebalance.c +++ b/src/utilities/util_rebalance.c @@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->rebalance(session, name, NULL)) != 0) { - fprintf(stderr, "%s: rebalance(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->rebalance(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.rebalance: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index aee299c6e63..bb2d40cd103 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are the object uri and new name. */ if (argc != 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); newuri = argv[1]; - if ((ret = session->rename(session, uri, newuri, NULL)) != 0) { - fprintf(stderr, "%s: rename %s to %s: %s\n", - progname, uri, newuri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->rename(session, uri, newuri, NULL)) != 0) + (void)util_err( + session, ret, "session.rename: %s, %s", uri, newuri); free(uri); - return (ret); } diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c index 679d1074457..6cc2278b846 100644 --- a/src/utilities/util_salvage.c +++ b/src/utilities/util_salvage.c @@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; const char *force; - char *name; + char *uri; force = NULL; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF) switch (ch) { case 'F': @@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the file name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "file")) == NULL) + if ((uri = util_uri(session, *argv, "file")) == NULL) return (1); - if ((ret = session->salvage(session, name, force)) != 0) { - fprintf(stderr, "%s: salvage(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->salvage(session, uri, force)) != 0) + (void)util_err(session, ret, "session.salvage: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c index 4376f559ceb..1b75d9ea8bf 100644 --- a/src/utilities/util_stat.c +++ b/src/utilities/util_stat.c @@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) objname = (char *)""; break; case 1: - if ((objname = util_name(session, *argv, "table")) == NULL) + if ((objname = util_uri(session, *argv, "table")) == NULL) return (1); objname_free = true; break; @@ -82,8 +82,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) (ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0) if (printf("%s=%s\n", desc, pval) < 0) { - ret = errno; - break; + (void)util_err(session, errno, "printf"); + goto err; } if (ret == WT_NOTFOUND) ret = 0; diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c index 9325c0d7e84..35de02345c8 100644 --- a/src/utilities/util_truncate.c +++ b/src/utilities/util_truncate.c @@ -15,8 +15,9 @@ util_truncate(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,13 +31,13 @@ util_truncate(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->truncate(session, name, NULL, NULL, NULL)) != 0) - return (util_err(session, ret, "%s: session.truncate", name)); + if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0) + (void)util_err(session, ret, "session.truncate: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 63b23f28c16..f89bd46e133 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->upgrade(session, name, NULL)) != 0) { - fprintf(stderr, "%s: upgrade(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->upgrade(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.upgrade: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index 82bdd780cd3..d0587fcfc8c 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) size_t size; int ch; bool dump_address, dump_blocks, dump_layout, dump_pages; - char *config, *dump_offsets, *name; + char *config, *dump_offsets, *uri; dump_address = dump_blocks = dump_layout = dump_pages = false; - config = dump_offsets = name = NULL; + config = dump_offsets = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF) switch (ch) { case 'd': @@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); /* Build the configuration string as necessary. */ @@ -69,7 +69,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) strlen("dump_offsets[],") + (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20; if ((config = malloc(size)) == NULL) { - (void)util_err(session, errno, NULL); + ret = util_err(session, errno, NULL); goto err; } snprintf(config, size, @@ -82,23 +82,19 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) dump_offsets != NULL ? "]," : "", dump_pages ? "dump_pages," : ""); } - if ((ret = session->verify(session, name, config)) != 0) { - fprintf(stderr, "%s: verify(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->verify(session, uri, config)) != 0) + (void)util_err(session, ret, "session.verify: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(config); - free(name); - +err: free(config); + free(uri); return (ret); } diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index 7d9bce02b36..b931fad064d 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool append, overwrite, rkey; - const char *uri; - char config[100]; + char *uri, config[100]; append = overwrite = false; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF) switch (ch) { case 'a': @@ -47,15 +47,21 @@ util_write(WT_SESSION *session, int argc, char *argv[]) } else if (argc < 3 || ((argc - 1) % 2 != 0)) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ (void)snprintf(config, sizeof(config), "%s,%s", append ? "append=true" : "", overwrite ? "overwrite=true" : ""); - if ((ret = session->open_cursor( - session, uri, NULL, config, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + if ((ret = + session->open_cursor(session, uri, NULL, config, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a -- cgit v1.2.1 From 2185e4206c238389665fa024c3f891160942c04d Mon Sep 17 00:00:00 2001 From: sueloverso Date: Mon, 6 Feb 2017 11:29:25 -0500 Subject: WT-3157 More aggressive error handling. (#3275) * More aggressive error handling. * Alternative checkpoint cleanup. (#3281) --- src/txn/txn_ckpt.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 90804db3240..59dcc23acc5 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -181,7 +181,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; - u_int i, j; + u_int i; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { @@ -189,22 +189,10 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); - WT_ERR(ret); + WT_RET(ret); } -err: - /* - * If we have an error somewhere in processing the handles, then - * we need to mark earlier trees dirty. - */ - if (ret != 0) - for (j = 0; j < i; ++j) { - if (session->ckpt_handle[j] == NULL) - continue; - WT_WITH_DHANDLE(session, session->ckpt_handle[j], - S2BT(session)->modified = true); - } - return (ret); + return (0); } /* @@ -555,7 +543,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; - bool full, idle, logging, tracking; + bool failed, full, idle, logging, tracking; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; @@ -836,12 +824,13 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0) + failed = ret != 0; + if (failed) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) - WT_TRET(__wt_meta_track_off(session, false, ret != 0)); + WT_TRET(__wt_meta_track_off(session, false, failed)); cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); @@ -874,6 +863,13 @@ err: /* for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; + /* + * If the operation failed, mark all trees dirty so they are + * included if a future checkpoint can succeed. + */ + if (failed) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + S2BT(session)->modified = true); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } -- cgit v1.2.1 From 2a59c1fd79ff98b89046404ccb756114d74fa5f4 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 8 Feb 2017 01:53:18 -0500 Subject: WT-3161 Panic on a write error in logging. (#3278) It is not possible to continue without risking data loss. --- src/log/log.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/log/log.c b/src/log/log.c index 1482cc0aca1..b07ef8c1bd5 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -62,6 +62,8 @@ static int __log_fs_write(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { + WT_DECL_RET; + /* * If we're writing into a new log file, we have to wait for all * writes to the previous log file to complete otherwise there could @@ -71,7 +73,10 @@ __log_fs_write(WT_SESSION_IMPL *session, __log_wait_for_earlier_slot(session, slot); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } - return (__wt_write(session, slot->slot_fh, offset, len, buf)); + if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) + WT_PANIC_MSG(session, ret, + "%s: fatal log failure", slot->slot_fh->name); + return (ret); } /* -- cgit v1.2.1 From 15b7658a380e374e627b86e7629c8fad3ef349dc Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 8 Feb 2017 23:25:22 -0500 Subject: WT-3164 Ensure all relevant btree fields are reset on checkpoint error. (#3283) --- src/txn/txn_ckpt.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 59dcc23acc5..5932e058552 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -524,6 +524,17 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, #endif } +/* + * __checkpoint_fail_reset -- + * Reset fields when a failure occurs. + */ +static void +__checkpoint_fail_reset(WT_SESSION_IMPL *session) +{ + S2BT(session)->modified = true; + S2BT(session)->ckpt = NULL; +} + /* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. @@ -869,7 +880,7 @@ err: /* */ if (failed) WT_WITH_DHANDLE(session, session->ckpt_handle[i], - S2BT(session)->modified = true); + __checkpoint_fail_reset(session)); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } -- cgit v1.2.1 From 0b9e4534b2e01a7bf3dec00c91d6f38dfbcc0dd0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 9 Feb 2017 09:15:15 -0500 Subject: WT-3088 bug: WiredTiger can evict the tree's current eviction walk point (#3280) WT-3088 bug: WiredTiger can evict the tree's current eviction walk point --- src/btree/bt_debug.c | 2 -- src/btree/bt_split.c | 74 +++++++++++++++++++++++++++------------------------- src/include/btmem.h | 8 +++--- src/include/btree.i | 4 +-- src/include/extern.h | 1 + 5 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index a89eca230fd..d664da2ebd3 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -699,8 +699,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", evict-lru")); if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) WT_RET(ds->f(ds, ", overflow-keys")); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) - WT_RET(ds->f(ds, ", split-block")); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) WT_RET(ds->f(ds, ", split-insert")); if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE)) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 7cfcd08f931..8122d242666 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -53,6 +53,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session) return (oldest); } +/* + * __wt_split_obsolete -- + * Check if it is safe to free / evict based on split generation. + */ +bool +__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) +{ + return (split_gen < __split_oldest_gen(session)); +} + /* * __split_stash_add -- * Add a new entry into the session's split stash list. @@ -394,8 +404,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) +__split_ref_step1(WT_SESSION_IMPL *session, + WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; WT_REF *child_ref, *ref; @@ -418,30 +428,25 @@ __split_ref_step1( child = ref->page; /* - * Block eviction and splits in newly created pages. + * Block eviction in newly created pages. * * Once the split is live, newly created internal pages might be * evicted and their WT_REF structures freed. If that happened * before all threads exit the index of the page that previously * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page's modify - * structure has a field with a transaction ID that's checked - * before any internal page is evicted. Unfortunately, we don't - * know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but - * the act of updating the original page's index is what allows - * the eviction to happen. + * ensure that doesn't happen, the newly created page contains + * the current split generation and can't be evicted until + * all readers have left the old generation. * - * Split blocking was because historic versions of the split - * code didn't update the WT_REF.home field until after the - * split was live, so the WT_REF.home fields being updated could - * split again before the update, there's a race between splits - * as to which would update them first. The current code updates - * the WT_REF.home fields before going live (in this function), - * this shouldn't be an issue, but for now splits remain turned - * off. + * Historic, we also blocked splits in newly created pages + * because we didn't update the WT_REF.home field until after + * the split was live, so the WT_REF.home fields being updated + * could split again before the update, there's a race between + * splits as to which would update them first. The current code + * updates the WT_REF.home fields before going live (in this + * function), this isn't an issue. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + child->pg_intl_split_gen = split_gen; /* * We use a page flag to prevent the child from splitting from @@ -473,7 +478,6 @@ __split_ref_step2( WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; - WT_PAGE *child; WT_REF *ref; uint32_t i; @@ -503,14 +507,9 @@ __split_ref_step2( continue; WT_ERR(ret); - child = ref->page; - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); + __split_verify_intl_key_order(session, ref->page)); #endif WT_ERR(__wt_hazard_clear(session, ref)); @@ -653,8 +652,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the root page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, false); + __split_ref_step1(session, alloc_index, split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -686,7 +689,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * fails, we don't roll back that change, because threads may already * be using the new index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); root_decr += size; @@ -838,6 +840,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the parent page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -908,7 +914,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * * Acquire a new split generation. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); @@ -1160,8 +1165,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Get a generation for this split, mark the page. */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; + /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, true); + __split_ref_step1(session, alloc_index, split_gen, true); /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, @@ -1207,7 +1216,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * back that change, because threads may already be using the new parent * page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); page_decr += size; @@ -1284,10 +1292,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; - /* Skip pages that aren't ready to split. */ - if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) - return (EBUSY); - if (trylock) WT_RET(__wt_try_writelock(session, &parent->page_lock)); else diff --git a/src/include/btmem.h b/src/include/btmem.h index 43c1a309d52..39ca223aebf 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -483,6 +483,7 @@ struct __wt_page { */ struct { WT_REF *parent_ref; /* Parent reference */ + uint64_t split_gen; /* Generation of last split */ struct __wt_page_index { uint32_t entries; @@ -492,6 +493,8 @@ struct __wt_page { } intl; #undef pg_intl_parent_ref #define pg_intl_parent_ref u.intl.parent_ref +#undef pg_intl_split_gen +#define pg_intl_split_gen u.intl.split_gen /* * Macros to copy/set the index because the name is obscured to ensure @@ -593,9 +596,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ -#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ diff --git a/src/include/btree.i b/src/include/btree.i index 378d93dd2ee..315efa86fa6 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1348,8 +1348,8 @@ __wt_page_can_evict( * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. */ - if (WT_PAGE_IS_INTERNAL(page) && - F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) + if (WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( + session, page->pg_intl_split_gen)) return (false); /* diff --git a/src/include/extern.h b/src/include/extern.h index 863d2a02861..836a7cb1ae6 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -160,6 +160,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 722b9d1b3da5dbfc4703f41855ae219df3fc6f57 Mon Sep 17 00:00:00 2001 From: Mark Benvenuto Date: Sat, 11 Feb 2017 08:14:52 -0500 Subject: WT-3173 Add runtime detection for s390x CRC32 hardware support (#3290) --- src/checksum/power8/crc32_wrapper.c | 4 ++-- src/checksum/zseries/crc32-s390x.c | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/checksum/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c index ddfa2bdaeb8..a9be9ced1c6 100644 --- a/src/checksum/power8/crc32_wrapper.c +++ b/src/checksum/power8/crc32_wrapper.c @@ -1,4 +1,6 @@ #if defined(__powerpc64__) +#include "wt_internal.h" + #define CRC_TABLE #include "crc32_constants.h" @@ -68,8 +70,6 @@ out: } #endif -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. diff --git a/src/checksum/zseries/crc32-s390x.c b/src/checksum/zseries/crc32-s390x.c index f77d6768d42..28b46594220 100644 --- a/src/checksum/zseries/crc32-s390x.c +++ b/src/checksum/zseries/crc32-s390x.c @@ -6,8 +6,20 @@ * Author(s): Hendrik Brueckner * */ +#include "wt_internal.h" + #include #include + +#if defined(HAVE_CRC32_HARDWARE) + +#include + +/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */ +#ifndef HWCAP_S390_VX +#define HWCAP_S390_VX 2048 +#endif + #include "crc32-s390x.h" #include "slicing-consts.h" @@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l /* Main CRC-32 functions */ DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le) -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. @@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len) return (~__wt_crc32c_le_vx(0xffffffff, chunk, len)); } +#endif + /* * __wt_checksum_init -- * WiredTiger: detect CRC hardware and set the checksum function. @@ -89,8 +101,14 @@ void __wt_checksum_init(void) { #if defined(HAVE_CRC32_HARDWARE) - __wt_process.checksum = __wt_checksum_hw; -#else + unsigned long caps = getauxval(AT_HWCAP); + + if (caps & HWCAP_S390_VX) + __wt_process.checksum = __wt_checksum_hw; + else + __wt_process.checksum = __wt_checksum_sw; + +#else /* !HAVE_CRC32_HARDWARE */ __wt_process.checksum = __wt_checksum_sw; #endif } -- cgit v1.2.1 From 7f5d0f9981214c723f2ed90cf4533887ed406176 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 13 Feb 2017 10:49:24 +1100 Subject: WT-3170 Change when eviction walk point is saved, cleanup splits. (#3284) * Change how eviction walk point is saved during walk. * After 0b9e453, we no longer need to do any non-DIAGNOSTIC work after completing the split (previously, we had changes to make the newly created split pages evictable, but now they are initially given a generation number which will prevent their eviction until it's OK). Rename __split_ref_step2() to be __split_verify_intl(), and change it to verify all of the internal pages involved in the split. Previously, we only verified the pages we had to read and update anyway. Now we don't have to update any pages and we're only reading pages in DIAGNOSTIC mode, verify all of them. Don't release the hazard pointer explicitly, use the more standard __wt_page_release() call (it should make no difference, it's just a bit more consistent). Rename __split_ref_step1() to be __split_ref_prepare(), there's no longer a step #2. * We don't need to publish WT_BTREE.evict_ref, or use a barrier: in one we're guaranteed that only the writing thread will check the assertion in the discard code (that we're not discarding the eviction's reference), and in the other case we're doing hazard-pointer coupling, which implies there is a barrier in the code path before the page can possibly be discarded by any thread. * Review barriers use in splits. (#3288). In all cases, use the pattern "Update the page index, which includes a barrier to make the split live, switch to benign error mode, then verify the pages involved in the split are correct." --- src/btree/bt_split.c | 164 ++++++++++++++++++++-------------------------- src/btree/bt_walk.c | 4 +- src/evict/evict_lru.c | 49 +++++++++----- src/include/session.h | 2 - src/session/session_api.c | 5 -- 5 files changed, 103 insertions(+), 121 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8122d242666..fcb14be7c76 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -197,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session, #ifdef HAVE_DIAGNOSTIC /* * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * Verify the key order on an internal page after a split. */ static void __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -249,6 +249,42 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) break; } } + +/* + * __split_verify_intl -- + * Verify a set of internal pages involved in a split. + */ +static int +__split_verify_intl(WT_SESSION_IMPL *session, + WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first) +{ + WT_DECL_RET; + WT_REF *ref; + + /* The split is complete and live, verify all of the pages involved. */ + if (page1 != NULL) + __split_verify_intl_key_order(session, page1); + if (page2 != NULL) + __split_verify_intl_key_order(session, page2); + + /* Skip the first slot on non-root internal pages, it's not set. */ + WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) { + if (skip_first) { + skip_first = false; + continue; + } + WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT)); + + __split_verify_intl_key_order(session, ref->page); + + WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + } WT_INTL_FOREACH_END; + + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error during page split"); +} #endif /* @@ -400,11 +436,11 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_ref_step1 -- + * __split_ref_prepare -- * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1(WT_SESSION_IMPL *session, +__split_ref_prepare(WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; @@ -469,58 +505,6 @@ __split_ref_step1(WT_SESSION_IMPL *session, } } -/* - * __split_ref_step2 -- - * Allow the newly created children to be evicted or split. - */ -static int -__split_ref_step2( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) -{ - WT_DECL_RET; - WT_REF *ref; - uint32_t i; - - /* - * The split has gone live, enable eviction and splits on the newly - * created internal pages. - */ - WT_WRITE_BARRIER(); - - for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { - ref = pindex->index[i]; - - /* - * We don't hold hazard pointers on created pages, they cannot - * be evicted because the page-modify transaction value set as - * they were created prevents eviction. (See above, we reset - * that value as part of fixing up the page.) But, an eviction - * thread might be attempting to evict the page (the WT_REF may - * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF - * may be WT_REF_READING), or it may be in some other state. - * Acquire a hazard pointer for any in-memory pages so we know - * the state of the page. Ignore pages not in-memory (deleted, - * on-disk, being read), there's no in-memory structure to fix. - */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) - continue; - WT_ERR(ret); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, ref->page)); -#endif - - WT_ERR(__wt_hazard_clear(session, ref)); - } - - return (0); - -err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error resolving a split"); -} - /* * __split_root -- * Split the root page in-memory, deepening the tree. @@ -657,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root->pg_intl_split_gen = split_gen; /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, split_gen, false); + __split_ref_prepare(session, alloc_index, split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -665,19 +649,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, root)); -#endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, false)); + alloc_index = NULL; /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* We've installed the allocated page-index, ensure error handling. */ - alloc_index = NULL; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + ret = __split_verify_intl(session, root, NULL, root, false)); + WT_ERR(ret); +#endif /* * We can't free the previous root's index, there may be threads using @@ -852,11 +833,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF @@ -875,18 +851,27 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, __wt_free(session, ref->page_del); } + /* + * Set the discarded WT_REF state to split, ensuring we don't + * race with any discard of the WT_REF deleted fields. + */ WT_PUBLISH(ref->state, WT_REF_SPLIT); - } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); + /* + * Push out the change: not required for correctness, but stops + * threads spinning on incorrect page references. + */ + WT_FULL_BARRIER(); + } /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); +#endif + /* * !!! * Swapping in the new page index released the page for eviction, we can @@ -1170,34 +1155,27 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page->pg_intl_split_gen = split_gen; /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, split_gen, true); + __split_ref_prepare(session, alloc_index, split_gen, true); /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* Confirm the page's index hasn't moved, then update it. */ + /* + * Confirm the page's index hasn't moved, then update it, which makes + * the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, page)); -#endif - - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, true)); - /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* - * Push out the changes: not required for correctness, but no reason - * to wait. - */ - WT_FULL_BARRIER(); +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + ret = __split_verify_intl(session, parent, page, page, true)); + WT_ERR(ret); +#endif /* * We don't care about the page-index we allocated, all we needed was diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 049700952ee..ddaa2e5f70b 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * - * We may be passed a pointer to btree->evict_page that we are clearing - * here. We check when discarding pages that we're not discarding that - * page, so this clear must be done before the page is released. + * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index db39a5acdee..efe056aee02 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -756,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session) * Clear a single walk point. */ static int -__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) +__evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -773,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) if ((ref = btree->evict_ref) == NULL) return (0); - if (count_stat) - WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); + WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); /* - * Clear evict_ref first, in case releasing it forces eviction (we - * assert we never try to evict the current eviction walk point). + * Clear evict_ref before releasing it in case that forces eviction (we + * assert that we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; + WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); @@ -803,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__evict_clear_walk(session, true))); + WT_TRET(__evict_clear_walk(session))); return (ret); } @@ -848,7 +848,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) /* Clear any existing LRU eviction walk for the file. */ WT_WITH_PASS_LOCK(session, - ret = __evict_clear_walk(session, true)); + ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); @@ -1662,8 +1662,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, FLD_SET(walk_flags, WT_READ_PREV); /* - * Get some more eviction candidate pages. - * + * Get some more eviction candidate pages, starting at the last saved + * point. Clear the saved point immediately, we assert when discarding + * pages we're not discarding an eviction point, so this clear must be + * complete before the page is released. + */ + ref = btree->evict_ref; + btree->evict_ref = NULL; + + /* * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a @@ -1676,7 +1683,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, for (evict = start, pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( - session, &btree->evict_ref, &refs_walked, walk_flags)) { + session, &ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs * pages seen. Some workloads create "deserts" in trees where @@ -1690,7 +1697,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (give_up) break; - if ((ref = btree->evict_ref) == NULL) { + if (ref == NULL) { if (++restarts == 2) break; WT_STAT_CONN_INCR( @@ -1812,6 +1819,8 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period /= 2; /* + * Give up the walk occasionally. + * * If we happen to end up on the root page or a page requiring urgent * eviction, clear it. We have to track hazard pointers, and the root * page complicates that calculation. @@ -1823,16 +1832,20 @@ fast: /* If the page can't be evicted, give up. */ * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ - if ((ref = btree->evict_ref) != NULL) { - /* Give up the walk occasionally. */ + if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->read_gen == WT_READGEN_OLDEST || - ref->page->memory_footprint >= btree->splitmempage) - WT_RET(__evict_clear_walk(session, restarts == 0)); - else if (ref->page->read_gen == WT_READGEN_OLDEST) + ref->page->memory_footprint >= btree->splitmempage) { + if (restarts == 0) + WT_STAT_CONN_INCR( + session, cache_eviction_walks_abandoned); + WT_RET(__wt_page_release(cache->walk_session, + ref, WT_READ_NO_EVICT)); + ref = NULL; + } else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( - session, &btree->evict_ref, - &refs_walked, walk_flags)); + session, &ref, &refs_walked, walk_flags)); + btree->evict_ref = ref; } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); diff --git a/src/include/session.h b/src/include/session.h index 7dd523aea26..085f871a34f 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -52,8 +52,6 @@ struct __wt_session_impl { const char *lastop; /* Last operation */ uint32_t id; /* UID, offset in session array */ - WT_CONDVAR *cond; /* Condition variable */ - WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ WT_DATA_HANDLE *dhandle; /* Current data handle */ diff --git a/src/session/session_api.c b/src/session/session_api.c index 71626e098cb..3a5d06f1b61 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config) /* Release common session resources. */ WT_TRET(__wt_session_release_resources(session)); - /* Destroy the thread's mutex. */ - WT_TRET(__wt_cond_destroy(session, &session->cond)); - /* The API lock protects opening and closing of sessions. */ __wt_spin_lock(session, &conn->api_lock); @@ -1837,8 +1834,6 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", &session_ret->cond)); - if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); -- cgit v1.2.1 From a8fe04026ef55b8f59df24ff75ae151c7c370e2a Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Sun, 12 Feb 2017 20:13:24 -0500 Subject: WT-3135 WT-3159 Fix search_near() with custom collators for index keys of variable length. (#3254) * For checkpoint logging, use a format that ends in 'u' to be compatible with previously created log files. In previous WT versions, these formats end in 'U', and a final 'U' does have a prefixed size. Now, a 'U' in any position has a prefixed size. --- dist/s_string.ok | 1 + dist/s_void | 4 + src/cursor/cur_index.c | 25 +- src/include/packing.i | 7 +- src/txn/txn_log.c | 4 +- test/csuite/Makefile.am | 3 + test/csuite/wt3135_search_near_collator/main.c | 360 +++++++++++++++++++++++++ 7 files changed, 398 insertions(+), 6 deletions(-) create mode 100644 test/csuite/wt3135_search_near_collator/main.c diff --git a/dist/s_string.ok b/dist/s_string.ok index bb0cacd9d5d..d2e9dffaa48 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -1182,6 +1182,7 @@ txt typedef uB uS +ui uint uintmax unbare diff --git a/dist/s_void b/dist/s_void index 4a6b4ad91a2..947153e730b 100755 --- a/dist/s_void +++ b/dist/s_void @@ -87,6 +87,10 @@ func_ok() -e '/int handle_progress$/d' \ -e '/int helium_cursor_reset$/d' \ -e '/int helium_session_verify$/d' \ + -e '/int index_compare_primary$/d' \ + -e '/int index_compare_S$/d' \ + -e '/int index_compare_u$/d' \ + -e '/int index_extractor_u$/d' \ -e '/int log_print_err$/d' \ -e '/int lz4_error$/d' \ -e '/int lz4_pre_size$/d' \ diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 4786b0524bc..13180efdea4 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -240,7 +240,16 @@ __curindex_search(WT_CURSOR *cursor) found_key = child->key; if (found_key.size < cursor->key.size) WT_ERR(WT_NOTFOUND); - found_key.size = cursor->key.size; + + /* + * Custom collators expect to see complete keys, pass an item containing + * all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, child->key_format, + cindex->iface.key_format, &child->key, &found_key)); + else + found_key.size = cursor->key.size; WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, &cmp)); @@ -307,8 +316,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) * so we flip the sign of the result to match what callers expect. */ found_key = child->key; - if (found_key.size > cursor->key.size) - found_key.size = cursor->key.size; + if (found_key.size > cursor->key.size) { + /* + * Custom collators expect to see complete keys, pass an item + * containing all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, cindex->iface.key_format, + &child->key, &found_key)); + else + found_key.size = cursor->key.size; + } WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, exact)); diff --git a/src/include/packing.i b/src/include/packing.i index 17ca261bcfc..8ba3dd536ac 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -168,10 +168,15 @@ next: if (pack->cur == pack->end) (int)(pack->end - pack->orig), pack->orig); return (0); case 'u': - case 'U': /* Special case for items with a size prefix. */ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u'; return (0); + case 'U': + /* + * Don't change the type. 'U' is used internally, so this type + * was already changed to explicitly include the size. + */ + return (0); case 'b': case 'h': case 'i': diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 7ad295f421b..2931dc1ce82 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, WT_ITEM ckpt_snapshot_unused; uint32_t ckpt_file, ckpt_offset; u_int ckpt_nsnapshot_unused; - const char *fmt = WT_UNCHECKED_STRING(IIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIu); if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, @@ -297,7 +297,7 @@ __wt_txn_checkpoint_log( uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; - const char *fmt = WT_UNCHECKED_STRING(IIIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIIu); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index bcdbf120d67..5167b42b433 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -43,6 +43,9 @@ noinst_PROGRAMS += test_wt2999_join_extractor test_wt3120_filesys_SOURCES = wt3120_filesys/main.c noinst_PROGRAMS += test_wt3120_filesys +test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c +noinst_PROGRAMS += test_wt3135_search_near_collator + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3135_search_near_collator/main.c b/test/csuite/wt3135_search_near_collator/main.c new file mode 100644 index 00000000000..3113d29dfa9 --- /dev/null +++ b/test/csuite/wt3135_search_near_collator/main.c @@ -0,0 +1,360 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3135 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +#define TEST_ENTRY_COUNT 5 +typedef const char *TEST_SET[TEST_ENTRY_COUNT]; +static TEST_SET test_sets[] = { + { "0", "01", "012", "0123", "01234" }, + { "A", "B", "C", "D", "E" }, + { "5", "54", "543", "5432", "54321" }, + { "54321", "5433", "544", "55", "6" } +}; +#define TEST_SET_COUNT (sizeof(test_sets) / sizeof(test_sets[0])) + +static bool +item_str_equal(WT_ITEM *item, const char *str) +{ + return (item->size == strlen(str) + 1 && strncmp((char *)item->data, + str, item->size) == 0); +} + +static int +compare_int(int a, int b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int +index_compare_primary(WT_PACK_STREAM *s1, WT_PACK_STREAM *s2, int *cmp) +{ + int64_t pkey1, pkey2; + int rc1, rc2; + + rc1 = wiredtiger_unpack_int(s1, &pkey1); + rc2 = wiredtiger_unpack_int(s2, &pkey2); + + if (rc1 == 0 && rc2 == 0) + *cmp = compare_int(pkey1, pkey2); + else if (rc1 != 0 && rc2 != 0) + *cmp = 0; + else if (rc1 != 0) + *cmp = -1; + else + *cmp = 1; + return (0); +} + +static int +index_compare_S(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_PACK_STREAM *s1, *s2; + const char *skey1, *skey2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "Si", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "Si", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_str(s1, &skey1)); + testutil_check(wiredtiger_unpack_str(s2, &skey2)); + + if ((*cmp = strcmp(skey1, skey2)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_compare_u(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM skey1, skey2; + WT_PACK_STREAM *s1, *s2; + + (void)collator; + + testutil_check(wiredtiger_unpack_start(session, "ui", key1->data, + key1->size, &s1)); + testutil_check(wiredtiger_unpack_start(session, "ui", key2->data, + key2->size, &s2)); + + testutil_check(wiredtiger_unpack_item(s1, &skey1)); + testutil_check(wiredtiger_unpack_item(s2, &skey2)); + + if ((*cmp = strcmp(skey1.data, skey2.data)) == 0) + testutil_check(index_compare_primary(s1, s2, cmp)); + + testutil_check(wiredtiger_pack_close(s1, NULL)); + testutil_check(wiredtiger_pack_close(s2, NULL)); + + return (0); +} + +static int +index_extractor_u(WT_EXTRACTOR *extractor, WT_SESSION *session, + const WT_ITEM *key, const WT_ITEM *value, WT_CURSOR *result_cursor) +{ + (void)extractor; + (void)session; + (void)key; + + result_cursor->set_key(result_cursor, value); + return result_cursor->insert(result_cursor); +} + +static WT_COLLATOR collator_S = { index_compare_S, NULL, NULL }; +static WT_COLLATOR collator_u = { index_compare_u, NULL, NULL }; +static WT_EXTRACTOR extractor_u = { index_extractor_u, NULL, NULL }; + +/* + * Check search() and search_near() using the test string indicated + * by test_index. + */ +static void +search_using_str(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + int exact, ret; + const char *result; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + cursor->set_key(cursor, test_str); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &result)); + + if (test_index == 0) + testutil_assert(strcmp(result, str_01) == 0 && exact > 0); + else if (test_index == 1) + testutil_assert(strcmp(result, str_01) == 0 && exact == 0); + else if (test_index == 2) + testutil_assert((strcmp(result, str_0123) == 0 && exact > 0) || + (strcmp(result, str_01) == 0 && exact < 0)); + else if (test_index == 3) + testutil_assert(strcmp(result, str_0123) == 0 && exact == 0); + else if (test_index == 4) + testutil_assert(strcmp(result, str_0123) == 0 && exact < 0); + + cursor->set_key(cursor, test_str); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * Check search() and search_near() using the test string indicated + * by test_index against a table containing a variable sized item. + */ +static void +search_using_item(WT_CURSOR *cursor, TEST_SET test_set, int test_index) +{ + WT_ITEM item; + size_t testlen; + int exact, ret; + const char *str_01, *str_0123, *test_str; + + testutil_assert(test_index >= 0 && test_index <= 4); + str_01 = test_set[1]; + str_0123 = test_set[3]; + test_str = test_set[test_index]; + + testlen = strlen(test_str) + 1; + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + testutil_check(cursor->search_near(cursor, &exact)); + testutil_check(cursor->get_key(cursor, &item)); + + if (test_index == 0) + testutil_assert(item_str_equal(&item, str_01) && exact > 0); + else if (test_index == 1) + testutil_assert(item_str_equal(&item, str_01) && exact == 0); + else if (test_index == 2) + testutil_assert((item_str_equal(&item, str_0123) && exact > 0) + || (item_str_equal(&item, str_01) && exact < 0)); + else if (test_index == 3) + testutil_assert(item_str_equal(&item, str_0123) && exact == 0); + else if (test_index == 4) + testutil_assert(item_str_equal(&item, str_0123) && exact < 0); + + item.data = test_str; + item.size = testlen; + cursor->set_key(cursor, &item); + ret = cursor->search(cursor); + + if (test_index == 0 || test_index == 2 || test_index == 4) + testutil_assert(ret == WT_NOTFOUND); + else if (test_index == 1 || test_index == 3) + testutil_assert(ret == 0); +} + +/* + * For each set of data, perform tests. + */ +static void +test_one_set(WT_SESSION *session, TEST_SET set) +{ + WT_CURSOR *cursor; + WT_ITEM item; + int32_t i; + + /* + * Part 1: Using a custom collator, insert some elements + * and verify results from search_near. + */ + + testutil_check(session->create(session, + "table:main", "key_format=i,value_format=S,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:def_collator", "columns=(v)")); + testutil_check(session->create(session, + "index:main:custom_collator", + "columns=(v),collator=collator_S")); + + /* Insert only elements #1 and #3. */ + testutil_check(session->open_cursor(session, + "table:main", NULL, NULL, &cursor)); + cursor->set_key(cursor, 0); + cursor->set_value(cursor, set[1]); + testutil_check(cursor->insert(cursor)); + cursor->set_key(cursor, 1); + cursor->set_value(cursor, set[3]); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + /* Check all elements in def_collator index. */ + testutil_check(session->open_cursor(session, + "index:main:def_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* Check all elements in custom_collator index */ + testutil_check(session->open_cursor(session, + "index:main:custom_collator", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_str(cursor, set, i); + testutil_check(cursor->close(cursor)); + + /* + * Part 2: perform the same checks using a custom collator and + * extractor. + */ + testutil_check(session->create(session, + "table:main2", "key_format=i,value_format=u,columns=(k,v)")); + + testutil_check(session->create(session, "index:main2:idx_w_coll", + "key_format=u,collator=collator_u,extractor=extractor_u")); + + testutil_check(session->open_cursor(session, + "table:main2", NULL, NULL, &cursor)); + + memset(&item, 0, sizeof(item)); + item.size = strlen(set[1]) + 1; + item.data = set[1]; + cursor->set_key(cursor, 1); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + item.size = strlen(set[3]) + 1; + item.data = set[3]; + cursor->set_key(cursor, 3); + cursor->set_value(cursor, &item); + testutil_check(cursor->insert(cursor)); + + testutil_check(cursor->close(cursor)); + + testutil_check(session->open_cursor(session, + "index:main2:idx_w_coll", NULL, NULL, &cursor)); + for (i = 0; i < (int32_t)TEST_ENTRY_COUNT; i++) + search_using_item(cursor, set, i); + testutil_check(cursor->close(cursor)); + + testutil_check(session->drop(session, "table:main", NULL)); + testutil_check(session->drop(session, "table:main2", NULL)); +} + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + int32_t i; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + /* Add any collators and extractors used by tests */ + testutil_check(opts->conn->add_collator(opts->conn, "collator_S", + &collator_S, NULL)); + testutil_check(opts->conn->add_collator(opts->conn, "collator_u", + &collator_u, NULL)); + testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u", + &extractor_u, NULL)); + + for (i = 0; i < (int32_t)TEST_SET_COUNT; i++) { + printf("test set %d\n", i); + test_one_set(session, test_sets[i]); + } + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1 From 2258dac42020b486b78947d434fde72c236d1e48 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 13 Feb 2017 10:02:37 -0500 Subject: WT-3174 Coverity/lint cleanup (#3293) * WT-3174 Coverity/lint cleanup clang38 complaints: wt3135_search_near_collator/main.c:75:22: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] *cmp = compare_int(pkey1, pkey2); ~~~~~~~~~~~ ^~~~~ wt3135_search_near_collator/main.c:75:29: error: implicit conversion loses integer precision: 'int64_t' (aka 'long') to 'int' [-Werror,-Wshorten-64-to-32] *cmp = compare_int(pkey1, pkey2); ~~~~~~~~~~~ ^~~~~ * Coverity complains in __split_root(): dead_error_condition: The switch value complete cannot be WT_ERR_PANIC. CID 1371132 (#1 of 1): Logically dead code (DEADCODE) dead_error_begin: Execution cannot reach this statement: case WT_ERR_PANIC:. Revert a minor part of 7f5d0f9, don't switch to benign error mode (setting WT_ERR_IGNORE) until after the split has been verified in DIAGNOSTIC mode. That makes sense and should make Coverity happy. * Fix type-casting, sizeof()/sizeof() is a size_t. --- src/btree/bt_split.c | 18 +++++++++--------- test/csuite/wt3135_search_near_collator/main.c | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fcb14be7c76..3142e52be0d 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -651,15 +651,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_intl(session, root, NULL, root, false)); WT_ERR(ret); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * We can't free the previous root's index, there may be threads using * it. Add to the session's discard list, to be freed once we know no @@ -864,14 +864,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_FULL_BARRIER(); } - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * !!! * Swapping in the new page index released the page for eviction, we can @@ -1168,15 +1168,15 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); - /* The split is complete and correct, ignore benign errors. */ - complete = WT_ERR_IGNORE; - #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_intl(session, parent, page, page, true)); WT_ERR(ret); #endif + /* The split is complete and verified, ignore benign errors. */ + complete = WT_ERR_IGNORE; + /* * We don't care about the page-index we allocated, all we needed was * the array of WT_REF structures, which has now been split into the diff --git a/test/csuite/wt3135_search_near_collator/main.c b/test/csuite/wt3135_search_near_collator/main.c index 3113d29dfa9..8783034a7d8 100644 --- a/test/csuite/wt3135_search_near_collator/main.c +++ b/test/csuite/wt3135_search_near_collator/main.c @@ -57,7 +57,7 @@ item_str_equal(WT_ITEM *item, const char *str) } static int -compare_int(int a, int b) +compare_int(int64_t a, int64_t b) { return (a < b ? -1 : (a > b ? 1 : 0)); } @@ -329,7 +329,7 @@ main(int argc, char *argv[]) { TEST_OPTS *opts, _opts; WT_SESSION *session; - int32_t i; + size_t i; opts = &_opts; memset(opts, 0, sizeof(*opts)); @@ -349,8 +349,8 @@ main(int argc, char *argv[]) testutil_check(opts->conn->add_extractor(opts->conn, "extractor_u", &extractor_u, NULL)); - for (i = 0; i < (int32_t)TEST_SET_COUNT; i++) { - printf("test set %d\n", i); + for (i = 0; i < TEST_SET_COUNT; i++) { + printf("test set %" WT_SIZET_FMT "\n", i); test_one_set(session, test_sets[i]); } -- cgit v1.2.1 From dc33b134ea0e231fd87924c6a50e6f8230a7c6bf Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 14 Feb 2017 14:22:10 +1100 Subject: WT-3175 Don't verify children during splits up the tree. (#3294) Reverts part of 7f5d0f9981214c723f2ed90cf4533887ed406176. Fixes a deadlock in diagnostic mode. Also revert a change that could cause diagnostic code to read pages into cache: we don't want diagnostic adding cache pressure and we already verify pages as they are evicted. --- src/btree/bt_split.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 3142e52be0d..45550ff627f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -251,29 +251,33 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __split_verify_intl -- - * Verify a set of internal pages involved in a split. + * __split_verify_root -- + * Verify a root page involved in a split. */ static int -__split_verify_intl(WT_SESSION_IMPL *session, - WT_PAGE *page1, WT_PAGE *page2, WT_PAGE *pindex_page, bool skip_first) +__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_REF *ref; /* The split is complete and live, verify all of the pages involved. */ - if (page1 != NULL) - __split_verify_intl_key_order(session, page1); - if (page2 != NULL) - __split_verify_intl_key_order(session, page2); - - /* Skip the first slot on non-root internal pages, it's not set. */ - WT_INTL_FOREACH_BEGIN(session, pindex_page, ref) { - if (skip_first) { - skip_first = false; + __split_verify_intl_key_order(session, page); + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * An eviction thread might be attempting to evict the page + * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based + * page (the WT_REF may be WT_REF_READING), or it may be in + * some other state. Acquire a hazard pointer for any + * in-memory pages so we know the state of the page. + * + * Ignore pages not in-memory (deleted, on-disk, being read), + * there's no in-memory structure to check. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) continue; - } - WT_ERR(__wt_page_in(session, ref, WT_READ_NO_EVICT)); + WT_ERR(ret); __split_verify_intl_key_order(session, ref->page); @@ -653,7 +657,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - ret = __split_verify_intl(session, root, NULL, root, false)); + ret = __split_verify_root(session, root)); WT_ERR(ret); #endif @@ -1170,8 +1174,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - ret = __split_verify_intl(session, parent, page, page, true)); - WT_ERR(ret); + __split_verify_intl_key_order(session, parent)); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, page)); #endif /* The split is complete and verified, ignore benign errors. */ -- cgit v1.2.1 From 5b16ddd3815fb043061ac35151e277b919d7e463 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 14 Feb 2017 15:32:43 +1100 Subject: WT-3152 Switch the table lock to a rwlock. (#3291) --- dist/flags.py | 3 +- dist/s_define.list | 2 + src/conn/conn_handle.c | 4 +- src/cursor/cur_table.c | 2 +- src/include/connection.h | 2 +- src/include/flags.h | 23 +++++----- src/include/schema.h | 104 +++++++++++++++++++++++++++++++++------------- src/schema/schema_list.c | 2 +- src/session/session_api.c | 15 +++---- src/txn/txn_ckpt.c | 2 +- 10 files changed, 106 insertions(+), 53 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index 216f7c29e0a..b20a7181532 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -123,7 +123,8 @@ flags = { 'SESSION_LOCKED_PASS', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', - 'SESSION_LOCKED_TABLE', + 'SESSION_LOCKED_TABLE_READ', + 'SESSION_LOCKED_TABLE_WRITE', 'SESSION_LOCKED_TURTLE', 'SESSION_LOGGING_INMEM', 'SESSION_LOOKASIDE_CURSOR', diff --git a/dist/s_define.list b/dist/s_define.list index 53a3df87615..8911d888077 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -39,6 +39,8 @@ WT_PADDING_CHECK WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT +WT_SESSION_LOCKED_TABLE_READ +WT_SESSION_LOCKED_TABLE_WRITE WT_SESSION_LOCKED_TURTLE WT_SIZE_CHECK WT_STATS_FIELD_TO_OFFSET diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 4f8d89fa9d2..287e9ca7b99 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -59,12 +59,12 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); - WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ __wt_rwlock_init(session, &conn->dhandle_lock); __wt_rwlock_init(session, &conn->hot_backup_lock); + __wt_rwlock_init(session, &conn->table_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS; ++i) @@ -142,7 +142,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); - __wt_spin_destroy(session, &conn->table_lock); + __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 76f7fc5865f..7e8cd153d2d 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -769,7 +769,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) return (0); /* If the table is incomplete, wait on the table lock and recheck. */ - WT_WITH_TABLE_LOCK(session, complete = table->cg_complete); + WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete); if (!complete) WT_RET_MSG(session, EINVAL, "'%s' not available until all column groups are created", diff --git a/src/include/connection.h b/src/include/connection.h index 3a719e59608..ce483d3291a 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -171,7 +171,7 @@ struct __wt_connection_impl { WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ - WT_SPINLOCK table_lock; /* Table creation spinlock */ + WT_RWLOCK table_lock; /* Table list lock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ WT_RWLOCK dhandle_lock; /* Data handle list lock */ diff --git a/src/include/flags.h b/src/include/flags.h index 5219bf33ed6..c1fff920e3b 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -59,17 +59,18 @@ #define WT_SESSION_LOCKED_PASS 0x00000040 #define WT_SESSION_LOCKED_SCHEMA 0x00000080 #define WT_SESSION_LOCKED_SLOT 0x00000100 -#define WT_SESSION_LOCKED_TABLE 0x00000200 -#define WT_SESSION_LOCKED_TURTLE 0x00000400 -#define WT_SESSION_LOGGING_INMEM 0x00000800 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 -#define WT_SESSION_NO_CACHE 0x00002000 -#define WT_SESSION_NO_DATA_HANDLES 0x00004000 -#define WT_SESSION_NO_EVICTION 0x00008000 -#define WT_SESSION_NO_LOGGING 0x00010000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 -#define WT_SESSION_SERVER_ASYNC 0x00080000 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 +#define WT_SESSION_LOCKED_TURTLE 0x00000800 +#define WT_SESSION_LOGGING_INMEM 0x00001000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 +#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_NO_DATA_HANDLES 0x00008000 +#define WT_SESSION_NO_EVICTION 0x00010000 +#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 +#define WT_SESSION_SERVER_ASYNC 0x00100000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 diff --git a/src/include/schema.h b/src/include/schema.h index fff57951c0e..9a6e1e54e80 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -78,10 +78,13 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) -/* Make it simple to check a generic locked state on the handle list lock */ +/* Helpers for the locked state of the handle list and table locks. */ #define WT_SESSION_LOCKED_HANDLE_LIST \ (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ WT_SESSION_LOCKED_HANDLE_LIST_WRITE) +#define WT_SESSION_LOCKED_TABLE \ + (WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_TABLE_WRITE) /* * WT_WITH_LOCK_WAIT -- @@ -90,7 +93,7 @@ struct __wt_table { #define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \ if (F_ISSET(session, (flag))) { \ op; \ - } else { \ + } else { \ __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ @@ -139,7 +142,7 @@ struct __wt_table { #define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ op; \ - } else { \ + } else { \ __wt_readlock(session, &S2C(session)->dhandle_lock); \ F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ op; \ @@ -150,15 +153,14 @@ struct __wt_table { /* * WT_WITH_HANDLE_LIST_WRITE_LOCK -- - * Acquire the data handle list lock in shared mode, perform an operation, - * drop the lock. The handle list lock is a read-write lock so the - * implementation is different to the other lock macros. - * Automatically upgrade from a read lock if held. + * Acquire the data handle list lock in exclusive mode, perform an + * operation, drop the lock. The handle list lock is a read-write lock so + * the implementation is different to the other lock macros. */ #define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ op; \ - } else { \ + } else { \ WT_ASSERT(session, \ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ __wt_writelock(session, &S2C(session)->dhandle_lock); \ @@ -201,22 +203,58 @@ struct __wt_table { } while (0) /* - * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT -- + * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK, + * WT_WITH_TABLE_WRITE_LOCK_NOWAIT -- * Acquire the table lock, perform an operation, drop the lock. + * The table lock is a read-write lock so the implementation is different + * to most other lock macros. + * + * Note: readlock always waits because some operations need the table lock + * to discard handles, and we only expect it to be held across short + * operations. */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ - WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ +#define WT_WITH_TABLE_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_readlock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) -#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \ + +#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_writelock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ +} while (0) +#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \ WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_NOWAIT(session, ret, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else if ((ret = __wt_try_writelock(session, \ + &S2C(session)->table_lock)) == 0) { \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) /* @@ -232,8 +270,10 @@ struct __wt_table { F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ bool __handle_write_locked = \ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ - bool __table_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ + bool __table_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \ + bool __table_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ if (__handle_read_locked) { \ @@ -244,9 +284,13 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ __wt_writeunlock(session, &__conn->dhandle_lock); \ } \ - if (__table_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_TABLE); \ - __wt_spin_unlock(session, &__conn->table_lock); \ + if (__table_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &__conn->table_lock); \ + } \ + if (__table_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &__conn->table_lock); \ } \ if (__schema_locked) { \ F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ @@ -265,9 +309,13 @@ struct __wt_table { __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ } \ - if (__table_locked) { \ - __wt_spin_lock(session, &__conn->table_lock); \ - F_SET(session, WT_SESSION_LOCKED_TABLE); \ + if (__table_read_locked) { \ + __wt_readlock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + } \ + if (__table_write_locked) { \ + __wt_writelock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ } \ if (__handle_read_locked) { \ __wt_readlock(session, &__conn->dhandle_lock); \ diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index ea7374b7554..74ef5135a4a 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session, /* Make sure the metadata is open before getting other locks. */ WT_RET(__wt_metadata_cursor(session, NULL)); - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __wt_schema_open_table( session, name, namelen, ok_incomplete, &table)); WT_RET(ret); diff --git a/src/session/session_api.c b/src/session/session_api.c index 3a5d06f1b61..d282c5d0c32 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config) cfg[1] = NULL; WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_alter(session, uri, cfg)))); err: if (ret != 0) @@ -518,7 +518,7 @@ __wt_session_create( WT_DECL_RET; WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_create(session, uri, config))); return (ret); } @@ -766,7 +766,7 @@ __session_rename(WT_SESSION *wt_session, WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: if (ret != 0) @@ -855,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) if (lock_wait) WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, ret = + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)))); else WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret = + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); } else { if (lock_wait) WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg))); else WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, ret = __wt_schema_drop(session, uri, cfg))); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 5932e058552..3261c8089f4 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -650,7 +650,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_get_handles, NULL))); WT_ERR(ret); -- cgit v1.2.1 From 988c297f22bbce3a40f7eb9ed22cdb7d9bf0a9c8 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 13 Feb 2017 23:44:11 -0500 Subject: WT-3144 bug fix: random cursor returns not-found when descending to an empty page (#3289) * If random descent through the tree fails, fallback to skipping through the tree's pages; if skipping through the tree's pages fails, fallback to a random entry from the first page in the tree that contains anything at all. * Add tests that create a tree with enough data for multiple pages, reopens the connection so we have a real tree, then truncates most / all of the tree and makes sure random lookups find data / fail (respectively). That way we're testing WT_REF_DELETED, not just empty pages. * Fix a documentation error, we never implemented a next_random_sample_percent configuration. --- src/btree/bt_cursor.c | 134 ++++++++++++++++++++++++++------------- src/btree/row_srch.c | 14 ++-- src/docs/cursor-random.dox | 5 -- test/suite/test_cursor_random.py | 49 ++++++++++++++ 4 files changed, 144 insertions(+), 58 deletions(-) diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index d18b9b76992..c0b028725c7 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -846,7 +846,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; - uint64_t skip; + uint64_t n, skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; @@ -862,60 +862,104 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); +#ifdef HAVE_DIAGNOSTIC /* - * If retrieving random values without sampling, or we don't have a - * page reference, pick a roughly random leaf page in the tree. + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - /* - * Skip past the sample size of the leaf pages in the tree - * between each random key return to compensate for unbalanced - * trees. - * - * Use the underlying file size divided by its block allocation - * size as our guess of leaf pages in the file (this can be - * entirely wrong, as it depends on how many pages are in this - * particular checkpoint, how large the leaf and internal pages - * really are, and other factors). Then, divide that value by - * the configured sample size and increment the final result to - * make sure tiny files don't leave us with a skip value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_sample_size != 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } + __wt_cursor_key_order_reset(cbt); +#endif - /* - * Choose a leaf page from the tree. - */ + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); - WT_ERR(ret); - } else { + if (ret == 0) + goto random_page_entry; + /* - * Read through the tree, skipping leaf pages. Be cautious about - * the skip count: if the last leaf page skipped was also the - * last leaf page in the tree, it may be set to zero on return - * with the end-of-walk condition. - * - * Pages read for data sampling aren't "useful"; don't update - * the read generation of pages already in memory, and if a page - * is read, set its generation to a low value so it is evicted - * quickly. + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. */ - for (skip = - cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | - WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } } + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 1c3d5ad5daa..0858e42356b 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -798,14 +798,7 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) current = NULL; retry = 100; - if (0) { -restart: /* - * Discard the currently held page and restart the search from - * the root. - */ - WT_RET(__wt_page_release(session, current, 0)); - } - +restart: /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { @@ -837,6 +830,11 @@ restart: /* break; } if (i == entries || descent == NULL) { + /* + * Discard the currently held page and restart from the + * root. + */ + WT_RET(__wt_page_release(session, current, 0)); if (--retry > 0) goto restart; return (WT_NOTFOUND); diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox index a0a3212be6d..b6434e3d161 100644 --- a/src/docs/cursor-random.dox +++ b/src/docs/cursor-random.dox @@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object into \c next_random_sample_size pieces, and each subsequent retrieval returns a record from the next one of those pieces. -For example, setting \c next_random_sample_percent to \c 10 would cause -the cursor to sequentially return records from each tenth part of the -object. Setting \c next_random_sample_percent to \c 1000 would cause the -cursor to sequentially return records from each .1% of the object. - */ diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index 3bda6dc9946..ee0f85a29ee 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -71,6 +71,15 @@ class test_cursor_random(wttest.WiredTigerTestCase): self.assertEquals(cursor.reset(), 0) cursor.close() + # Check that next_random fails with an empty tree, repeatedly. + def test_cursor_random_empty(self): + uri = self.type + self.session.create(uri, 'key_format=S,value_format=S') + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,5): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + cursor.close + # Check that next_random works with a single value, repeatedly. def test_cursor_random_single_record(self): uri = self.type @@ -127,6 +136,46 @@ class test_cursor_random(wttest.WiredTigerTestCase): def test_cursor_random_multiple_page_records(self): self.cursor_random_multiple_page_records(0) + # Check that next_random fails in the presence of a set of values, some of + # which are deleted. + def test_cursor_random_deleted_partial(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + start = self.session.open_cursor(uri, None) + start.set_key(ds.key(10)) + end = self.session.open_cursor(uri, None) + end.set_key(ds.key(10000-10)) + self.session.truncate(None, start, end, None) + self.assertEqual(start.close(), 0) + self.assertEqual(end.close(), 0) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertEqual(cursor.next(), 0) + + # Check that next_random fails in the presence of a set of values, all of + # which are deleted. + def test_cursor_random_deleted_all(self): + uri = self.type + ds = self.dataset(self, uri, 10000, + config='allocation_size=512,leaf_page_max=512') + ds.populate() + + # Close the connection so everything is forced to disk. + self.reopen_conn() + + self.session.truncate(uri, None, None, None) + + cursor = self.session.open_cursor(uri, None, self.config) + for i in range(1,10): + self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) + # Check that opening a random cursor on column-store returns not-supported. class test_cursor_random_column(wttest.WiredTigerTestCase): scenarios = make_scenarios([ -- cgit v1.2.1 From df64d277ae99adf98824fbf2118626c77fd2f199 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Tue, 14 Feb 2017 16:39:24 +1100 Subject: WT-3149 Have eviction choose a random point when walking a tree. (#3285) Only choose a random point when there is no saved walk point. Fixes to random search as well - noticed search termination conditions when sampling the search page vs. walking it sequentially weren't the same. Changed that, which caused the test_compact02 test to fail. There's an underlying bug in this code, if we return WT_NOTFOUND, we can lose a hazard pointer on the page of the tree we unsucessfully searched. Add a page-release in the case of returning not-found. --- dist/filelist | 1 + src/btree/bt_cursor.c | 180 ++-------------------- src/btree/bt_random.c | 413 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/btree/row_srch.c | 237 ----------------------------- src/evict/evict_lru.c | 28 +++- src/include/extern.h | 7 +- 6 files changed, 454 insertions(+), 412 deletions(-) create mode 100644 src/btree/bt_random.c diff --git a/dist/filelist b/dist/filelist index 13d67ef961b..3886035eaa9 100644 --- a/dist/filelist +++ b/dist/filelist @@ -30,6 +30,7 @@ src/btree/bt_io.c src/btree/bt_misc.c src/btree/bt_ovfl.c src/btree/bt_page.c +src/btree/bt_random.c src/btree/bt_read.c src/btree/bt_rebalance.c src/btree/bt_ret.c diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index c0b028725c7..5fde2237538 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -76,11 +76,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) } /* - * __cursor_valid -- + * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline bool -__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +bool +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; @@ -330,7 +330,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -338,7 +338,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (valid) @@ -419,14 +419,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } /* @@ -462,7 +462,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - if (__cursor_valid(cbt, &upd)) { + if (__wt_cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) @@ -537,7 +537,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || + ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); @@ -552,7 +552,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __cursor_valid(cbt, NULL)) + cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); @@ -682,12 +682,12 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible - * in __cursor_valid, or we can miss conflict. + * in __wt_cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -711,7 +711,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); @@ -786,7 +786,8 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && + if ((cbt->compare != 0 || + !__wt_cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } @@ -800,7 +801,7 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); @@ -829,155 +830,6 @@ err: if (ret == WT_RESTART) { return (ret); } -/* - * __wt_btcur_next_random -- - * Move to a random record in the tree. There are two algorithms, one - * where we select a record at random from the whole tree on each - * retrieval and one where we first select a record at random from the - * whole tree, and then subsequently sample forward from that location. - * The sampling approach allows us to select reasonably uniform random - * points from unbalanced trees. - */ -int -__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_SESSION_IMPL *session; - WT_UPDATE *upd; - wt_off_t size; - uint64_t n, skip; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; - - /* - * Only supports row-store: applications can trivially select a random - * value from a column-store, if there were any reason to do so. - */ - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, ENOTSUP, - "WT_CURSOR.next_random only supported by row-store tables"); - - WT_STAT_CONN_INCR(session, cursor_next); - WT_STAT_DATA_INCR(session, cursor_next); - -#ifdef HAVE_DIAGNOSTIC - /* - * Under some conditions we end up using the underlying cursor.next to - * walk through the object. Since there are multiple calls, we can hit - * the cursor-order checks, turn them off. - */ - __wt_cursor_key_order_reset(cbt); -#endif - - /* - * If we don't have a current position in the tree, or if retrieving - * random values without sampling, pick a roughly random leaf page in - * the tree and return an entry from it. - */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - WT_ERR(__cursor_func_init(cbt, true)); - WT_WITH_PAGE_INDEX( - session, ret = __wt_row_random_descent(session, cbt)); - if (ret == 0) - goto random_page_entry; - - /* - * Random descent may return not-found: the tree might be empty - * or have so many deleted items we didn't find any valid pages. - * We can't return WT_NOTFOUND to the application unless a tree - * is really empty, fallback to skipping through tree pages. - */ - WT_ERR_NOTFOUND_OK(ret); - } - - /* - * Cursor through the tree, skipping past the sample size of the leaf - * pages in the tree between each random key return to compensate for - * unbalanced trees. - * - * If the random descent attempt failed, we don't have a configured - * sample size, use 100 for no particular reason. - */ - if (cbt->next_random_sample_size == 0) - cbt->next_random_sample_size = 100; - - /* - * If the random descent attempt failed, or it's our first skip attempt, - * we haven't yet set the pages to skip, do it now. - * - * Use the underlying file size divided by its block allocation size as - * our guess of leaf pages in the file (this can be entirely wrong, as - * it depends on how many pages are in this particular checkpoint, how - * large the leaf and internal pages really are, and other factors). - * Then, divide that value by the configured sample size and increment - * the final result to make sure tiny files don't leave us with a skip - * value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_leaf_skip == 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } - - /* - * Be paranoid about loop termination: first, if the last leaf page - * skipped was also the last leaf page in the tree, skip may be set to - * zero on return along with the NULL WT_REF end-of-walk condition. - * Second, if a tree has no valid pages at all (the condition after - * initial creation), we might make no progress at all, or finally, if - * a tree has only deleted pages, we'll make progress, but never get a - * useful WT_REF. And, of course, the tree can switch from one of these - * states to another without warning. Decrement skip regardless of what - * is happening in the search, guarantee we eventually quit. - * - * Pages read for data sampling aren't "useful"; don't update the read - * generation of pages already in memory, and if a page is read, set - * its generation to a low value so it is evicted quickly. - */ - for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { - n = skip; - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); - if (n == skip) { - if (skip == 0) - break; - --skip; - } - } - - /* - * We can't return WT_NOTFOUND to the application unless a tree is - * really empty, fallback to a random entry from the first page in the - * tree that has anything at all. - */ - if (cbt->ref == NULL) - WT_ERR(__wt_btcur_next(cbt, false)); - -random_page_entry: - /* - * Select a random entry from the leaf page. If it's not valid, move to - * the next entry, if that doesn't work, move to the previous entry. - */ - WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__cursor_valid(cbt, &upd)) - WT_ERR(__wt_kv_return(session, cbt, upd)); - else { - if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) - ret = __wt_btcur_prev(cbt, false); - WT_ERR(ret); - } - return (0); - -err: WT_TRET(__cursor_reset(cbt)); - return (ret); -} - /* * __wt_btcur_compare -- * Return a comparison between two cursors. diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c new file mode 100644 index 00000000000..3cc6838c4c8 --- /dev/null +++ b/src/btree/bt_random.c @@ -0,0 +1,413 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_row_random_leaf -- + * Return a random key from a row-store leaf page. + */ +int +__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint64_t samples; + uint32_t choice, entries, i; + int level; + + page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + __cursor_pos_clear(cbt); + + /* If the page has disk-based entries, select from them. */ + if (page->entries != 0) { + cbt->compare = 0; + cbt->slot = __wt_random(&session->rnd) % page->entries; + + /* + * The real row-store search function builds the key, so we + * have to as well. + */ + return (__wt_row_leaf_key(session, + page, page->pg_row + cbt->slot, cbt->tmp, false)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list. + * + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) + break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + for (samples = entries; level > 0; samples += entries) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; + cbt->ins_head = ins_head; + cbt->compare = 0; + + /* + * Random lookups in newly created collections can be slow if a page + * consists of a large skiplist. Schedule the page for eviction if we + * encounter a large skiplist. This worthwhile because applications + * that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. + */ + if (samples > 5000) + __wt_page_evict_soon(session, cbt->ref); + + return (0); +} + +/* + * __wt_random_descent -- + * Find a random leaf page in a tree. + */ +int +__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + uint32_t flags, i, entries, retry; + + btree = S2BT(session); + current = NULL; + retry = 100; + + /* Eviction should not be tapped to do eviction. */ + flags = WT_READ_RESTART_OK; + if (eviction) + LF_SET(WT_READ_NO_EVICT); + + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, flags)); + } + + /* Search the internal pages of the tree. */ + current = &btree->root; + for (;;) { + page = current->page; + if (!WT_PAGE_IS_INTERNAL(page)) + break; + + WT_INTL_INDEX_GET(session, page, pindex); + entries = pindex->entries; + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + * + * Eviction is only looking for a place in the cache and so only + * wants in-memory pages (but a deleted page is fine); currently + * our other caller is looking for a key/value pair on a random + * leave page, and so will accept any page that contains a valid + * key/value pair, so on-disk is fine, but deleted is not. + */ + descent = NULL; + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state == WT_REF_MEM || + (!eviction && descent->state == WT_REF_DISK)) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state == WT_REF_MEM || + (!eviction && + descent->state == WT_REF_DISK)) + break; + } + if (i == entries || descent == NULL) { + if (--retry > 0) + goto restart; + + WT_RET(__wt_page_release(session, current, flags)); + return (WT_NOTFOUND); + } + + /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. + */ + if ((ret = + __wt_page_swap(session, current, descent, flags)) == 0) { + current = descent; + continue; + } + if (ret == WT_RESTART) + goto restart; + return (ret); + } + + *refp = current; + return (0); +} + +/* + * __wt_btcur_next_random -- + * Move to a random record in the tree. There are two algorithms, one + * where we select a record at random from the whole tree on each + * retrieval and one where we first select a record at random from the + * whole tree, and then subsequently sample forward from that location. + * The sampling approach allows us to select reasonably uniform random + * points from unbalanced trees. + */ +int +__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + wt_off_t size; + uint64_t n, skip; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + btree = cbt->btree; + + /* + * Only supports row-store: applications can trivially select a random + * value from a column-store, if there were any reason to do so. + */ + if (btree->type != BTREE_ROW) + WT_RET_MSG(session, ENOTSUP, + "WT_CURSOR.next_random only supported by row-store tables"); + + WT_STAT_CONN_INCR(session, cursor_next); + WT_STAT_DATA_INCR(session, cursor_next); + +#ifdef HAVE_DIAGNOSTIC + /* + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. + */ + __wt_cursor_key_order_reset(cbt); +#endif + + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { + WT_ERR(__cursor_func_init(cbt, true)); + WT_WITH_PAGE_INDEX(session, + ret = __wt_random_descent(session, &cbt->ref, false)); + if (ret == 0) + goto random_page_entry; + + /* + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. + */ + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } + } + + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: + /* + * Select a random entry from the leaf page. If it's not valid, move to + * the next entry, if that doesn't work, move to the previous entry. + */ + WT_ERR(__wt_row_random_leaf(session, cbt)); + if (__wt_cursor_valid(cbt, &upd)) + WT_ERR(__wt_kv_return(session, cbt, upd)); + else { + if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) + ret = __wt_btcur_prev(cbt, false); + WT_ERR(ret); + } + return (0); + +err: WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 0858e42356b..9c3d467340e 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -623,240 +623,3 @@ leaf_match: cbt->compare = 0; err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } - -/* - * __wt_row_random_leaf -- - * Return a random key from a row-store leaf page. - */ -int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_INSERT *ins, **start, **stop; - WT_INSERT_HEAD *ins_head; - WT_PAGE *page; - uint64_t samples; - uint32_t choice, entries, i; - int level; - - page = cbt->ref->page; - start = stop = NULL; /* [-Wconditional-uninitialized] */ - entries = 0; /* [-Wconditional-uninitialized] */ - - __cursor_pos_clear(cbt); - - /* If the page has disk-based entries, select from them. */ - if (page->entries != 0) { - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->entries; - - /* - * The real row-store search function builds the key, so we - * have to as well. - */ - return (__wt_row_leaf_key(session, - page, page->pg_row + cbt->slot, cbt->tmp, false)); - } - - /* - * If the tree is new (and not empty), it might have a large insert - * list. - * - * Walk down the list until we find a level with at least 50 entries, - * that's where we'll start rolling random numbers. The value 50 is - * used to ignore levels with only a few entries, that is, levels which - * are potentially badly skewed. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { - start = &ins_head->head[level]; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - - if (entries > 50) - break; - } - - /* - * If it's a tiny list and we went all the way to level 0, correct the - * level; entries is correctly set. - */ - if (level < 0) - level = 0; - - /* - * Step down the skip list levels, selecting a random chunk of the name - * space at each level. - */ - for (samples = entries; level > 0; samples += entries) { - /* - * There are (entries) or (entries + 1) chunks of the name space - * considered at each level. They are: between start and the 1st - * element, between the 1st and 2nd elements, and so on to the - * last chunk which is the name space after the stop element on - * the current level. This last chunk of name space may or may - * not be there: as we descend the levels of the skip list, this - * chunk may appear, depending if the next level down has - * entries logically after the stop point in the current level. - * We can't ignore those entries: because of the algorithm used - * to determine the depth of a skiplist, there may be a large - * number of entries "revealed" by descending a level. - * - * If the next level down has more items after the current stop - * point, there are (entries + 1) chunks to consider, else there - * are (entries) chunks. - */ - if (*(stop - 1) == NULL) - choice = __wt_random(&session->rnd) % entries; - else - choice = __wt_random(&session->rnd) % (entries + 1); - - if (choice == entries) { - /* - * We selected the name space after the stop element on - * this level. Set the start point to the current stop - * point, descend a level and move the stop element to - * the end of the list, that is, the end of the newly - * discovered name space, counting entries as we go. - */ - start = stop; - --start; - --level; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - } else { - /* - * We selected another name space on the level. Move the - * start pointer the selected number of entries forward - * to the start of the selected chunk (if the selected - * number is 0, start won't move). Set the stop pointer - * to the next element in the list and drop both start - * and stop down a level. - */ - for (i = 0; i < choice; ++i) - start = &(*start)->next[level]; - stop = &(*start)->next[level]; - - --start; - --stop; - --level; - - /* Count the entries in the selected name space. */ - for (entries = 0, - ins = *start; ins != *stop; ins = ins->next[level]) - ++entries; - } - } - - /* - * When we reach the bottom level, entries will already be set. Select - * a random entry from the name space and return it. - * - * It should be impossible for the entries count to be 0 at this point, - * but check for it out of paranoia and to quiet static testing tools. - */ - if (entries > 0) - entries = __wt_random(&session->rnd) % entries; - for (ins = *start; entries > 0; --entries) - ins = ins->next[0]; - - cbt->ins = ins; - cbt->ins_head = ins_head; - cbt->compare = 0; - - /* - * Random lookups in newly created collections can be slow if a page - * consists of a large skiplist. Schedule the page for eviction if we - * encounter a large skiplist. This worthwhile because applications - * that take a sample often take many samples, so the overhead of - * traversing the skip list each time accumulates to real time. - */ - if (samples > 5000) - __wt_page_evict_soon(session, cbt->ref); - - return (0); -} - -/* - * __wt_row_random_descent -- - * Find a random leaf page in a row-store tree. - */ -int -__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *current, *descent; - uint32_t i, entries, retry; - - btree = S2BT(session); - current = NULL; - retry = 100; - -restart: - /* Search the internal pages of the tree. */ - current = &btree->root; - for (;;) { - page = current->page; - if (page->type != WT_PAGE_ROW_INT) - break; - - WT_INTL_INDEX_GET(session, page, pindex); - entries = pindex->entries; - - /* - * There may be empty pages in the tree, and they're useless to - * us. If we don't find a non-empty page in "entries" random - * guesses, take the first non-empty page in the tree. If the - * search page contains nothing other than empty pages, restart - * from the root some number of times before giving up. - */ - descent = NULL; - for (i = 0; i < entries; ++i) { - descent = - pindex->index[__wt_random(&session->rnd) % entries]; - if (descent->state != WT_REF_DELETED) - break; - } - if (i == entries) - for (i = 0; i < entries; ++i) { - descent = pindex->index[i]; - if (descent->state != WT_REF_DELETED) - break; - } - if (i == entries || descent == NULL) { - /* - * Discard the currently held page and restart from the - * root. - */ - WT_RET(__wt_page_release(session, current, 0)); - if (--retry > 0) - goto restart; - return (WT_NOTFOUND); - } - - /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. - */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { - current = descent; - continue; - } - if (ret == WT_RESTART) - goto restart; - return (ret); - } - - cbt->ref = current; - return (0); -} diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index efe056aee02..42fe4d4608e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1654,10 +1654,29 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + /* + * Choose a random point in the tree if looking for candidates in a + * tree with no starting point set. This is mostly aimed at ensuring + * eviction fairly visits all pages in trees with a lot of in-cache + * content. + */ + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid for our walk */ + WT_WITH_PAGE_INDEX(session, ret = + __wt_random_descent(session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + + /* + * Reverse the direction of the walk each time we start at a + * random point so both ends of the tree are equally likely to + * be visited. + */ + btree->evict_walk_reverse = !btree->evict_walk_reverse; + } + walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - /* Randomize the walk direction. */ if (btree->evict_walk_reverse) FLD_SET(walk_flags, WT_READ_PREV); @@ -1799,13 +1818,6 @@ fast: /* If the page can't be evicted, give up. */ WT_STAT_CONN_INCRV( session, cache_eviction_pages_queued, (u_int)(evict - start)); - /* - * If gave up the walk, reverse the direction of the walk and skip it - * next time. - */ - if (give_up) - btree->evict_walk_reverse = !btree->evict_walk_reverse; - /* * If we couldn't find the number of pages we were looking for, skip * the tree next time. diff --git a/src/include/extern.h b/src/include/extern.h index 836a7cb1ae6..8e55077c2a9 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -98,6 +98,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -105,7 +106,6 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((w extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -150,6 +150,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags @@ -193,8 +196,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -- cgit v1.2.1 From 92c48cfcd9c66ba66386fd48ca326ec750057d86 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 00:56:29 -0500 Subject: WT-2909 Add a custom file system fault injection test to verify checkpoint integrity (#3272) Implement a custom file system, and use it via a test case to add validate checkpoint integrity in the face of file-system level errors. --- dist/s_string.ok | 2 + dist/s_void | 1 + ext/test/fail_fs/fail_fs.c | 197 ++++++-- test/csuite/Makefile.am | 3 + test/csuite/wt2909_checkpoint_integrity/main.c | 660 +++++++++++++++++++++++++ test/utility/misc.c | 2 +- test/utility/test_util.h | 2 +- 7 files changed, 827 insertions(+), 40 deletions(-) create mode 100644 test/csuite/wt2909_checkpoint_integrity/main.c diff --git a/dist/s_string.ok b/dist/s_string.ok index d2e9dffaa48..e033f77327f 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -770,6 +770,7 @@ idx ifdef ifdef's iiSii +iiiS iiii iiu ikey @@ -1138,6 +1139,7 @@ subgetraw subgets subinit sublicense +subtest subtree sunique superset diff --git a/dist/s_void b/dist/s_void index 947153e730b..90425d5a718 100755 --- a/dist/s_void +++ b/dist/s_void @@ -82,6 +82,7 @@ func_ok() -e '/int fail_file_sync$/d' \ -e '/int fail_fs_directory_list_free$/d' \ -e '/int fail_fs_exist$/d' \ + -e '/int fail_fs_simulate_fail$/d' \ -e '/int fail_fs_terminate$/d' \ -e '/int handle_message$/d' \ -e '/int handle_progress$/d' \ diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 29d469768c5..a6376ce203b 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -35,16 +35,29 @@ #include #include #include +#include #include #include "queue.h" -#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) +#define FAIL_FS_GIGABYTE (1024 * 1024 * 1024) + +#define FAIL_FS_ENV_ENABLE "WT_FAIL_FS_ENABLE" +#define FAIL_FS_ENV_WRITE_ALLOW "WT_FAIL_FS_WRITE_ALLOW" +#define FAIL_FS_ENV_READ_ALLOW "WT_FAIL_FS_READ_ALLOW" /* * A "fail file system", that is, a file system extension that fails when we - * want it to. This is only used in test frameworks, this fact allows us - * to simplify some error paths. + * want it to. This is only used in test frameworks, this fact allows us to + * simplify some error paths. This code is not portable to Windows, as it has + * direct knowledge of file descriptors, environment variables and stack + * traces. + * + * When the filesystem extension is configured, parameters can set how many + * reads or writes can be allowed before failure. If this is not fine-grained + * enough, an 'environment' configuration parameter can be specified. If that + * is used, then on every file system read or write, environment variables are + * checked that control when reading or writing should fail. */ typedef struct { WT_FILE_SYSTEM iface; @@ -54,6 +67,9 @@ typedef struct { * uses a single, global file system lock. */ pthread_rwlock_t lock; /* Lock */ + bool fail_enabled; + bool use_environment; + bool verbose; int64_t read_ops; int64_t write_ops; int64_t allow_reads; @@ -86,12 +102,12 @@ static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); static int fail_file_write( WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *); static bool fail_fs_arg( - const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, - int64_t *argp); + const char *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, int64_t *); static int fail_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, char ***, uint32_t *); static int fail_fs_directory_list_free( WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t); +static void fail_fs_env(const char *, int64_t *); static int fail_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *); static int fail_fs_open(WT_FILE_SYSTEM *, WT_SESSION *, const char *, WT_FS_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **); @@ -99,6 +115,8 @@ static int fail_fs_remove( WT_FILE_SYSTEM *, WT_SESSION *, const char *, uint32_t); static int fail_fs_rename( WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *, uint32_t); +static int fail_fs_simulate_fail( + FAIL_FILE_HANDLE *, WT_SESSION *, int64_t, const char *); static int fail_fs_size( WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *); static int fail_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *); @@ -145,8 +163,12 @@ fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) fail_fh = (FAIL_FILE_HANDLE *)file_handle; + /* + * We don't actually open an fd when opening directories for flushing, + * so ignore that case here. + */ if (fail_fh->fd < 0) - return (EINVAL); + return (0); ret = close(fail_fh->fd); fail_fh->fd = -1; fail_file_handle_remove(session, fail_fh); @@ -198,7 +220,7 @@ fail_file_read(WT_FILE_HANDLE *file_handle, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; - int64_t read_ops; + int64_t envint, read_ops; int ret; size_t chunk; ssize_t nr; @@ -207,19 +229,34 @@ fail_file_read(WT_FILE_HANDLE *file_handle, fail_fh = (FAIL_FILE_HANDLE *)file_handle; fail_fs = fail_fh->fail_fs; wtext = fail_fs->wtext; + read_ops = 0; ret = 0; fail_fs_lock(&fail_fs->lock); - read_ops = ++fail_fs->read_ops; + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_READ_ALLOW, + &fail_fs->allow_reads); + fail_fs->read_ops = 0; + } + read_ops = ++fail_fs->read_ops; + } else + fail_fs->fail_enabled = false; + } else + read_ops = ++fail_fs->read_ops; + fail_fs_unlock(&fail_fs->lock); - if (fail_fs->allow_reads != 0 && read_ops % fail_fs->allow_reads == 0) { - (void)wtext->msg_printf(wtext, session, - "fail_fs: %s: simulated failure after %" PRId64 - " reads\n", fail_fh->iface.name, read_ops); - return (EIO); - } + if (fail_fs->fail_enabled && fail_fs->allow_reads != 0 && + read_ops % fail_fs->allow_reads == 0) + return (fail_fs_simulate_fail( + fail_fh, session, read_ops, "read")); + /* Break reads larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { chunk = (len < FAIL_FS_GIGABYTE) ? len : FAIL_FS_GIGABYTE; if ((nr = pread(fail_fh->fd, addr, chunk, offset)) <= 0) { @@ -262,7 +299,7 @@ fail_file_size( /* * fail_file_sync -- * Ensure the content of the file is stable. This is a no-op in our - * memory backed file system. + * file system. */ static int fail_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session) @@ -300,7 +337,7 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; - int64_t write_ops; + int64_t envint, write_ops; int ret; size_t chunk; ssize_t nr; @@ -309,19 +346,32 @@ fail_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session, fail_fh = (FAIL_FILE_HANDLE *)file_handle; fail_fs = fail_fh->fail_fs; wtext = fail_fs->wtext; + write_ops = 0; ret = 0; fail_fs_lock(&fail_fs->lock); - write_ops = ++fail_fs->write_ops; + + if (fail_fs->use_environment) { + fail_fs_env(FAIL_FS_ENV_ENABLE, &envint); + if (envint != 0) { + if (!fail_fs->fail_enabled) { + fail_fs->fail_enabled = true; + fail_fs_env(FAIL_FS_ENV_WRITE_ALLOW, + &fail_fs->allow_writes); + fail_fs->write_ops = 0; + } + write_ops = ++fail_fs->write_ops; + } else + fail_fs->fail_enabled = false; + } else + write_ops = ++fail_fs->write_ops; + fail_fs_unlock(&fail_fs->lock); - if (fail_fs->allow_writes != 0 && - write_ops % fail_fs->allow_writes == 0) { - (void)wtext->msg_printf(wtext, session, - "fail_fs: %s: simulated failure after %" PRId64 - " writes\n", fail_fh->iface.name, write_ops); - return (EIO); - } + if (fail_fs->fail_enabled && fail_fs->allow_writes != 0 && + write_ops % fail_fs->allow_writes == 0) + return (fail_fs_simulate_fail( + fail_fh, session, write_ops, "write")); /* Break writes larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { @@ -348,17 +398,12 @@ static bool fail_fs_arg(const char *match, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int64_t *argp) { - char *s; - int64_t result; - if (strncmp(match, key->str, key->len) == 0 && - match[key->len] == '\0') { - s = (char *)value->str; - result = strtoll(s, &s, 10); - if ((size_t)(s - (char *)value->str) == value->len) { - *argp = result; - return (true); - } + match[key->len] == '\0' && + (value->type == WT_CONFIG_ITEM_BOOL || + value->type == WT_CONFIG_ITEM_NUM)) { + *argp = value->val; + return (true); } return (false); } @@ -453,6 +498,30 @@ fail_fs_directory_list_free(WT_FILE_SYSTEM *file_system, return (0); } +/* + * fail_fs_env -- + * If the name is in the environment, return its integral value. + */ +static void +fail_fs_env(const char *name, int64_t *valp) +{ + int64_t result; + char *s, *value; + + result = 0; + if ((value = getenv(name)) != NULL) { + s = value; + if (strcmp(value, "true") == 0) + result = 1; + else if (strcmp(value, "false") != 0) { + result = strtoll(value, &s, 10); + if (*s != '\0') + result = 0; + } + } + *valp = result; +} + /* * fail_fs_exist -- * Return if the file exists. @@ -482,7 +551,6 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, WT_FILE_HANDLE *file_handle; int fd, open_flags, ret; - (void)file_type; /* Unused */ (void)session; /* Unused */ *file_handlep = NULL; @@ -492,6 +560,9 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, fd = -1; ret = 0; + if (fail_fs->verbose) + fprintf(stderr, "fail_fs: open: %s\n", name); + fail_fs_lock(&fail_fs->lock); open_flags = 0; @@ -504,7 +575,14 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, else open_flags |= O_RDWR; - if ((fd = open(name, open_flags, 0666)) < 0) { + /* + * Opening a file handle on a directory is only to support filesystems + * that require a directory sync for durability. This is a no-op + * for this file system. + */ + if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) + fd = -1; + else if ((fd = open(name, open_flags, 0666)) < 0) { ret = errno; goto err; } @@ -587,6 +665,38 @@ fail_fs_rename(WT_FILE_SYSTEM *file_system, return (rename(from, to)); } +/* + * fail_fs_simulate_fail -- + * Simulate a failure from this file system by reporting it + * and returning a non-zero return code. + */ +static int +fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, + int64_t nops, const char *opkind) +{ + FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; + int btret, i; + void *bt[100]; + char **btstr; + + fail_fs = fail_fh->fail_fs; + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, + "fail_fs: %s: simulated failure after %" PRId64 + " %s operations\n", fail_fh->iface.name, nops, opkind); + btret = backtrace(bt, sizeof(bt)/sizeof(bt[0])); + if ((btstr = backtrace_symbols(bt, btret)) != NULL) { + for (i = 0; i < btret; i++) + (void)wtext->msg_printf(wtext, session, " %s", + btstr[i]); + free(btstr); + } + } + return (EIO); +} + /* * fail_fs_size -- * Get the size of a file in bytes, by file name. @@ -641,6 +751,7 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) WT_CONFIG_PARSER *config_parser; WT_EXTENSION_API *wtext; WT_FILE_SYSTEM *file_system; + int64_t argval; int ret; ret = 0; @@ -663,9 +774,17 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) goto err; } while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { - if (fail_fs_arg("allow_writes", &k, &v, &fail_fs->allow_writes)) + if (fail_fs_arg("environment", &k, &v, &argval)) { + fail_fs->use_environment = (argval != 0); + continue; + } else if (fail_fs_arg("verbose", &k, &v, &argval)) { + fail_fs->verbose = (argval != 0); + continue; + } else if (fail_fs_arg("allow_writes", &k, &v, + &fail_fs->allow_writes)) continue; - if (fail_fs_arg("allow_reads", &k, &v, &fail_fs->allow_reads)) + else if (fail_fs_arg("allow_reads", &k, &v, + &fail_fs->allow_reads)) continue; (void)wtext->err_printf(wtext, NULL, @@ -687,6 +806,8 @@ wiredtiger_extension_init(WT_CONNECTION *conn, WT_CONFIG_ARG *config) wtext->strerror(wtext, NULL, ret)); goto err; } + if (fail_fs->allow_writes != 0 || fail_fs->allow_reads != 0) + fail_fs->fail_enabled = true; fail_fs_allocate_lock(&fail_fs->lock); /* Initialize the in-memory jump table. */ diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index 5167b42b433..0158d0c96d1 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -37,6 +37,9 @@ noinst_PROGRAMS += test_wt2834_join_bloom_fix test_wt2853_perf_SOURCES = wt2853_perf/main.c noinst_PROGRAMS += test_wt2853_perf +test_wt2909_checkpoint_integrity_SOURCES = wt2909_checkpoint_integrity/main.c +noinst_PROGRAMS += test_wt2909_checkpoint_integrity + test_wt2999_join_extractor_SOURCES = wt2999_join_extractor/main.c noinst_PROGRAMS += test_wt2999_join_extractor diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c new file mode 100644 index 00000000000..efc459ff271 --- /dev/null +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -0,0 +1,660 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +#include +#include +#include + +/* + * JIRA ticket reference: WT-2909 + * Test case description: + * + * This test attempts to check the integrity of checkpoints by injecting + * failures (by means of a custom file system) and then trying to recover. To + * insulate the top level program from various crashes that may occur when + * injecting failures, the "populate" code runs in another process, and is + * expected to sometimes fail. Then the top level program runs recovery (with + * the normal file system) and checks the results. Any failure at the top level + * indicates a checkpoint integrity problem. + * + * Each subtest uses the same kind of schema and data, the only variance is + * when the faults are injected. At the moment, this test only injects during + * checkpoints, and only injects write failures. It varies in the number of + * successful writes that occur before an injected failure (during a checkpoint + * operation), this can be indicated with "-o N". When N is not specified, the + * test attempts to find the optimal range of N for testing. Clearly when N is + * large, then the checkpoint may be successfully written, and the data + * represented by the checkpoint will be fully present. When N is small, + * nothing of interest is written and no data is present. To find the sweet + * spot where interesting failures occur, the test does a binary search to find + * the approximate N that divides the "small" and "large" cases. This is not + * strictly deterministic, a given N may give different results on different + * runs. But approximate optimal N can be determined, allowing a series of + * additional tests clustered around this N. + * + * The data is stored in two tables, one having indices. Both tables have + * the same keys and are updated with the same key in a single transaction. + * + * Failure mode: + * If one table is out of step with the other, that is detected as a failure at + * the top level. If an index is missing values (or has extra values), that is + * likewise a failure at the top level. If the tables or the home directory + * cannot be opened, that is a top level error. The tables must be present + * as an initial checkpoint is done without any injected fault. + */ + +/* + * This program does not run on Windows. The non-portable aspects at minimum + * are fork/exec the use of environment variables (used by fail_fs), and file + * name and build locations of dynamically loaded libraries. + */ +#define BIG_SIZE (1024 * 10) +#define BIG_CONTENTS "" +#define MAX_ARGS 20 +#define MAX_OP_RANGE 1000 +#define STDERR_FILE "stderr.txt" +#define STDOUT_FILE "stdout.txt" +#define TESTS_PER_OP_VALUE 3 +#define VERBOSE_PRINT 10000 + +static int check_results(TEST_OPTS *, uint64_t *); +static void check_values(WT_CURSOR *, int, int, int, char *); +static int create_big_string(char **); +static void cursor_count_items(WT_CURSOR *, uint64_t *); +static void disable_failures(void); +static void enable_failures(uint64_t, uint64_t); +static void generate_key(uint32_t, int *); +static void generate_value(uint32_t, uint32_t, char *, int *, int *, int *, + char **); +static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, + uint64_t *); +static void run_check_subtest_range(TEST_OPTS *, const char *, bool); +static int run_process(TEST_OPTS *, const char *, char *[], int *); +static int subtest_main(int, char *[], bool); +static void subtest_populate(TEST_OPTS *, bool); +int main(int, char *[]); + +extern int __wt_optind; + +#define WT_FAIL_FS_LIB "../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so" + +/* + * check_results -- + * Check all the tables and verify the results. + */ +static int +check_results(TEST_OPTS *opts, uint64_t *foundp) +{ + WT_CURSOR *maincur, *maincur2, *v0cur, *v1cur, *v2cur; + WT_SESSION *session; + uint64_t count, idxcount, nrecords; + uint32_t rndint; + int key, key_got, ret, v0, v1, v2; + char *bigref, *big; + + testutil_check(create_big_string(&bigref)); + nrecords = opts->nrecords; + testutil_check(wiredtiger_open(opts->home, NULL, + "create,log=(enabled)", &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + testutil_check(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + testutil_check(session->open_cursor(session, "index:subtest:v0", NULL, + NULL, &v0cur)); + testutil_check(session->open_cursor(session, "index:subtest:v1", NULL, + NULL, &v1cur)); + testutil_check(session->open_cursor(session, "index:subtest:v2", NULL, + NULL, &v2cur)); + + count = 0; + while ((ret = maincur->next(maincur)) == 0) { + testutil_check(maincur2->next(maincur2)); + testutil_check(maincur2->get_key(maincur2, &key_got)); + testutil_check(maincur2->get_value(maincur2, &rndint)); + + generate_key((uint32_t)count, &key); + generate_value(rndint, (uint32_t)count, + bigref, &v0, &v1, &v2, &big); + testutil_assert(key == key_got); + + /* Check the key/values in main table. */ + testutil_check(maincur->get_key(maincur, &key_got)); + testutil_assert(key == key_got); + check_values(maincur, v0, v1, v2, big); + + /* Check the values in the indices. */ + v0cur->set_key(v0cur, v0); + testutil_check(v0cur->search(v0cur)); + check_values(v0cur, v0, v1, v2, big); + v1cur->set_key(v1cur, v1); + testutil_check(v1cur->search(v1cur)); + check_values(v1cur, v0, v1, v2, big); + v2cur->set_key(v2cur, v2); + testutil_check(v2cur->search(v2cur)); + check_values(v2cur, v0, v1, v2, big); + + count++; + if (count % VERBOSE_PRINT == 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, + nrecords); + } + if (count % VERBOSE_PRINT != 0 && opts->verbose) + printf("checked %" PRIu64 "/%" PRIu64 "\n", count, nrecords); + + /* + * Always expect at least one entry, as populate does a + * checkpoint after the first insert. + */ + testutil_assert(count > 0); + testutil_assert(ret == WT_NOTFOUND); + testutil_assert(maincur2->next(maincur2) == WT_NOTFOUND); + cursor_count_items(v0cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v1cur, &idxcount); + testutil_assert(count == idxcount); + cursor_count_items(v2cur, &idxcount); + testutil_assert(count == idxcount); + + testutil_check(opts->conn->close(opts->conn, NULL)); + opts->conn = NULL; + + free(bigref); + *foundp = count; + return (0); +} + +/* + * check_values -- + * Check that the values in the cursor match the given values. + */ +static void +check_values(WT_CURSOR *cursor, int v0, int v1, int v2, char *big) +{ + int v0_got, v1_got, v2_got; + char *big_got; + + testutil_check(cursor->get_value(cursor, &v0_got, &v1_got, &v2_got, + &big_got)); + testutil_assert(v0 == v0_got); + testutil_assert(v1 == v1_got); + testutil_assert(v2 == v2_got); + testutil_assert(strcmp(big, big_got) == 0); +} + +/* + * create_big_string -- + * Create and fill the "reference" big array. + */ +static int create_big_string(char **bigp) +{ + size_t i, mod; + char *big; + + if ((big = malloc(BIG_SIZE + 1)) == NULL) + return (ENOMEM); + mod = strlen(BIG_CONTENTS); + for (i = 0; i < BIG_SIZE; i++) { + big[i] = BIG_CONTENTS[i % mod]; + } + big[BIG_SIZE] = '\0'; + *bigp = big; + return (0); +} + +/* + * cursor_count_items -- + * Count the number of items in the table by traversing + * through the cursor. + */ +static void +cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) +{ + int ret; + + *countp = 0; + + cursor->reset(cursor); + while ((ret = cursor->next(cursor)) == 0) + (*countp)++; + testutil_assert(ret == WT_NOTFOUND); +} + +/* + * disable_failures -- + * Disable failures in the fail file system. + */ +static void +disable_failures(void) +{ + setenv("WT_FAIL_FS_ENABLE", "0", 1); +} + +/* + * enable_failures -- + * Enable failures in the fail file system. + */ +static void +enable_failures(uint64_t allow_writes, uint64_t allow_reads) +{ + char value[100]; + + setenv("WT_FAIL_FS_ENABLE", "1", 1); + snprintf(value, sizeof(value), "%" PRIu64, allow_writes); + setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1); + snprintf(value, sizeof(value), "%" PRIu64, allow_reads); + setenv("WT_FAIL_FS_READ_ALLOW", value, 1); +} + +/* + * generate_key -- + * Generate a key used by the "subtest" and "subtest2" tables. + */ +static void +generate_key(uint32_t i, int *keyp) +{ + *keyp = (int)i; +} + +/* + * generate_value -- + * Generate values for the "subtest" table. + */ +static void +generate_value(uint32_t rndint, uint32_t i, char *bigref, + int *v0p, int *v1p, int *v2p, char **bigp) +{ + *v0p = (int)(i * 7); + *v1p = (int)(i * 10007); + *v2p = (int)(i * 100000007); + *bigp = &bigref[rndint % BIG_SIZE]; +} + +/* + * run_check_subtest -- + * Run the subtest with the given parameters and check the results. + */ +static void +run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, + bool close_test, uint64_t *nresultsp) +{ + int narg; + int estatus; + char rarg[20], sarg[20]; + char *subtest_args[MAX_ARGS]; + + narg = 0; + if (debugger != NULL) { + subtest_args[narg++] = (char *)debugger; + subtest_args[narg++] = (char *)"--"; + } + + subtest_args[narg++] = (char *)opts->progname; + /* "subtest" must appear before arguments */ + if (close_test) + subtest_args[narg++] = (char *)"subtest_close"; + else + subtest_args[narg++] = (char *)"subtest"; + subtest_args[narg++] = (char *)"-h"; + subtest_args[narg++] = opts->home; + subtest_args[narg++] = (char *)"-v"; /* subtest is always verbose */ + subtest_args[narg++] = (char *)"-p"; + subtest_args[narg++] = (char *)"-o"; + snprintf(sarg, sizeof(sarg), "%" PRIu64, nops); + subtest_args[narg++] = sarg; /* number of operations */ + subtest_args[narg++] = (char *)"-n"; + snprintf(rarg, sizeof(rarg), "%" PRIu64, opts->nrecords); + subtest_args[narg++] = rarg; /* number of records */ + subtest_args[narg++] = NULL; + testutil_assert(narg <= MAX_ARGS); + if (opts->verbose) + printf("running a separate process with %" PRIu64 + " operations until fail...\n", nops); + testutil_clean_work_dir(opts->home); + testutil_check(run_process( + opts, debugger != NULL ? debugger : opts->progname, + subtest_args, &estatus)); + if (opts->verbose) + printf("process exited %d\n", estatus); + + /* + * Verify results in parent process. + */ + testutil_check(check_results(opts, nresultsp)); +} + +/* + * run_check_subtest_range -- + * + * Run successive tests via binary search that determines the approximate + * crossover point between when data is recoverable or not. Once that is + * determined, run the subtest in a range near that crossover point. + * + * The theory is that running at the crossover point will tend to trigger + * "interesting" failures at the borderline when the checkpoint is about to, + * or has, succeeded. If any of those failures creates a WT home directory + * that cannot be recovered, the top level test will fail. + */ +static void +run_check_subtest_range(TEST_OPTS *opts, const char *debugger, bool close_test) +{ + uint64_t cutoff, high, low, mid, nops, nresults; + int i; + bool got_failure, got_success; + + if (opts->verbose) + printf("Determining best range of operations until failure, " + "with close_test %s.\n", + (close_test ? "enabled" : "disabled")); + + run_check_subtest(opts, debugger, 1, close_test, &cutoff); + low = 0; + high = MAX_OP_RANGE; + mid = (low + high) / 2; + while (mid != low) { + run_check_subtest(opts, debugger, mid, close_test, + &nresults); + if (nresults > cutoff) + high = mid; + else + low = mid; + mid = (low + high) / 2; + } + /* + * mid is the number of ops that is the crossover point. + * Run some tests near that point to try to trigger weird + * failures. If mid is too low or too high, it indicates + * there is a fundamental problem with the test. + */ + testutil_assert(mid > 1 && mid < MAX_OP_RANGE - 1); + if (opts->verbose) + printf("Retesting around %" PRIu64 " operations.\n", + mid); + + got_failure = false; + got_success = false; + for (nops = mid - 10; nops < mid + 10; nops++) { + for (i = 0; i < TESTS_PER_OP_VALUE; i++) { + run_check_subtest(opts, debugger, nops, + close_test, &nresults); + if (nresults > cutoff) + got_failure = true; + else + got_success = true; + } + } + /* + * Check that it really ran with a crossover point. + */ + testutil_assert(got_failure); + testutil_assert(got_success); +} + +/* + * run_process -- + * Run a program with arguments, wait until it completes. + */ +static int +run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) +{ + int pid; + + if (opts->verbose) { + printf("running: "); + for (char **arg = argv; *arg != NULL; arg++) + printf("%s ", *arg); + printf("\n"); + } + if ((pid = fork()) == 0) { + execv(prog, argv); + } else if (pid < 0) + return (errno); + + waitpid(pid, status, 0); + return (0); +} + +/* + * subtest_main -- + * The main program for the subtest + */ +static int +subtest_main(int argc, char *argv[], bool close_test) +{ + TEST_OPTS *opts, _opts; + WT_SESSION *session; + char config[1024], filename[1024]; + + opts = &_opts; + if (testutil_disable_long_tests()) + return (0); + memset(opts, 0, sizeof(*opts)); + + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + /* Redirect stderr, stdout. */ + sprintf(filename, "%s/%s", opts->home, STDERR_FILE); + freopen(filename, "a", stderr); + sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); + freopen(filename, "a", stdout); + snprintf(config, sizeof(config), + "create,cache_size=250M,log=(enabled)," + "transaction_sync=(enabled,method=none),extensions=(" + WT_FAIL_FS_LIB + "=(early_load,config={environment=true,verbose=true})]"); + + testutil_check(wiredtiger_open(opts->home, NULL, config, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(session->create(session, "table:subtest", + "key_format=i,value_format=iiiS," + "columns=(id,v0,v1,v2,big)")); + + testutil_check(session->create(session, "table:subtest2", + "key_format=i,value_format=i")); + + testutil_check(session->create(session, "index:subtest:v0", + "columns=(v0)")); + testutil_check(session->create(session, "index:subtest:v1", + "columns=(v1)")); + testutil_check(session->create(session, "index:subtest:v2", + "columns=(v2)")); + + testutil_check(session->close(session, NULL)); + + subtest_populate(opts, close_test); + + testutil_cleanup(opts); + + return (0); +} + +/* + * This macro is used as a substitute for testutil_check, except that it is + * aware of when a failure may be expected due to the effects of the fail_fs. + * This macro is used only in subtest_populate(), it uses local variables. + */ +#define CHECK(expr) { \ + int _ret; \ + _ret = expr; \ + if (_ret != 0) { \ + if (!failmode || \ + (_ret != WT_RUN_RECOVERY && _ret != EIO)) { \ + fprintf(stderr, " BAD RETURN %d for \"%s\"\n", \ + _ret, #expr); \ + testutil_check(_ret); \ + } else \ + failed = true; \ + } \ +} + +/* + * subtest_populate -- + * Populate the tables. + */ +static void +subtest_populate(TEST_OPTS *opts, bool close_test) +{ + WT_CURSOR *maincur, *maincur2; + WT_RAND_STATE rnd; + WT_SESSION *session; + uint64_t nrecords; + uint32_t i, rndint; + int key, v0, v1, v2; + char *big, *bigref; + bool failed, failmode; + + failmode = failed = false; + __wt_random_init_seed(NULL, &rnd); + CHECK(create_big_string(&bigref)); + nrecords = opts->nrecords; + + CHECK(opts->conn->open_session( + opts->conn, NULL, NULL, &session)); + + CHECK(session->open_cursor(session, "table:subtest", NULL, + NULL, &maincur)); + + CHECK(session->open_cursor(session, "table:subtest2", NULL, + NULL, &maincur2)); + + for (i = 0; i < nrecords && !failed; i++) { + rndint = __wt_random(&rnd); + generate_key(i, &key); + generate_value(rndint, i, bigref, &v0, &v1, &v2, &big); + CHECK(session->begin_transaction(session, NULL)); + maincur->set_key(maincur, key); + maincur->set_value(maincur, v0, v1, v2, big); + CHECK(maincur->insert(maincur)); + + maincur2->set_key(maincur2, key); + maincur2->set_value(maincur2, rndint); + CHECK(maincur2->insert(maincur2)); + CHECK(session->commit_transaction(session, NULL)); + + if (i == 0) + /* + * Force an initial checkpoint, that helps to + * distinguish a clear failure from just not running + * long enough. + */ + CHECK(session->checkpoint(session, NULL)); + + if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) + printf(" %d/%" PRIu64 "\n", (i + 1), nrecords); + /* Attempt to isolate the failures to checkpointing. */ + if (i == (nrecords/100)) { + enable_failures(opts->nops, 1000000); + failmode = true; /* CHECK should expect failures. */ + CHECK(session->checkpoint(session, NULL)); + failmode = false; + disable_failures(); + if (failed && opts->verbose) + printf("checkpoint failed (expected).\n"); + } + } + + /* + * Closing handles after an extreme fail is likely to cause + * cascading failures (or crashes), so recommended practice is + * to immediately exit. We're interested in testing both with + * and without the recommended practice. + */ + if (failed) { + if (!close_test) { + fprintf(stderr, "exit early.\n"); + exit(0); + } else + fprintf(stderr, "closing after failure.\n"); + } + + free(bigref); + CHECK(maincur->close(maincur)); + CHECK(maincur2->close(maincur2)); + CHECK(session->close(session, NULL)); +} + +/* + * main -- + * The main program for the test. When invoked with "subtest" + * argument, run the subtest. Otherwise, run a separate process + * for each needed subtest, and check the results. + */ +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + uint64_t nresults; + const char *debugger; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + debugger = NULL; + + testutil_check(testutil_parse_opts(argc, argv, opts)); + argc -= __wt_optind; + argv += __wt_optind; + if (opts->nrecords == 0) + opts->nrecords = 50000; + + while (argc > 0) { + if (strcmp(argv[0], "subtest") == 0) + return (subtest_main(argc, argv, false)); + else if (strcmp(argv[0], "subtest_close") == 0) + return (subtest_main(argc, argv, true)); + else if (strcmp(argv[0], "gdb") == 0) + debugger = "/usr/bin/gdb"; + else + testutil_assert(false); + argc--; + argv++; + } + if (opts->verbose) { + printf("Number of operations until failure: %" PRIu64 + " (change with -o N)\n", opts->nops); + printf("Number of records: %" PRIu64 + " (change with -n N)\n", opts->nrecords); + } + if (opts->nops == 0) { + run_check_subtest_range(opts, debugger, false); + run_check_subtest_range(opts, debugger, true); + } else + run_check_subtest(opts, debugger, opts->nops, + opts->nrecords, &nresults); + + testutil_clean_work_dir(opts->home); + testutil_cleanup(opts); + + return (0); +} diff --git a/test/utility/misc.c b/test/utility/misc.c index 1491c9a6938..1ba08ddd77f 100644 --- a/test/utility/misc.c +++ b/test/utility/misc.c @@ -78,7 +78,7 @@ testutil_work_dir_from_path(char *buffer, size_t len, const char *dir) * Remove the work directory. */ void -testutil_clean_work_dir(char *dir) +testutil_clean_work_dir(const char *dir) { size_t len; int ret; diff --git a/test/utility/test_util.h b/test/utility/test_util.h index f6a9cd68e02..489bbe18d87 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -183,7 +183,7 @@ void *dmalloc(size_t); void *drealloc(void *, size_t); void *dstrdup(const void *); void *dstrndup(const char *, size_t); -void testutil_clean_work_dir(char *); +void testutil_clean_work_dir(const char *); void testutil_cleanup(TEST_OPTS *); bool testutil_disable_long_tests(void); void testutil_make_work_dir(char *); -- cgit v1.2.1 From e66634960eeaf60d1b13c26308053e0baf51030b Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 14 Feb 2017 08:36:14 -0500 Subject: WT-2909 Create automatable test verifying checkpoint integrity after errors (#3295) * WT-2909 Create automatable test verifying checkpoint integrity after errors Make gcc 4.7 work again. * Linux (Red Hat 5.3.1-6) declares backtrace(3) to return an int, FreeBSD (10.3-RELEASE-p11) declares it to return a size_t. * Remove repeated #include files, check for error returns from a few functions. * The Linux/FreeBSD backtrace() calls are fundamentally incompatible, add an #ifdef. --- ext/test/fail_fs/fail_fs.c | 13 +++++++--- test/csuite/wt2909_checkpoint_integrity/main.c | 33 +++++++++++++------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index a6376ce203b..0ea4a7d5e00 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -95,8 +95,7 @@ static void fail_file_handle_remove(WT_SESSION *, FAIL_FILE_HANDLE *); static int fail_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool); static int fail_file_read( WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *); -static int fail_file_size( - WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); +static int fail_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *); static int fail_file_sync(WT_FILE_HANDLE *, WT_SESSION *); static int fail_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t); static int fail_file_write( @@ -676,7 +675,11 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, { FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; +#ifdef __linux__ int btret, i; +#else + size_t btret, i; +#endif void *bt[100]; char **btstr; @@ -686,7 +689,11 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 " %s operations\n", fail_fh->iface.name, nops, opkind); - btret = backtrace(bt, sizeof(bt)/sizeof(bt[0])); +#ifdef __linux__ + btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); +#else + btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); +#endif if ((btstr = backtrace_symbols(bt, btret)) != NULL) { for (i = 0; i < btret; i++) (void)wtext->msg_printf(wtext, session, " %s", diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index efc459ff271..bf7f86cfd07 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -27,8 +27,6 @@ */ #include "test_util.h" -#include -#include #include /* @@ -243,7 +241,7 @@ cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) *countp = 0; - cursor->reset(cursor); + testutil_check(cursor->reset(cursor)); while ((ret = cursor->next(cursor)) == 0) (*countp)++; testutil_assert(ret == WT_NOTFOUND); @@ -256,7 +254,7 @@ cursor_count_items(WT_CURSOR *cursor, uint64_t *countp) static void disable_failures(void) { - setenv("WT_FAIL_FS_ENABLE", "0", 1); + testutil_check(setenv("WT_FAIL_FS_ENABLE", "0", 1)); } /* @@ -268,11 +266,11 @@ enable_failures(uint64_t allow_writes, uint64_t allow_reads) { char value[100]; - setenv("WT_FAIL_FS_ENABLE", "1", 1); + testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1)); snprintf(value, sizeof(value), "%" PRIu64, allow_writes); - setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1); + testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1)); snprintf(value, sizeof(value), "%" PRIu64, allow_reads); - setenv("WT_FAIL_FS_READ_ALLOW", value, 1); + testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1)); } /* @@ -307,10 +305,8 @@ static void run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops, bool close_test, uint64_t *nresultsp) { - int narg; - int estatus; - char rarg[20], sarg[20]; - char *subtest_args[MAX_ARGS]; + int estatus, narg; + char rarg[20], sarg[20], *subtest_args[MAX_ARGS]; narg = 0; if (debugger != NULL) { @@ -427,19 +423,21 @@ static int run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status) { int pid; + char **arg; if (opts->verbose) { printf("running: "); - for (char **arg = argv; *arg != NULL; arg++) + for (arg = argv; *arg != NULL; arg++) printf("%s ", *arg); printf("\n"); } if ((pid = fork()) == 0) { - execv(prog, argv); + (void)execv(prog, argv); + testutil_die(errno, "%s", prog); } else if (pid < 0) return (errno); - waitpid(pid, status, 0); + (void)waitpid(pid, status, 0); return (0); } @@ -464,9 +462,9 @@ subtest_main(int argc, char *argv[], bool close_test) /* Redirect stderr, stdout. */ sprintf(filename, "%s/%s", opts->home, STDERR_FILE); - freopen(filename, "a", stderr); + testutil_assert(freopen(filename, "a", stderr) != NULL); sprintf(filename, "%s/%s", opts->home, STDOUT_FILE); - freopen(filename, "a", stdout); + testutil_assert(freopen(filename, "a", stdout) != NULL); snprintf(config, sizeof(config), "create,cache_size=250M,log=(enabled)," "transaction_sync=(enabled,method=none),extensions=(" @@ -572,7 +570,8 @@ subtest_populate(TEST_OPTS *opts, bool close_test) CHECK(session->checkpoint(session, NULL)); if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) - printf(" %d/%" PRIu64 "\n", (i + 1), nrecords); + printf(" %" PRIu32 "/%" PRIu64 "\n", + (i + 1), nrecords); /* Attempt to isolate the failures to checkpointing. */ if (i == (nrecords/100)) { enable_failures(opts->nops, 1000000); -- cgit v1.2.1 From 152d4778f58fe8d9448c530c7cda07801499e8d7 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 14 Feb 2017 10:57:35 -0500 Subject: WT-2909 Create automatable test verifying checkpoint integrity after errors (#3296) FreeBSD's backtrace is the outlier, everybody else (OS X, Solaris, Linux) is using int types, not size_t. --- ext/test/fail_fs/fail_fs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 0ea4a7d5e00..9445dbf9aca 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -675,10 +675,10 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, { FAIL_FILE_SYSTEM *fail_fs; WT_EXTENSION_API *wtext; -#ifdef __linux__ - int btret, i; -#else +#ifdef __FreeBSD__ size_t btret, i; +#else + int btret, i; #endif void *bt[100]; char **btstr; @@ -689,10 +689,10 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 " %s operations\n", fail_fh->iface.name, nops, opkind); -#ifdef __linux__ - btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); -#else +#ifdef __FreeBSD__ btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); +#else + btret = backtrace(bt, (int)(sizeof(bt) / sizeof(bt[0]))); #endif if ((btstr = backtrace_symbols(bt, btret)) != NULL) { for (i = 0; i < btret; i++) -- cgit v1.2.1 From a6a0483f2b4f1617bc1aa1179685b74bad990290 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 14:30:51 -0500 Subject: WT-3180 bug fix: disable long tests in the top-level main program, (#3298) rather than the subtest. Disable core files for the subtest, as they are rarely interesting. Fix some uint64 values/parameters that were declared as uint32. --- test/csuite/wt2909_checkpoint_integrity/main.c | 29 ++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/test/csuite/wt2909_checkpoint_integrity/main.c b/test/csuite/wt2909_checkpoint_integrity/main.c index bf7f86cfd07..ddf249fb406 100644 --- a/test/csuite/wt2909_checkpoint_integrity/main.c +++ b/test/csuite/wt2909_checkpoint_integrity/main.c @@ -27,6 +27,8 @@ */ #include "test_util.h" +#include +#include #include /* @@ -87,8 +89,8 @@ static int create_big_string(char **); static void cursor_count_items(WT_CURSOR *, uint64_t *); static void disable_failures(void); static void enable_failures(uint64_t, uint64_t); -static void generate_key(uint32_t, int *); -static void generate_value(uint32_t, uint32_t, char *, int *, int *, int *, +static void generate_key(uint64_t, int *); +static void generate_value(uint32_t, uint64_t, char *, int *, int *, int *, char **); static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool, uint64_t *); @@ -140,9 +142,8 @@ check_results(TEST_OPTS *opts, uint64_t *foundp) testutil_check(maincur2->get_key(maincur2, &key_got)); testutil_check(maincur2->get_value(maincur2, &rndint)); - generate_key((uint32_t)count, &key); - generate_value(rndint, (uint32_t)count, - bigref, &v0, &v1, &v2, &big); + generate_key(count, &key); + generate_value(rndint, count, bigref, &v0, &v1, &v2, &big); testutil_assert(key == key_got); /* Check the key/values in main table. */ @@ -278,7 +279,7 @@ enable_failures(uint64_t allow_writes, uint64_t allow_reads) * Generate a key used by the "subtest" and "subtest2" tables. */ static void -generate_key(uint32_t i, int *keyp) +generate_key(uint64_t i, int *keyp) { *keyp = (int)i; } @@ -288,7 +289,7 @@ generate_key(uint32_t i, int *keyp) * Generate values for the "subtest" table. */ static void -generate_value(uint32_t rndint, uint32_t i, char *bigref, +generate_value(uint32_t rndint, uint64_t i, char *bigref, int *v0p, int *v1p, int *v2p, char **bigp) { *v0p = (int)(i * 7); @@ -451,12 +452,16 @@ subtest_main(int argc, char *argv[], bool close_test) TEST_OPTS *opts, _opts; WT_SESSION *session; char config[1024], filename[1024]; + struct rlimit rlim; - opts = &_opts; if (testutil_disable_long_tests()) return (0); + opts = &_opts; memset(opts, 0, sizeof(*opts)); + memset(&rlim, 0, sizeof(rlim)); + /* No core files during fault injection tests. */ + testutil_check(setrlimit(RLIMIT_CORE, &rlim)); testutil_check(testutil_parse_opts(argc, argv, opts)); testutil_make_work_dir(opts->home); @@ -527,8 +532,8 @@ subtest_populate(TEST_OPTS *opts, bool close_test) WT_CURSOR *maincur, *maincur2; WT_RAND_STATE rnd; WT_SESSION *session; - uint64_t nrecords; - uint32_t i, rndint; + uint64_t i, nrecords; + uint32_t rndint; int key, v0, v1, v2; char *big, *bigref; bool failed, failmode; @@ -570,7 +575,7 @@ subtest_populate(TEST_OPTS *opts, bool close_test) CHECK(session->checkpoint(session, NULL)); if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose) - printf(" %" PRIu32 "/%" PRIu64 "\n", + printf(" %" PRIu64 "/%" PRIu64 "\n", (i + 1), nrecords); /* Attempt to isolate the failures to checkpointing. */ if (i == (nrecords/100)) { @@ -617,6 +622,8 @@ main(int argc, char *argv[]) uint64_t nresults; const char *debugger; + if (testutil_disable_long_tests()) + return (0); opts = &_opts; memset(opts, 0, sizeof(*opts)); debugger = NULL; -- cgit v1.2.1 From a53bb9683b7f8e4fda3c6272ec8224857e756ba8 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 16:30:53 -0500 Subject: WT-3179 test bug: clang sanitizer failure in fail_fs #3300 hold the fs lock while manipulating the list of file handles. --- ext/test/fail_fs/fail_fs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index 9445dbf9aca..cb87b43bfd9 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -156,11 +156,13 @@ static int fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) { FAIL_FILE_HANDLE *fail_fh; + FAIL_FILE_SYSTEM *fail_fs; int ret; (void)session; /* Unused */ fail_fh = (FAIL_FILE_HANDLE *)file_handle; + fail_fs = fail_fh->fail_fs; /* * We don't actually open an fd when opening directories for flushing, @@ -170,14 +172,16 @@ fail_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session) return (0); ret = close(fail_fh->fd); fail_fh->fd = -1; + fail_fs_lock(&fail_fs->lock); fail_file_handle_remove(session, fail_fh); + fail_fs_unlock(&fail_fs->lock); return (ret); } /* * fail_file_handle_remove -- * Destroy an in-memory file handle. Should only happen on remove or - * shutdown. + * shutdown. The file system lock must be held during this call. */ static void fail_file_handle_remove(WT_SESSION *session, FAIL_FILE_HANDLE *fail_fh) -- cgit v1.2.1 From 7a725a97d281095280515b0609f0e61747fd1b58 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Tue, 14 Feb 2017 17:21:07 -0500 Subject: WT-3179 test bug: clang sanitizer failure in fail_fs Replaced a fprintf call, and cleaned up a call to access system call. --- ext/test/fail_fs/fail_fs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ext/test/fail_fs/fail_fs.c b/ext/test/fail_fs/fail_fs.c index cb87b43bfd9..d0d8a14c8c2 100644 --- a/ext/test/fail_fs/fail_fs.c +++ b/ext/test/fail_fs/fail_fs.c @@ -536,7 +536,7 @@ fail_fs_exist(WT_FILE_SYSTEM *file_system, (void)file_system; /* Unused */ (void)session; /* Unused */ - *existp = (access(name, 0) == 0); + *existp = (access(name, F_OK) == 0); return (0); } @@ -551,6 +551,7 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, { FAIL_FILE_HANDLE *fail_fh; FAIL_FILE_SYSTEM *fail_fs; + WT_EXTENSION_API *wtext; WT_FILE_HANDLE *file_handle; int fd, open_flags, ret; @@ -563,8 +564,11 @@ fail_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session, fd = -1; ret = 0; - if (fail_fs->verbose) - fprintf(stderr, "fail_fs: open: %s\n", name); + if (fail_fs->verbose) { + wtext = fail_fs->wtext; + (void)wtext->msg_printf(wtext, session, "fail_fs: open: %s", + name); + } fail_fs_lock(&fail_fs->lock); @@ -692,7 +696,7 @@ fail_fs_simulate_fail(FAIL_FILE_HANDLE *fail_fh, WT_SESSION *session, wtext = fail_fs->wtext; (void)wtext->msg_printf(wtext, session, "fail_fs: %s: simulated failure after %" PRId64 - " %s operations\n", fail_fh->iface.name, nops, opkind); + " %s operations", fail_fh->iface.name, nops, opkind); #ifdef __FreeBSD__ btret = backtrace(bt, sizeof(bt) / sizeof(bt[0])); #else -- cgit v1.2.1 From 70b5ab64d84cb8a22553def853ddb1a11393ff73 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 15 Feb 2017 18:08:10 +1100 Subject: WT-3149 Make random lookups for eviction more lightweight. (#3302) Eviction walks don't need to start on leaf pages: just try to descend through the tree and as soon as we can't swap to a child page, start the walk from the parent. --- src/btree/bt_random.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index 3cc6838c4c8..44de511f787 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -166,7 +166,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * __wt_random_descent -- - * Find a random leaf page in a tree. + * Find a random page in a tree for either sampling or eviction. */ int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) @@ -183,9 +183,11 @@ __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) retry = 100; /* Eviction should not be tapped to do eviction. */ - flags = WT_READ_RESTART_OK; if (eviction) - LF_SET(WT_READ_NO_EVICT); + flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | + WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK; + else + flags = WT_READ_RESTART_OK; if (0) { restart: /* @@ -205,6 +207,13 @@ restart: /* WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; + /* Eviction just wants any random child. */ + if (eviction) { + descent = pindex->index[ + __wt_random(&session->rnd) % entries]; + goto descend; + } + /* * There may be empty pages in the tree, and they're useless to * us. If we don't find a non-empty page in "entries" random @@ -212,10 +221,8 @@ restart: /* * search page contains nothing other than empty pages, restart * from the root some number of times before giving up. * - * Eviction is only looking for a place in the cache and so only - * wants in-memory pages (but a deleted page is fine); currently - * our other caller is looking for a key/value pair on a random - * leave page, and so will accept any page that contains a valid + * Random sampling is looking for a key/value pair on a random + * leaf page, and so will accept any page that contains a valid * key/value pair, so on-disk is fine, but deleted is not. */ descent = NULL; @@ -223,15 +230,14 @@ restart: /* descent = pindex->index[__wt_random(&session->rnd) % entries]; if (descent->state == WT_REF_MEM || - (!eviction && descent->state == WT_REF_DISK)) + descent->state == WT_REF_DISK) break; } if (i == entries) for (i = 0; i < entries; ++i) { descent = pindex->index[i]; if (descent->state == WT_REF_MEM || - (!eviction && - descent->state == WT_REF_DISK)) + descent->state == WT_REF_DISK) break; } if (i == entries || descent == NULL) { @@ -249,17 +255,25 @@ restart: /* * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ - if ((ret = +descend: if ((ret = __wt_page_swap(session, current, descent, flags)) == 0) { current = descent; continue; } + if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART)) + break; if (ret == WT_RESTART) goto restart; return (ret); } - *refp = current; + /* + * There is no point starting with the root page: the walk will exit + * immediately. In that case we aren't holding a hazard pointer so + * there is nothing to release. + */ + if (!eviction || !__wt_ref_is_root(current)) + *refp = current; return (0); } -- cgit v1.2.1 From 83ce29217f0bebad1c0a86e4eb827a70216b4641 Mon Sep 17 00:00:00 2001 From: sueloverso Date: Wed, 15 Feb 2017 16:38:07 -0500 Subject: WT-3186 Fix error path and panic detection in logging loops. (#3304) --- src/include/extern.h | 2 +- src/log/log.c | 6 +++++- src/log/log_slot.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/include/extern.h b/src/include/extern.h index 8e55077c2a9..19ad9a880df 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -406,7 +406,7 @@ extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bo extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/log/log.c b/src/log/log.c index b07ef8c1bd5..d6caa55f8c7 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -2132,7 +2132,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_CONN_INCR(session, log_writes); - __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * The only time joining a slot should ever return an error is if it + * detects a panic. + */ + WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot)); /* * If the addition of this record crosses the buffer boundary, * switch in a new slot. diff --git a/src/log/log_slot.c b/src/log/log_slot.c index d6e692f8c51..542f010ea53 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -160,6 +160,7 @@ retry: #endif if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { while (slot->slot_unbuffered == 0) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; @@ -464,7 +465,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) * __wt_log_slot_join -- * Join a consolidated logging slot. */ -void +int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { @@ -498,6 +499,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, } for (;;) { WT_BARRIER(); + WT_RET(WT_SESSION_CHECK_PANIC(session)); slot = log->active_slot; old_state = slot->slot_state; if (WT_LOG_SLOT_OPEN(old_state)) { @@ -555,6 +557,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); + return (0); } /* -- cgit v1.2.1 From 8a1adcc4a1c4c25e1270290a8eb21173f41e83a9 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Thu, 16 Feb 2017 00:21:26 -0500 Subject: WT-3184 bug fix: special case searching an index that has a custom collator. (#3303) In this case, we must use the entire (raw) key to duplicate the position, instead of truncating to the visible part. --- src/cursor/cur_index.c | 3 +- src/cursor/cur_std.c | 7 +- src/include/wiredtiger.in | 5 +- test/csuite/Makefile.am | 3 + test/csuite/wt3184_dup_index_collator/main.c | 168 +++++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 test/csuite/wt3184_dup_index_collator/main.c diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 13180efdea4..6fc01c0421f 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -245,7 +245,8 @@ __curindex_search(WT_CURSOR *cursor) * Custom collators expect to see complete keys, pass an item containing * all the visible fields so it unpacks correctly. */ - if (cindex->index->collator != NULL) + if (cindex->index->collator != NULL && + !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH)) WT_ERR(__wt_struct_repack(session, child->key_format, cindex->iface.key_format, &child->key, &found_key)); else diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 7ace6d49cf0..99a9e373354 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -633,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) { + WT_DECL_RET; WT_ITEM key; /* @@ -662,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) * cursors cannot reference application memory after cursor operations * and that requirement will save the day. */ - WT_RET(cursor->search(cursor)); + F_SET(cursor, WT_CURSTD_RAW_SEARCH); + ret = cursor->search(cursor); + F_CLR(cursor, WT_CURSTD_RAW_SEARCH); - return (0); + return (ret); } /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index d1e3d383396..c148e759299 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -576,8 +576,9 @@ struct __wt_cursor { #define WT_CURSTD_OPEN 0x00200 #define WT_CURSTD_OVERWRITE 0x00400 #define WT_CURSTD_RAW 0x00800 -#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */ +#define WT_CURSTD_RAW_SEARCH 0x01000 +#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index 0158d0c96d1..e2b72532703 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -49,6 +49,9 @@ noinst_PROGRAMS += test_wt3120_filesys test_wt3135_search_near_collator_SOURCES = wt3135_search_near_collator/main.c noinst_PROGRAMS += test_wt3135_search_near_collator +test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c +noinst_PROGRAMS += test_wt3184_dup_index_collator + # Run this during a "make check" smoke test. TESTS = $(noinst_PROGRAMS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/test/csuite/wt3184_dup_index_collator/main.c b/test/csuite/wt3184_dup_index_collator/main.c new file mode 100644 index 00000000000..bcefd2f1a3b --- /dev/null +++ b/test/csuite/wt3184_dup_index_collator/main.c @@ -0,0 +1,168 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3184 + * Test case description: Each set of data is ordered and contains + * five elements (0-4). We insert elements 1 and 3, and then do + * search_near and search for each element. For each set of data, we perform + * these tests first using a custom collator, and second using a custom collator + * and extractor. In each case there are index keys having variable length. + * Failure mode: In the reported test case, the custom compare routine is + * given a truncated key to compare, and the unpack functions return errors + * because the truncation appeared in the middle of a key. + */ + +static int +compare_int(int32_t a, int32_t b) +{ + return (a < b ? -1 : (a > b ? 1 : 0)); +} + +static int32_t +item_to_int(WT_ITEM *item) +{ + testutil_assert(item->size == sizeof(int32_t)); + return (*(int32_t *)item->data); +} + +static int +compare_int_items(WT_ITEM *itema, WT_ITEM *itemb) +{ + testutil_assert(itema->size == sizeof(int32_t)); + testutil_assert(itemb->size == sizeof(int32_t)); + return (compare_int(item_to_int(itema), item_to_int(itemb))); +} + +static void +print_int_item(const char *str, const WT_ITEM *item) +{ + if (item->size > 0) { + testutil_assert(item->size == sizeof(int32_t)); + printf("%s%" PRId32, str, *(int32_t *)item->data); + } else + printf("%s", str); +} + +static int +index_compare(WT_COLLATOR *collator, WT_SESSION *session, + const WT_ITEM *key1, const WT_ITEM *key2, int *cmp) +{ + WT_ITEM ikey1, pkey1, ikey2, pkey2; + + (void)collator; + testutil_check(wiredtiger_struct_unpack(session, + key1->data, key1->size, "uu", &ikey1, &pkey1)); + testutil_check(wiredtiger_struct_unpack(session, + key2->data, key2->size, "uu", &ikey2, &pkey2)); + + print_int_item("index_compare: index key1 = ", &ikey1); + print_int_item(", primary key1 = ", &pkey1); + print_int_item(", index key2 = ", &ikey2); + print_int_item(", primary key2 = ", &pkey2); + printf("\n"); + + if ((*cmp = compare_int_items(&ikey1, &ikey2)) != 0) + return (0); + + if (pkey1.size != 0 && pkey2.size != 0) + *cmp = compare_int_items(&pkey1, &pkey2); + else if (pkey1.size != 0) + *cmp = 1; + else if (pkey2.size != 0) + *cmp = -1; + else + *cmp = 0; + + return (0); +} + +static WT_COLLATOR index_coll = { index_compare, NULL, NULL }; + +int +main(int argc, char *argv[]) +{ + TEST_OPTS *opts, _opts; + WT_CURSOR *cursor, *cursor1; + WT_ITEM got, k, v; + WT_SESSION *session; + int32_t ki, vi; + + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, NULL, "create", + &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + testutil_check(opts->conn->add_collator(opts->conn, "index_coll", + &index_coll, NULL)); + + testutil_check(session->create(session, + "table:main", "key_format=u,value_format=u,columns=(k,v)")); + testutil_check(session->create(session, + "index:main:index", "columns=(v),collator=index_coll")); + + printf("adding new record\n"); + testutil_check(session->open_cursor(session, "table:main", NULL, NULL, + &cursor)); + + ki = 13; + vi = 17; + + k.data = &ki; k.size = sizeof(ki); + v.data = &vi; v.size = sizeof(vi); + + cursor->set_key(cursor, &k); + cursor->set_value(cursor, &v); + testutil_check(cursor->insert(cursor)); + testutil_check(cursor->close(cursor)); + + printf("positioning index cursor\n"); + + testutil_check(session->open_cursor(session, "index:main:index", NULL, + NULL, &cursor)); + cursor->set_key(cursor, &v); + testutil_check(cursor->search(cursor)); + + printf("duplicating cursor\n"); + testutil_check(session->open_cursor(session, NULL, cursor, NULL, + &cursor1)); + cursor->get_value(cursor, &got); + testutil_assert(item_to_int(&got) == 17); + cursor1->get_value(cursor1, &got); + testutil_assert(item_to_int(&got) == 17); + + testutil_check(session->close(session, NULL)); + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} -- cgit v1.2.1