From fb6ebe75207c3221314ed318595489a838ef1db0 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Mon, 2 Nov 2015 11:55:14 +1100 Subject: Import wiredtiger-wiredtiger-mongodb-3.2.0-rc1-194-g0dc3f20.tar.gz from wiredtiger branch mongodb-3.2 --- src/third_party/wiredtiger/bench/wtperf/wtperf.c | 2 + .../wiredtiger/bench/wtperf/wtperf_opt.i | 2 + src/third_party/wiredtiger/dist/api_data.py | 80 +++--- src/third_party/wiredtiger/dist/api_err.py | 5 + src/third_party/wiredtiger/dist/flags.py | 6 +- src/third_party/wiredtiger/dist/s_define.list | 1 + src/third_party/wiredtiger/dist/s_string.ok | 1 + src/third_party/wiredtiger/src/block/block_ckpt.c | 20 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 10 +- src/third_party/wiredtiger/src/btree/bt_debug.c | 51 +++- src/third_party/wiredtiger/src/btree/bt_delete.c | 10 +- src/third_party/wiredtiger/src/btree/bt_discard.c | 54 +++-- src/third_party/wiredtiger/src/btree/bt_io.c | 3 + src/third_party/wiredtiger/src/btree/bt_read.c | 6 +- src/third_party/wiredtiger/src/btree/bt_split.c | 239 ++++++++++++------ src/third_party/wiredtiger/src/btree/bt_walk.c | 14 +- src/third_party/wiredtiger/src/btree/row_srch.c | 49 +++- src/third_party/wiredtiger/src/config/config_def.c | 14 +- src/third_party/wiredtiger/src/conn/api_strerror.c | 2 + src/third_party/wiredtiger/src/conn/conn_api.c | 18 +- src/third_party/wiredtiger/src/conn/conn_ckpt.c | 11 + src/third_party/wiredtiger/src/conn/conn_dhandle.c | 7 +- src/third_party/wiredtiger/src/conn/conn_handle.c | 2 + src/third_party/wiredtiger/src/conn/conn_log.c | 32 ++- src/third_party/wiredtiger/src/conn/conn_sweep.c | 9 + src/third_party/wiredtiger/src/cursor/cur_ds.c | 2 +- src/third_party/wiredtiger/src/cursor/cur_file.c | 44 ++-- .../wiredtiger/src/cursor/cur_metadata.c | 21 +- src/third_party/wiredtiger/src/cursor/cur_table.c | 2 +- src/third_party/wiredtiger/src/evict/evict_lru.c | 114 ++++++--- src/third_party/wiredtiger/src/evict/evict_page.c | 124 +++++++--- src/third_party/wiredtiger/src/include/api.h | 9 +- src/third_party/wiredtiger/src/include/block.h | 3 + src/third_party/wiredtiger/src/include/btmem.h | 23 +- src/third_party/wiredtiger/src/include/btree.i | 12 +- src/third_party/wiredtiger/src/include/btree_cmp.i | 55 +++++ src/third_party/wiredtiger/src/include/cache.i | 20 ++ .../wiredtiger/src/include/connection.h | 1 + src/third_party/wiredtiger/src/include/extern.h | 11 +- src/third_party/wiredtiger/src/include/flags.h | 66 ++--- src/third_party/wiredtiger/src/include/log.h | 5 +- src/third_party/wiredtiger/src/include/lsm.h | 1 + src/third_party/wiredtiger/src/include/meta.h | 10 + src/third_party/wiredtiger/src/include/misc.h | 4 +- src/third_party/wiredtiger/src/include/txn.h | 32 ++- src/third_party/wiredtiger/src/include/txn.i | 61 ++--- .../wiredtiger/src/include/wiredtiger.in | 9 + src/third_party/wiredtiger/src/log/log.c | 119 ++++----- src/third_party/wiredtiger/src/log/log_slot.c | 2 + src/third_party/wiredtiger/src/lsm/lsm_cursor.c | 2 +- src/third_party/wiredtiger/src/lsm/lsm_work_unit.c | 64 +++-- src/third_party/wiredtiger/src/meta/meta_table.c | 20 +- src/third_party/wiredtiger/src/meta/meta_track.c | 35 +-- src/third_party/wiredtiger/src/meta/meta_turtle.c | 4 +- .../wiredtiger/src/os_posix/os_fallocate.c | 6 +- .../wiredtiger/src/reconcile/rec_write.c | 270 +++++++++++---------- .../wiredtiger/src/schema/schema_open.c | 47 +++- .../wiredtiger/src/schema/schema_stat.c | 68 ++++++ .../wiredtiger/src/session/session_api.c | 125 ++++------ .../wiredtiger/src/session/session_compact.c | 61 +++-- .../wiredtiger/src/session/session_dhandle.c | 8 - src/third_party/wiredtiger/src/txn/txn_ckpt.c | 74 +++++- src/third_party/wiredtiger/src/txn/txn_recover.c | 4 +- 63 files changed, 1417 insertions(+), 769 deletions(-) (limited to 'src/third_party') diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 20c30e10482..44aff59963c 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -1455,6 +1455,8 @@ close_reopen(CONFIG *cfg) { int ret; + if (!cfg->reopen_connection) + return (0); /* * Reopen the connection. We do this so that the workload phase always * starts with the on-disk files, and so that read-only workloads can diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index 7e29aa0f3c2..be3ba462e0c 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -134,6 +134,8 @@ DEF_OPT_AS_UINT32(random_range, 0, "if non zero choose a value from within this range as the key for " "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") +DEF_OPT_AS_BOOL(reopen_connection, 1, + "close and reopen the connection between populate and workload phases") DEF_OPT_AS_UINT32(report_interval, 2, "output throughput information every interval seconds, 0 to disable") DEF_OPT_AS_UINT32(run_ops, 0, diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 6fd7dcd0093..99e08282e49 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -582,7 +582,7 @@ session_config = [ choices=['read-uncommitted', 'read-committed', 'snapshot']), ] -common_wiredtiger_open = [ +wiredtiger_open_common = connection_runtime_config + [ Config('buffer_alignment', '-1', r''' in-memory alignment (in bytes) for buffers used for I/O. The default value of -1 indicates a platform-specific alignment value @@ -676,6 +676,30 @@ common_wiredtiger_open = [ ]), ] +wiredtiger_open = wiredtiger_open_common + [ + Config('config_base', 'true', r''' + write the base configuration file if creating the database. If + \c false in the config passed directly to ::wiredtiger_open, will + ignore any existing base configuration file in addition to not creating + one. See @ref config_base for more information''', + type='boolean'), + Config('create', 'false', r''' + create the database if it does not exist''', + type='boolean'), + Config('exclusive', 'false', r''' + fail if the database already exists, generally used with the + \c create option''', + type='boolean'), + Config('in_memory', 'false', r''' + keep data in-memory only, minimize disk I/O''', + type='boolean', undoc=True), + Config('use_environment_priv', 'false', r''' + use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment + variables regardless of whether or not the process is running + with special privileges. See @ref home for more information''', + type='boolean'), +] + cursor_runtime_config = [ Config('append', 'false', r''' append the value as a new record, creating a new record @@ -1003,59 +1027,13 @@ methods = { # creation-specific configuration strings). # wiredtiger_open_all: # All of the above configuration values combined -'wiredtiger_open' : Method( - connection_runtime_config + - common_wiredtiger_open + [ - Config('config_base', 'true', r''' - write the base configuration file if creating the database. If - \c false in the config passed directly to ::wiredtiger_open, will - ignore any existing base configuration file in addition to not creating - one. See @ref config_base for more information''', - type='boolean'), - Config('create', 'false', r''' - create the database if it does not exist''', - type='boolean'), - Config('exclusive', 'false', r''' - fail if the database already exists, generally used with the - \c create option''', - type='boolean'), - Config('use_environment_priv', 'false', r''' - use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment - variables regardless of whether or not the process is running - with special privileges. See @ref home for more information''', - type='boolean'), -]), -'wiredtiger_open_basecfg' : Method( - connection_runtime_config + - common_wiredtiger_open + [ +'wiredtiger_open' : Method(wiredtiger_open), +'wiredtiger_open_basecfg' : Method(wiredtiger_open_common + [ Config('version', '(major=0,minor=0)', r''' the file version'''), ]), -'wiredtiger_open_usercfg' : Method( - connection_runtime_config + - common_wiredtiger_open -), -'wiredtiger_open_all' : Method( - connection_runtime_config + - common_wiredtiger_open + [ - Config('config_base', 'true', r''' - write the base configuration file if creating the database. If - \c false in the config passed directly to ::wiredtiger_open, will - ignore any existing base configuration file in addition to not creating - one. See @ref config_base for more information''', - type='boolean'), - Config('create', 'false', r''' - create the database if it does not exist''', - type='boolean'), - Config('exclusive', 'false', r''' - fail if the database already exists, generally used with the - \c create option''', - type='boolean'), - Config('use_environment_priv', 'false', r''' - use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment - variables regardless of whether or not the process is running - with special privileges. See @ref home for more information''', - type='boolean'), +'wiredtiger_open_usercfg' : Method(wiredtiger_open_common), +'wiredtiger_open_all' : Method(wiredtiger_open + [ Config('version', '(major=0,minor=0)', r''' the file version'''), ]), diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py index 936c7bb11a7..09332d508a2 100644 --- a/src/third_party/wiredtiger/dist/api_err.py +++ b/src/third_party/wiredtiger/dist/api_err.py @@ -51,6 +51,11 @@ errors = [ 'recovery must be run to continue', ''' This error is generated when wiredtiger_open is configured to return an error if recovery is required to use the database.'''), + Error('WT_CACHE_FULL', -31807, + 'operation would overflow cache', ''' + This error is generated when wiredtiger_open is configured + to run in-memory, and an insert or update operation requires more + than the configured cache size to complete.''', undoc=True), ] # Update the #defines in the wiredtiger.in file. diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 65b68cf4277..da677c17389 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -36,6 +36,7 @@ flags = { 'page_read' : [ 'READ_CACHE', 'READ_COMPACT', + 'READ_NO_EMPTY', 'READ_NO_EVICT', 'READ_NO_GEN', 'READ_NO_WAIT', @@ -45,9 +46,10 @@ flags = { 'READ_WONT_NEED', ], 'rec_write' : [ + 'EVICT_IN_MEMORY', 'EVICT_LOOKASIDE', - 'EVICTING', 'EVICT_UPDATE_RESTORE', + 'EVICTING', 'VISIBILITY_ERR', ], 'txn_log_checkpoint' : [ @@ -92,6 +94,7 @@ flags = { 'CONN_CKPT_SYNC', 'CONN_CLOSING', 'CONN_EVICTION_RUN', + 'CONN_IN_MEMORY', 'CONN_LAS_OPEN', 'CONN_LEAK_MEMORY', 'CONN_LOG_SERVER_RUN', @@ -114,6 +117,7 @@ flags = { 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', + 'SESSION_LOCKED_TURTLE', 'SESSION_LOGGING_INMEM', 'SESSION_LOOKASIDE_CURSOR', 'SESSION_NO_CACHE', diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index dce284dae44..d204a11835b 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -35,6 +35,7 @@ WT_PACKED_STRUCT_END WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT +WT_SESSION_LOCKED_TURTLE WT_STAT_DECR WT_STAT_DECRV WT_STAT_FAST_CONN_DECRV diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index d234a3c101f..26c0a905b82 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -59,6 +59,7 @@ CSV CURSORs CURSTD CallsCustDate +Checkpointing Checksum Checksums CityHash diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index 9b42a072d73..2c8ff89a5cf 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -83,11 +83,16 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { /* - * We depend on the btree level for locking: things will go - * bad fast should we open the live system in two handles, or - * if we create, salvage, truncate or verify the live/running - * file, for that matter. + * We depend on the btree level for locking: things will go bad + * fast if we open the live system in two handles, or salvage, + * truncate or verify the live/running file. */ +#ifdef HAVE_DIAGNOSTIC + __wt_spin_lock(session, &block->live_lock); + WT_ASSERT(session, block->live_open == false); + block->live_open = true; + __wt_spin_unlock(session, &block->live_lock); +#endif ci = &block->live; WT_ERR(__wt_block_ckpt_init(session, ci, "live")); } @@ -178,8 +183,8 @@ __wt_block_checkpoint_unload( /* * If it's the live system, truncate to discard any extended blocks and * discard the active extent lists. Hold the lock even though we're - * unloading the live checkpoint, there could be readers active in - * other checkpoints. + * unloading the live checkpoint, there could be readers active in other + * checkpoints. */ if (!checkpoint) { /* @@ -191,6 +196,9 @@ __wt_block_checkpoint_unload( __wt_spin_lock(session, &block->live_lock); __wt_block_ckpt_destroy(session, &block->live); +#ifdef HAVE_DIAGNOSTIC + block->live_open = false; +#endif __wt_spin_unlock(session, &block->live_lock); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 037648696b3..3290fd6374c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -306,9 +306,6 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); - if (btree->type == BTREE_ROW) - WT_RET(__cursor_size_chk(session, &cursor->key)); - /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search @@ -376,9 +373,6 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); - if (btree->type == BTREE_ROW) - WT_RET(__cursor_size_chk(session, &cursor->key)); - /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact @@ -449,6 +443,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { + WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); @@ -659,9 +654,6 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); - if (btree->type == BTREE_ROW) - WT_RET(__cursor_size_chk(session, &cursor->key)); - retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 15ae93522a7..8edc40794e2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -45,7 +45,8 @@ static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); static int __debug_ref(WT_DBG *, WT_REF *); static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); -static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t); +static int __debug_tree( + WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t); static void __debug_update(WT_DBG *, WT_UPDATE *, bool); static void __dmsg(WT_DBG *, const char *, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))); @@ -224,6 +225,8 @@ __wt_debug_addr(WT_SESSION_IMPL *session, WT_DECL_ITEM(buf); WT_DECL_RET; + WT_ASSERT(session, S2BT_SAFE(session) != NULL); + bm = S2BT(session)->bm; WT_RET(__wt_scr_alloc(session, 1024, &buf)); @@ -245,6 +248,8 @@ __wt_debug_offset_blind( WT_DECL_ITEM(buf); WT_DECL_RET; + WT_ASSERT(session, S2BT_SAFE(session) != NULL); + /* * This routine depends on the default block manager's view of files, * where an address consists of a file offset, length, and checksum. @@ -274,6 +279,8 @@ __wt_debug_offset(WT_SESSION_IMPL *session, WT_DECL_RET; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp; + WT_ASSERT(session, S2BT_SAFE(session) != NULL); + /* * This routine depends on the default block manager's view of files, * where an address consists of a file offset, length, and checksum. @@ -377,6 +384,8 @@ __debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk) uint32_t i; uint8_t v; + WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL); + btree = S2BT(ds->session); WT_FIX_FOREACH(btree, dsk, v, i) { @@ -398,6 +407,8 @@ __debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk) WT_CELL_UNPACK *unpack, _unpack; uint32_t i; + WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL); + btree = S2BT(ds->session); unpack = &_unpack; @@ -465,6 +476,8 @@ __wt_debug_tree_shape( { WT_DBG *ds, _ds; + WT_ASSERT(session, S2BT_SAFE(session) != NULL); + ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); @@ -484,22 +497,30 @@ __wt_debug_tree_shape( /* * __wt_debug_tree_all -- * Dump the in-memory information for a tree, including leaf pages. + * Takes an explicit btree as an argument, as one may not yet be set on + * the session. This is often the case as this function will be called + * from within a debugger, which makes setting a btree complicated. */ int -__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +__wt_debug_tree_all( + WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile) { - return (__debug_tree( - session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); + return (__debug_tree(session, + btree, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); } /* * __wt_debug_tree -- * Dump the in-memory information for a tree, not including leaf pages. + * Takes an explicit btree as an argument, as one may not yet be set on + * the session. This is often the case as this function will be called + * from within a debugger, which makes setting a btree complicated. */ int -__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +__wt_debug_tree( + WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile) { - return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK)); + return (__debug_tree(session, btree, page, ofile, WT_DEBUG_TREE_WALK)); } /* @@ -512,6 +533,8 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) WT_DBG *ds, _ds; WT_DECL_RET; + WT_ASSERT(session, S2BT_SAFE(session) != NULL); + ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); @@ -524,11 +547,16 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) /* * __debug_tree -- - * Dump the in-memory information for a tree. + * Dump the in-memory information for a tree. Takes an explicit btree + * as an argument, as one may not be set on the session. This is often + * the case as this function will be called from within a debugger, which + * makes setting a btree complicated. We mark the session to the btree + * in this function */ static int __debug_tree( - WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags) + WT_SESSION_IMPL *session, WT_BTREE *btree, + WT_PAGE *page, const char *ofile, uint32_t flags) { WT_DBG *ds, _ds; WT_DECL_RET; @@ -540,7 +568,7 @@ __debug_tree( if (page == NULL) page = S2BT(session)->root.page; - ret = __debug_page(ds, page, flags); + WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags)); __dmsg_wrapup(ds); @@ -664,9 +692,6 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) case WT_PM_REC_REPLACE: __dmsg(ds, ", replaced"); break; - case WT_PM_REC_REWRITE: - __dmsg(ds, ", rewrite"); - break; case 0: break; WT_ILLEGAL_VALUE(session); @@ -693,6 +718,8 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) uint32_t i; uint8_t v; + WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL); + session = ds->session; btree = S2BT(session); dsk = page->dsk; diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 23429121e98..757b7b51cdd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) /* * __wt_delete_page_skip -- - * If iterating a cursor, skip deleted pages that are visible to us. + * If iterating a cursor, skip deleted pages that are either visible to + * us or globally visible. */ bool -__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; @@ -245,8 +246,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); - skip = ref->page_del == NULL || - __wt_txn_visible(session, ref->page_del->txnid); + skip = ref->page_del == NULL || (visible_all ? + __wt_txn_visible_all(session, ref->page_del->txnid) : + __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 998667e3e1f..67e70d0cdb9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -12,9 +12,10 @@ static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *); static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *); static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *); static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); -static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t); -static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *); -static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t); +static void __free_skip_array( + WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t, bool); +static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *, bool); +static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t, bool); /* * __wt_ref_out -- @@ -144,12 +145,15 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) WT_MULTI *multi; WT_PAGE_MODIFY *mod; uint32_t i; + bool update_ignore; mod = page->modify; + /* In some failed-split cases, we can't discard updates. */ + update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE); + switch (mod->rec_result) { case WT_PM_REC_MULTIBLOCK: - case WT_PM_REC_REWRITE: /* Free list of replacement blocks. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { @@ -160,7 +164,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) break; } __wt_free(session, multi->supd); - __wt_free(session, multi->supd_dsk); + __wt_free(session, multi->disk_image); __wt_free(session, multi->addr.addr); } __wt_free(session, mod->mod_multi); @@ -179,7 +183,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) case WT_PAGE_COL_VAR: /* Free the append array. */ if ((append = WT_COL_APPEND(page)) != NULL) { - __free_skip_list(session, WT_SKIP_FIRST(append)); + __free_skip_list( + session, WT_SKIP_FIRST(append), update_ignore); __wt_free(session, append); __wt_free(session, mod->mod_append); } @@ -188,7 +193,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) if (mod->mod_update != NULL) __free_skip_array(session, mod->mod_update, page->type == - WT_PAGE_COL_FIX ? 1 : page->pg_var_entries); + WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, + update_ignore); break; } @@ -302,6 +308,10 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ROW *rip; uint32_t i; void *copy; + bool update_ignore; + + /* In some failed-split cases, we can't discard updates. */ + update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE); /* * Free the in-memory index array. @@ -326,12 +336,13 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) * found on the original page). */ if (page->pg_row_ins != NULL) - __free_skip_array( - session, page->pg_row_ins, page->pg_row_entries + 1); + __free_skip_array(session, + page->pg_row_ins, page->pg_row_entries + 1, update_ignore); /* Free the update array. */ if (page->pg_row_upd != NULL) - __free_update(session, page->pg_row_upd, page->pg_row_entries); + __free_update(session, + page->pg_row_upd, page->pg_row_entries, update_ignore); } /* @@ -339,8 +350,8 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) * Discard an array of skip list headers. */ static void -__free_skip_array( - WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries) +__free_skip_array(WT_SESSION_IMPL *session, + WT_INSERT_HEAD **head_arg, uint32_t entries, bool update_ignore) { WT_INSERT_HEAD **head; @@ -350,7 +361,8 @@ __free_skip_array( */ for (head = head_arg; entries > 0; --entries, ++head) if (*head != NULL) { - __free_skip_list(session, WT_SKIP_FIRST(*head)); + __free_skip_list( + session, WT_SKIP_FIRST(*head), update_ignore); __wt_free(session, *head); } @@ -364,12 +376,13 @@ __free_skip_array( * of a WT_INSERT structure and its associated chain of WT_UPDATE structures. */ static void -__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) +__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins, bool update_ignore) { WT_INSERT *next; for (; ins != NULL; ins = next) { - __wt_free_update_list(session, ins->upd); + if (!update_ignore) + __wt_free_update_list(session, ins->upd); next = WT_SKIP_NEXT(ins); __wt_free(session, ins); } @@ -380,8 +393,8 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) * Discard the update array. */ static void -__free_update( - WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries) +__free_update(WT_SESSION_IMPL *session, + WT_UPDATE **update_head, uint32_t entries, bool update_ignore) { WT_UPDATE **updp; @@ -389,9 +402,10 @@ __free_update( * For each non-NULL slot in the page's array of updates, free the * linked list anchored in that slot. */ - for (updp = update_head; entries > 0; --entries, ++updp) - if (*updp != NULL) - __wt_free_update_list(session, *updp); + if (!update_ignore) + for (updp = update_head; entries > 0; --entries, ++updp) + if (*updp != NULL) + __wt_free_update_list(session, *updp); /* Free the update array. */ __wt_free(session, update_head); diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index e46e4a55696..6481f514323 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -192,6 +192,9 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, (!checkpoint && addr != NULL && addr_sizep != NULL) || (checkpoint && addr == NULL && addr_sizep == NULL)); + /* In-memory databases shouldn't write pages. */ + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)); + #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index d9cdfc78c75..e60f7b3fb02 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -448,8 +448,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags for (oldgen = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { - case WT_REF_DISK: case WT_REF_DELETED: + if (LF_ISSET(WT_READ_NO_EMPTY) && + __wt_delete_page_skip(session, ref, false)) + return (WT_NOTFOUND); + /* FALLTHROUGH */ + case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index adda9145ee4..2145d6ac014 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -211,7 +211,8 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) * splitting into parent pages can become large enough to result * in slow operations. */ - if (pindex->entries > btree->split_deepen_min_child) + if (!__wt_ref_is_root(ref) && + pindex->entries > btree->split_deepen_min_child) return (true); return (false); @@ -405,7 +406,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) uint64_t split_gen; uint32_t children, chunk, i, j, moved_entries, new_entries, remain; uint32_t skip_leading, slots; - bool panic; + bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); @@ -414,7 +415,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) btree = S2BT(session); alloc_index = NULL; parent_incr = parent_decr = 0; - panic = false; + complete = false; /* * Our caller is holding the parent page locked to single-thread splits, @@ -552,28 +553,28 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) } WT_ASSERT(session, alloc_refp - alloc_index->index == - alloc_index->entries - skip_trailing); - WT_ASSERT(session, - parent_refp - pindex->index == pindex->entries - skip_trailing); + (ptrdiff_t)(alloc_index->entries - skip_trailing)); + WT_ASSERT(session, parent_refp - pindex->index == + (ptrdiff_t)(pindex->entries - skip_trailing)); /* * Confirm the parent page's index hasn't moved, then update it, which * makes the split visible to threads descending the tree. From this - * point on, we're committed to the split. If subsequent work fails, - * we have to panic because we may have threads of control using the - * new page index we swap in. + * point on, we're committed to the split. * * A note on error handling: until this point, there's no problem with * unwinding on error. We allocated a new page index, a new set of * WT_REFs and a new set of child pages -- if an error occurred, the * parent remained unchanged, although it may have an incorrect memory * footprint. From now on we've modified the parent page, attention - * needs to be paid. + * needs to be paid. However, subsequent failures are relatively benign, + * the split is OK and complete. For that reason, we ignore errors past + * this point unless there's a panic. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - panic = true; + complete = true; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, @@ -657,7 +658,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) * be using the new index. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_ERR(__split_safe_free(session, split_gen, 0, pindex, size)); + WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size)); parent_decr += size; /* @@ -666,25 +667,29 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); - if (0) { -err: __wt_free_ref_index(session, parent, alloc_index, true); +err: /* + * If complete is true, we saw an error after opening up the tree to + * descent through the parent page's new index. There is nothing we + * can do, there are threads potentially active in both versions of + * the tree. + * + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) + __wt_free_ref_index(session, parent, alloc_index, true); - /* - * If panic is set, we saw an error after opening up the tree - * to descent through the parent page's new index. There is - * nothing we can do, the tree is inconsistent and there are - * threads potentially active in both versions of the tree. - */ - if (panic) - ret = __wt_panic(session); - } - return (ret); + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during parent page split to " + "deepen the tree"); + return (ret == WT_PANIC || !complete ? ret : 0); } /* * __split_multi_inmem -- - * Instantiate a page in a multi-block set, when an update couldn't be - * written. + * Instantiate a page in a multi-block set. */ static int __split_multi_inmem( @@ -699,13 +704,12 @@ __split_multi_inmem( uint64_t recno; uint32_t i, slot; - __wt_btcur_init(session, &cbt); - __wt_btcur_open(&cbt); - /* - * We can find unresolved updates when attempting to evict a page, which - * can't be written. This code re-creates the in-memory page and applies - * the unresolved updates to that page. + * This code re-creates an in-memory page that is part of a set created + * while evicting a large page, and adds references to any unresolved + * update chains to the new page. We get here due to choosing to keep + * the results of a split in memory or because and update could not be + * written when attempting to evict a page. * * Clear the disk image and link the page into the passed-in WT_REF to * simplify error handling: our caller will not discard the disk image @@ -713,13 +717,16 @@ __split_multi_inmem( * allocated page on error, when discarding the allocated WT_REF. */ WT_RET(__wt_page_inmem(session, ref, - multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size, + multi->disk_image, ((WT_PAGE_HEADER *)multi->disk_image)->mem_size, WT_PAGE_DISK_ALLOC, &page)); - multi->supd_dsk = NULL; + multi->disk_image = NULL; if (orig->type == WT_PAGE_ROW_LEAF) WT_RET(__wt_scr_alloc(session, 0, &key)); + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + /* Re-create each modification we couldn't write. */ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) switch (orig->type) { @@ -727,7 +734,6 @@ __split_multi_inmem( case WT_PAGE_COL_VAR: /* Build a key. */ upd = supd->ins->upd; - supd->ins->upd = NULL; recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ @@ -742,13 +748,11 @@ __split_multi_inmem( if (supd->ins == NULL) { slot = WT_ROW_SLOT(orig, supd->rip); upd = orig->pg_row_upd[slot]; - orig->pg_row_upd[slot] = NULL; WT_ERR(__wt_row_leaf_key( session, orig, supd->rip, key, false)); } else { upd = supd->ins->upd; - supd->ins->upd = NULL; key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); @@ -765,13 +769,14 @@ __split_multi_inmem( } /* - * We modified the page above, which will have set the first dirty + * If we modified the page above, it will have set the first dirty * transaction to the last transaction currently running. However, the * updates we installed may be older than that. Set the first dirty * transaction to an impossibly old value so this page is never skipped * in a checkpoint. */ - page->modify->first_dirty_txn = WT_TXN_FIRST; + if (page->modify != NULL) + page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt, true)); @@ -780,6 +785,38 @@ err: /* Free any resources that may have been cached in the cursor. */ return (ret); } +/* + * __split_multi_inmem_final -- + * Discard moved update lists from the original page. + */ +static void +__split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi) +{ + WT_SAVE_UPD *supd; + uint32_t i, slot; + + /* + * We've successfully created new in-memory pages. For error-handling + * reasons, we've left the update chains referenced by both the original + * and new pages. We're ready to discard the original page, terminate + * the original page's reference to any update list we moved. + */ + for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) + switch (orig->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + supd->ins->upd = NULL; + break; + case WT_PAGE_ROW_LEAF: + if (supd->ins == NULL) { + slot = WT_ROW_SLOT(orig, supd->rip); + orig->pg_row_upd[slot] = NULL; + } else + supd->ins->upd = NULL; + break; + } +} + /* * __wt_multi_to_ref -- * Move a multi-block list into an array of WT_REF structures. @@ -801,16 +838,10 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, ref = *refp; incr += sizeof(WT_REF); - /* - * Any parent reference must be filled in by our caller; the primary - * use of this function is when splitting into a parent page, and we - * aren't holding any locks here that would allow us to know which - * parent we'll eventually split into, if the tree is simultaneously - * being deepened. - */ + /* Any parent reference is filled in by our caller. */ ref->home = NULL; - if (multi->supd == NULL) { + if (multi->disk_image == NULL) { /* * Copy the address: we could simply take the buffer, but that * would complicate error handling, freeing the reference array @@ -839,7 +870,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, break; } - ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM; + ref->state = addr != NULL ? WT_REF_DISK : WT_REF_MEM; /* * If our caller wants to track the memory allocations, we have a return @@ -982,11 +1013,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * reading thread will restart. Include the ref we are splitting in * the count to be deleted. */ - for (i = 0, deleted_entries = 1; i < parent_entries; ++i) { + for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); if (next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref) && + __wt_delete_page_skip(session, next_ref, true) && __wt_atomic_casv32( &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) deleted_entries++; @@ -998,6 +1029,13 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ result_entries = (parent_entries + new_entries) - deleted_entries; + /* + * If the entire (sub)tree is empty, give up: we can't leave an empty + * internal page. + */ + if (result_entries == 0) + return (0); + /* * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from @@ -1042,6 +1080,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, *alloc_refp++ = next_ref; } + /* Check that we filled in all the entries. */ + WT_ASSERT(session, + alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -1078,9 +1120,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", - __wt_page_type_string(ref->page->type), parent_entries, - result_entries, result_entries - parent_entries)); + " (%" PRIu32 ")", ref->page == NULL ? + "reverse" : __wt_page_type_string(ref->page->type), + parent_entries, result_entries, result_entries - parent_entries)); /* * The new page index is in place, free the WT_REF we were splitting @@ -1172,20 +1214,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, __split_should_deepen(session, parent_ref)) ret = __split_deepen(session, parent); -err: if (!complete) +err: /* + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) { for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - __wt_free_ref_index(session, NULL, alloc_index, false); + __wt_free_ref_index(session, NULL, alloc_index, false); + } - /* - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ if (ret != 0 && ret != WT_PANIC) __wt_err(session, ret, "ignoring not-fatal error during parent page split"); @@ -1478,6 +1521,24 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) return (ret); } +/* + * __wt_split_reverse -- + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. + */ +int +__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + bool hazard; + + WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, 0); + WT_TRET(__split_parent_unlock(session, parent, hazard)); + return (ret); +} + /* * __wt_split_rewrite -- * Rewrite an in-memory page with a new version. @@ -1505,6 +1566,14 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) memset(&new, 0, sizeof(new)); WT_RET(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + /* + * The rewrite succeeded, we can no longer fail. + * + * Finalize the move, discarding moved update lists from the original + * page. + */ + __split_multi_inmem_final(page, &mod->mod_multi[0]); + /* * Discard the original page. * @@ -1560,33 +1629,43 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); - __wt_free(session, ref_new); - /* - * The split succeeded, discard the page. + * The split succeeded, we can no longer fail. * - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Finalize the move, discarding moved update lists from the original + * page. + */ + for (i = 0; i < new_entries; ++i) + __split_multi_inmem_final(page, &mod->mod_multi[i]); + + /* + * Pages with unresolved changes are not marked clean in reconciliation, + * do it now, then discard the page. */ __wt_page_modify_clear(session, page); __wt_page_out(session, &page); - return (0); + if (0) { +err: /* + * A note on error handling: when handling unresolved changes, + * we create new in-memory pages with those unresolved changes. + * The problem is the new pages are given references to the + * original page's update lists, and once all of the pages are + * created, there's a second pass to remove the updates from the + * original page. If an error occurs, we can't simply free the + * newly created pages, that would discard the original page's + * updates. Set a flag so the discard function doesn't discard + * the updates on the page. + */ + for (i = 0; i < new_entries; ++i) + if (ref_new[i]->page != NULL) { + F_SET_ATOMIC( + ref_new[i]->page, WT_PAGE_UPDATE_IGNORE); + __wt_free_ref(session, + ref_new[i]->page, ref_new[i], true); + } + } -err: /* - * A note on error handling: in the case of evicting a page that has - * unresolved changes, we just instantiated some in-memory pages that - * reflect those unresolved changes. The problem is those pages - * reference the same WT_UPDATE chains as the page we're splitting, - * that is, we simply copied references into the new pages. If the - * split fails, the original page is fine, but discarding the created - * page would free those update chains, and that's wrong. There isn't - * an easy solution, there's a lot of small memory allocations in some - * common code paths, and unwinding those changes will be difficult. - * For now, leak the memory by not discarding the instantiated pages. - */ - for (i = 0; i < new_entries; ++i) - __wt_free_ref(session, page, ref_new[i], false); __wt_free(session, ref_new); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 6e1d182ed0b..8e0f4036b79 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session, */ WT_ENTER_PAGE_INDEX(session); + /* Walk should never instantiate deleted pages. */ + LF_SET(WT_READ_NO_EMPTY); + /* * !!! * Fast-truncate currently only works on row-store trees. @@ -174,9 +177,10 @@ ascend: /* /* * If we got all the way through an internal page and - * all of the child pages were deleted, evict it. + * all of the child pages were deleted, mark it for + * eviction. */ - if (empty_internal) { + if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } @@ -257,7 +261,7 @@ ascend: /* * to delete it again. */ if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page @@ -294,7 +298,7 @@ ascend: /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + __wt_delete_page_skip(session, ref, false)) break; } @@ -302,7 +306,7 @@ ascend: /* /* * Not-found is an expected return when only walking - * in-cache pages. + * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index b99c93d319a..87929d8a457 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -218,10 +218,11 @@ restart: page = current->page; } /* - * Binary search of the internal page. There are two versions - * (a default loop and an application-specified collation loop), - * because moving the collation test and error handling inside - * the loop costs about 5%. + * Binary search of an internal page. There are three versions + * (keys with no application-specified collation order, in long + * and short versions, and keys with an application-specified + * collation order), because doing the tests and error handling + * inside the loop costs about 5%. * * The 0th key on an internal page is a problem for a couple of * reasons. First, we have to force the 0th key to sort less @@ -236,7 +237,22 @@ restart: page = current->page; */ base = 1; limit = pindex->entries - 1; - if (collator == NULL) + if (collator == NULL && + srch_key->size <= WT_COMPARE_SHORT_MAXLEN) + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + descent = pindex->index[indx]; + __wt_ref_key( + page, descent, &item->data, &item->size); + + cmp = __wt_lex_compare_short(srch_key, item); + if (cmp > 0) { + base = indx + 1; + --limit; + } else if (cmp == 0) + goto descend; + } + else if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -356,13 +372,28 @@ leaf_only: } /* - * Binary search of the leaf page. There are two versions (a default - * loop and an application-specified collation loop), because moving - * the collation test and error handling inside the loop costs about 5%. + * Binary search of an leaf page. There are three versions (keys with + * no application-specified collation order, in long and short versions, + * and keys with an application-specified collation order), because + * doing the tests and error handling inside the loop costs about 5%. */ base = 0; limit = page->pg_row_entries; - if (collator == NULL) + if (collator == NULL && srch_key->size <= WT_COMPARE_SHORT_MAXLEN) + for (; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + rip = page->pg_row_d + indx; + WT_ERR( + __wt_row_leaf_key(session, page, rip, item, true)); + + cmp = __wt_lex_compare_short(srch_key, item); + if (cmp > 0) { + base = indx + 1; + --limit; + } else if (cmp == 0) + goto leaf_match; + } + else if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); rip = page->pg_row_d + indx; diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 419f4124133..c8aca15d103 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -519,6 +519,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, + { "in_memory", "boolean", NULL, NULL, NULL, 0 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 8 }, @@ -594,6 +595,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, { "hazard_max", "int", NULL, "min=15", NULL, 0 }, + { "in_memory", "boolean", NULL, NULL, NULL, 0 }, { "log", "category", NULL, NULL, confchk_wiredtiger_open_log_subconfigs, 8 }, @@ -974,8 +976,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=," "file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," + "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," @@ -983,7 +985,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" ",verbose=", - confchk_wiredtiger_open, 34 + confchk_wiredtiger_open, 35 }, { "wiredtiger_open_all", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -995,8 +997,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=," "file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," + "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," @@ -1004,7 +1006,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" ",verbose=,version=(major=0,minor=0)", - confchk_wiredtiger_open_all, 35 + confchk_wiredtiger_open_all, 36 }, { "wiredtiger_open_basecfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c index 92f12402537..edb11957556 100644 --- a/src/third_party/wiredtiger/src/conn/api_strerror.c +++ b/src/third_party/wiredtiger/src/conn/api_strerror.c @@ -38,6 +38,8 @@ __wt_wiredtiger_error(int error) return ("WT_RESTART: restart the operation (internal)"); case WT_RUN_RECOVERY: return ("WT_RUN_RECOVERY: recovery must be run to continue"); + case WT_CACHE_FULL: + return ("WT_CACHE_FULL: operation would overflow cache"); } /* diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index b50ad750158..d86b02287f0 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1726,6 +1726,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "create=," "encryption=(secretkey=)," "exclusive=," + "in_memory=," "log=(recover=)," "use_environment_priv=," "verbose=,", &base_config)); @@ -1798,7 +1799,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_DECL_RET; const WT_NAME_FLAG *ft; WT_SESSION_IMPL *session; - int64_t config_base_set; + bool config_base_set; const char *enc_cfg[] = { NULL, NULL }; char version[64]; @@ -1842,7 +1843,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* Capture the config_base setting file for later use. */ WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); - config_base_set = cval.val; + config_base_set = cval.val != 0; /* Configure error messages so we get them right early. */ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); @@ -1850,6 +1851,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_strndup( session, cval.str, cval.len, &conn->error_prefix)); + /* + * XXX ideally, we would check "in_memory" here, so we could completely + * avoid having a database directory. However, it can be convenient to + * pass "in_memory" via the WIREDTIGER_CONFIG environment variable, and + * we haven't read it yet. + */ + /* Get the database home. */ WT_ERR(__conn_home(session, home, cfg)); @@ -1883,7 +1891,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, __conn_config_append(cfg, version); /* Ignore the base_config file if we config_base set to false. */ - if (config_base_set != 0) + if (config_base_set) WT_ERR( __conn_config_file(session, WT_BASECONFIG, false, cfg, i1)); __conn_config_append(cfg, config); @@ -1921,6 +1929,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval)); conn->session_scratch_max = (size_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "in_memory", &cval)); + if (cval.val != 0) + F_SET(conn, WT_CONN_IN_MEMORY); + WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval)); if (cval.val) F_SET(conn, WT_CONN_CKPT_SYNC); diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index caf0c3b68f0..8f039e61654 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -32,8 +32,19 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) */ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); conn->ckpt_usecs = (uint64_t)cval.val * 1000000; + WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval)); conn->ckpt_logsize = (wt_off_t)cval.val; + + /* Checkpoints are incompatible with in-memory configuration */ + if (conn->ckpt_usecs != 0 || conn->ckpt_logsize != 0) { + WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); + if (cval.val != 0) + WT_RET_MSG(session, EINVAL, + "In memory configuration incompatible with " + "checkpoints"); + } + __wt_log_written_reset(session); if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) || (conn->ckpt_logsize && conn->ckpt_usecs == 0 && diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 0b364b5fd4b..c6d5b535b86 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -194,7 +194,8 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) if (force && (btree->bm == NULL || btree->bm->map == NULL)) { WT_ERR(__conn_dhandle_mark_dead(session)); marked_dead = true; - } else + } + if (!marked_dead || final) WT_ERR(__wt_checkpoint_close(session, final)); } @@ -695,7 +696,7 @@ restart: WT_WITH_DHANDLE(session, dhandle, WT_TRET(__wt_conn_dhandle_discard_single( - session, true, false))); + session, true, F_ISSET(conn, WT_CONN_IN_MEMORY)))); goto restart; } @@ -712,7 +713,7 @@ restart: while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__wt_conn_dhandle_discard_single( - session, true, false))); + session, true, F_ISSET(conn, WT_CONN_IN_MEMORY)))); return (ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index e10f2a8c968..cc4e3ae2681 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -59,6 +59,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); + WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock); @@ -145,6 +146,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); + __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 9068e7e85a2..527b756ee1a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -47,9 +47,13 @@ __logmgr_config( { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; + bool enabled; conn = S2C(session); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + enabled = cval.val != 0; + /* * If we're reconfiguring, enabled must match the already * existing setting. @@ -57,14 +61,21 @@ __logmgr_config( * If it is off and the user it turning it on, or it is on * and the user is turning it off, return an error. */ - WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); if (reconfig && - ((cval.val != 0 && - !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || - (cval.val == 0 && - FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) + ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || + (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) return (EINVAL); - *runp = cval.val != 0; + + /* Logging is incompatible with in-memory */ + if (enabled) { + WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); + if (cval.val != 0) + WT_RET_MSG(session, EINVAL, + "In memory configuration incompatible with " + "log=(enabled=true)"); + } + + *runp = enabled; /* * Setup a log path and compression even if logging is disabled in case @@ -379,9 +390,16 @@ __log_file_server(void *arg) * to move the sync_lsn into the next file for * later syncs. */ + WT_ERR(__wt_fsync(session, close_fh)); + /* + * We want to make sure the file size reflects + * actual data and has minimal pre-allocated + * zeroed space. + */ + WT_ERR(__wt_ftruncate( + session, close_fh, close_end_lsn.offset)); close_end_lsn.file++; close_end_lsn.offset = 0; - WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 23846f978fe..a8620ebaa99 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -334,6 +334,15 @@ __wt_sweep_config(WT_SESSION_IMPL *session, const char *cfg[]) cfg, "file_manager.close_idle_time", &cval)); conn->sweep_idle_time = (time_t)cval.val; + /* Non-zero sweep idle time is incompatible with in-memory */ + if (conn->sweep_idle_time != 0) { + WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); + if (cval.val != 0) + WT_RET_MSG(session, EINVAL, + "In memory configuration incompatible with " + "non zero file_manager=(close_idle_time)"); + } + WT_RET(__wt_config_gets(session, cfg, "file_manager.close_scan_interval", &cval)); conn->sweep_interval = (time_t)cval.val; diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index 8ee57d24413..ccc19717612 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -384,7 +384,7 @@ __curds_remove(WT_CURSOR *cursor) source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 65f5dafc344..1db819b8b40 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -323,7 +323,7 @@ __curfile_remove(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; - CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); + CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); @@ -495,24 +495,30 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, bitmap = bulk = false; flags = 0; - WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); - if (cval.type == WT_CONFIG_ITEM_BOOL || - (cval.type == WT_CONFIG_ITEM_NUM && - (cval.val == 0 || cval.val == 1))) { - bitmap = false; - bulk = cval.val != 0; - } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) - bitmap = bulk = true; - /* - * Unordered bulk insert is a special case used internally by - * index creation on existing tables. It doesn't enforce - * any special semantics at the file level. It primarily - * exists to avoid some locking problems with LSM trees and - * index creation. - */ - else if (!WT_STRING_MATCH("unordered", cval.str, cval.len)) - WT_RET_MSG(session, EINVAL, - "Value for 'bulk' must be a boolean or 'bitmap'"); + /* + * Decode the bulk configuration settings. In memory databases + * ignore bulk load. + */ + if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { + WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); + if (cval.type == WT_CONFIG_ITEM_BOOL || + (cval.type == WT_CONFIG_ITEM_NUM && + (cval.val == 0 || cval.val == 1))) { + bitmap = false; + bulk = cval.val != 0; + } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) + bitmap = bulk = true; + /* + * Unordered bulk insert is a special case used + * internally by index creation on existing tables. It + * doesn't enforce any special semantics at the file + * level. It primarily exists to avoid some locking + * problems between LSM and index creation. + */ + else if (!WT_STRING_MATCH("unordered", cval.str, cval.len)) + WT_RET_MSG(session, EINVAL, + "Value for 'bulk' must be a boolean or 'bitmap'"); + } /* Bulk handles require exclusive access. */ if (bulk) diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c index 460c46c0d29..55da93859a6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c +++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c @@ -171,7 +171,15 @@ __curmetadata_next(WT_CURSOR *cursor) if (!F_ISSET(mdc, WT_MDC_POSITIONED)) WT_ERR(__curmetadata_metadata_search(session, cursor)); else { - WT_ERR(file_cursor->next(mdc->file_cursor)); + /* + * When applications open metadata cursors, they expect to see + * all schema-level operations reflected in the results. Query + * at read-uncommitted to avoid confusion caused by the current + * transaction state. + */ + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->next(mdc->file_cursor)); + WT_ERR(ret); WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } @@ -204,7 +212,8 @@ __curmetadata_prev(WT_CURSOR *cursor) goto err; } - ret = file_cursor->prev(file_cursor); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->prev(file_cursor)); if (ret == 0) WT_ERR(__curmetadata_setkv(mdc, file_cursor)); else if (ret == WT_NOTFOUND) @@ -264,7 +273,9 @@ __curmetadata_search(WT_CURSOR *cursor) if (WT_KEY_IS_METADATA(&cursor->key)) WT_ERR(__curmetadata_metadata_search(session, cursor)); else { - WT_ERR(file_cursor->search(file_cursor)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->search(file_cursor)); + WT_ERR(ret); WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } @@ -298,7 +309,9 @@ __curmetadata_search_near(WT_CURSOR *cursor, int *exact) WT_ERR(__curmetadata_metadata_search(session, cursor)); *exact = 1; } else { - WT_ERR(file_cursor->search_near(file_cursor, exact)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->search_near(file_cursor, exact)); + WT_ERR(ret); WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 01d1fdd1886..38359236b27 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -610,7 +610,7 @@ __curtable_remove(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_ERR(__curtable_open_indices(ctable)); /* Find the old record so it can be removed from indices */ diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index f9171900ca4..c28b89b81ce 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -158,6 +158,9 @@ __evict_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; +#ifdef HAVE_DIAGNOSTIC + struct timespec now, stuck_ts; +#endif u_int spins; session = arg; @@ -200,6 +203,20 @@ __evict_server(void *arg) /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; +#ifdef HAVE_DIAGNOSTIC + stuck_ts.tv_sec = 0; + } else if (stuck_ts.tv_sec == 0) + WT_ERR(__wt_epoch(session, &stuck_ts)); + else { + /* After being stuck for 5 minutes, give up. */ + WT_ERR(__wt_epoch(session, &now)); + if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) { + __wt_errx(session, + "Cache stuck for too long, giving up"); + (void)__wt_cache_dump(session, NULL); + WT_ERR(ETIMEDOUT); + } +#endif } WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); @@ -1210,8 +1227,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) if (__wt_page_is_empty(page)) goto fast; - /* Optionally ignore clean pages. */ - if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY)) + /* Skip clean pages if appropriate. */ + if (!modified && (F_ISSET(conn, WT_CONN_IN_MEMORY) || + FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY))) continue; /* @@ -1560,11 +1578,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) #ifdef HAVE_DIAGNOSTIC /* * __wt_cache_dump -- - * Dump debugging information to stdout about the size of the files in the - * cache. - * - * NOTE: this function is not called anywhere, it is intended to be called - * from a debugger. + * Dump debugging information to a file (default stderr) about the size of + * the files in the cache. */ int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) @@ -1574,60 +1589,95 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; WT_REF *next_walk; - uint64_t file_intl_pages, file_leaf_pages; - uint64_t file_bytes, file_dirty, total_bytes; + uint64_t dirty_bytes, dirty_pages, intl_bytes, intl_pages; + uint64_t leaf_bytes, leaf_pages; + uint64_t max_dirty_bytes, max_intl_bytes, max_leaf_bytes, total_bytes; + size_t size; conn = S2C(session); total_bytes = 0; if (ofile == NULL) - fp = stdout; + fp = stderr; else WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp)); + /* Note: odd string concatenation avoids spelling errors. */ + (void)__wt_fprintf(fp, "==========\n" "cache dump\n"); + saved_dhandle = session->dhandle; TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; + dirty_bytes = dirty_pages = intl_bytes = intl_pages = 0; + leaf_bytes = leaf_pages = 0; + max_dirty_bytes = max_intl_bytes = max_leaf_bytes = 0; + next_walk = NULL; session->dhandle = dhandle; while (__wt_tree_walk(session, &next_walk, NULL, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { page = next_walk->page; - if (WT_PAGE_IS_INTERNAL(page)) - ++file_intl_pages; - else - ++file_leaf_pages; - file_bytes += page->memory_footprint; - if (__wt_page_is_modified(page)) - file_dirty += page->memory_footprint; - (void)__wt_fprintf(fp, - "%" WT_SIZET_FMT ", ", page->memory_footprint); + size = page->memory_footprint; + + if (WT_PAGE_IS_INTERNAL(page)) { + ++intl_pages; + intl_bytes += size; + max_intl_bytes = WT_MAX(max_intl_bytes, size); + } else { + ++leaf_pages; + leaf_bytes += size; + max_leaf_bytes = WT_MAX(max_leaf_bytes, size); + } + if (__wt_page_is_modified(page)) { + ++dirty_pages; + dirty_bytes += size; + max_dirty_bytes = + WT_MAX(max_dirty_bytes, size); + } } session->dhandle = NULL; - (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t" - " %" PRIu64 " internal pages, %" PRIu64 " leaf pages," - " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n", - dhandle->name, - dhandle->checkpoint == NULL ? "" : " [", - dhandle->checkpoint == NULL ? "" : dhandle->checkpoint, - dhandle->checkpoint == NULL ? "" : "]", - file_intl_pages, file_leaf_pages, - file_bytes >> 20, file_dirty >> 20); - - total_bytes += file_bytes; + if (dhandle->checkpoint == NULL) + (void)__wt_fprintf(fp, "%s(): \n", dhandle->name); + else + (void)__wt_fprintf(fp, "%s(checkpoint=%s): \n", + dhandle->name, dhandle->checkpoint); + if (intl_pages != 0) + (void)__wt_fprintf(fp, "\t" "internal pages: " + "%" PRIu64 " pages, %" PRIu64 + " max, %" PRIu64 "MB total\n", + intl_pages, max_intl_bytes, intl_bytes >> 20); + if (leaf_pages != 0) + (void)__wt_fprintf(fp, "\t" "leaf pages: " + "%" PRIu64 " pages, %" PRIu64 + " max, %" PRIu64 "MB total\n", + leaf_pages, max_leaf_bytes, leaf_bytes >> 20); + if (dirty_pages != 0) + (void)__wt_fprintf(fp, "\t" "dirty pages: " + "%" PRIu64 " pages, %" PRIu64 + " max, %" PRIu64 "MB total\n", + dirty_pages, max_dirty_bytes, dirty_bytes >> 20); + + total_bytes += intl_bytes + leaf_bytes; } session->dhandle = saved_dhandle; + /* + * Apply the overhead percentage so our total bytes are comparable with + * the tracked value. + */ + if (conn->cache->overhead_pct != 0) + total_bytes += + (total_bytes * (uint64_t)conn->cache->overhead_pct) / 100; (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB" " vs tracked inuse %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20); - if (fp != stdout) + (void)__wt_fprintf(fp, "==========\n"); + if (fp != stderr) WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); return (0); } diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index fb42b928f28..7202da7927c 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -106,7 +106,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) conn->cache->evict_max_page_size = page->memory_footprint; /* Update the reference and discard the page. */ - if (mod == NULL || mod->rec_result == 0) { + if ((mod == NULL || mod->rec_result == 0) && + !F_ISSET(conn, WT_CONN_IN_MEMORY)) { if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else @@ -142,6 +143,50 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) && return (ret); } +/* + * __evict_delete_ref -- + * Mark a page reference deleted and check if the parent can reverse + * split. + */ +static int +__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_PAGE_INDEX *pindex; + uint32_t ndeleted; + + if (__wt_ref_is_root(ref)) + return (0); + + /* + * Avoid doing reverse splits when closing the file, it is + * wasted work and some structure may already have been freed. + */ + if (!closing) { + parent = ref->home; + WT_INTL_INDEX_GET(session, parent, pindex); + ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1); + + /* + * If more than 10% of the parent references are deleted, try a + * reverse split. Don't bother if there is a single deleted + * reference: the internal page is empty and we have to wait + * for eviction to notice. + * + * This will consume the deleted ref (and eventually free it). + * If the reverse split can't get the access it needs because + * something is busy, be sure that the page still ends up + * marked deleted. + */ + if (ndeleted > pindex->entries / 10 && pindex->entries > 1 && + (ret = __wt_split_reverse(session, ref)) != EBUSY) + return (ret); + } + + WT_PUBLISH(ref->state, WT_REF_DELETED); + return (0); +} /* * __wt_evict_page_clean_update -- @@ -151,6 +196,8 @@ int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { + WT_DECL_RET; + /* * If doing normal system eviction, but only in the service of reducing * the number of dirty pages, leave the clean page in cache. @@ -164,8 +211,12 @@ __wt_evict_page_clean_update( * page re-instantiated (for example, by searching) and never written. */ __wt_ref_out(session, ref); - WT_PUBLISH(ref->state, - ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK); + if (ref->addr == NULL) { + WT_WITH_PAGE_INDEX(session, + ret = __evict_delete_ref(session, ref, closing)); + WT_RET_BUSY_OK(ret); + } else + WT_PUBLISH(ref->state, WT_REF_DISK); return (0); } @@ -178,6 +229,7 @@ static int __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { WT_ADDR *addr; + WT_DECL_RET; WT_PAGE *parent; WT_PAGE_MODIFY *mod; @@ -206,14 +258,31 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) */ __wt_ref_out(session, ref); ref->addr = NULL; - WT_PUBLISH(ref->state, WT_REF_DELETED); + WT_WITH_PAGE_INDEX(session, + ret = __evict_delete_ref(session, ref, closing)); + WT_RET_BUSY_OK(ret); break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ /* - * A real split where we reconciled a page and it turned into a - * lot of pages. + * Either a split where we reconciled a page and it turned into + * a lot of pages or an in-memory page that got too large, we + * forcibly evicted it, and there wasn't anything to write. + * + * The latter is a special case of forced eviction. Imagine a + * thread updating a small set keys on a leaf page. The page + * is too large or has too many deleted items, so we try and + * evict it, but after reconciliation there's only a small + * amount of live data (so it's a single page we can't split), + * and if there's an older reader somewhere, there's data on + * the page we can't write (so the page can't be evicted). In + * that case, we end up here with a single block that we can't + * write. Take advantage of the fact we have exclusive access + * to the page and rewrite it in memory. */ - WT_RET(__wt_split_multi(session, ref, closing)); + if (mod->mod_multi_entries == 1) + WT_RET(__wt_split_rewrite(session, ref)); + else + WT_RET(__wt_split_multi(session, ref, closing)); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ /* @@ -248,20 +317,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) ref->addr = addr; WT_PUBLISH(ref->state, WT_REF_DISK); break; - case WT_PM_REC_REWRITE: - /* - * An in-memory page that got too large, we forcibly evicted - * it, and there wasn't anything to write. (Imagine two threads - * updating a small set keys on a leaf page. The page is too - * large so we try to evict it, but after reconciliation - * there's only a small amount of data (so it's a single page - * we can't split), and because there are two threads, there's - * some data we can't write (so we can't evict it). In that - * case, we take advantage of the fact we have exclusive access - * to the page and rewrite it in memory.) - */ - WT_RET(__wt_split_rewrite(session, ref)); - break; WT_ILLEGAL_VALUE(session); } @@ -302,6 +357,7 @@ __evict_review( WT_DECL_RET; WT_PAGE *page; uint32_t flags; + bool modified; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -322,6 +378,14 @@ __evict_review( /* Now that we have exclusive access, review the page. */ page = ref->page; + modified = __wt_page_is_modified(page); + + /* + * Clean pages can't be evicted when running in memory only. This + * should be uncommon - we don't add clean pages to the queue. + */ + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !modified && !closing) + return (EBUSY); /* * Fail if an internal has active children, the children must be evicted @@ -341,7 +405,7 @@ __evict_review( * Update the oldest ID to avoid wasted effort should it have * fallen behind current. */ - if (__wt_page_is_modified(page)) + if (modified) __wt_txn_update_oldest(session, true); if (!__wt_page_can_evict(session, ref, false, inmem_splitp)) @@ -359,7 +423,7 @@ __evict_review( } /* If the page is clean, we're done and we can evict. */ - if (!__wt_page_is_modified(page)) + if (!modified) return (0); /* @@ -389,7 +453,9 @@ __evict_review( if (closing) LF_SET(WT_VISIBILITY_ERR); else if (!WT_PAGE_IS_INTERNAL(page)) { - if (page->read_gen == WT_READGEN_OLDEST) + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + LF_SET(WT_EVICT_IN_MEMORY | WT_EVICT_UPDATE_RESTORE); + else if (page->read_gen == WT_READGEN_OLDEST) LF_SET(WT_EVICT_UPDATE_RESTORE); else if (__wt_eviction_aggressive(session)) LF_SET(WT_EVICT_LOOKASIDE); @@ -399,15 +465,17 @@ __evict_review( /* * Success: assert the page is clean or reconciliation was configured - * for an update/restore split, and if the page is clean, reconciliation - * was configured for a lookaside table or all updates on the page are - * globally visible. + * for an update/restore split. If the page is clean, assert that + * reconciliation was configured for a lookaside table, or it's not a + * durable object (currently the lookaside table), or all page updates + * were globally visible. */ WT_ASSERT(session, LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page)); WT_ASSERT(session, - LF_SET(WT_EVICT_LOOKASIDE) || __wt_page_is_modified(page) || + LF_ISSET(WT_EVICT_LOOKASIDE) || + F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, page->modify->rec_max_txn)); return (0); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 8679b9510a8..74c58845c43 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -116,10 +116,17 @@ API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) +#define CURSOR_REMOVE_API_CALL(cur, s, bt) \ + (s) = (WT_SESSION_IMPL *)(cur)->session; \ + TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \ + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); + #define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ - ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ + if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \ + WT_ERR(WT_CACHE_FULL); #define CURSOR_UPDATE_API_END(s, ret) \ TXN_API_END(s, ret) diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index aa141e1df71..4bff6c82783 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -238,6 +238,9 @@ struct __wt_block { */ WT_SPINLOCK live_lock; /* Live checkpoint lock */ WT_BLOCK_CKPT live; /* Live checkpoint */ +#ifdef HAVE_DIAGNOSTIC + bool live_open; /* Live system is open */ +#endif bool ckpt_inprogress;/* Live checkpoint in progress */ /* Compaction support */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 41b2c98f9e8..02819237c13 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -261,7 +261,9 @@ struct __wt_page_modify { } key; /* - * Eviction, but block wasn't written: unresolved updates and + * Eviction, but the block wasn't written: either an in-memory + * configuration or unresolved updates prevented the write. + * There may be a list of unresolved updates, there's always an * associated disk image. * * Saved updates are either a WT_INSERT, or a row-store leaf @@ -274,7 +276,7 @@ struct __wt_page_modify { uint64_t onpage_txn; } *supd; uint32_t supd_entries; - void *supd_dsk; + void *disk_image; /* * Block was written: address, size and checksum. @@ -386,7 +388,6 @@ struct __wt_page_modify { #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ -#define WT_PM_REC_REWRITE 4 /* Reconciliation: rewrite in place */ uint8_t rec_result; /* Reconciliation state */ }; @@ -433,6 +434,7 @@ struct __wt_page { struct __wt_page_index { uint32_t entries; + uint32_t deleted_entries; WT_REF **index; } * volatile __index; /* Collated children */ @@ -579,8 +581,17 @@ struct __wt_page { #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ + uint8_t unused[2]; /* Unused padding */ + + /* + * Used to protect and co-ordinate splits for internal pages and + * reconciliation for all pages. + */ + WT_FAIR_LOCK page_lock; + /* * The page's read generation acts as an LRU value for each page in the * tree; it is used by the eviction server thread to select pages to be @@ -602,12 +613,6 @@ struct __wt_page { #define WT_READGEN_STEP 100 uint64_t read_gen; - /* - * Used to protect and co-ordinate splits for internal pages and - * reconciliation for all pages. - */ - WT_FAIR_LOCK page_lock; - size_t memory_footprint; /* Memory attached to the page */ /* Page's on-disk representation: NULL for pages created in memory. */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 14b5303cca9..23e212eb772 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1105,7 +1105,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * internal pages not be evicted until all threads are known to have * exited the original page index array, because evicting an internal * page discards its WT_REF array, and a thread traversing the original - * page index array might see an freed WT_REF. During the split we set + * page index array might see a freed WT_REF. During the split we set * a transaction value, once that's globally visible, we know we can * evict the created page. */ @@ -1263,13 +1263,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, #endif ); - /* An expected failure: WT_NOTFOUND when doing a cache-only read. */ - if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND) - return (WT_NOTFOUND); - - /* An expected failure: WT_RESTART */ - if (ret == WT_RESTART) - return (WT_RESTART); + /* Expected failures: page not found or restart. */ + if (ret == WT_NOTFOUND || ret == WT_RESTART) + return (ret); /* Discard the original held page. */ acquired = ret == 0; diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i index 76f1ad4317a..8a7fe19a22f 100644 --- a/src/third_party/wiredtiger/src/include/btree_cmp.i +++ b/src/third_party/wiredtiger/src/include/btree_cmp.i @@ -188,3 +188,58 @@ __wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator, return (collator->compare( collator, &session->iface, user_item, tree_item, cmpp)); } + +/* + * __wt_lex_compare_short -- + * Lexicographic comparison routine for short keys. + * + * Returns: + * < 0 if user_item is lexicographically < tree_item + * = 0 if user_item is lexicographically = tree_item + * > 0 if user_item is lexicographically > tree_item + * + * We use the names "user" and "tree" so it's clear in the btree code which + * the application is looking at when we call its comparison function. + */ +static inline int +__wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item) +{ + size_t len, usz, tsz; + const uint8_t *userp, *treep; + + usz = user_item->size; + tsz = tree_item->size; + len = WT_MIN(usz, tsz); + + userp = user_item->data; + treep = tree_item->data; + + /* + * The maximum packed uint64_t is 9B, catch row-store objects using + * packed record numbers as keys. + */ +#define WT_COMPARE_SHORT_MAXLEN 9 +#undef WT_COMPARE_SHORT +#define WT_COMPARE_SHORT(n) \ + case n: \ + if (*userp != *treep) \ + break; \ + ++userp, ++treep + switch (len) { + WT_COMPARE_SHORT(9); + WT_COMPARE_SHORT(8); + WT_COMPARE_SHORT(7); + WT_COMPARE_SHORT(6); + WT_COMPARE_SHORT(5); + WT_COMPARE_SHORT(4); + WT_COMPARE_SHORT(3); + WT_COMPARE_SHORT(2); + case 1: + if (*userp != *treep) + break; + + /* Contents are equal up to the smallest length. */ + return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1); + } + return (*userp < *treep ? -1 : 1); +} diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index 475f4a86654..a95138c3f0f 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -192,6 +192,22 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) return (false); } +/* + * __wt_cache_full -- + * Return if the cache is at (or over) capacity. + */ +static inline bool +__wt_cache_full(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CACHE *cache; + + conn = S2C(session); + cache = conn->cache; + + return (__wt_cache_bytes_inuse(cache) >= conn->cache_size); +} + /* * __wt_cache_eviction_check -- * Evict pages if the cache crosses its boundaries. @@ -214,6 +230,10 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) return (0); + /* In memory configurations don't block when the cache is full. */ + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + return (0); + /* * Threads operating on trees that cannot be evicted are ignored, * mostly because they're not contributing to the problem. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 2dfb24a83da..03b8174b7e1 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -178,6 +178,7 @@ struct __wt_connection_impl { WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ + WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ /* * We distribute the btree page locks across a set of spin locks. Don't diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1f63f07646e..3dd479acc0a 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -111,12 +111,12 @@ extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, c extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile); extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile); extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); -extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); -extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); +extern int __wt_debug_tree_all( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile); +extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile); extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); -extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); @@ -155,6 +155,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); @@ -446,7 +447,6 @@ extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key); extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, char **valuep); extern void __wt_meta_track_discard(WT_SESSION_IMPL *session); extern int __wt_meta_track_on(WT_SESSION_IMPL *session); -extern int __wt_meta_track_find_handle( WT_SESSION_IMPL *session, const char *name, const char *checkpoint); extern int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll); extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session); extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session); @@ -566,9 +566,9 @@ extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table); extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp); extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table); -extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep); extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, bool quiet, WT_TABLE **tablep, WT_COLGROUP **colgroupp); extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, bool quiet, WT_TABLE **tablep, WT_INDEX **indexp); +extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep); extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp); extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table); extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, bool value_only, WT_ITEM *plan); @@ -591,6 +591,7 @@ extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags); extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); +extern int __wt_session_release_resources(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 24dccd30913..99b6f1c483f 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -6,21 +6,23 @@ #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_CLOSING 0x00000004 #define WT_CONN_EVICTION_RUN 0x00000008 -#define WT_CONN_LAS_OPEN 0x00000010 -#define WT_CONN_LEAK_MEMORY 0x00000020 -#define WT_CONN_LOG_SERVER_RUN 0x00000040 -#define WT_CONN_LSM_MERGE 0x00000080 -#define WT_CONN_PANIC 0x00000100 -#define WT_CONN_SERVER_ASYNC 0x00000200 -#define WT_CONN_SERVER_CHECKPOINT 0x00000400 -#define WT_CONN_SERVER_LSM 0x00000800 -#define WT_CONN_SERVER_RUN 0x00001000 -#define WT_CONN_SERVER_STATISTICS 0x00002000 -#define WT_CONN_SERVER_SWEEP 0x00004000 -#define WT_CONN_WAS_BACKUP 0x00008000 +#define WT_CONN_IN_MEMORY 0x00000010 +#define WT_CONN_LAS_OPEN 0x00000020 +#define WT_CONN_LEAK_MEMORY 0x00000040 +#define WT_CONN_LOG_SERVER_RUN 0x00000080 +#define WT_CONN_LSM_MERGE 0x00000100 +#define WT_CONN_PANIC 0x00000200 +#define WT_CONN_SERVER_ASYNC 0x00000400 +#define WT_CONN_SERVER_CHECKPOINT 0x00000800 +#define WT_CONN_SERVER_LSM 0x00001000 +#define WT_CONN_SERVER_RUN 0x00002000 +#define WT_CONN_SERVER_STATISTICS 0x00004000 +#define WT_CONN_SERVER_SWEEP 0x00008000 +#define WT_CONN_WAS_BACKUP 0x00010000 #define WT_EVICTING 0x00000001 -#define WT_EVICT_LOOKASIDE 0x00000002 -#define WT_EVICT_UPDATE_RESTORE 0x00000004 +#define WT_EVICT_IN_MEMORY 0x00000002 +#define WT_EVICT_LOOKASIDE 0x00000004 +#define WT_EVICT_UPDATE_RESTORE 0x00000008 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_DIRECTORY 0x00000004 @@ -36,13 +38,14 @@ #define WT_LOG_FSYNC 0x00000008 #define WT_READ_CACHE 0x00000001 #define WT_READ_COMPACT 0x00000002 -#define WT_READ_NO_EVICT 0x00000004 -#define WT_READ_NO_GEN 0x00000008 -#define WT_READ_NO_WAIT 0x00000010 -#define WT_READ_PREV 0x00000020 -#define WT_READ_SKIP_INTL 0x00000040 -#define WT_READ_TRUNCATE 0x00000080 -#define WT_READ_WONT_NEED 0x00000100 +#define WT_READ_NO_EMPTY 0x00000004 +#define WT_READ_NO_EVICT 0x00000008 +#define WT_READ_NO_GEN 0x00000010 +#define WT_READ_NO_WAIT 0x00000020 +#define WT_READ_PREV 0x00000040 +#define WT_READ_SKIP_INTL 0x00000080 +#define WT_READ_TRUNCATE 0x00000100 +#define WT_READ_WONT_NEED 0x00000200 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_INTERNAL 0x00000004 @@ -51,15 +54,16 @@ #define WT_SESSION_LOCKED_SCHEMA 0x00000020 #define WT_SESSION_LOCKED_SLOT 0x00000040 #define WT_SESSION_LOCKED_TABLE 0x00000080 -#define WT_SESSION_LOGGING_INMEM 0x00000100 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200 -#define WT_SESSION_NO_CACHE 0x00000400 -#define WT_SESSION_NO_DATA_HANDLES 0x00000800 -#define WT_SESSION_NO_EVICTION 0x00001000 -#define WT_SESSION_NO_LOGGING 0x00002000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000 -#define WT_SESSION_SERVER_ASYNC 0x00010000 +#define WT_SESSION_LOCKED_TURTLE 0x00000100 +#define WT_SESSION_LOGGING_INMEM 0x00000200 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00000400 +#define WT_SESSION_NO_CACHE 0x00000800 +#define WT_SESSION_NO_DATA_HANDLES 0x00001000 +#define WT_SESSION_NO_EVICTION 0x00002000 +#define WT_SESSION_NO_LOGGING 0x00004000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00008000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00010000 +#define WT_SESSION_SERVER_ASYNC 0x00020000 #define WT_SYNC_CHECKPOINT 0x00000001 #define WT_SYNC_CLOSE 0x00000002 #define WT_SYNC_DISCARD 0x00000004 @@ -93,7 +97,7 @@ #define WT_VERB_VERIFY 0x00200000 #define WT_VERB_VERSION 0x00400000 #define WT_VERB_WRITE 0x00800000 -#define WT_VISIBILITY_ERR 0x00000008 +#define WT_VISIBILITY_ERR 0x00000010 /* * flags section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 06be95697c7..521de567fc0 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -152,8 +152,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot { WT_ITEM slot_buf; /* Buffer for grouped writes */ #define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */ -#define WT_SLOT_SYNC 0x02 /* Needs sync on release */ -#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */ +#define WT_SLOT_FLUSH 0x02 /* Wait for write */ +#define WT_SLOT_SYNC 0x04 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */ uint32_t flags; /* Flags */ }; diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index 11cf8204aec..d15dab3aa45 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -96,6 +96,7 @@ struct __wt_lsm_chunk { int8_t empty; /* 1/0: checkpoint missing */ int8_t evicted; /* 1/0: in-memory chunk was evicted */ + uint8_t flushing; /* 1/0: chunk flush in progress */ #define WT_LSM_CHUNK_BLOOM 0x01 #define WT_LSM_CHUNK_MERGING 0x02 diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index a5a303f1630..938101e9caa 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -39,6 +39,16 @@ #define WT_METADATA_VERSION "WiredTiger version" /* Version keys */ #define WT_METADATA_VERSION_STR "WiredTiger version string" +/* + * WT_WITH_TURTLE_LOCK -- + * Acquire the turtle file lock, perform an operation, drop the lock. + */ +#define WT_WITH_TURTLE_LOCK(session, op) do { \ + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_TURTLE));\ + WT_WITH_LOCK(session, \ + &S2C(session)->turtle_lock, WT_SESSION_LOCKED_TURTLE, op); \ +} while (0) + /* * WT_CKPT -- * Encapsulation of checkpoint information, shared by the metadata, the diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index ff2f6645e5a..eca77214b47 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -120,11 +120,11 @@ * hex constant might be a negative integer), and to ensure the hex constant is * the correct size before applying the bitwise not operator. */ -#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask))) +#define FLD_CLR(field, mask) ((void)((field) &= ~(uint32_t)(mask))) #define FLD_MASK(field, mask) ((field) & (uint32_t)(mask)) #define FLD_ISSET(field, mask) (FLD_MASK(field, mask) != 0) #define FLD64_ISSET(field, mask) (((field) & (uint64_t)(mask)) != 0) -#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask))) +#define FLD_SET(field, mask) ((void)((field) |= (uint32_t)(mask))) #define F_CLR(p, mask) FLD_CLR((p)->flags, mask) #define F_ISSET(p, mask) FLD_ISSET((p)->flags, mask) diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 7a07d16045d..f5a2c1c7dda 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -28,6 +28,32 @@ #define WT_SESSION_IS_CHECKPOINT(s) \ ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) +/* + * Perform an operation at the specified isolation level. + * + * This is fiddly: we can't cope with operations that begin transactions + * (leaving an ID allocated), and operations must not move our published + * snap_min forwards (or updates we need could be freed while this operation is + * in progress). Check for those cases: the bugs they cause are hard to debug. + */ +#define WT_WITH_TXN_ISOLATION(s, iso, op) do { \ + WT_TXN_ISOLATION saved_iso = (s)->isolation; \ + WT_TXN_ISOLATION saved_txn_iso = (s)->txn.isolation; \ + WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(s); \ + WT_TXN_STATE saved_state = *txn_state; \ + (s)->txn.forced_iso++; \ + (s)->isolation = (s)->txn.isolation = (iso); \ + op; \ + (s)->isolation = saved_iso; \ + (s)->txn.isolation = saved_txn_iso; \ + WT_ASSERT((s), (s)->txn.forced_iso > 0); \ + (s)->txn.forced_iso--; \ + WT_ASSERT((s), txn_state->id == saved_state.id && \ + (txn_state->snap_min == saved_state.snap_min || \ + saved_state.snap_min == WT_TXN_NONE)); \ + txn_state->snap_min = saved_state.snap_min; \ +} while (0) + struct __wt_named_snapshot { const char *name; @@ -129,6 +155,8 @@ struct __wt_txn { WT_TXN_ISOLATION isolation; + uint32_t forced_iso; /* Isolation is currently forced. */ + /* * Snapshot data: * ids < snap_min are visible, @@ -153,13 +181,13 @@ struct __wt_txn { /* Checkpoint status. */ WT_LSN ckpt_lsn; - bool full_ckpt; uint32_t ckpt_nsnapshot; WT_ITEM *ckpt_snapshot; + bool full_ckpt; #define WT_TXN_AUTOCOMMIT 0x01 #define WT_TXN_ERROR 0x02 -#define WT_TXN_HAS_ID 0x04 +#define WT_TXN_HAS_ID 0x04 #define WT_TXN_HAS_SNAPSHOT 0x08 #define WT_TXN_NAMED_SNAPSHOT 0x10 #define WT_TXN_READONLY 0x20 diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 54c30adae76..e49e3d1257b 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -187,18 +187,17 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) /* * Read-uncommitted transactions see all other changes. - * - * All metadata reads are at read-uncommitted isolation. That's - * because once a schema-level operation completes, subsequent - * operations must see the current version of checkpoint metadata, or - * they may try to read blocks that may have been freed from a file. - * Metadata updates use non-transactional techniques (such as the - * schema and metadata locks) to protect access to in-flight updates. */ - if (txn->isolation == WT_ISO_READ_UNCOMMITTED || - session->dhandle == session->meta_dhandle) + if (txn->isolation == WT_ISO_READ_UNCOMMITTED) return (true); + /* + * If we don't have a transactional snapshot, only make stable updates + * visible. + */ + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (__wt_txn_visible_all(session, id)); + /* Transactions see their own changes. */ if (id == txn->id) return (true); @@ -429,9 +428,15 @@ __wt_txn_read_last(WT_SESSION_IMPL *session) txn = &session->txn; - /* Release the snap_min ID we put in the global table. */ - if (!F_ISSET(txn, WT_TXN_RUNNING) || - txn->isolation != WT_ISO_SNAPSHOT) + /* + * Release the snap_min ID we put in the global table. + * + * If the isolation has been temporarily forced, don't touch the + * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION. + */ + if ((!F_ISSET(txn, WT_TXN_RUNNING) || + txn->isolation != WT_ISO_SNAPSHOT) && + txn->forced_iso == 0) __wt_txn_release_snapshot(session); } @@ -451,28 +456,26 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) txn_state = WT_SESSION_TXN_STATE(session); /* - * If there is no transaction running (so we don't have an ID), and no - * snapshot allocated, put an ID in the global table to prevent any - * update that we are reading from being trimmed to save memory. Do a - * read before the write because this shared data is accessed a lot. + * We are about to read data, which means we need to protect against + * updates being freed from underneath this cursor. Read-uncommitted + * isolation protects values by putting a transaction ID in the global + * table to prevent any update that we are reading from being freed. + * Other isolation levels get a snapshot to protect their reads. * * !!! - * Note: We are updating the global table unprotected, so the - * oldest_id may move past this ID if a scan races with this - * value being published. That said, read-uncommitted operations - * always take the most recent version of a value, so for that version - * to be freed, two newer versions would have to be committed. Putting - * this snap_min ID in the table prevents the oldest ID from moving + * Note: We are updating the global table unprotected, so the global + * oldest_id may move past our snap_min if a scan races with this value + * being published. That said, read-uncommitted operations always see + * the most recent update for each record that has not been aborted + * regardless of the snap_min value published here. Even if there is a + * race while publishing this ID, it prevents the oldest ID from moving * further forward, so that once a read-uncommitted cursor is * positioned on a value, it can't be freed. */ - if (txn->isolation == WT_ISO_READ_UNCOMMITTED && - !F_ISSET(txn, WT_TXN_HAS_ID) && - WT_TXNID_LT(txn_state->snap_min, txn_global->last_running)) - txn_state->snap_min = txn_global->last_running; - - if (txn->isolation != WT_ISO_READ_UNCOMMITTED && - !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { + if (txn_state->snap_min == WT_TXN_NONE) + txn_state->snap_min = txn_global->last_running; + } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) __wt_txn_get_snapshot(session); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index b7ebb8fbc14..037399625ea 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2910,6 +2910,15 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * if recovery is required to use the database. */ #define WT_RUN_RECOVERY -31806 +/*! @cond internal */ +/*! + * Operation would overflow cache. + * This error is generated when wiredtiger_open is configured to run in-memory, + * and an insert or update operation requires more than the configured cache + * size to complete. + */ +#define WT_CACHE_FULL -31807 +/*! @endcond */ /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index efe4d22eeca..44dc7dc30a7 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -551,7 +551,6 @@ __log_fill(WT_SESSION_IMPL *session, else /* * If this is a force or unbuffered write, write it now. - * A forced write sends in a temporary, local slot. */ WT_ERR(__wt_write(session, myslot->slot->slot_fh, myslot->offset + myslot->slot->slot_start_offset, @@ -1173,87 +1172,60 @@ __wt_log_close(WT_SESSION_IMPL *session) } /* - * __log_filesize -- - * Returns an estimate of the real end of log file. + * __log_has_hole -- + * Determine if the current offset represents a hole in the log + * file (i.e. there is valid data somewhere after the hole), or + * if this is the end of this log file and the remainder of the + * file is zeroes. */ static int -__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof) +__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; - wt_off_t log_size, off, off1; - uint32_t allocsize, bufsz; + wt_off_t log_size, off, remainder; + size_t bufsz, rdlen; char *buf, *zerobuf; conn = S2C(session); log = conn->log; - if (eof == NULL) - return (0); - *eof = 0; - WT_RET(__wt_filesize(session, fh, &log_size)); - if (log == NULL) - allocsize = WT_LOG_ALIGN; - else - allocsize = log->allocsize; + log_size = fh->size; + remainder = log_size - offset; + *hole = false; /* * It can be very slow looking for the last real record in the log - * in very small chunks. Walk backward by a megabyte at a time. When - * we find a part of the log that is not just zeroes, walk to find - * the last record. + * in very small chunks. Walk a megabyte at a time. If we find a + * part of the log that is not just zeroes we know this log file + * has a hole in it. */ buf = zerobuf = NULL; - if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE) + if (log == NULL || log->allocsize < WT_MEGABYTE) bufsz = WT_MEGABYTE; else - bufsz = allocsize; + bufsz = log->allocsize; + + if ((size_t)remainder < bufsz) + bufsz = (size_t)remainder; WT_RET(__wt_calloc_def(session, bufsz, &buf)); WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf)); /* - * Read in a chunk starting at the end of the file. Keep going until - * we reach the beginning or we find a chunk that contains any non-zero - * bytes. Compare against a known zero byte chunk. + * Read in a chunk starting at the given offset. + * Compare against a known zero byte chunk. */ - for (off = log_size - (wt_off_t)bufsz; - off >= 0; - off -= (wt_off_t)bufsz) { - WT_ERR(__wt_read(session, fh, off, bufsz, buf)); - if (memcmp(buf, zerobuf, bufsz) != 0) + for (off = offset; remainder > 0; + remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) { + rdlen = WT_MIN(bufsz, (size_t)remainder); + WT_ERR(__wt_read(session, fh, off, rdlen, buf)); + if (memcmp(buf, zerobuf, rdlen) != 0) { + *hole = true; break; + } } - /* - * If we're walking by large amounts, now walk by the real allocsize - * to find the real end, if we found something. Otherwise we reached - * the beginning of the file. Offset can go negative if the log file - * size is not a multiple of a megabyte. The first chunk of the log - * file will always be non-zero. - */ - if (off < 0) - off = 0; - - /* - * We know all log records are aligned at log->allocsize. The first - * item in a log record is always a 32-bit length. Look for any - * non-zero length at the allocsize boundary. This may not be a true - * log record since it could be the middle of a large record. But we - * know no log record starts after it. Return an estimate of the log - * file size. - */ - for (off1 = bufsz - allocsize; - off1 > 0; off1 -= (wt_off_t)allocsize) - if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0) - break; - off = off + off1; - - /* - * Set EOF to the last zero-filled record we saw. - */ - *eof = off + (wt_off_t)allocsize; -err: - if (buf != NULL) +err: if (buf != NULL) __wt_free(session, buf); if (zerobuf != NULL) __wt_free(session, zerobuf); @@ -1310,7 +1282,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * responsible for freeing the slot in that case. Otherwise the * worker thread will free it. */ - if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { if (freep != NULL) *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; @@ -1340,6 +1312,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); + WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); if (++yield_count < 1000) __wt_yield(); else @@ -1354,6 +1327,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) WT_ASSERT(session, slot != log->active_slot); WT_ERR(__wt_cond_signal(session, log->log_write_cond)); + F_CLR(slot, WT_SLOT_FLUSH); /* * Signal the close thread if needed. @@ -1543,7 +1517,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, } WT_ERR(__log_openfile( session, false, &log_fh, WT_LOG_FILENAME, start_lsn.file)); - WT_ERR(__log_filesize(session, log_fh, &log_size)); + WT_ERR(__wt_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); @@ -1574,7 +1548,7 @@ advance: break; WT_ERR(__log_openfile(session, false, &log_fh, WT_LOG_FILENAME, rd_lsn.file)); - WT_ERR(__log_filesize(session, log_fh, &log_size)); + WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; } @@ -1592,16 +1566,25 @@ advance: */ reclen = *(uint32_t *)buf->mem; /* - * Log files are pre-allocated. We never expect a zero length - * unless we've reached the end of the log. The log can be - * written out of order, so when recovery finds the end of - * the log, truncate the file and remove any later log files - * that may exist. + * Log files are pre-allocated. We need to detect the + * difference between a hole in the file (where this location + * would be considered the end of log) and the last record + * in the log and we're at the zeroed part of the file. + * If we find a zeroed record, scan forward in the log looking + * for any data. If we detect any we have a hole and stop. + * Otherwise if the rest is all zeroes advance to the next file. + * When recovery finds the end of the log, truncate the file + * and remove any later log files that may exist. */ if (reclen == 0) { - /* This LSN is the end. */ - eol = true; - break; + WT_ERR(__log_has_hole( + session, log_fh, rd_lsn.offset, &eol)); + if (eol) + /* Found a hole. This LSN is the end. */ + break; + else + /* Last record in log. Look for more. */ + goto advance; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 7c541eb7bec..b3790412536 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -429,6 +429,8 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_STAT_FAST_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); + if (LF_ISSET(WT_LOG_FLUSH)) + F_SET(slot, WT_SLOT_FLUSH); if (LF_ISSET(WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC); if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index cca417a31fc..f988bfc97fd 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -1434,7 +1434,7 @@ __clsm_remove(WT_CURSOR *cursor) clsm = (WT_CURSOR_LSM *)cursor; - CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); + CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 7056c907f8e..4741cf52608 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -261,6 +261,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; + bool flush_set; + + flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. @@ -269,8 +272,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - if ((ret = __lsm_discard_handle( - session, chunk->uri, NULL)) == 0) + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __lsm_discard_handle(session, chunk->uri, NULL)); + if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; @@ -294,7 +298,11 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, return (0); } - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", + if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) + return (0); + flush_set = true; + + WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* @@ -318,27 +326,31 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } - WT_RET(ret); + WT_ERR(ret); - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", + WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, chunk->uri, - __wt_checkpoint, NULL, NULL, 0)); - + /* + * Turn on metadata tracking to ensure the checkpoint gets the + * necessary handle locks. + */ + WT_ERR(__wt_meta_track_on(session)); + WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( + session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); + WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) - WT_RET_MSG(session, ret, "LSM checkpoint"); + WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ - WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); + WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ - WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); + WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ - WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); + WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; @@ -346,9 +358,11 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); - if (ret != 0) - WT_RET_MSG(session, ret, "LSM metadata write"); + WT_ERR_MSG(session, ret, "LSM metadata write"); + + WT_PUBLISH(chunk->flushing, 0); + flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and @@ -356,24 +370,28 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ - WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); + WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); - WT_RET(__wt_session_release_btree(session)); + WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", + WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else - WT_RET(__wt_lsm_manager_push_entry( + WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); - return (0); + +err: if (flush_set) + WT_PUBLISH(chunk->flushing, 0); + + return (ret); } /* @@ -487,7 +505,9 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT)); + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); + WT_RET(ret); /* * Take the schema lock for the drop operation. Since __wt_schema_drop diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index 0bab52b9d9c..e7074a9c1b5 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -151,8 +151,11 @@ __wt_metadata_update( key, value, WT_META_TRACKING(session) ? "true" : "false", __metadata_turtle(key) ? "" : "not ")); - if (__metadata_turtle(key)) - return (__wt_turtle_update(session, key, value)); + if (__metadata_turtle(key)) { + WT_WITH_TURTLE_LOCK(session, + ret = __wt_turtle_update(session, key, value)); + return (ret); + } if (WT_META_TRACKING(session)) WT_RET(__wt_meta_track_update(session, key)); @@ -219,9 +222,20 @@ __wt_metadata_search( if (__metadata_turtle(key)) return (__wt_turtle_read(session, key, valuep)); + /* + * All metadata reads are at read-uncommitted isolation. That's + * because once a schema-level operation completes, subsequent + * operations must see the current version of checkpoint metadata, or + * they may try to read blocks that may have been freed from a file. + * Metadata updates use non-transactional techniques (such as the + * schema and metadata locks) to protect access to in-flight updates. + */ WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); cursor->set_key(cursor, key); - WT_ERR(cursor->search(cursor)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = cursor->search(cursor)); + WT_ERR(ret); + WT_ERR(cursor->get_value(cursor, &value)); WT_ERR(__wt_strdup(session, value, valuep)); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index b223c2fb8fc..bc96a35efc7 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -222,35 +222,6 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk) return (ret); } -/* - * __wt_meta_track_find_handle -- - * Check if we have already seen a handle. - */ -int -__wt_meta_track_find_handle( - WT_SESSION_IMPL *session, const char *name, const char *checkpoint) -{ - WT_META_TRACK *trk, *trk_orig; - - WT_ASSERT(session, - WT_META_TRACKING(session) && session->meta_track_nest > 0); - - trk_orig = session->meta_track; - trk = session->meta_track_next; - - while (--trk >= trk_orig) { - if (trk->op != WT_ST_LOCK) - continue; - if (strcmp(trk->dhandle->name, name) == 0 && - ((trk->dhandle->checkpoint == NULL && checkpoint == NULL) || - (trk->dhandle->checkpoint != NULL && - strcmp(trk->dhandle->checkpoint, checkpoint) == 0))) - return (0); - } - - return (WT_NOTFOUND); -} - /* * __wt_meta_track_off -- * Turn off metadata operation tracking, unrolling on error. @@ -293,7 +264,8 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) * If we don't have the metadata handle (e.g, we're in the process of * creating the metadata), we can't sync it. */ - if (!need_sync || session->meta_dhandle == NULL) + if (!need_sync || session->meta_dhandle == NULL || + F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto done; /* If we're logging, make sure the metadata update was flushed. */ @@ -304,7 +276,8 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) WT_RET(ret); } else { WT_WITH_DHANDLE(session, session->meta_dhandle, - ret = __wt_checkpoint(session, NULL)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_COMMITTED, + ret = __wt_checkpoint(session, NULL))); WT_RET(ret); WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint_sync(session, NULL)); diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 1aa9c953689..13e8b31916f 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -202,7 +202,9 @@ __wt_turtle_init(WT_SESSION_IMPL *session) /* Create the turtle file. */ WT_RET(__metadata_config(session, &metaconf)); - WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf)); + WT_WITH_TURTLE_LOCK(session, ret = __wt_turtle_update( + session, WT_METAFILE_URI, metaconf)); + WT_ERR(ret); } /* Remove the backup files, we'll never read them again. */ diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c index 20a9e8236ac..6280e334afb 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c @@ -49,8 +49,7 @@ __wt_std_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len) #if defined(HAVE_FALLOCATE) WT_DECL_RET; - WT_SYSCALL_RETRY( - fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret); + WT_SYSCALL_RETRY(fallocate(fh->fd, 0, offset, len), ret); return (ret); #else WT_UNUSED(fh); @@ -76,8 +75,7 @@ __wt_sys_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len) * Linux versions (RHEL 5.5), but not in the version of the C library. * This allows it to work everywhere the kernel supports it. */ - WT_SYSCALL_RETRY(syscall( - SYS_fallocate, fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret); + WT_SYSCALL_RETRY(syscall(SYS_fallocate, fh->fd, 0, offset, len), ret); return (ret); #else WT_UNUSED(fh); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 40917bebf56..965f798e820 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -25,7 +25,7 @@ typedef struct { WT_PAGE *page; uint32_t flags; /* Caller's configuration */ - WT_ITEM dsk; /* Temporary disk-image buffer */ + WT_ITEM disk_image; /* Temporary disk-image buffer */ /* * Track start/stop write generation to decide if all changes to the @@ -40,9 +40,7 @@ typedef struct { uint64_t orig_btree_checkpoint_gen; uint64_t orig_txn_checkpoint_gen; - /* - * Track maximum transaction ID seen and first unwritten transaction ID. - */ + /* Track the page's maximum transaction ID. */ uint64_t max_txn; /* @@ -161,7 +159,7 @@ typedef struct { WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t cksum; /* Split's checksum */ - void *dsk; /* Split's disk image */ + void *disk_image; /* Split's disk image */ /* * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and @@ -450,12 +448,16 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* - * Clean up the boundary structures: some workloads result in millions - * of these structures, and if associated with some random session that - * got roped into doing forced eviction, they won't be discarded for the - * life of the session. + * Clean up reconciliation resources: some workloads have millions of + * boundary structures, and if associated with an application session + * pulled into doing forced eviction, they won't be discarded for the + * life of the session (or until session.reset is called). Discard all + * of the reconciliation resources if an application thread, not doing + * a checkpoint. */ - __rec_bnd_cleanup(session, r, false); + __rec_bnd_cleanup(session, r, + F_ISSET(session, WT_SESSION_INTERNAL) || + WT_SESSION_IS_CHECKPOINT(session) ? false : true); WT_RET(ret); @@ -619,7 +621,6 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) switch (mod->rec_result) { case WT_PM_REC_EMPTY: /* Page is empty */ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ - case WT_PM_REC_REWRITE: /* Rewrite */ return (0); case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ break; @@ -647,6 +648,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) WT_INTL_INDEX_GET(session, next, pindex); for (i = 0; i < mod->mod_multi_entries; ++i) { + /* + * There's special error handling required when re-instantiating + * pages in memory; it's not needed here, asserted for safety. + */ + WT_ASSERT(session, mod->mod_multi[i].supd == NULL); + WT_ERR(__wt_multi_to_ref(session, next, &mod->mod_multi[i], &pindex->index[i], NULL)); pindex->index[i]->home = next; @@ -751,7 +758,7 @@ __rec_write_init(WT_SESSION_IMPL *session, r->last = &r->_last; /* Disk buffers need to be aligned for writing. */ - F_SET(&r->dsk, WT_ITEM_ALIGNED); + F_SET(&r->disk_image, WT_ITEM_ALIGNED); } /* Reconciliation is not re-entrant, make sure that doesn't happen. */ @@ -809,6 +816,9 @@ __rec_write_init(WT_SESSION_IMPL *session, } r->flags = flags; + /* Track the page's maximum transaction ID. */ + r->max_txn = WT_TXN_NONE; + /* Track if the page can be marked clean. */ r->leave_dirty = false; @@ -890,7 +900,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) return; *(WT_RECONCILE **)reconcilep = NULL; - __wt_buf_free(session, &r->dsk); + __wt_buf_free(session, &r->disk_image); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -945,14 +955,15 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) * * During some big-page evictions we have seen boundary arrays that have * millions of elements. That should not be a normal event, but if the - * memory is associated with a random session, it won't be discarded - * until the session is closed. If there are more than 10,000 boundary - * structure elements, destroy the boundary array and we'll start over. + * memory is associated with a random application session, it won't be + * discarded until the session is closed or reset. If there are more + * than 10,000 boundary structure elements, discard the boundary array + * entirely and start over next time. */ if (destroy || r->bnd_entries > 10 * 1000) { for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); - __wt_free(session, bnd->dsk); + __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); __wt_buf_free(session, &bnd->key); } @@ -973,7 +984,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) ++last_used; for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); - __wt_free(session, bnd->dsk); + __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); } } @@ -1436,7 +1447,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * If there are deleted child pages we can't discard immediately, keep * the page dirty so they are eventually freed. */ - r->leave_dirty = 1; + r->leave_dirty = true; /* * If the original page cannot be freed, we need to keep a slot on the @@ -1631,8 +1642,8 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) * for overflow in diagnostic mode. */ WT_ASSERT(session, r->space_avail >= size); - WT_ASSERT(session, - WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->dsk.memsize)); + WT_ASSERT(session, WT_BLOCK_FITS( + r->first_free, size, r->disk_image.mem, r->disk_image.memsize)); r->entries += v; r->space_avail -= size; @@ -1854,7 +1865,7 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) WT_CLEAR(bnd->addr); bnd->size = 0; bnd->cksum = 0; - __wt_free(session, bnd->dsk); + __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); bnd->supd_next = 0; @@ -1967,14 +1978,14 @@ __rec_split_init(WT_SESSION_IMPL *session, */ corrected_page_size = r->page_size; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); /* * Clear the disk page's header and block-manager space, set the page * type (the type doesn't change, and setting it later would require * additional code in a few different places). */ - dsk = r->dsk.mem; + dsk = r->disk_image.mem; memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree)); dsk->type = page->type; @@ -2253,11 +2264,11 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) btree = S2BT(session); bm = btree->bm; - len = WT_PTRDIFF(r->first_free, r->dsk.mem); + len = WT_PTRDIFF(r->first_free, r->disk_image.mem); corrected_page_size = len + add_len; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size)); - r->first_free = (uint8_t *)r->dsk.mem + len; + WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); + r->first_free = (uint8_t *)r->disk_image.mem + len; WT_ASSERT(session, corrected_page_size >= len); r->space_avail = corrected_page_size - len; WT_ASSERT(session, r->space_avail >= add_len); @@ -2278,7 +2289,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) size_t inuse; btree = S2BT(session); - dsk = r->dsk.mem; + dsk = r->disk_image.mem; /* * We should never split during salvage, and we're about to drop core @@ -2410,8 +2421,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* Finalize the header information and write the page. */ dsk->recno = last->recno; dsk->u.entries = r->entries; - dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); - WT_RET(__rec_split_write(session, r, last, &r->dsk, false)); + dsk->mem_size = + r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk); + WT_RET( + __rec_split_write(session, r, last, &r->disk_image, false)); /* * Set the caller's entry count and buffer information for the @@ -2475,7 +2488,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, unpack = &_unpack; compressor = btree->compressor; dst = &r->raw_destination; - dsk = r->dsk.mem; + dsk = r->disk_image.mem; WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next]; @@ -2751,7 +2764,7 @@ no_slots: r->first_free = dsk_start + len; r->space_avail += r->raw_offsets[result_slots]; WT_ASSERT(session, r->first_free + r->space_avail <= - (uint8_t *)r->dsk.mem + r->dsk.memsize); + (uint8_t *)r->disk_image.mem + r->disk_image.memsize); /* * Set the key for the next block (before writing the block, a @@ -2788,14 +2801,15 @@ no_slots: WT_STAT_FAST_DATA_INCR(session, compress_raw_fail); dsk->recno = last->recno; - dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + dsk->mem_size = + r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->entries = 0; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - write_ref = &r->dsk; + write_ref = &r->disk_image; last->already_compressed = false; } else { /* @@ -2823,7 +2837,7 @@ no_slots: last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( - session, &r->dsk, dst->mem, dst->size)); + session, &r->disk_image, dst->mem, dst->size)); } else WT_RET( __rec_split_write(session, r, last, write_ref, last_block)); @@ -2966,14 +2980,14 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) bnd->entries = r->entries; /* Finalize the header information. */ - dsk = r->dsk.mem; + dsk = r->disk_image.mem; dsk->recno = bnd->recno; dsk->u.entries = r->entries; - dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); + dsk->mem_size = r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk); /* If this is a checkpoint, we're done, otherwise write the page. */ return (__rec_is_checkpoint(session, r, bnd) ? - 0 : __rec_split_write(session, r, bnd, &r->dsk, true)); + 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); } /* @@ -3023,9 +3037,9 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * WT_PAGE_HEADER header onto the scratch buffer, most of the header * information remains unchanged between the pages. */ - WT_RET(__wt_scr_alloc(session, r->dsk.memsize, &tmp)); + WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); dsk = tmp->mem; - memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); + memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_SIZE); /* * For each split chunk we've created, update the disk image and copy @@ -3035,7 +3049,8 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { /* Copy the page contents to the temporary buffer. */ len = (bnd + 1)->offset - bnd->offset; - memcpy(dsk_start, (uint8_t *)r->dsk.mem + bnd->offset, len); + memcpy(dsk_start, + (uint8_t *)r->disk_image.mem + bnd->offset, len); /* Finalize the header information and write the page. */ dsk->recno = bnd->recno; @@ -3060,12 +3075,12 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * chunk, including header, because if there was room for that large a * remnant, we wouldn't have switched from accumulating to a page end. */ - p = (uint8_t *)r->dsk.mem + bnd->offset; + p = (uint8_t *)r->disk_image.mem + bnd->offset; len = WT_PTRDIFF(r->first_free, p); if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) WT_PANIC_ERR(session, EINVAL, "Reconciliation remnant too large for the split buffer"); - dsk = r->dsk.mem; + dsk = r->disk_image.mem; dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); (void)memmove(dsk_start, p, len); @@ -3208,13 +3223,17 @@ supd_check_complete: } /* - * If using the save/restore eviction path and we had to skip updates in - * order to build this disk image, we can't actually write it. Instead, - * we will re-instantiate the page using the disk image and the list of - * updates we skipped. + * If configured for an in-memory database, or using the save/restore + * eviction path and we had to skip updates in order to build this disk + * image, we can't actually write it. Instead, we will re-instantiate + * the page using the disk image and any list of updates we skipped. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { - r->cache_write_restore = true; + if (F_ISSET(r, WT_EVICT_IN_MEMORY) || + (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)) { + + /* Statistics tracking that we used update/restore. */ + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) + r->cache_write_restore = true; /* * If the buffer is compressed (raw compression was configured), @@ -3228,10 +3247,10 @@ supd_check_complete: */ if (bnd->already_compressed) WT_ERR(__rec_raw_decompress( - session, buf->data, buf->size, &bnd->dsk)); + session, buf->data, buf->size, &bnd->disk_image)); else { WT_ERR(__wt_strndup( - session, buf->data, buf->size, &bnd->dsk)); + session, buf->data, buf->size, &bnd->disk_image)); WT_ASSERT(session, __wt_verify_dsk_image(session, "[evict split]", buf->data, buf->size, true) == 0); } @@ -3784,8 +3803,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_REPLACE: addr = &child->modify->mod_replace; break; - case WT_PM_REC_REWRITE: - break; WT_ILLEGAL_VALUE_ERR(session); } break; @@ -5281,7 +5298,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - if (multi->supd == NULL) { + if (multi->disk_image == NULL) { if (multi->addr.reuse) multi->addr.addr = NULL; else { @@ -5291,7 +5308,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) } } else { __wt_free(session, multi->supd); - __wt_free(session, multi->supd_dsk); + __wt_free(session, multi->disk_image); } } __wt_free(session, mod->mod_multi); @@ -5318,6 +5335,44 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) return (ret); } +/* + * __rec_split_dump_keys -- + * Dump out the split keys in verbose mode. + */ +static int +__rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) +{ + WT_BOUNDARY *bnd; + WT_DECL_ITEM(tkey); + WT_DECL_RET; + uint32_t i; + + if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_ROW_LEAF) + WT_RET(__wt_scr_alloc(session, 0, &tkey)); + WT_ERR(__wt_verbose( + session, WT_VERB_SPLIT, "split: %" PRIu32 " pages", r->bnd_next)); + for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ERR(__wt_buf_set_printable( + session, tkey, bnd->key.data, bnd->key.size)); + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "starting key %.*s", + (int)tkey->size, (const char *)tkey->data)); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_VAR: + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "starting recno %" PRIu64, bnd->recno)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } +err: __wt_scr_free(session, &tkey); + return (ret); +} + /* * __rec_write_wrapup -- * Finish the reconciliation. @@ -5328,7 +5383,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_BM *bm; WT_BOUNDARY *bnd; WT_BTREE *btree; - WT_MULTI *multi; WT_PAGE_MODIFY *mod; WT_REF *ref; size_t addr_size; @@ -5376,7 +5430,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_EMPTY: /* Page deleted */ break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ - case WT_PM_REC_REWRITE: /* Rewrite */ /* * Discard the multiple replacement blocks. */ @@ -5442,24 +5495,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = &r->bnd[0]; /* - * If we're saving/restoring changes for this page, there's - * nothing to write. Allocate, then initialize the array of - * replacement blocks. + * If saving/restoring changes for this page and there's only + * one block, there's nothing to write. This is an in-memory + * configuration or a special case of forced eviction: set up + * a single block as if to split, then use that disk image to + * rewrite the page in memory. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { - WT_RET(__wt_calloc_def( - session, r->bnd_next, &mod->mod_multi)); - multi = mod->mod_multi; - multi->supd = bnd->supd; - multi->supd_entries = bnd->supd_next; - bnd->supd = NULL; - multi->supd_dsk = bnd->dsk; - bnd->dsk = NULL; - mod->mod_multi_entries = 1; - - mod->rec_result = WT_PM_REC_REWRITE; - break; - } + if (bnd->disk_image != NULL) + goto split; /* * If this is a root page, then we don't have an address and we @@ -5467,7 +5510,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * we were about to write the buffer so we know what to do here. */ if (bnd->addr.addr == NULL) - WT_RET(__wt_bt_write(session, &r->dsk, + WT_RET(__wt_bt_write(session, &r->disk_image, NULL, NULL, true, bnd->already_compressed)); else { mod->mod_replace = bnd->addr; @@ -5495,49 +5538,18 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_ILLEGAL_VALUE(session); } - /* Display the actual split keys. */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) { - WT_DECL_ITEM(tkey); - WT_DECL_RET; - uint32_t i; - - if (page->type == WT_PAGE_ROW_INT || - page->type == WT_PAGE_ROW_LEAF) - WT_RET(__wt_scr_alloc(session, 0, &tkey)); - for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i) - switch (page->type) { - case WT_PAGE_ROW_INT: - case WT_PAGE_ROW_LEAF: - WT_ERR(__wt_buf_set_printable( - session, tkey, - bnd->key.data, bnd->key.size)); - WT_ERR(__wt_verbose( - session, WT_VERB_SPLIT, - "split: starting key " - "%.*s", - (int)tkey->size, - (const char *)tkey->data)); - break; - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_INT: - case WT_PAGE_COL_VAR: - WT_ERR(__wt_verbose( - session, WT_VERB_SPLIT, - "split: starting recno %" PRIu64, - bnd->recno)); - break; - WT_ILLEGAL_VALUE_ERR(session); - } -err: __wt_scr_free(session, &tkey); - WT_RET(ret); - } + /* Optionally display the actual split keys in verbose mode. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) + WT_RET(__rec_split_dump_keys(session, page, r)); + + /* Track the largest set of page-splits. */ if (r->bnd_next > r->bnd_next_max) { r->bnd_next_max = r->bnd_next; WT_STAT_FAST_DATA_SET( session, rec_multiblock_max, r->bnd_next_max); } - switch (page->type) { +split: switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__rec_split_row(session, r, page)); @@ -5575,14 +5587,10 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * information (otherwise we might think the backing block is being * reused on a subsequent reconciliation where we want to free it). */ - switch (mod->rec_result) { - case WT_PM_REC_MULTIBLOCK: - case WT_PM_REC_REWRITE: + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) multi->addr.reuse = 0; - break; - } /* * On error, discard blocks we've written, they're unreferenced by the @@ -5641,18 +5649,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { - multi->supd = bnd->supd; - multi->supd_entries = bnd->supd_next; - bnd->supd = NULL; - multi->supd_dsk = bnd->dsk; - bnd->dsk = NULL; - } else { + if (bnd->disk_image == NULL) { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; + } else { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->disk_image = bnd->disk_image; + bnd->disk_image = NULL; } } mod->mod_multi_entries = r->bnd_next; @@ -5681,18 +5689,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { multi->key.recno = bnd->recno; - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { - multi->supd = bnd->supd; - multi->supd_entries = bnd->supd_next; - bnd->supd = NULL; - multi->supd_dsk = bnd->dsk; - bnd->dsk = NULL; - } else { + if (bnd->disk_image == NULL) { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; + } else { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->disk_image = bnd->disk_image; + bnd->disk_image = NULL; } } mod->mod_multi_entries = r->bnd_next; diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index a86cff4d723..ba8664f2e39 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -260,11 +260,11 @@ err: __wt_scr_free(session, &buf); } /* - * __wt_schema_open_index -- - * Open one or more indices for a table. + * __schema_open_index -- + * Open one or more indices for a table (internal version). */ -int -__wt_schema_open_index(WT_SESSION_IMPL *session, +static int +__schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; @@ -386,6 +386,21 @@ err: __wt_scr_free(session, &tmp); return (ret); } +/* + * __wt_schema_open_index -- + * Open one or more indices for a table. + */ +int +__wt_schema_open_index(WT_SESSION_IMPL *session, + WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) +{ + WT_DECL_RET; + + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __schema_open_index(session, table, idxname, len, indexp)); + return (ret); +} + /* * __wt_schema_open_indices -- * Open the indices for a table. @@ -397,11 +412,11 @@ __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table) } /* - * __wt_schema_open_table -- - * Open a named table. + * __schema_open_table -- + * Open a named table (internal version). */ -int -__wt_schema_open_table(WT_SESSION_IMPL *session, +static int +__schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep) { WT_CONFIG cparser; @@ -597,3 +612,19 @@ err: __wt_schema_release_table(session, table); WT_RET(ENOENT); WT_RET_MSG(session, ENOENT, "%s not found in table", uri); } + +/* + * __wt_schema_open_table -- + * Open a named table. + */ +int +__wt_schema_open_table(WT_SESSION_IMPL *session, + const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep) +{ + WT_DECL_RET; + + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __schema_open_table( + session, name, namelen, ok_incomplete, tablep)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c index 0c2c5fe78c0..d14b81d389f 100644 --- a/src/third_party/wiredtiger/src/schema/schema_stat.c +++ b/src/third_party/wiredtiger/src/schema/schema_stat.c @@ -52,6 +52,63 @@ err: __wt_scr_free(session, &buf); return (ret); } +/* + * __curstat_size_only -- + * For very simple tables we can avoid getting table handles if + * configured to only retrieve the size. It's worthwhile because + * workloads that create and drop a lot of tables can put a lot of + * pressure on the table list lock. + */ +static int +__curstat_size_only(WT_SESSION_IMPL *session, + const char *uri, bool *was_fast,WT_CURSOR_STAT *cst) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM ckey, colconf, cval; + WT_DECL_RET; + WT_ITEM namebuf; + wt_off_t filesize; + char *tableconf; + + WT_CLEAR(namebuf); + *was_fast = false; + + /* Retrieve the metadata for this table. */ + WT_RET(__wt_metadata_search(session, uri, &tableconf)); + + /* + * The fast path only works if the table consists of a single file + * and does not have any indexes. The absence of named columns is how + * we determine that neither of those conditions can be satisfied. + */ + WT_ERR(__wt_config_getones(session, tableconf, "columns", &colconf)); + WT_ERR(__wt_config_subinit(session, &cparser, &colconf)); + if ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) + goto err; + + /* Build up the file name from the table URI. */ + WT_ERR(__wt_buf_fmt( + session, &namebuf, "%s.wt", uri + strlen("table:"))); + /* + * Get the size of the underlying file. There is nothing stopping a + * race with schema level table operations (for example drop) if there + * is a race there will be an error message generated. + */ + WT_ERR(__wt_filesize_name(session, namebuf.data, &filesize)); + + /* Setup and populate the statistics structure */ + __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); + cst->u.dsrc_stats.block_size = filesize; + __wt_curstat_dsrc_final(cst); + + *was_fast = true; + +err: __wt_free(session, tableconf); + __wt_buf_free(session, &namebuf); + + return (ret); +} + /* * __wt_curstat_table_init -- * Initialize the statistics for a table. @@ -67,6 +124,17 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, WT_TABLE *table; u_int i; const char *name; + bool was_fast; + + /* + * If only gathering table size statistics, try a fast path that + * avoids the schema and table list locks. + */ + if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + WT_RET(__curstat_size_only(session, uri, &was_fast, cst)); + if (was_fast) + return (0); + } name = uri + strlen("table:"); WT_RET(__wt_schema_get_table( diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index a766829afad..2045329b8ff 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -33,6 +33,8 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) __wt_buf_free(session, &cursor->value); } } + + WT_ASSERT(session, session->ncursors == 0); return (ret); } @@ -58,6 +60,33 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) return (ret); } +/* + * __wt_session_release_resources -- + * Release common session resources. + */ +int +__wt_session_release_resources(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + + /* Block manager cleanup */ + if (session->block_manager_cleanup != NULL) + WT_TRET(session->block_manager_cleanup(session)); + + /* Reconciliation cleanup */ + if (session->reconcile_cleanup != NULL) + WT_TRET(session->reconcile_cleanup(session)); + + /* + * Discard scratch buffers, error memory; last, just in case a cleanup + * routine uses scratch buffers. + */ + __wt_scr_discard(session); + __wt_buf_free(session, &session->err); + + return (ret); +} + /* * __session_clear -- * Clear a session structure. @@ -132,24 +161,17 @@ __session_close(WT_SESSION *wt_session, const char *config) /* Close all tables. */ WT_TRET(__wt_schema_close_tables(session)); + /* Confirm we're not holding any hazard pointers. */ + __wt_hazard_close(session); + /* Discard metadata tracking. */ __wt_meta_track_discard(session); - /* Discard scratch buffers, error memory. */ - __wt_scr_discard(session); - __wt_buf_free(session, &session->err); - /* Free transaction information. */ __wt_txn_destroy(session); - /* Confirm we're not holding any hazard pointers. */ - __wt_hazard_close(session); - - /* Cleanup */ - if (session->block_manager_cleanup != NULL) - WT_TRET(session->block_manager_cleanup(session)); - if (session->reconcile_cleanup != NULL) - WT_TRET(session->reconcile_cleanup(session)); + /* Release common session resources. */ + WT_TRET(__wt_session_release_resources(session)); /* Destroy the thread's mutex. */ WT_TRET(__wt_cond_destroy(session, &session->cond)); @@ -547,38 +569,12 @@ __session_reset(WT_SESSION *wt_session) WT_TRET(__wt_session_reset_cursors(session, true)); - WT_ASSERT(session, session->ncursors == 0); - - __wt_scr_discard(session); - __wt_buf_free(session, &session->err); + /* Release common session resources. */ + WT_TRET(__wt_session_release_resources(session)); err: API_END_RET_NOTFOUND_MAP(session, ret); } -/* - * __session_compact -- - * WT_SESSION->compact method. - */ -static int -__session_compact(WT_SESSION *wt_session, const char *uri, const char *config) -{ - WT_SESSION_IMPL *session; - - session = (WT_SESSION_IMPL *)wt_session; - - /* Disallow objects in the WiredTiger name space. */ - WT_RET(__wt_str_name_check(session, uri)); - - if (!WT_PREFIX_MATCH(uri, "colgroup:") && - !WT_PREFIX_MATCH(uri, "file:") && - !WT_PREFIX_MATCH(uri, "index:") && - !WT_PREFIX_MATCH(uri, "lsm:") && - !WT_PREFIX_MATCH(uri, "table:")) - return (__wt_bad_object_type(session, uri)); - - return (__wt_session_compact(wt_session, uri, config)); -} - /* * __wt_session_drop -- * Internal version of WT_SESSION::drop. @@ -630,6 +626,9 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, salvage, config, cfg); + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_ERR(ENOTSUP); + /* Block out checkpoints to avoid spurious EBUSY errors. */ WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, ret = @@ -818,6 +817,10 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, verify, config, cfg); + + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_ERR(ENOTSUP); + /* Block out checkpoints to avoid spurious EBUSY errors. */ WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, @@ -1036,11 +1039,12 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; - WT_STAT_FAST_CONN_INCR(session, txn_checkpoint); SESSION_API_CALL(session, checkpoint, config, cfg); + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_ERR(ENOTSUP); + /* * Checkpoints require a snapshot to write a transactionally consistent * snapshot of the data. @@ -1054,43 +1058,20 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) * from evicting anything newer than this because we track the oldest * transaction ID in the system that is not visible to all readers. */ + txn = &session->txn; if (F_ISSET(txn, WT_TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Checkpoint not permitted in a transaction"); - /* - * Reset open cursors. Do this explicitly, even though it will happen - * implicitly in the call to begin_transaction for the checkpoint, the - * checkpoint code will acquire the schema lock before we do that, and - * some implementation of WT_CURSOR::reset might need the schema lock. - */ - WT_ERR(__wt_session_reset_cursors(session, false)); - - /* - * Don't highjack the session checkpoint thread for eviction. - * - * Application threads are not generally available for potentially slow - * operations, but checkpoint does enough I/O it may be called upon to - * perform slow operations for the block manager. - */ - F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); + ret = __wt_txn_checkpoint(session, cfg); /* - * Only one checkpoint can be active at a time, and checkpoints must run - * in the same order as they update the metadata. It's probably a bad - * idea to run checkpoints out of multiple threads, but serialize them - * here to ensure we don't get into trouble. + * Release common session resources (for example, checkpoint may acquire + * significant reconciliation structures/memory). */ - WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1); - - WT_WITH_CHECKPOINT_LOCK(session, - ret = __wt_txn_checkpoint(session, cfg)); + WT_TRET(__wt_session_release_resources(session)); - WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); - -err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); - - API_END_RET_NOTFOUND_MAP(session, ret); +err: API_END_RET_NOTFOUND_MAP(session, ret); } /* @@ -1160,7 +1141,7 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_strerror, __session_open_cursor, __session_create, - __session_compact, + __wt_session_compact, __session_drop, __session_log_flush, __session_log_printf, diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index bbd4bbc536c..bd503cd7826 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -146,24 +146,12 @@ __session_compact_check_timeout( static int __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { - WT_DECL_RET; + struct timespec start_time; WT_DECL_ITEM(t); - WT_SESSION *wt_session; - WT_TXN *txn; + WT_DECL_RET; int i; - struct timespec start_time; - - txn = &session->txn; - wt_session = &session->iface; - - /* - * File compaction requires checkpoints, which will fail in a - * transactional context. Check now so the error message isn't - * confusing. - */ - if (session->compact->file_count != 0 && F_ISSET(txn, WT_TXN_RUNNING)) - WT_ERR_MSG(session, EINVAL, - " File compaction not permitted in a transaction"); + const char *checkpoint_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_checkpoint), NULL, NULL }; /* * Force the checkpoint: we don't want to skip it because the work we @@ -171,6 +159,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) */ WT_ERR(__wt_scr_alloc(session, 128, &t)); WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + checkpoint_cfg[1] = t->data; WT_ERR(__wt_epoch(session, &start_time)); @@ -182,7 +171,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) * time through the loop. */ for (i = 0; i < 100; ++i) { - WT_ERR(wt_session->checkpoint(wt_session, t->data)); + WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); session->compaction = false; WT_WITH_SCHEMA_LOCK(session, @@ -192,8 +181,8 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) if (!session->compaction) break; - WT_ERR(wt_session->checkpoint(wt_session, t->data)); - WT_ERR(wt_session->checkpoint(wt_session, t->data)); + WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); + WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); WT_ERR(__session_compact_check_timeout(session, start_time)); } @@ -212,10 +201,24 @@ __wt_session_compact( WT_CONFIG_ITEM cval; WT_DECL_RET; WT_SESSION_IMPL *session; + WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_ERR(ENOTSUP); + + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "lsm:") && + !WT_PREFIX_MATCH(uri, "table:")) + WT_ERR(__wt_bad_object_type(session, uri)); + /* Setup the structure in the session handle */ memset(&compact, 0, sizeof(WT_COMPACT)); session->compact = &compact; @@ -231,9 +234,27 @@ __wt_session_compact( if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_lsm_compact, cfg, 0)); - if (session->compact->file_count != 0) + if (session->compact->file_count != 0) { + /* + * File compaction requires checkpoints, which will fail in a + * transactional context. Check now so the error message isn't + * confusing. + */ + txn = &session->txn; + if (F_ISSET(txn, WT_TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, + " File compaction not permitted in a transaction"); + WT_ERR(__compact_file(session, uri, cfg)); + } err: session->compact = NULL; + + /* + * Release common session resources (for example, checkpoint may acquire + * significant reconciliation structures/memory). + */ + WT_TRET(__wt_session_release_resources(session)); + API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index e506b6848a1..346e9c0ab38 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -540,14 +540,6 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_ASSERT(session, WT_META_TRACKING(session)); saved_dhandle = session->dhandle; - /* - * If we already have the checkpoint locked, don't attempt to lock - * it again. - */ - if ((ret = __wt_meta_track_find_handle( - session, saved_dhandle->name, checkpoint)) != WT_NOTFOUND) - return (ret); - /* * Get the checkpoint handle exclusive, so no one else can access it * while we are creating the new checkpoint. diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index ccd6ce23560..066abc9ed0f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -344,11 +344,11 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, } /* - * __wt_txn_checkpoint -- + * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ -int -__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +static int +__txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { struct timespec start, stop, verb_timer; WT_CONNECTION_IMPL *conn; @@ -630,6 +630,50 @@ err: /* return (ret); } +/* + * __wt_txn_checkpoint -- + * Checkpoint a database or a list of objects in the database. + */ +int +__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + + /* + * Reset open cursors. Do this explicitly, even though it will happen + * implicitly in the call to begin_transaction for the checkpoint, the + * checkpoint code will acquire the schema lock before we do that, and + * some implementation of WT_CURSOR::reset might need the schema lock. + */ + WT_RET(__wt_session_reset_cursors(session, false)); + + /* + * Don't highjack the session checkpoint thread for eviction. + * + * Application threads are not generally available for potentially slow + * operations, but checkpoint does enough I/O it may be called upon to + * perform slow operations for the block manager. + */ + F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); + + /* + * Only one checkpoint can be active at a time, and checkpoints must run + * in the same order as they update the metadata. It's probably a bad + * idea to run checkpoints out of multiple threads, but as compaction + * calls checkpoint directly, it can be tough to avoid. Serialize here + * to ensure we don't get into trouble. + */ + WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1); + + WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint(session, cfg)); + + WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); + + F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); + + return (ret); +} + /* * __drop -- * Drop all checkpoints with a specific name. @@ -726,8 +770,8 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) * Checkpoint a tree. */ static int -__checkpoint_worker( - WT_SESSION_IMPL *session, const char *cfg[], bool is_checkpoint) +__checkpoint_worker(WT_SESSION_IMPL *session, + const char *cfg[], bool is_checkpoint, bool need_tracking) { WT_BM *bm; WT_BTREE *btree; @@ -752,6 +796,22 @@ __checkpoint_worker( fake_ckpt = hot_backup_locked = false; name_alloc = NULL; + /* + * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied + * with wrapping the entire assert condition in the unused macro. + */ + WT_UNUSED(need_tracking); + + /* + * Most callers need meta tracking to be on here, otherwise it is + * possible for this checkpoint to cleanup handles that are still in + * use. The exceptions are: + * - Checkpointing the metadata handle itself. + * - On connection close when we know there can't be any races. + */ + WT_ASSERT(session, !need_tracking || + WT_IS_METADATA(dhandle) || WT_META_TRACKING(session)); + /* * Set the checkpoint LSN to the maximum LSN so that if logging is * disabled, recovery will never roll old changes forward over the @@ -1128,7 +1188,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Should be holding the schema lock. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - return (__checkpoint_worker(session, cfg, true)); + return (__checkpoint_worker(session, cfg, true, true)); } /* @@ -1208,7 +1268,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_TRET(__checkpoint_worker(session, NULL, false)); + WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking)); if (need_tracking) WT_RET(__wt_meta_track_off(session, true, ret != 0)); diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 63d86969311..d0b3b909f09 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -421,8 +421,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session) was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ - WT_RET(__wt_open_session(conn, NULL, NULL, true, &session)); - F_SET(session, WT_SESSION_NO_LOGGING); + WT_RET(__wt_open_internal_session(conn, "txn-recover", + false, WT_SESSION_NO_LOGGING, &session)); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); -- cgit v1.2.1