summaryrefslogtreecommitdiff
path: root/src/third_party
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2015-11-02 11:55:14 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2015-11-02 11:55:14 +1100
commitfb6ebe75207c3221314ed318595489a838ef1db0 (patch)
tree6b9b210b15f9b9685b9a5dd707001297127ee1d3 /src/third_party
parent4fbfa13ec0f819080a35ed8b528a030797e483a6 (diff)
downloadmongo-fb6ebe75207c3221314ed318595489a838ef1db0.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.2.0-rc1-194-g0dc3f20.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party')
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c2
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i2
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py80
-rw-r--r--src/third_party/wiredtiger/dist/api_err.py5
-rw-r--r--src/third_party/wiredtiger/dist/flags.py6
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list1
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c20
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c51
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c54
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c239
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c49
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c14
-rw-r--r--src/third_party/wiredtiger/src/conn/api_strerror.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c18
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c11
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c7
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c32
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c9
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c44
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_metadata.c21
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c114
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c124
-rw-r--r--src/third_party/wiredtiger/src/include/api.h9
-rw-r--r--src/third_party/wiredtiger/src/include/block.h3
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h23
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i12
-rw-r--r--src/third_party/wiredtiger/src/include/btree_cmp.i55
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i20
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h1
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h11
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h66
-rw-r--r--src/third_party/wiredtiger/src/include/log.h5
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h1
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h10
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h4
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h32
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i61
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in9
-rw-r--r--src/third_party/wiredtiger/src/log/log.c119
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c64
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c20
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c35
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c4
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fallocate.c6
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c270
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c47
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c68
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c125
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c61
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c8
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c74
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c4
63 files changed, 1417 insertions, 769 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
index 20c30e10482..44aff59963c 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -1455,6 +1455,8 @@ close_reopen(CONFIG *cfg)
{
int ret;
+ if (!cfg->reopen_connection)
+ return (0);
/*
* Reopen the connection. We do this so that the workload phase always
* starts with the on-disk files, and so that read-only workloads can
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
index 7e29aa0f3c2..be3ba462e0c 100644
--- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
@@ -134,6 +134,8 @@ DEF_OPT_AS_UINT32(random_range, 0,
"if non zero choose a value from within this range as the key for "
"insert operations")
DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value")
+DEF_OPT_AS_BOOL(reopen_connection, 1,
+ "close and reopen the connection between populate and workload phases")
DEF_OPT_AS_UINT32(report_interval, 2,
"output throughput information every interval seconds, 0 to disable")
DEF_OPT_AS_UINT32(run_ops, 0,
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 6fd7dcd0093..99e08282e49 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -582,7 +582,7 @@ session_config = [
choices=['read-uncommitted', 'read-committed', 'snapshot']),
]
-common_wiredtiger_open = [
+wiredtiger_open_common = connection_runtime_config + [
Config('buffer_alignment', '-1', r'''
in-memory alignment (in bytes) for buffers used for I/O. The
default value of -1 indicates a platform-specific alignment value
@@ -676,6 +676,30 @@ common_wiredtiger_open = [
]),
]
+wiredtiger_open = wiredtiger_open_common + [
+ Config('config_base', 'true', r'''
+ write the base configuration file if creating the database. If
+ \c false in the config passed directly to ::wiredtiger_open, will
+ ignore any existing base configuration file in addition to not creating
+ one. See @ref config_base for more information''',
+ type='boolean'),
+ Config('create', 'false', r'''
+ create the database if it does not exist''',
+ type='boolean'),
+ Config('exclusive', 'false', r'''
+ fail if the database already exists, generally used with the
+ \c create option''',
+ type='boolean'),
+ Config('in_memory', 'false', r'''
+ keep data in-memory only, minimize disk I/O''',
+ type='boolean', undoc=True),
+ Config('use_environment_priv', 'false', r'''
+ use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
+ variables regardless of whether or not the process is running
+ with special privileges. See @ref home for more information''',
+ type='boolean'),
+]
+
cursor_runtime_config = [
Config('append', 'false', r'''
append the value as a new record, creating a new record
@@ -1003,59 +1027,13 @@ methods = {
# creation-specific configuration strings).
# wiredtiger_open_all:
# All of the above configuration values combined
-'wiredtiger_open' : Method(
- connection_runtime_config +
- common_wiredtiger_open + [
- Config('config_base', 'true', r'''
- write the base configuration file if creating the database. If
- \c false in the config passed directly to ::wiredtiger_open, will
- ignore any existing base configuration file in addition to not creating
- one. See @ref config_base for more information''',
- type='boolean'),
- Config('create', 'false', r'''
- create the database if it does not exist''',
- type='boolean'),
- Config('exclusive', 'false', r'''
- fail if the database already exists, generally used with the
- \c create option''',
- type='boolean'),
- Config('use_environment_priv', 'false', r'''
- use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
- variables regardless of whether or not the process is running
- with special privileges. See @ref home for more information''',
- type='boolean'),
-]),
-'wiredtiger_open_basecfg' : Method(
- connection_runtime_config +
- common_wiredtiger_open + [
+'wiredtiger_open' : Method(wiredtiger_open),
+'wiredtiger_open_basecfg' : Method(wiredtiger_open_common + [
Config('version', '(major=0,minor=0)', r'''
the file version'''),
]),
-'wiredtiger_open_usercfg' : Method(
- connection_runtime_config +
- common_wiredtiger_open
-),
-'wiredtiger_open_all' : Method(
- connection_runtime_config +
- common_wiredtiger_open + [
- Config('config_base', 'true', r'''
- write the base configuration file if creating the database. If
- \c false in the config passed directly to ::wiredtiger_open, will
- ignore any existing base configuration file in addition to not creating
- one. See @ref config_base for more information''',
- type='boolean'),
- Config('create', 'false', r'''
- create the database if it does not exist''',
- type='boolean'),
- Config('exclusive', 'false', r'''
- fail if the database already exists, generally used with the
- \c create option''',
- type='boolean'),
- Config('use_environment_priv', 'false', r'''
- use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
- variables regardless of whether or not the process is running
- with special privileges. See @ref home for more information''',
- type='boolean'),
+'wiredtiger_open_usercfg' : Method(wiredtiger_open_common),
+'wiredtiger_open_all' : Method(wiredtiger_open + [
Config('version', '(major=0,minor=0)', r'''
the file version'''),
]),
diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py
index 936c7bb11a7..09332d508a2 100644
--- a/src/third_party/wiredtiger/dist/api_err.py
+++ b/src/third_party/wiredtiger/dist/api_err.py
@@ -51,6 +51,11 @@ errors = [
'recovery must be run to continue', '''
This error is generated when wiredtiger_open is configured
to return an error if recovery is required to use the database.'''),
+ Error('WT_CACHE_FULL', -31807,
+ 'operation would overflow cache', '''
+ This error is generated when wiredtiger_open is configured
+ to run in-memory, and an insert or update operation requires more
+ than the configured cache size to complete.''', undoc=True),
]
# Update the #defines in the wiredtiger.in file.
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 65b68cf4277..da677c17389 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -36,6 +36,7 @@ flags = {
'page_read' : [
'READ_CACHE',
'READ_COMPACT',
+ 'READ_NO_EMPTY',
'READ_NO_EVICT',
'READ_NO_GEN',
'READ_NO_WAIT',
@@ -45,9 +46,10 @@ flags = {
'READ_WONT_NEED',
],
'rec_write' : [
+ 'EVICT_IN_MEMORY',
'EVICT_LOOKASIDE',
- 'EVICTING',
'EVICT_UPDATE_RESTORE',
+ 'EVICTING',
'VISIBILITY_ERR',
],
'txn_log_checkpoint' : [
@@ -92,6 +94,7 @@ flags = {
'CONN_CKPT_SYNC',
'CONN_CLOSING',
'CONN_EVICTION_RUN',
+ 'CONN_IN_MEMORY',
'CONN_LAS_OPEN',
'CONN_LEAK_MEMORY',
'CONN_LOG_SERVER_RUN',
@@ -114,6 +117,7 @@ flags = {
'SESSION_LOCKED_SCHEMA',
'SESSION_LOCKED_SLOT',
'SESSION_LOCKED_TABLE',
+ 'SESSION_LOCKED_TURTLE',
'SESSION_LOGGING_INMEM',
'SESSION_LOOKASIDE_CURSOR',
'SESSION_NO_CACHE',
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index dce284dae44..d204a11835b 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -35,6 +35,7 @@ WT_PACKED_STRUCT_END
WT_READ_BARRIER
WT_REF_SIZE
WT_SESSION_LOCKED_CHECKPOINT
+WT_SESSION_LOCKED_TURTLE
WT_STAT_DECR
WT_STAT_DECRV
WT_STAT_FAST_CONN_DECRV
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index d234a3c101f..26c0a905b82 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -59,6 +59,7 @@ CSV
CURSORs
CURSTD
CallsCustDate
+Checkpointing
Checksum
Checksums
CityHash
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
index 9b42a072d73..2c8ff89a5cf 100644
--- a/src/third_party/wiredtiger/src/block/block_ckpt.c
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -83,11 +83,16 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
} else {
/*
- * We depend on the btree level for locking: things will go
- * bad fast should we open the live system in two handles, or
- * if we create, salvage, truncate or verify the live/running
- * file, for that matter.
+ * We depend on the btree level for locking: things will go bad
+ * fast if we open the live system in two handles, or salvage,
+ * truncate or verify the live/running file.
*/
+#ifdef HAVE_DIAGNOSTIC
+ __wt_spin_lock(session, &block->live_lock);
+ WT_ASSERT(session, block->live_open == false);
+ block->live_open = true;
+ __wt_spin_unlock(session, &block->live_lock);
+#endif
ci = &block->live;
WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
}
@@ -178,8 +183,8 @@ __wt_block_checkpoint_unload(
/*
* If it's the live system, truncate to discard any extended blocks and
* discard the active extent lists. Hold the lock even though we're
- * unloading the live checkpoint, there could be readers active in
- * other checkpoints.
+ * unloading the live checkpoint, there could be readers active in other
+ * checkpoints.
*/
if (!checkpoint) {
/*
@@ -191,6 +196,9 @@ __wt_block_checkpoint_unload(
__wt_spin_lock(session, &block->live_lock);
__wt_block_ckpt_destroy(session, &block->live);
+#ifdef HAVE_DIAGNOSTIC
+ block->live_open = false;
+#endif
__wt_spin_unlock(session, &block->live_lock);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 037648696b3..3290fd6374c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -306,9 +306,6 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_STAT_FAST_CONN_INCR(session, cursor_search);
WT_STAT_FAST_DATA_INCR(session, cursor_search);
- if (btree->type == BTREE_ROW)
- WT_RET(__cursor_size_chk(session, &cursor->key));
-
/*
* If we have a page pinned, search it; if we don't have a page pinned,
* or the search of the pinned page doesn't find an exact match, search
@@ -376,9 +373,6 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
- if (btree->type == BTREE_ROW)
- WT_RET(__cursor_size_chk(session, &cursor->key));
-
/*
* If we have a row-store page pinned, search it; if we don't have a
* page pinned, or the search of the pinned page doesn't find an exact
@@ -449,6 +443,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
} else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND)
exact = 1;
else {
+ WT_ERR(__cursor_func_init(cbt, true));
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, NULL, true) :
__cursor_col_search(session, cbt, NULL));
@@ -659,9 +654,6 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
WT_STAT_FAST_DATA_INCR(session, cursor_remove);
WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
- if (btree->type == BTREE_ROW)
- WT_RET(__cursor_size_chk(session, &cursor->key));
-
retry: WT_RET(__cursor_func_init(cbt, true));
switch (btree->type) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 15ae93522a7..8edc40794e2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -45,7 +45,8 @@ static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
static int __debug_ref(WT_DBG *, WT_REF *);
static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
-static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t);
+static int __debug_tree(
+ WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t);
static void __debug_update(WT_DBG *, WT_UPDATE *, bool);
static void __dmsg(WT_DBG *, const char *, ...)
WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3)));
@@ -224,6 +225,8 @@ __wt_debug_addr(WT_SESSION_IMPL *session,
WT_DECL_ITEM(buf);
WT_DECL_RET;
+ WT_ASSERT(session, S2BT_SAFE(session) != NULL);
+
bm = S2BT(session)->bm;
WT_RET(__wt_scr_alloc(session, 1024, &buf));
@@ -245,6 +248,8 @@ __wt_debug_offset_blind(
WT_DECL_ITEM(buf);
WT_DECL_RET;
+ WT_ASSERT(session, S2BT_SAFE(session) != NULL);
+
/*
* This routine depends on the default block manager's view of files,
* where an address consists of a file offset, length, and checksum.
@@ -274,6 +279,8 @@ __wt_debug_offset(WT_SESSION_IMPL *session,
WT_DECL_RET;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp;
+ WT_ASSERT(session, S2BT_SAFE(session) != NULL);
+
/*
* This routine depends on the default block manager's view of files,
* where an address consists of a file offset, length, and checksum.
@@ -377,6 +384,8 @@ __debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
uint32_t i;
uint8_t v;
+ WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL);
+
btree = S2BT(ds->session);
WT_FIX_FOREACH(btree, dsk, v, i) {
@@ -398,6 +407,8 @@ __debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
WT_CELL_UNPACK *unpack, _unpack;
uint32_t i;
+ WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL);
+
btree = S2BT(ds->session);
unpack = &_unpack;
@@ -465,6 +476,8 @@ __wt_debug_tree_shape(
{
WT_DBG *ds, _ds;
+ WT_ASSERT(session, S2BT_SAFE(session) != NULL);
+
ds = &_ds;
WT_RET(__debug_config(session, ds, ofile));
@@ -484,22 +497,30 @@ __wt_debug_tree_shape(
/*
* __wt_debug_tree_all --
* Dump the in-memory information for a tree, including leaf pages.
+ * Takes an explicit btree as an argument, as one may not yet be set on
+ * the session. This is often the case as this function will be called
+ * from within a debugger, which makes setting a btree complicated.
*/
int
-__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+__wt_debug_tree_all(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile)
{
- return (__debug_tree(
- session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
+ return (__debug_tree(session,
+ btree, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
}
/*
* __wt_debug_tree --
* Dump the in-memory information for a tree, not including leaf pages.
+ * Takes an explicit btree as an argument, as one may not yet be set on
+ * the session. This is often the case as this function will be called
+ * from within a debugger, which makes setting a btree complicated.
*/
int
-__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+__wt_debug_tree(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile)
{
- return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK));
+ return (__debug_tree(session, btree, page, ofile, WT_DEBUG_TREE_WALK));
}
/*
@@ -512,6 +533,8 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
WT_DBG *ds, _ds;
WT_DECL_RET;
+ WT_ASSERT(session, S2BT_SAFE(session) != NULL);
+
ds = &_ds;
WT_RET(__debug_config(session, ds, ofile));
@@ -524,11 +547,16 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
/*
* __debug_tree --
- * Dump the in-memory information for a tree.
+ * Dump the in-memory information for a tree. Takes an explicit btree
+ * as an argument, as one may not be set on the session. This is often
+ * the case as this function will be called from within a debugger, which
+ * makes setting a btree complicated. We mark the session to the btree
+ * in this function
*/
static int
__debug_tree(
- WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags)
+ WT_SESSION_IMPL *session, WT_BTREE *btree,
+ WT_PAGE *page, const char *ofile, uint32_t flags)
{
WT_DBG *ds, _ds;
WT_DECL_RET;
@@ -540,7 +568,7 @@ __debug_tree(
if (page == NULL)
page = S2BT(session)->root.page;
- ret = __debug_page(ds, page, flags);
+ WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags));
__dmsg_wrapup(ds);
@@ -664,9 +692,6 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
case WT_PM_REC_REPLACE:
__dmsg(ds, ", replaced");
break;
- case WT_PM_REC_REWRITE:
- __dmsg(ds, ", rewrite");
- break;
case 0:
break;
WT_ILLEGAL_VALUE(session);
@@ -693,6 +718,8 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
uint32_t i;
uint8_t v;
+ WT_ASSERT(ds->session, S2BT_SAFE(ds->session) != NULL);
+
session = ds->session;
btree = S2BT(session);
dsk = page->dsk;
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 23429121e98..757b7b51cdd 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* __wt_delete_page_skip --
- * If iterating a cursor, skip deleted pages that are visible to us.
+ * If iterating a cursor, skip deleted pages that are either visible to
+ * us or globally visible.
*/
bool
-__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
bool skip;
@@ -245,8 +246,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
return (false);
- skip = ref->page_del == NULL ||
- __wt_txn_visible(session, ref->page_del->txnid);
+ skip = ref->page_del == NULL || (visible_all ?
+ __wt_txn_visible_all(session, ref->page_del->txnid) :
+ __wt_txn_visible(session, ref->page_del->txnid));
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (skip);
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 998667e3e1f..67e70d0cdb9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -12,9 +12,10 @@ static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *);
static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *);
static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
-static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
-static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
-static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
+static void __free_skip_array(
+ WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t, bool);
+static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *, bool);
+static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t, bool);
/*
* __wt_ref_out --
@@ -144,12 +145,15 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
uint32_t i;
+ bool update_ignore;
mod = page->modify;
+ /* In some failed-split cases, we can't discard updates. */
+ update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE);
+
switch (mod->rec_result) {
case WT_PM_REC_MULTIBLOCK:
- case WT_PM_REC_REWRITE:
/* Free list of replacement blocks. */
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
@@ -160,7 +164,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
__wt_free(session, multi->supd);
- __wt_free(session, multi->supd_dsk);
+ __wt_free(session, multi->disk_image);
__wt_free(session, multi->addr.addr);
}
__wt_free(session, mod->mod_multi);
@@ -179,7 +183,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_COL_VAR:
/* Free the append array. */
if ((append = WT_COL_APPEND(page)) != NULL) {
- __free_skip_list(session, WT_SKIP_FIRST(append));
+ __free_skip_list(
+ session, WT_SKIP_FIRST(append), update_ignore);
__wt_free(session, append);
__wt_free(session, mod->mod_append);
}
@@ -188,7 +193,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
if (mod->mod_update != NULL)
__free_skip_array(session, mod->mod_update,
page->type ==
- WT_PAGE_COL_FIX ? 1 : page->pg_var_entries);
+ WT_PAGE_COL_FIX ? 1 : page->pg_var_entries,
+ update_ignore);
break;
}
@@ -302,6 +308,10 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ROW *rip;
uint32_t i;
void *copy;
+ bool update_ignore;
+
+ /* In some failed-split cases, we can't discard updates. */
+ update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE);
/*
* Free the in-memory index array.
@@ -326,12 +336,13 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
* found on the original page).
*/
if (page->pg_row_ins != NULL)
- __free_skip_array(
- session, page->pg_row_ins, page->pg_row_entries + 1);
+ __free_skip_array(session,
+ page->pg_row_ins, page->pg_row_entries + 1, update_ignore);
/* Free the update array. */
if (page->pg_row_upd != NULL)
- __free_update(session, page->pg_row_upd, page->pg_row_entries);
+ __free_update(session,
+ page->pg_row_upd, page->pg_row_entries, update_ignore);
}
/*
@@ -339,8 +350,8 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
* Discard an array of skip list headers.
*/
static void
-__free_skip_array(
- WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries)
+__free_skip_array(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD **head_arg, uint32_t entries, bool update_ignore)
{
WT_INSERT_HEAD **head;
@@ -350,7 +361,8 @@ __free_skip_array(
*/
for (head = head_arg; entries > 0; --entries, ++head)
if (*head != NULL) {
- __free_skip_list(session, WT_SKIP_FIRST(*head));
+ __free_skip_list(
+ session, WT_SKIP_FIRST(*head), update_ignore);
__wt_free(session, *head);
}
@@ -364,12 +376,13 @@ __free_skip_array(
* of a WT_INSERT structure and its associated chain of WT_UPDATE structures.
*/
static void
-__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
+__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins, bool update_ignore)
{
WT_INSERT *next;
for (; ins != NULL; ins = next) {
- __wt_free_update_list(session, ins->upd);
+ if (!update_ignore)
+ __wt_free_update_list(session, ins->upd);
next = WT_SKIP_NEXT(ins);
__wt_free(session, ins);
}
@@ -380,8 +393,8 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
* Discard the update array.
*/
static void
-__free_update(
- WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries)
+__free_update(WT_SESSION_IMPL *session,
+ WT_UPDATE **update_head, uint32_t entries, bool update_ignore)
{
WT_UPDATE **updp;
@@ -389,9 +402,10 @@ __free_update(
* For each non-NULL slot in the page's array of updates, free the
* linked list anchored in that slot.
*/
- for (updp = update_head; entries > 0; --entries, ++updp)
- if (*updp != NULL)
- __wt_free_update_list(session, *updp);
+ if (!update_ignore)
+ for (updp = update_head; entries > 0; --entries, ++updp)
+ if (*updp != NULL)
+ __wt_free_update_list(session, *updp);
/* Free the update array. */
__wt_free(session, update_head);
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
index e46e4a55696..6481f514323 100644
--- a/src/third_party/wiredtiger/src/btree/bt_io.c
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -192,6 +192,9 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
(!checkpoint && addr != NULL && addr_sizep != NULL) ||
(checkpoint && addr == NULL && addr_sizep == NULL));
+ /* In-memory databases shouldn't write pages. */
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
+
#ifdef HAVE_DIAGNOSTIC
/*
* We're passed a table's disk image. Decompress if necessary and
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index d9cdfc78c75..e60f7b3fb02 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -448,8 +448,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
for (oldgen = stalled = false,
force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
switch (ref->state) {
- case WT_REF_DISK:
case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_NO_EMPTY) &&
+ __wt_delete_page_skip(session, ref, false))
+ return (WT_NOTFOUND);
+ /* FALLTHROUGH */
+ case WT_REF_DISK:
if (LF_ISSET(WT_READ_CACHE))
return (WT_NOTFOUND);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index adda9145ee4..2145d6ac014 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -211,7 +211,8 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
* splitting into parent pages can become large enough to result
* in slow operations.
*/
- if (pindex->entries > btree->split_deepen_min_child)
+ if (!__wt_ref_is_root(ref) &&
+ pindex->entries > btree->split_deepen_min_child)
return (true);
return (false);
@@ -405,7 +406,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
uint64_t split_gen;
uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
uint32_t skip_leading, slots;
- bool panic;
+ bool complete;
void *p;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
@@ -414,7 +415,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
btree = S2BT(session);
alloc_index = NULL;
parent_incr = parent_decr = 0;
- panic = false;
+ complete = false;
/*
* Our caller is holding the parent page locked to single-thread splits,
@@ -552,28 +553,28 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
}
WT_ASSERT(session,
alloc_refp - alloc_index->index ==
- alloc_index->entries - skip_trailing);
- WT_ASSERT(session,
- parent_refp - pindex->index == pindex->entries - skip_trailing);
+ (ptrdiff_t)(alloc_index->entries - skip_trailing));
+ WT_ASSERT(session, parent_refp - pindex->index ==
+ (ptrdiff_t)(pindex->entries - skip_trailing));
/*
* Confirm the parent page's index hasn't moved, then update it, which
* makes the split visible to threads descending the tree. From this
- * point on, we're committed to the split. If subsequent work fails,
- * we have to panic because we may have threads of control using the
- * new page index we swap in.
+ * point on, we're committed to the split.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
* parent remained unchanged, although it may have an incorrect memory
* footprint. From now on we've modified the parent page, attention
- * needs to be paid.
+ * needs to be paid. However, subsequent failures are relatively benign,
+ * the split is OK and complete. For that reason, we ignore errors past
+ * this point unless there's a panic.
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- panic = true;
+ complete = true;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
@@ -657,7 +658,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
* be using the new index.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_ERR(__split_safe_free(session, split_gen, 0, pindex, size));
+ WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size));
parent_decr += size;
/*
@@ -666,25 +667,29 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
__wt_cache_page_inmem_incr(session, parent, parent_incr);
__wt_cache_page_inmem_decr(session, parent, parent_decr);
- if (0) {
-err: __wt_free_ref_index(session, parent, alloc_index, true);
+err: /*
+ * If complete is true, we saw an error after opening up the tree to
+ * descent through the parent page's new index. There is nothing we
+ * can do, there are threads potentially active in both versions of
+ * the tree.
+ *
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete)
+ __wt_free_ref_index(session, parent, alloc_index, true);
- /*
- * If panic is set, we saw an error after opening up the tree
- * to descent through the parent page's new index. There is
- * nothing we can do, the tree is inconsistent and there are
- * threads potentially active in both versions of the tree.
- */
- if (panic)
- ret = __wt_panic(session);
- }
- return (ret);
+ if (ret != 0 && ret != WT_PANIC)
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page split to "
+ "deepen the tree");
+ return (ret == WT_PANIC || !complete ? ret : 0);
}
/*
* __split_multi_inmem --
- * Instantiate a page in a multi-block set, when an update couldn't be
- * written.
+ * Instantiate a page in a multi-block set.
*/
static int
__split_multi_inmem(
@@ -699,13 +704,12 @@ __split_multi_inmem(
uint64_t recno;
uint32_t i, slot;
- __wt_btcur_init(session, &cbt);
- __wt_btcur_open(&cbt);
-
/*
- * We can find unresolved updates when attempting to evict a page, which
- * can't be written. This code re-creates the in-memory page and applies
- * the unresolved updates to that page.
+ * This code re-creates an in-memory page that is part of a set created
+ * while evicting a large page, and adds references to any unresolved
+ * update chains to the new page. We get here due to choosing to keep
+ * the results of a split in memory or because and update could not be
+ * written when attempting to evict a page.
*
* Clear the disk image and link the page into the passed-in WT_REF to
* simplify error handling: our caller will not discard the disk image
@@ -713,13 +717,16 @@ __split_multi_inmem(
* allocated page on error, when discarding the allocated WT_REF.
*/
WT_RET(__wt_page_inmem(session, ref,
- multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size,
+ multi->disk_image, ((WT_PAGE_HEADER *)multi->disk_image)->mem_size,
WT_PAGE_DISK_ALLOC, &page));
- multi->supd_dsk = NULL;
+ multi->disk_image = NULL;
if (orig->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &key));
+ __wt_btcur_init(session, &cbt);
+ __wt_btcur_open(&cbt);
+
/* Re-create each modification we couldn't write. */
for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
switch (orig->type) {
@@ -727,7 +734,6 @@ __split_multi_inmem(
case WT_PAGE_COL_VAR:
/* Build a key. */
upd = supd->ins->upd;
- supd->ins->upd = NULL;
recno = WT_INSERT_RECNO(supd->ins);
/* Search the page. */
@@ -742,13 +748,11 @@ __split_multi_inmem(
if (supd->ins == NULL) {
slot = WT_ROW_SLOT(orig, supd->rip);
upd = orig->pg_row_upd[slot];
- orig->pg_row_upd[slot] = NULL;
WT_ERR(__wt_row_leaf_key(
session, orig, supd->rip, key, false));
} else {
upd = supd->ins->upd;
- supd->ins->upd = NULL;
key->data = WT_INSERT_KEY(supd->ins);
key->size = WT_INSERT_KEY_SIZE(supd->ins);
@@ -765,13 +769,14 @@ __split_multi_inmem(
}
/*
- * We modified the page above, which will have set the first dirty
+ * If we modified the page above, it will have set the first dirty
* transaction to the last transaction currently running. However, the
* updates we installed may be older than that. Set the first dirty
* transaction to an impossibly old value so this page is never skipped
* in a checkpoint.
*/
- page->modify->first_dirty_txn = WT_TXN_FIRST;
+ if (page->modify != NULL)
+ page->modify->first_dirty_txn = WT_TXN_FIRST;
err: /* Free any resources that may have been cached in the cursor. */
WT_TRET(__wt_btcur_close(&cbt, true));
@@ -781,6 +786,38 @@ err: /* Free any resources that may have been cached in the cursor. */
}
/*
+ * __split_multi_inmem_final --
+ * Discard moved update lists from the original page.
+ */
+static void
+__split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
+{
+ WT_SAVE_UPD *supd;
+ uint32_t i, slot;
+
+ /*
+ * We've successfully created new in-memory pages. For error-handling
+ * reasons, we've left the update chains referenced by both the original
+ * and new pages. We're ready to discard the original page, terminate
+ * the original page's reference to any update list we moved.
+ */
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
+ switch (orig->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ supd->ins->upd = NULL;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->rip);
+ orig->pg_row_upd[slot] = NULL;
+ } else
+ supd->ins->upd = NULL;
+ break;
+ }
+}
+
+/*
* __wt_multi_to_ref --
* Move a multi-block list into an array of WT_REF structures.
*/
@@ -801,16 +838,10 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
ref = *refp;
incr += sizeof(WT_REF);
- /*
- * Any parent reference must be filled in by our caller; the primary
- * use of this function is when splitting into a parent page, and we
- * aren't holding any locks here that would allow us to know which
- * parent we'll eventually split into, if the tree is simultaneously
- * being deepened.
- */
+ /* Any parent reference is filled in by our caller. */
ref->home = NULL;
- if (multi->supd == NULL) {
+ if (multi->disk_image == NULL) {
/*
* Copy the address: we could simply take the buffer, but that
* would complicate error handling, freeing the reference array
@@ -839,7 +870,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
break;
}
- ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM;
+ ref->state = addr != NULL ? WT_REF_DISK : WT_REF_MEM;
/*
* If our caller wants to track the memory allocations, we have a return
@@ -982,11 +1013,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* reading thread will restart. Include the ref we are splitting in
* the count to be deleted.
*/
- for (i = 0, deleted_entries = 1; i < parent_entries; ++i) {
+ for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if (next_ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, next_ref) &&
+ __wt_delete_page_skip(session, next_ref, true) &&
__wt_atomic_casv32(
&next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
deleted_entries++;
@@ -999,6 +1030,13 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
result_entries = (parent_entries + new_entries) - deleted_entries;
/*
+ * If the entire (sub)tree is empty, give up: we can't leave an empty
+ * internal page.
+ */
+ if (result_entries == 0)
+ return (0);
+
+ /*
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
@@ -1042,6 +1080,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*alloc_refp++ = next_ref;
}
+ /* Check that we filled in all the entries. */
+ WT_ASSERT(session,
+ alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -1078,9 +1120,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%s split into parent %" PRIu32 " -> %" PRIu32
- " (%" PRIu32 ")",
- __wt_page_type_string(ref->page->type), parent_entries,
- result_entries, result_entries - parent_entries));
+ " (%" PRIu32 ")", ref->page == NULL ?
+ "reverse" : __wt_page_type_string(ref->page->type),
+ parent_entries, result_entries, result_entries - parent_entries));
/*
* The new page index is in place, free the WT_REF we were splitting
@@ -1172,20 +1214,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
__split_should_deepen(session, parent_ref))
ret = __split_deepen(session, parent);
-err: if (!complete)
+err: /*
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened, and our caller has to proceed
+ * with the split.
+ */
+ if (!complete) {
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
next_ref->state = WT_REF_DELETED;
}
- __wt_free_ref_index(session, NULL, alloc_index, false);
+ __wt_free_ref_index(session, NULL, alloc_index, false);
+ }
- /*
- * A note on error handling: if we completed the split, return success,
- * nothing really bad can have happened, and our caller has to proceed
- * with the split.
- */
if (ret != 0 && ret != WT_PANIC)
__wt_err(session, ret,
"ignoring not-fatal error during parent page split");
@@ -1479,6 +1522,24 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
+ * __wt_split_reverse --
+ * We have a locked ref that is empty and we want to rewrite the index in
+ * its parent.
+ */
+int
+__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ bool hazard;
+
+ WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
+ ret = __split_parent(session, ref, NULL, 0, 0, 0);
+ WT_TRET(__split_parent_unlock(session, parent, hazard));
+ return (ret);
+}
+
+/*
* __wt_split_rewrite --
* Rewrite an in-memory page with a new version.
*/
@@ -1506,6 +1567,14 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
/*
+ * The rewrite succeeded, we can no longer fail.
+ *
+ * Finalize the move, discarding moved update lists from the original
+ * page.
+ */
+ __split_multi_inmem_final(page, &mod->mod_multi[0]);
+
+ /*
* Discard the original page.
*
* Pages with unresolved changes are not marked clean during
@@ -1560,33 +1629,43 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
- __wt_free(session, ref_new);
-
/*
- * The split succeeded, discard the page.
+ * The split succeeded, we can no longer fail.
*
- * Pages with unresolved changes are not marked clean during
- * reconciliation, do it now.
+ * Finalize the move, discarding moved update lists from the original
+ * page.
+ */
+ for (i = 0; i < new_entries; ++i)
+ __split_multi_inmem_final(page, &mod->mod_multi[i]);
+
+ /*
+ * Pages with unresolved changes are not marked clean in reconciliation,
+ * do it now, then discard the page.
*/
__wt_page_modify_clear(session, page);
__wt_page_out(session, &page);
- return (0);
+ if (0) {
+err: /*
+ * A note on error handling: when handling unresolved changes,
+ * we create new in-memory pages with those unresolved changes.
+ * The problem is the new pages are given references to the
+ * original page's update lists, and once all of the pages are
+ * created, there's a second pass to remove the updates from the
+ * original page. If an error occurs, we can't simply free the
+ * newly created pages, that would discard the original page's
+ * updates. Set a flag so the discard function doesn't discard
+ * the updates on the page.
+ */
+ for (i = 0; i < new_entries; ++i)
+ if (ref_new[i]->page != NULL) {
+ F_SET_ATOMIC(
+ ref_new[i]->page, WT_PAGE_UPDATE_IGNORE);
+ __wt_free_ref(session,
+ ref_new[i]->page, ref_new[i], true);
+ }
+ }
-err: /*
- * A note on error handling: in the case of evicting a page that has
- * unresolved changes, we just instantiated some in-memory pages that
- * reflect those unresolved changes. The problem is those pages
- * reference the same WT_UPDATE chains as the page we're splitting,
- * that is, we simply copied references into the new pages. If the
- * split fails, the original page is fine, but discarding the created
- * page would free those update chains, and that's wrong. There isn't
- * an easy solution, there's a lot of small memory allocations in some
- * common code paths, and unwinding those changes will be difficult.
- * For now, leak the memory by not discarding the instantiated pages.
- */
- for (i = 0; i < new_entries; ++i)
- __wt_free_ref(session, page, ref_new[i], false);
__wt_free(session, ref_new);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 6e1d182ed0b..8e0f4036b79 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
*/
WT_ENTER_PAGE_INDEX(session);
+ /* Walk should never instantiate deleted pages. */
+ LF_SET(WT_READ_NO_EMPTY);
+
/*
* !!!
* Fast-truncate currently only works on row-store trees.
@@ -174,9 +177,10 @@ ascend: /*
/*
* If we got all the way through an internal page and
- * all of the child pages were deleted, evict it.
+ * all of the child pages were deleted, mark it for
+ * eviction.
*/
- if (empty_internal) {
+ if (empty_internal && pindex->entries > 1) {
__wt_page_evict_soon(ref->page);
empty_internal = false;
}
@@ -257,7 +261,7 @@ ascend: /*
* to delete it again.
*/
if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ __wt_delete_page_skip(session, ref, false))
break;
/*
* If deleting a range, try to delete the page
@@ -294,7 +298,7 @@ ascend: /*
* Try to skip deleted pages visible to us.
*/
if (ref->state == WT_REF_DELETED &&
- __wt_delete_page_skip(session, ref))
+ __wt_delete_page_skip(session, ref, false))
break;
}
@@ -302,7 +306,7 @@ ascend: /*
/*
* Not-found is an expected return when only walking
- * in-cache pages.
+ * in-cache pages, or if we see a deleted page.
*/
if (ret == WT_NOTFOUND) {
ret = 0;
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index b99c93d319a..87929d8a457 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -218,10 +218,11 @@ restart: page = current->page;
}
/*
- * Binary search of the internal page. There are two versions
- * (a default loop and an application-specified collation loop),
- * because moving the collation test and error handling inside
- * the loop costs about 5%.
+ * Binary search of an internal page. There are three versions
+ * (keys with no application-specified collation order, in long
+ * and short versions, and keys with an application-specified
+ * collation order), because doing the tests and error handling
+ * inside the loop costs about 5%.
*
* The 0th key on an internal page is a problem for a couple of
* reasons. First, we have to force the 0th key to sort less
@@ -236,7 +237,22 @@ restart: page = current->page;
*/
base = 1;
limit = pindex->entries - 1;
- if (collator == NULL)
+ if (collator == NULL &&
+ srch_key->size <= WT_COMPARE_SHORT_MAXLEN)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+ __wt_ref_key(
+ page, descent, &item->data, &item->size);
+
+ cmp = __wt_lex_compare_short(srch_key, item);
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto descend;
+ }
+ else if (collator == NULL)
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
descent = pindex->index[indx];
@@ -356,13 +372,28 @@ leaf_only:
}
/*
- * Binary search of the leaf page. There are two versions (a default
- * loop and an application-specified collation loop), because moving
- * the collation test and error handling inside the loop costs about 5%.
+ * Binary search of an leaf page. There are three versions (keys with
+ * no application-specified collation order, in long and short versions,
+ * and keys with an application-specified collation order), because
+ * doing the tests and error handling inside the loop costs about 5%.
*/
base = 0;
limit = page->pg_row_entries;
- if (collator == NULL)
+ if (collator == NULL && srch_key->size <= WT_COMPARE_SHORT_MAXLEN)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ rip = page->pg_row_d + indx;
+ WT_ERR(
+ __wt_row_leaf_key(session, page, rip, item, true));
+
+ cmp = __wt_lex_compare_short(srch_key, item);
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto leaf_match;
+ }
+ else if (collator == NULL)
for (; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
rip = page->pg_row_d + indx;
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 419f4124133..c8aca15d103 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -519,6 +519,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
NULL, NULL,
confchk_wiredtiger_open_file_manager_subconfigs, 3 },
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
+ { "in_memory", "boolean", NULL, NULL, NULL, 0 },
{ "log", "category",
NULL, NULL,
confchk_wiredtiger_open_log_subconfigs, 8 },
@@ -594,6 +595,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
NULL, NULL,
confchk_wiredtiger_open_file_manager_subconfigs, 3 },
{ "hazard_max", "int", NULL, "min=15", NULL, 0 },
+ { "in_memory", "boolean", NULL, NULL, NULL, 0 },
{ "log", "category",
NULL, NULL,
confchk_wiredtiger_open_log_subconfigs, 8 },
@@ -974,8 +976,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
"file_extend=,file_manager=(close_handle_minimum=250,"
"close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
- "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB,"
+ "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
"worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
"session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
",name=,quota=0,reserve=0,size=500MB),statistics=none,"
@@ -983,7 +985,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
",verbose=",
- confchk_wiredtiger_open, 34
+ confchk_wiredtiger_open, 35
},
{ "wiredtiger_open_all",
"async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
@@ -995,8 +997,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
"file_extend=,file_manager=(close_handle_minimum=250,"
"close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
- "log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
- "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
+ "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB,"
+ "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=,"
"worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
"session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB"
",name=,quota=0,reserve=0,size=500MB),statistics=none,"
@@ -1004,7 +1006,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"transaction_sync=(enabled=0,method=fsync),use_environment_priv=0"
",verbose=,version=(major=0,minor=0)",
- confchk_wiredtiger_open_all, 35
+ confchk_wiredtiger_open_all, 36
},
{ "wiredtiger_open_basecfg",
"async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
index 92f12402537..edb11957556 100644
--- a/src/third_party/wiredtiger/src/conn/api_strerror.c
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -38,6 +38,8 @@ __wt_wiredtiger_error(int error)
return ("WT_RESTART: restart the operation (internal)");
case WT_RUN_RECOVERY:
return ("WT_RUN_RECOVERY: recovery must be run to continue");
+ case WT_CACHE_FULL:
+ return ("WT_CACHE_FULL: operation would overflow cache");
}
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index b50ad750158..d86b02287f0 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1726,6 +1726,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
"create=,"
"encryption=(secretkey=),"
"exclusive=,"
+ "in_memory=,"
"log=(recover=),"
"use_environment_priv=,"
"verbose=,", &base_config));
@@ -1798,7 +1799,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_DECL_RET;
const WT_NAME_FLAG *ft;
WT_SESSION_IMPL *session;
- int64_t config_base_set;
+ bool config_base_set;
const char *enc_cfg[] = { NULL, NULL };
char version[64];
@@ -1842,7 +1843,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
/* Capture the config_base setting file for later use. */
WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
- config_base_set = cval.val;
+ config_base_set = cval.val != 0;
/* Configure error messages so we get them right early. */
WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
@@ -1850,6 +1851,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_strndup(
session, cval.str, cval.len, &conn->error_prefix));
+ /*
+ * XXX ideally, we would check "in_memory" here, so we could completely
+ * avoid having a database directory. However, it can be convenient to
+ * pass "in_memory" via the WIREDTIGER_CONFIG environment variable, and
+ * we haven't read it yet.
+ */
+
/* Get the database home. */
WT_ERR(__conn_home(session, home, cfg));
@@ -1883,7 +1891,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
__conn_config_append(cfg, version);
/* Ignore the base_config file if we config_base set to false. */
- if (config_base_set != 0)
+ if (config_base_set)
WT_ERR(
__conn_config_file(session, WT_BASECONFIG, false, cfg, i1));
__conn_config_append(cfg, config);
@@ -1921,6 +1929,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval));
conn->session_scratch_max = (size_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val != 0)
+ F_SET(conn, WT_CONN_IN_MEMORY);
+
WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
if (cval.val)
F_SET(conn, WT_CONN_CKPT_SYNC);
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index caf0c3b68f0..8f039e61654 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -32,8 +32,19 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp)
*/
WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
conn->ckpt_usecs = (uint64_t)cval.val * 1000000;
+
WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
conn->ckpt_logsize = (wt_off_t)cval.val;
+
+ /* Checkpoints are incompatible with in-memory configuration */
+ if (conn->ckpt_usecs != 0 || conn->ckpt_logsize != 0) {
+ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val != 0)
+ WT_RET_MSG(session, EINVAL,
+ "In memory configuration incompatible with "
+ "checkpoints");
+ }
+
__wt_log_written_reset(session);
if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) ||
(conn->ckpt_logsize && conn->ckpt_usecs == 0 &&
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 0b364b5fd4b..c6d5b535b86 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -194,7 +194,8 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
WT_ERR(__conn_dhandle_mark_dead(session));
marked_dead = true;
- } else
+ }
+ if (!marked_dead || final)
WT_ERR(__wt_checkpoint_close(session, final));
}
@@ -695,7 +696,7 @@ restart:
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, false)));
+ session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
goto restart;
}
@@ -712,7 +713,7 @@ restart:
while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL)
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(
- session, true, false)));
+ session, true, F_ISSET(conn, WT_CONN_IN_MEMORY))));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
index e10f2a8c968..cc4e3ae2681 100644
--- a/src/third_party/wiredtiger/src/conn/conn_handle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -59,6 +59,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation"));
+ WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file"));
WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock);
@@ -145,6 +146,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
__wt_spin_destroy(session, &conn->table_lock);
+ __wt_spin_destroy(session, &conn->turtle_lock);
for (i = 0; i < WT_PAGE_LOCKS; ++i)
__wt_spin_destroy(session, &conn->page_lock[i]);
__wt_free(session, conn->page_lock);
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 9068e7e85a2..527b756ee1a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -47,9 +47,13 @@ __logmgr_config(
{
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
+ bool enabled;
conn = S2C(session);
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ enabled = cval.val != 0;
+
/*
* If we're reconfiguring, enabled must match the already
* existing setting.
@@ -57,14 +61,21 @@ __logmgr_config(
* If it is off and the user it turning it on, or it is on
* and the user is turning it off, return an error.
*/
- WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
if (reconfig &&
- ((cval.val != 0 &&
- !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
- (cval.val == 0 &&
- FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
+ ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
+ (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
return (EINVAL);
- *runp = cval.val != 0;
+
+ /* Logging is incompatible with in-memory */
+ if (enabled) {
+ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val != 0)
+ WT_RET_MSG(session, EINVAL,
+ "In memory configuration incompatible with "
+ "log=(enabled=true)");
+ }
+
+ *runp = enabled;
/*
* Setup a log path and compression even if logging is disabled in case
@@ -379,9 +390,16 @@ __log_file_server(void *arg)
* to move the sync_lsn into the next file for
* later syncs.
*/
+ WT_ERR(__wt_fsync(session, close_fh));
+ /*
+ * We want to make sure the file size reflects
+ * actual data and has minimal pre-allocated
+ * zeroed space.
+ */
+ WT_ERR(__wt_ftruncate(
+ session, close_fh, close_end_lsn.offset));
close_end_lsn.file++;
close_end_lsn.offset = 0;
- WT_ERR(__wt_fsync(session, close_fh));
__wt_spin_lock(session, &log->log_sync_lock);
locked = true;
WT_ERR(__wt_close(session, &close_fh));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 23846f978fe..a8620ebaa99 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -334,6 +334,15 @@ __wt_sweep_config(WT_SESSION_IMPL *session, const char *cfg[])
cfg, "file_manager.close_idle_time", &cval));
conn->sweep_idle_time = (time_t)cval.val;
+ /* Non-zero sweep idle time is incompatible with in-memory */
+ if (conn->sweep_idle_time != 0) {
+ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
+ if (cval.val != 0)
+ WT_RET_MSG(session, EINVAL,
+ "In memory configuration incompatible with "
+ "non zero file_manager=(close_idle_time)");
+ }
+
WT_RET(__wt_config_gets(session,
cfg, "file_manager.close_scan_interval", &cval));
conn->sweep_interval = (time_t)cval.val;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
index 8ee57d24413..ccc19717612 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_ds.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -384,7 +384,7 @@ __curds_remove(WT_CURSOR *cursor)
source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
- CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ CURSOR_REMOVE_API_CALL(cursor, session, NULL);
WT_STAT_FAST_CONN_INCR(session, cursor_remove);
WT_STAT_FAST_DATA_INCR(session, cursor_remove);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 65f5dafc344..1db819b8b40 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -323,7 +323,7 @@ __curfile_remove(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
cbt = (WT_CURSOR_BTREE *)cursor;
- CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree);
+ CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree);
WT_CURSOR_NEEDKEY(cursor);
WT_CURSOR_NOVALUE(cursor);
@@ -495,24 +495,30 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
bitmap = bulk = false;
flags = 0;
- WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
- if (cval.type == WT_CONFIG_ITEM_BOOL ||
- (cval.type == WT_CONFIG_ITEM_NUM &&
- (cval.val == 0 || cval.val == 1))) {
- bitmap = false;
- bulk = cval.val != 0;
- } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
- bitmap = bulk = true;
- /*
- * Unordered bulk insert is a special case used internally by
- * index creation on existing tables. It doesn't enforce
- * any special semantics at the file level. It primarily
- * exists to avoid some locking problems with LSM trees and
- * index creation.
- */
- else if (!WT_STRING_MATCH("unordered", cval.str, cval.len))
- WT_RET_MSG(session, EINVAL,
- "Value for 'bulk' must be a boolean or 'bitmap'");
+ /*
+ * Decode the bulk configuration settings. In memory databases
+ * ignore bulk load.
+ */
+ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ if (cval.type == WT_CONFIG_ITEM_BOOL ||
+ (cval.type == WT_CONFIG_ITEM_NUM &&
+ (cval.val == 0 || cval.val == 1))) {
+ bitmap = false;
+ bulk = cval.val != 0;
+ } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
+ bitmap = bulk = true;
+ /*
+ * Unordered bulk insert is a special case used
+ * internally by index creation on existing tables. It
+ * doesn't enforce any special semantics at the file
+ * level. It primarily exists to avoid some locking
+ * problems between LSM and index creation.
+ */
+ else if (!WT_STRING_MATCH("unordered", cval.str, cval.len))
+ WT_RET_MSG(session, EINVAL,
+ "Value for 'bulk' must be a boolean or 'bitmap'");
+ }
/* Bulk handles require exclusive access. */
if (bulk)
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
index 460c46c0d29..55da93859a6 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -171,7 +171,15 @@ __curmetadata_next(WT_CURSOR *cursor)
if (!F_ISSET(mdc, WT_MDC_POSITIONED))
WT_ERR(__curmetadata_metadata_search(session, cursor));
else {
- WT_ERR(file_cursor->next(mdc->file_cursor));
+ /*
+ * When applications open metadata cursors, they expect to see
+ * all schema-level operations reflected in the results. Query
+ * at read-uncommitted to avoid confusion caused by the current
+ * transaction state.
+ */
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = file_cursor->next(mdc->file_cursor));
+ WT_ERR(ret);
WT_ERR(__curmetadata_setkv(mdc, file_cursor));
}
@@ -204,7 +212,8 @@ __curmetadata_prev(WT_CURSOR *cursor)
goto err;
}
- ret = file_cursor->prev(file_cursor);
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = file_cursor->prev(file_cursor));
if (ret == 0)
WT_ERR(__curmetadata_setkv(mdc, file_cursor));
else if (ret == WT_NOTFOUND)
@@ -264,7 +273,9 @@ __curmetadata_search(WT_CURSOR *cursor)
if (WT_KEY_IS_METADATA(&cursor->key))
WT_ERR(__curmetadata_metadata_search(session, cursor));
else {
- WT_ERR(file_cursor->search(file_cursor));
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = file_cursor->search(file_cursor));
+ WT_ERR(ret);
WT_ERR(__curmetadata_setkv(mdc, file_cursor));
}
@@ -298,7 +309,9 @@ __curmetadata_search_near(WT_CURSOR *cursor, int *exact)
WT_ERR(__curmetadata_metadata_search(session, cursor));
*exact = 1;
} else {
- WT_ERR(file_cursor->search_near(file_cursor, exact));
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = file_cursor->search_near(file_cursor, exact));
+ WT_ERR(ret);
WT_ERR(__curmetadata_setkv(mdc, file_cursor));
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index 01d1fdd1886..38359236b27 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -610,7 +610,7 @@ __curtable_remove(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
ctable = (WT_CURSOR_TABLE *)cursor;
- CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ CURSOR_REMOVE_API_CALL(cursor, session, NULL);
WT_ERR(__curtable_open_indices(ctable));
/* Find the old record so it can be removed from indices */
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index f9171900ca4..c28b89b81ce 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -158,6 +158,9 @@ __evict_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+#ifdef HAVE_DIAGNOSTIC
+ struct timespec now, stuck_ts;
+#endif
u_int spins;
session = arg;
@@ -200,6 +203,20 @@ __evict_server(void *arg)
/* Next time we wake up, reverse the sweep direction. */
cache->flags ^= WT_CACHE_WALK_REVERSE;
+#ifdef HAVE_DIAGNOSTIC
+ stuck_ts.tv_sec = 0;
+ } else if (stuck_ts.tv_sec == 0)
+ WT_ERR(__wt_epoch(session, &stuck_ts));
+ else {
+ /* After being stuck for 5 minutes, give up. */
+ WT_ERR(__wt_epoch(session, &now));
+ if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) {
+ __wt_errx(session,
+ "Cache stuck for too long, giving up");
+ (void)__wt_cache_dump(session, NULL);
+ WT_ERR(ETIMEDOUT);
+ }
+#endif
}
WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
@@ -1210,8 +1227,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
if (__wt_page_is_empty(page))
goto fast;
- /* Optionally ignore clean pages. */
- if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY))
+ /* Skip clean pages if appropriate. */
+ if (!modified && (F_ISSET(conn, WT_CONN_IN_MEMORY) ||
+ FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY)))
continue;
/*
@@ -1560,11 +1578,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
#ifdef HAVE_DIAGNOSTIC
/*
* __wt_cache_dump --
- * Dump debugging information to stdout about the size of the files in the
- * cache.
- *
- * NOTE: this function is not called anywhere, it is intended to be called
- * from a debugger.
+ * Dump debugging information to a file (default stderr) about the size of
+ * the files in the cache.
*/
int
__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
@@ -1574,60 +1589,95 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
WT_DATA_HANDLE *dhandle, *saved_dhandle;
WT_PAGE *page;
WT_REF *next_walk;
- uint64_t file_intl_pages, file_leaf_pages;
- uint64_t file_bytes, file_dirty, total_bytes;
+ uint64_t dirty_bytes, dirty_pages, intl_bytes, intl_pages;
+ uint64_t leaf_bytes, leaf_pages;
+ uint64_t max_dirty_bytes, max_intl_bytes, max_leaf_bytes, total_bytes;
+ size_t size;
conn = S2C(session);
total_bytes = 0;
if (ofile == NULL)
- fp = stdout;
+ fp = stderr;
else
WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp));
+ /* Note: odd string concatenation avoids spelling errors. */
+ (void)__wt_fprintf(fp, "==========\n" "cache dump\n");
+
saved_dhandle = session->dhandle;
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
+ dirty_bytes = dirty_pages = intl_bytes = intl_pages = 0;
+ leaf_bytes = leaf_pages = 0;
+ max_dirty_bytes = max_intl_bytes = max_leaf_bytes = 0;
+
next_walk = NULL;
session->dhandle = dhandle;
while (__wt_tree_walk(session,
&next_walk, NULL, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
next_walk != NULL) {
page = next_walk->page;
- if (WT_PAGE_IS_INTERNAL(page))
- ++file_intl_pages;
- else
- ++file_leaf_pages;
- file_bytes += page->memory_footprint;
- if (__wt_page_is_modified(page))
- file_dirty += page->memory_footprint;
- (void)__wt_fprintf(fp,
- "%" WT_SIZET_FMT ", ", page->memory_footprint);
+ size = page->memory_footprint;
+
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ ++intl_pages;
+ intl_bytes += size;
+ max_intl_bytes = WT_MAX(max_intl_bytes, size);
+ } else {
+ ++leaf_pages;
+ leaf_bytes += size;
+ max_leaf_bytes = WT_MAX(max_leaf_bytes, size);
+ }
+ if (__wt_page_is_modified(page)) {
+ ++dirty_pages;
+ dirty_bytes += size;
+ max_dirty_bytes =
+ WT_MAX(max_dirty_bytes, size);
+ }
}
session->dhandle = NULL;
- (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t"
- " %" PRIu64 " internal pages, %" PRIu64 " leaf pages,"
- " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n",
- dhandle->name,
- dhandle->checkpoint == NULL ? "" : " [",
- dhandle->checkpoint == NULL ? "" : dhandle->checkpoint,
- dhandle->checkpoint == NULL ? "" : "]",
- file_intl_pages, file_leaf_pages,
- file_bytes >> 20, file_dirty >> 20);
-
- total_bytes += file_bytes;
+ if (dhandle->checkpoint == NULL)
+ (void)__wt_fprintf(fp, "%s(<live>): \n", dhandle->name);
+ else
+ (void)__wt_fprintf(fp, "%s(checkpoint=%s): \n",
+ dhandle->name, dhandle->checkpoint);
+ if (intl_pages != 0)
+ (void)__wt_fprintf(fp, "\t" "internal pages: "
+ "%" PRIu64 " pages, %" PRIu64
+ " max, %" PRIu64 "MB total\n",
+ intl_pages, max_intl_bytes, intl_bytes >> 20);
+ if (leaf_pages != 0)
+ (void)__wt_fprintf(fp, "\t" "leaf pages: "
+ "%" PRIu64 " pages, %" PRIu64
+ " max, %" PRIu64 "MB total\n",
+ leaf_pages, max_leaf_bytes, leaf_bytes >> 20);
+ if (dirty_pages != 0)
+ (void)__wt_fprintf(fp, "\t" "dirty pages: "
+ "%" PRIu64 " pages, %" PRIu64
+ " max, %" PRIu64 "MB total\n",
+ dirty_pages, max_dirty_bytes, dirty_bytes >> 20);
+
+ total_bytes += intl_bytes + leaf_bytes;
}
session->dhandle = saved_dhandle;
+ /*
+ * Apply the overhead percentage so our total bytes are comparable with
+ * the tracked value.
+ */
+ if (conn->cache->overhead_pct != 0)
+ total_bytes +=
+ (total_bytes * (uint64_t)conn->cache->overhead_pct) / 100;
(void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB"
" vs tracked inuse %" PRIu64 "MB\n",
total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
- if (fp != stdout)
+ (void)__wt_fprintf(fp, "==========\n");
+ if (fp != stderr)
WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index fb42b928f28..7202da7927c 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -106,7 +106,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
conn->cache->evict_max_page_size = page->memory_footprint;
/* Update the reference and discard the page. */
- if (mod == NULL || mod->rec_result == 0) {
+ if ((mod == NULL || mod->rec_result == 0) &&
+ !F_ISSET(conn, WT_CONN_IN_MEMORY)) {
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
@@ -142,6 +143,50 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
return (ret);
}
+/*
+ * __evict_delete_ref --
+ * Mark a page reference deleted and check if the parent can reverse
+ * split.
+ */
+static int
+__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+ WT_PAGE_INDEX *pindex;
+ uint32_t ndeleted;
+
+ if (__wt_ref_is_root(ref))
+ return (0);
+
+ /*
+ * Avoid doing reverse splits when closing the file, it is
+ * wasted work and some structure may already have been freed.
+ */
+ if (!closing) {
+ parent = ref->home;
+ WT_INTL_INDEX_GET(session, parent, pindex);
+ ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
+
+ /*
+ * If more than 10% of the parent references are deleted, try a
+ * reverse split. Don't bother if there is a single deleted
+ * reference: the internal page is empty and we have to wait
+ * for eviction to notice.
+ *
+ * This will consume the deleted ref (and eventually free it).
+ * If the reverse split can't get the access it needs because
+ * something is busy, be sure that the page still ends up
+ * marked deleted.
+ */
+ if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
+ (ret = __wt_split_reverse(session, ref)) != EBUSY)
+ return (ret);
+ }
+
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (0);
+}
/*
* __wt_evict_page_clean_update --
@@ -151,6 +196,8 @@ int
__wt_evict_page_clean_update(
WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
+ WT_DECL_RET;
+
/*
* If doing normal system eviction, but only in the service of reducing
* the number of dirty pages, leave the clean page in cache.
@@ -164,8 +211,12 @@ __wt_evict_page_clean_update(
* page re-instantiated (for example, by searching) and never written.
*/
__wt_ref_out(session, ref);
- WT_PUBLISH(ref->state,
- ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
+ if (ref->addr == NULL) {
+ WT_WITH_PAGE_INDEX(session,
+ ret = __evict_delete_ref(session, ref, closing));
+ WT_RET_BUSY_OK(ret);
+ } else
+ WT_PUBLISH(ref->state, WT_REF_DISK);
return (0);
}
@@ -178,6 +229,7 @@ static int
__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_ADDR *addr;
+ WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
@@ -206,14 +258,31 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
__wt_ref_out(session, ref);
ref->addr = NULL;
- WT_PUBLISH(ref->state, WT_REF_DELETED);
+ WT_WITH_PAGE_INDEX(session,
+ ret = __evict_delete_ref(session, ref, closing));
+ WT_RET_BUSY_OK(ret);
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
/*
- * A real split where we reconciled a page and it turned into a
- * lot of pages.
+ * Either a split where we reconciled a page and it turned into
+ * a lot of pages or an in-memory page that got too large, we
+ * forcibly evicted it, and there wasn't anything to write.
+ *
+ * The latter is a special case of forced eviction. Imagine a
+ * thread updating a small set keys on a leaf page. The page
+ * is too large or has too many deleted items, so we try and
+ * evict it, but after reconciliation there's only a small
+ * amount of live data (so it's a single page we can't split),
+ * and if there's an older reader somewhere, there's data on
+ * the page we can't write (so the page can't be evicted). In
+ * that case, we end up here with a single block that we can't
+ * write. Take advantage of the fact we have exclusive access
+ * to the page and rewrite it in memory.
*/
- WT_RET(__wt_split_multi(session, ref, closing));
+ if (mod->mod_multi_entries == 1)
+ WT_RET(__wt_split_rewrite(session, ref));
+ else
+ WT_RET(__wt_split_multi(session, ref, closing));
break;
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
/*
@@ -248,20 +317,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
ref->addr = addr;
WT_PUBLISH(ref->state, WT_REF_DISK);
break;
- case WT_PM_REC_REWRITE:
- /*
- * An in-memory page that got too large, we forcibly evicted
- * it, and there wasn't anything to write. (Imagine two threads
- * updating a small set keys on a leaf page. The page is too
- * large so we try to evict it, but after reconciliation
- * there's only a small amount of data (so it's a single page
- * we can't split), and because there are two threads, there's
- * some data we can't write (so we can't evict it). In that
- * case, we take advantage of the fact we have exclusive access
- * to the page and rewrite it in memory.)
- */
- WT_RET(__wt_split_rewrite(session, ref));
- break;
WT_ILLEGAL_VALUE(session);
}
@@ -302,6 +357,7 @@ __evict_review(
WT_DECL_RET;
WT_PAGE *page;
uint32_t flags;
+ bool modified;
/*
* Get exclusive access to the page if our caller doesn't have the tree
@@ -322,6 +378,14 @@ __evict_review(
/* Now that we have exclusive access, review the page. */
page = ref->page;
+ modified = __wt_page_is_modified(page);
+
+ /*
+ * Clean pages can't be evicted when running in memory only. This
+ * should be uncommon - we don't add clean pages to the queue.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !modified && !closing)
+ return (EBUSY);
/*
* Fail if an internal has active children, the children must be evicted
@@ -341,7 +405,7 @@ __evict_review(
* Update the oldest ID to avoid wasted effort should it have
* fallen behind current.
*/
- if (__wt_page_is_modified(page))
+ if (modified)
__wt_txn_update_oldest(session, true);
if (!__wt_page_can_evict(session, ref, false, inmem_splitp))
@@ -359,7 +423,7 @@ __evict_review(
}
/* If the page is clean, we're done and we can evict. */
- if (!__wt_page_is_modified(page))
+ if (!modified)
return (0);
/*
@@ -389,7 +453,9 @@ __evict_review(
if (closing)
LF_SET(WT_VISIBILITY_ERR);
else if (!WT_PAGE_IS_INTERNAL(page)) {
- if (page->read_gen == WT_READGEN_OLDEST)
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ LF_SET(WT_EVICT_IN_MEMORY | WT_EVICT_UPDATE_RESTORE);
+ else if (page->read_gen == WT_READGEN_OLDEST)
LF_SET(WT_EVICT_UPDATE_RESTORE);
else if (__wt_eviction_aggressive(session))
LF_SET(WT_EVICT_LOOKASIDE);
@@ -399,15 +465,17 @@ __evict_review(
/*
* Success: assert the page is clean or reconciliation was configured
- * for an update/restore split, and if the page is clean, reconciliation
- * was configured for a lookaside table or all updates on the page are
- * globally visible.
+ * for an update/restore split. If the page is clean, assert that
+ * reconciliation was configured for a lookaside table, or it's not a
+ * durable object (currently the lookaside table), or all page updates
+ * were globally visible.
*/
WT_ASSERT(session,
LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page));
WT_ASSERT(session,
- LF_SET(WT_EVICT_LOOKASIDE) ||
__wt_page_is_modified(page) ||
+ LF_ISSET(WT_EVICT_LOOKASIDE) ||
+ F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session, page->modify->rec_max_txn));
return (0);
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 8679b9510a8..74c58845c43 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -116,10 +116,17 @@
API_CALL_NOCONF(s, WT_CURSOR, n, cur, \
((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+#define CURSOR_REMOVE_API_CALL(cur, s, bt) \
+ (s) = (WT_SESSION_IMPL *)(cur)->session; \
+ TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle);
+
#define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \
(s) = (WT_SESSION_IMPL *)(cur)->session; \
TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \
- ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \
+ if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \
+ WT_ERR(WT_CACHE_FULL);
#define CURSOR_UPDATE_API_END(s, ret) \
TXN_API_END(s, ret)
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
index aa141e1df71..4bff6c82783 100644
--- a/src/third_party/wiredtiger/src/include/block.h
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -238,6 +238,9 @@ struct __wt_block {
*/
WT_SPINLOCK live_lock; /* Live checkpoint lock */
WT_BLOCK_CKPT live; /* Live checkpoint */
+#ifdef HAVE_DIAGNOSTIC
+ bool live_open; /* Live system is open */
+#endif
bool ckpt_inprogress;/* Live checkpoint in progress */
/* Compaction support */
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 41b2c98f9e8..02819237c13 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -261,7 +261,9 @@ struct __wt_page_modify {
} key;
/*
- * Eviction, but block wasn't written: unresolved updates and
+ * Eviction, but the block wasn't written: either an in-memory
+ * configuration or unresolved updates prevented the write.
+ * There may be a list of unresolved updates, there's always an
* associated disk image.
*
* Saved updates are either a WT_INSERT, or a row-store leaf
@@ -274,7 +276,7 @@ struct __wt_page_modify {
uint64_t onpage_txn;
} *supd;
uint32_t supd_entries;
- void *supd_dsk;
+ void *disk_image;
/*
* Block was written: address, size and checksum.
@@ -386,7 +388,6 @@ struct __wt_page_modify {
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
#define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
-#define WT_PM_REC_REWRITE 4 /* Reconciliation: rewrite in place */
uint8_t rec_result; /* Reconciliation state */
};
@@ -433,6 +434,7 @@ struct __wt_page {
struct __wt_page_index {
uint32_t entries;
+ uint32_t deleted_entries;
WT_REF **index;
} * volatile __index; /* Collated children */
@@ -579,8 +581,17 @@ struct __wt_page {
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
+#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
+ uint8_t unused[2]; /* Unused padding */
+
+ /*
+ * Used to protect and co-ordinate splits for internal pages and
+ * reconciliation for all pages.
+ */
+ WT_FAIR_LOCK page_lock;
+
/*
* The page's read generation acts as an LRU value for each page in the
* tree; it is used by the eviction server thread to select pages to be
@@ -602,12 +613,6 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
- /*
- * Used to protect and co-ordinate splits for internal pages and
- * reconciliation for all pages.
- */
- WT_FAIR_LOCK page_lock;
-
size_t memory_footprint; /* Memory attached to the page */
/* Page's on-disk representation: NULL for pages created in memory. */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 14b5303cca9..23e212eb772 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1105,7 +1105,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* internal pages not be evicted until all threads are known to have
* exited the original page index array, because evicting an internal
* page discards its WT_REF array, and a thread traversing the original
- * page index array might see an freed WT_REF. During the split we set
+ * page index array might see a freed WT_REF. During the split we set
* a transaction value, once that's globally visible, we know we can
* evict the created page.
*/
@@ -1263,13 +1263,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
#endif
);
- /* An expected failure: WT_NOTFOUND when doing a cache-only read. */
- if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
- return (WT_NOTFOUND);
-
- /* An expected failure: WT_RESTART */
- if (ret == WT_RESTART)
- return (WT_RESTART);
+ /* Expected failures: page not found or restart. */
+ if (ret == WT_NOTFOUND || ret == WT_RESTART)
+ return (ret);
/* Discard the original held page. */
acquired = ret == 0;
diff --git a/src/third_party/wiredtiger/src/include/btree_cmp.i b/src/third_party/wiredtiger/src/include/btree_cmp.i
index 76f1ad4317a..8a7fe19a22f 100644
--- a/src/third_party/wiredtiger/src/include/btree_cmp.i
+++ b/src/third_party/wiredtiger/src/include/btree_cmp.i
@@ -188,3 +188,58 @@ __wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
return (collator->compare(
collator, &session->iface, user_item, tree_item, cmpp));
}
+
+/*
+ * __wt_lex_compare_short --
+ * Lexicographic comparison routine for short keys.
+ *
+ * Returns:
+ * < 0 if user_item is lexicographically < tree_item
+ * = 0 if user_item is lexicographically = tree_item
+ * > 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison function.
+ */
+static inline int
+__wt_lex_compare_short(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+{
+ size_t len, usz, tsz;
+ const uint8_t *userp, *treep;
+
+ usz = user_item->size;
+ tsz = tree_item->size;
+ len = WT_MIN(usz, tsz);
+
+ userp = user_item->data;
+ treep = tree_item->data;
+
+ /*
+ * The maximum packed uint64_t is 9B, catch row-store objects using
+ * packed record numbers as keys.
+ */
+#define WT_COMPARE_SHORT_MAXLEN 9
+#undef WT_COMPARE_SHORT
+#define WT_COMPARE_SHORT(n) \
+ case n: \
+ if (*userp != *treep) \
+ break; \
+ ++userp, ++treep
+ switch (len) {
+ WT_COMPARE_SHORT(9);
+ WT_COMPARE_SHORT(8);
+ WT_COMPARE_SHORT(7);
+ WT_COMPARE_SHORT(6);
+ WT_COMPARE_SHORT(5);
+ WT_COMPARE_SHORT(4);
+ WT_COMPARE_SHORT(3);
+ WT_COMPARE_SHORT(2);
+ case 1:
+ if (*userp != *treep)
+ break;
+
+ /* Contents are equal up to the smallest length. */
+ return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+ }
+ return (*userp < *treep ? -1 : 1);
+}
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index 475f4a86654..a95138c3f0f 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -193,6 +193,22 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp)
}
/*
+ * __wt_cache_full --
+ * Return if the cache is at (or over) capacity.
+ */
+static inline bool
+__wt_cache_full(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CACHE *cache;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ return (__wt_cache_bytes_inuse(cache) >= conn->cache_size);
+}
+
+/*
* __wt_cache_eviction_check --
* Evict pages if the cache crosses its boundaries.
*/
@@ -214,6 +230,10 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
return (0);
+ /* In memory configurations don't block when the cache is full. */
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ return (0);
+
/*
* Threads operating on trees that cannot be evicted are ignored,
* mostly because they're not contributing to the problem.
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 2dfb24a83da..03b8174b7e1 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -178,6 +178,7 @@ struct __wt_connection_impl {
WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */
WT_SPINLOCK schema_lock; /* Schema operation spinlock */
WT_SPINLOCK table_lock; /* Table creation spinlock */
+ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */
/*
* We distribute the btree page locks across a set of spin locks. Don't
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 1f63f07646e..3dd479acc0a 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -111,12 +111,12 @@ extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, c
extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile);
extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile);
extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
-extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
-extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree_all( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile);
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
-extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all);
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
@@ -155,6 +155,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
@@ -446,7 +447,6 @@ extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, char **valuep);
extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
-extern int __wt_meta_track_find_handle( WT_SESSION_IMPL *session, const char *name, const char *checkpoint);
extern int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll);
extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session);
@@ -566,9 +566,9 @@ extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table,
extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table);
extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp);
extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table);
-extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep);
extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, bool quiet, WT_TABLE **tablep, WT_COLGROUP **colgroupp);
extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, bool quiet, WT_TABLE **tablep, WT_INDEX **indexp);
+extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep);
extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp);
extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table);
extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, bool value_only, WT_ITEM *plan);
@@ -591,6 +591,7 @@ extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len
extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags);
extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers);
extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
+extern int __wt_session_release_resources(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 24dccd30913..99b6f1c483f 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -6,21 +6,23 @@
#define WT_CONN_CKPT_SYNC 0x00000002
#define WT_CONN_CLOSING 0x00000004
#define WT_CONN_EVICTION_RUN 0x00000008
-#define WT_CONN_LAS_OPEN 0x00000010
-#define WT_CONN_LEAK_MEMORY 0x00000020
-#define WT_CONN_LOG_SERVER_RUN 0x00000040
-#define WT_CONN_LSM_MERGE 0x00000080
-#define WT_CONN_PANIC 0x00000100
-#define WT_CONN_SERVER_ASYNC 0x00000200
-#define WT_CONN_SERVER_CHECKPOINT 0x00000400
-#define WT_CONN_SERVER_LSM 0x00000800
-#define WT_CONN_SERVER_RUN 0x00001000
-#define WT_CONN_SERVER_STATISTICS 0x00002000
-#define WT_CONN_SERVER_SWEEP 0x00004000
-#define WT_CONN_WAS_BACKUP 0x00008000
+#define WT_CONN_IN_MEMORY 0x00000010
+#define WT_CONN_LAS_OPEN 0x00000020
+#define WT_CONN_LEAK_MEMORY 0x00000040
+#define WT_CONN_LOG_SERVER_RUN 0x00000080
+#define WT_CONN_LSM_MERGE 0x00000100
+#define WT_CONN_PANIC 0x00000200
+#define WT_CONN_SERVER_ASYNC 0x00000400
+#define WT_CONN_SERVER_CHECKPOINT 0x00000800
+#define WT_CONN_SERVER_LSM 0x00001000
+#define WT_CONN_SERVER_RUN 0x00002000
+#define WT_CONN_SERVER_STATISTICS 0x00004000
+#define WT_CONN_SERVER_SWEEP 0x00008000
+#define WT_CONN_WAS_BACKUP 0x00010000
#define WT_EVICTING 0x00000001
-#define WT_EVICT_LOOKASIDE 0x00000002
-#define WT_EVICT_UPDATE_RESTORE 0x00000004
+#define WT_EVICT_IN_MEMORY 0x00000002
+#define WT_EVICT_LOOKASIDE 0x00000004
+#define WT_EVICT_UPDATE_RESTORE 0x00000008
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
#define WT_FILE_TYPE_DIRECTORY 0x00000004
@@ -36,13 +38,14 @@
#define WT_LOG_FSYNC 0x00000008
#define WT_READ_CACHE 0x00000001
#define WT_READ_COMPACT 0x00000002
-#define WT_READ_NO_EVICT 0x00000004
-#define WT_READ_NO_GEN 0x00000008
-#define WT_READ_NO_WAIT 0x00000010
-#define WT_READ_PREV 0x00000020
-#define WT_READ_SKIP_INTL 0x00000040
-#define WT_READ_TRUNCATE 0x00000080
-#define WT_READ_WONT_NEED 0x00000100
+#define WT_READ_NO_EMPTY 0x00000004
+#define WT_READ_NO_EVICT 0x00000008
+#define WT_READ_NO_GEN 0x00000010
+#define WT_READ_NO_WAIT 0x00000020
+#define WT_READ_PREV 0x00000040
+#define WT_READ_SKIP_INTL 0x00000080
+#define WT_READ_TRUNCATE 0x00000100
+#define WT_READ_WONT_NEED 0x00000200
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_INTERNAL 0x00000004
@@ -51,15 +54,16 @@
#define WT_SESSION_LOCKED_SCHEMA 0x00000020
#define WT_SESSION_LOCKED_SLOT 0x00000040
#define WT_SESSION_LOCKED_TABLE 0x00000080
-#define WT_SESSION_LOGGING_INMEM 0x00000100
-#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200
-#define WT_SESSION_NO_CACHE 0x00000400
-#define WT_SESSION_NO_DATA_HANDLES 0x00000800
-#define WT_SESSION_NO_EVICTION 0x00001000
-#define WT_SESSION_NO_LOGGING 0x00002000
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000
-#define WT_SESSION_SERVER_ASYNC 0x00010000
+#define WT_SESSION_LOCKED_TURTLE 0x00000100
+#define WT_SESSION_LOGGING_INMEM 0x00000200
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00000400
+#define WT_SESSION_NO_CACHE 0x00000800
+#define WT_SESSION_NO_DATA_HANDLES 0x00001000
+#define WT_SESSION_NO_EVICTION 0x00002000
+#define WT_SESSION_NO_LOGGING 0x00004000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00008000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00010000
+#define WT_SESSION_SERVER_ASYNC 0x00020000
#define WT_SYNC_CHECKPOINT 0x00000001
#define WT_SYNC_CLOSE 0x00000002
#define WT_SYNC_DISCARD 0x00000004
@@ -93,7 +97,7 @@
#define WT_VERB_VERIFY 0x00200000
#define WT_VERB_VERSION 0x00400000
#define WT_VERB_WRITE 0x00800000
-#define WT_VISIBILITY_ERR 0x00000008
+#define WT_VISIBILITY_ERR 0x00000010
/*
* flags section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 06be95697c7..521de567fc0 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -152,8 +152,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
WT_ITEM slot_buf; /* Buffer for grouped writes */
#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */
-#define WT_SLOT_SYNC 0x02 /* Needs sync on release */
-#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */
+#define WT_SLOT_FLUSH 0x02 /* Wait for write */
+#define WT_SLOT_SYNC 0x04 /* Needs sync on release */
+#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */
uint32_t flags; /* Flags */
};
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
index 11cf8204aec..d15dab3aa45 100644
--- a/src/third_party/wiredtiger/src/include/lsm.h
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -96,6 +96,7 @@ struct __wt_lsm_chunk {
int8_t empty; /* 1/0: checkpoint missing */
int8_t evicted; /* 1/0: in-memory chunk was evicted */
+ uint8_t flushing; /* 1/0: chunk flush in progress */
#define WT_LSM_CHUNK_BLOOM 0x01
#define WT_LSM_CHUNK_MERGING 0x02
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index a5a303f1630..938101e9caa 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -40,6 +40,16 @@
#define WT_METADATA_VERSION_STR "WiredTiger version string"
/*
+ * WT_WITH_TURTLE_LOCK --
+ * Acquire the turtle file lock, perform an operation, drop the lock.
+ */
+#define WT_WITH_TURTLE_LOCK(session, op) do { \
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_TURTLE));\
+ WT_WITH_LOCK(session, \
+ &S2C(session)->turtle_lock, WT_SESSION_LOCKED_TURTLE, op); \
+} while (0)
+
+/*
* WT_CKPT --
* Encapsulation of checkpoint information, shared by the metadata, the
* btree engine, and the block manager.
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index ff2f6645e5a..eca77214b47 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -120,11 +120,11 @@
* hex constant might be a negative integer), and to ensure the hex constant is
* the correct size before applying the bitwise not operator.
*/
-#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask)))
+#define FLD_CLR(field, mask) ((void)((field) &= ~(uint32_t)(mask)))
#define FLD_MASK(field, mask) ((field) & (uint32_t)(mask))
#define FLD_ISSET(field, mask) (FLD_MASK(field, mask) != 0)
#define FLD64_ISSET(field, mask) (((field) & (uint64_t)(mask)) != 0)
-#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask)))
+#define FLD_SET(field, mask) ((void)((field) |= (uint32_t)(mask)))
#define F_CLR(p, mask) FLD_CLR((p)->flags, mask)
#define F_ISSET(p, mask) FLD_ISSET((p)->flags, mask)
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 7a07d16045d..f5a2c1c7dda 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -28,6 +28,32 @@
#define WT_SESSION_IS_CHECKPOINT(s) \
((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id)
+/*
+ * Perform an operation at the specified isolation level.
+ *
+ * This is fiddly: we can't cope with operations that begin transactions
+ * (leaving an ID allocated), and operations must not move our published
+ * snap_min forwards (or updates we need could be freed while this operation is
+ * in progress). Check for those cases: the bugs they cause are hard to debug.
+ */
+#define WT_WITH_TXN_ISOLATION(s, iso, op) do { \
+ WT_TXN_ISOLATION saved_iso = (s)->isolation; \
+ WT_TXN_ISOLATION saved_txn_iso = (s)->txn.isolation; \
+ WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(s); \
+ WT_TXN_STATE saved_state = *txn_state; \
+ (s)->txn.forced_iso++; \
+ (s)->isolation = (s)->txn.isolation = (iso); \
+ op; \
+ (s)->isolation = saved_iso; \
+ (s)->txn.isolation = saved_txn_iso; \
+ WT_ASSERT((s), (s)->txn.forced_iso > 0); \
+ (s)->txn.forced_iso--; \
+ WT_ASSERT((s), txn_state->id == saved_state.id && \
+ (txn_state->snap_min == saved_state.snap_min || \
+ saved_state.snap_min == WT_TXN_NONE)); \
+ txn_state->snap_min = saved_state.snap_min; \
+} while (0)
+
struct __wt_named_snapshot {
const char *name;
@@ -129,6 +155,8 @@ struct __wt_txn {
WT_TXN_ISOLATION isolation;
+ uint32_t forced_iso; /* Isolation is currently forced. */
+
/*
* Snapshot data:
* ids < snap_min are visible,
@@ -153,13 +181,13 @@ struct __wt_txn {
/* Checkpoint status. */
WT_LSN ckpt_lsn;
- bool full_ckpt;
uint32_t ckpt_nsnapshot;
WT_ITEM *ckpt_snapshot;
+ bool full_ckpt;
#define WT_TXN_AUTOCOMMIT 0x01
#define WT_TXN_ERROR 0x02
-#define WT_TXN_HAS_ID 0x04
+#define WT_TXN_HAS_ID 0x04
#define WT_TXN_HAS_SNAPSHOT 0x08
#define WT_TXN_NAMED_SNAPSHOT 0x10
#define WT_TXN_READONLY 0x20
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 54c30adae76..e49e3d1257b 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -187,18 +187,17 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
/*
* Read-uncommitted transactions see all other changes.
- *
- * All metadata reads are at read-uncommitted isolation. That's
- * because once a schema-level operation completes, subsequent
- * operations must see the current version of checkpoint metadata, or
- * they may try to read blocks that may have been freed from a file.
- * Metadata updates use non-transactional techniques (such as the
- * schema and metadata locks) to protect access to in-flight updates.
*/
- if (txn->isolation == WT_ISO_READ_UNCOMMITTED ||
- session->dhandle == session->meta_dhandle)
+ if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
return (true);
+ /*
+ * If we don't have a transactional snapshot, only make stable updates
+ * visible.
+ */
+ if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ return (__wt_txn_visible_all(session, id));
+
/* Transactions see their own changes. */
if (id == txn->id)
return (true);
@@ -429,9 +428,15 @@ __wt_txn_read_last(WT_SESSION_IMPL *session)
txn = &session->txn;
- /* Release the snap_min ID we put in the global table. */
- if (!F_ISSET(txn, WT_TXN_RUNNING) ||
- txn->isolation != WT_ISO_SNAPSHOT)
+ /*
+ * Release the snap_min ID we put in the global table.
+ *
+ * If the isolation has been temporarily forced, don't touch the
+ * snapshot here: it will be restored by WT_WITH_TXN_ISOLATION.
+ */
+ if ((!F_ISSET(txn, WT_TXN_RUNNING) ||
+ txn->isolation != WT_ISO_SNAPSHOT) &&
+ txn->forced_iso == 0)
__wt_txn_release_snapshot(session);
}
@@ -451,28 +456,26 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
txn_state = WT_SESSION_TXN_STATE(session);
/*
- * If there is no transaction running (so we don't have an ID), and no
- * snapshot allocated, put an ID in the global table to prevent any
- * update that we are reading from being trimmed to save memory. Do a
- * read before the write because this shared data is accessed a lot.
+ * We are about to read data, which means we need to protect against
+ * updates being freed from underneath this cursor. Read-uncommitted
+ * isolation protects values by putting a transaction ID in the global
+ * table to prevent any update that we are reading from being freed.
+ * Other isolation levels get a snapshot to protect their reads.
*
* !!!
- * Note: We are updating the global table unprotected, so the
- * oldest_id may move past this ID if a scan races with this
- * value being published. That said, read-uncommitted operations
- * always take the most recent version of a value, so for that version
- * to be freed, two newer versions would have to be committed. Putting
- * this snap_min ID in the table prevents the oldest ID from moving
+ * Note: We are updating the global table unprotected, so the global
+ * oldest_id may move past our snap_min if a scan races with this value
+ * being published. That said, read-uncommitted operations always see
+ * the most recent update for each record that has not been aborted
+ * regardless of the snap_min value published here. Even if there is a
+ * race while publishing this ID, it prevents the oldest ID from moving
* further forward, so that once a read-uncommitted cursor is
* positioned on a value, it can't be freed.
*/
- if (txn->isolation == WT_ISO_READ_UNCOMMITTED &&
- !F_ISSET(txn, WT_TXN_HAS_ID) &&
- WT_TXNID_LT(txn_state->snap_min, txn_global->last_running))
- txn_state->snap_min = txn_global->last_running;
-
- if (txn->isolation != WT_ISO_READ_UNCOMMITTED &&
- !F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = txn_global->last_running;
+ } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
__wt_txn_get_snapshot(session);
}
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index b7ebb8fbc14..037399625ea 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -2910,6 +2910,15 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
* if recovery is required to use the database.
*/
#define WT_RUN_RECOVERY -31806
+/*! @cond internal */
+/*!
+ * Operation would overflow cache.
+ * This error is generated when wiredtiger_open is configured to run in-memory,
+ * and an insert or update operation requires more than the configured cache
+ * size to complete.
+ */
+#define WT_CACHE_FULL -31807
+/*! @endcond */
/*
* Error return section: END
* DO NOT EDIT: automatically built by dist/api_err.py.
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index efe4d22eeca..44dc7dc30a7 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -551,7 +551,6 @@ __log_fill(WT_SESSION_IMPL *session,
else
/*
* If this is a force or unbuffered write, write it now.
- * A forced write sends in a temporary, local slot.
*/
WT_ERR(__wt_write(session, myslot->slot->slot_fh,
myslot->offset + myslot->slot->slot_start_offset,
@@ -1173,87 +1172,60 @@ __wt_log_close(WT_SESSION_IMPL *session)
}
/*
- * __log_filesize --
- * Returns an estimate of the real end of log file.
+ * __log_has_hole --
+ * Determine if the current offset represents a hole in the log
+ * file (i.e. there is valid data somewhere after the hole), or
+ * if this is the end of this log file and the remainder of the
+ * file is zeroes.
*/
static int
-__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_LOG *log;
- wt_off_t log_size, off, off1;
- uint32_t allocsize, bufsz;
+ wt_off_t log_size, off, remainder;
+ size_t bufsz, rdlen;
char *buf, *zerobuf;
conn = S2C(session);
log = conn->log;
- if (eof == NULL)
- return (0);
- *eof = 0;
- WT_RET(__wt_filesize(session, fh, &log_size));
- if (log == NULL)
- allocsize = WT_LOG_ALIGN;
- else
- allocsize = log->allocsize;
+ log_size = fh->size;
+ remainder = log_size - offset;
+ *hole = false;
/*
* It can be very slow looking for the last real record in the log
- * in very small chunks. Walk backward by a megabyte at a time. When
- * we find a part of the log that is not just zeroes, walk to find
- * the last record.
+ * in very small chunks. Walk a megabyte at a time. If we find a
+ * part of the log that is not just zeroes we know this log file
+ * has a hole in it.
*/
buf = zerobuf = NULL;
- if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+ if (log == NULL || log->allocsize < WT_MEGABYTE)
bufsz = WT_MEGABYTE;
else
- bufsz = allocsize;
+ bufsz = log->allocsize;
+
+ if ((size_t)remainder < bufsz)
+ bufsz = (size_t)remainder;
WT_RET(__wt_calloc_def(session, bufsz, &buf));
WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
/*
- * Read in a chunk starting at the end of the file. Keep going until
- * we reach the beginning or we find a chunk that contains any non-zero
- * bytes. Compare against a known zero byte chunk.
+ * Read in a chunk starting at the given offset.
+ * Compare against a known zero byte chunk.
*/
- for (off = log_size - (wt_off_t)bufsz;
- off >= 0;
- off -= (wt_off_t)bufsz) {
- WT_ERR(__wt_read(session, fh, off, bufsz, buf));
- if (memcmp(buf, zerobuf, bufsz) != 0)
+ for (off = offset; remainder > 0;
+ remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) {
+ rdlen = WT_MIN(bufsz, (size_t)remainder);
+ WT_ERR(__wt_read(session, fh, off, rdlen, buf));
+ if (memcmp(buf, zerobuf, rdlen) != 0) {
+ *hole = true;
break;
+ }
}
- /*
- * If we're walking by large amounts, now walk by the real allocsize
- * to find the real end, if we found something. Otherwise we reached
- * the beginning of the file. Offset can go negative if the log file
- * size is not a multiple of a megabyte. The first chunk of the log
- * file will always be non-zero.
- */
- if (off < 0)
- off = 0;
-
- /*
- * We know all log records are aligned at log->allocsize. The first
- * item in a log record is always a 32-bit length. Look for any
- * non-zero length at the allocsize boundary. This may not be a true
- * log record since it could be the middle of a large record. But we
- * know no log record starts after it. Return an estimate of the log
- * file size.
- */
- for (off1 = bufsz - allocsize;
- off1 > 0; off1 -= (wt_off_t)allocsize)
- if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
- break;
- off = off + off1;
-
- /*
- * Set EOF to the last zero-filled record we saw.
- */
- *eof = off + (wt_off_t)allocsize;
-err:
- if (buf != NULL)
+err: if (buf != NULL)
__wt_free(session, buf);
if (zerobuf != NULL)
__wt_free(session, zerobuf);
@@ -1310,7 +1282,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
* responsible for freeing the slot in that case. Otherwise the
* worker thread will free it.
*/
- if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
if (freep != NULL)
*freep = 0;
slot->slot_state = WT_LOG_SLOT_WRITTEN;
@@ -1340,6 +1312,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
*/
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
+ WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
if (++yield_count < 1000)
__wt_yield();
else
@@ -1354,6 +1327,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
WT_ASSERT(session, slot != log->active_slot);
WT_ERR(__wt_cond_signal(session, log->log_write_cond));
+ F_CLR(slot, WT_SLOT_FLUSH);
/*
* Signal the close thread if needed.
@@ -1543,7 +1517,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
}
WT_ERR(__log_openfile(
session, false, &log_fh, WT_LOG_FILENAME, start_lsn.file));
- WT_ERR(__log_filesize(session, log_fh, &log_size));
+ WT_ERR(__wt_filesize(session, log_fh, &log_size));
rd_lsn = start_lsn;
WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf));
@@ -1574,7 +1548,7 @@ advance:
break;
WT_ERR(__log_openfile(session,
false, &log_fh, WT_LOG_FILENAME, rd_lsn.file));
- WT_ERR(__log_filesize(session, log_fh, &log_size));
+ WT_ERR(__wt_filesize(session, log_fh, &log_size));
eol = false;
continue;
}
@@ -1592,16 +1566,25 @@ advance:
*/
reclen = *(uint32_t *)buf->mem;
/*
- * Log files are pre-allocated. We never expect a zero length
- * unless we've reached the end of the log. The log can be
- * written out of order, so when recovery finds the end of
- * the log, truncate the file and remove any later log files
- * that may exist.
+ * Log files are pre-allocated. We need to detect the
+ * difference between a hole in the file (where this location
+ * would be considered the end of log) and the last record
+ * in the log and we're at the zeroed part of the file.
+ * If we find a zeroed record, scan forward in the log looking
+ * for any data. If we detect any we have a hole and stop.
+ * Otherwise if the rest is all zeroes advance to the next file.
+ * When recovery finds the end of the log, truncate the file
+ * and remove any later log files that may exist.
*/
if (reclen == 0) {
- /* This LSN is the end. */
- eol = true;
- break;
+ WT_ERR(__log_has_hole(
+ session, log_fh, rd_lsn.offset, &eol));
+ if (eol)
+ /* Found a hole. This LSN is the end. */
+ break;
+ else
+ /* Last record in log. Look for more. */
+ goto advance;
}
rdup_len = __wt_rduppo2(reclen, allocsize);
if (reclen > allocsize) {
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index 7c541eb7bec..b3790412536 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -429,6 +429,8 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
F_SET(slot, WT_SLOT_SYNC_DIR);
+ if (LF_ISSET(WT_LOG_FLUSH))
+ F_SET(slot, WT_SLOT_FLUSH);
if (LF_ISSET(WT_LOG_FSYNC))
F_SET(slot, WT_SLOT_SYNC);
if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index cca417a31fc..f988bfc97fd 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -1434,7 +1434,7 @@ __clsm_remove(WT_CURSOR *cursor)
clsm = (WT_CURSOR_LSM *)cursor;
- CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ CURSOR_REMOVE_API_CALL(cursor, session, NULL);
WT_CURSOR_NEEDKEY(cursor);
WT_CURSOR_NOVALUE(cursor);
WT_ERR(__clsm_enter(clsm, false, true));
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 7056c907f8e..4741cf52608 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -261,6 +261,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
WT_TXN_ISOLATION saved_isolation;
+ bool flush_set;
+
+ flush_set = false;
/*
* If the chunk is already checkpointed, make sure it is also evicted.
@@ -269,8 +272,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
!F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
!chunk->evicted) {
- if ((ret = __lsm_discard_handle(
- session, chunk->uri, NULL)) == 0)
+ WT_WITH_HANDLE_LIST_LOCK(session,
+ ret = __lsm_discard_handle(session, chunk->uri, NULL));
+ if (ret == 0)
chunk->evicted = 1;
else if (ret == EBUSY)
ret = 0;
@@ -294,7 +298,11 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
return (0);
}
- WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
+ if (!__wt_atomic_cas8(&chunk->flushing, 0, 1))
+ return (0);
+ flush_set = true;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
chunk->uri));
/*
@@ -318,27 +326,31 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
session->txn.isolation = saved_isolation;
WT_TRET(__wt_session_release_btree(session));
}
- WT_RET(ret);
+ WT_ERR(ret);
- WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
chunk->uri));
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(session, chunk->uri,
- __wt_checkpoint, NULL, NULL, 0));
-
+ /*
+ * Turn on metadata tracking to ensure the checkpoint gets the
+ * necessary handle locks.
+ */
+ WT_ERR(__wt_meta_track_on(session));
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+ session, chunk->uri, __wt_checkpoint, NULL, NULL, 0));
+ WT_TRET(__wt_meta_track_off(session, false, ret != 0));
if (ret != 0)
- WT_RET_MSG(session, ret, "LSM checkpoint");
+ WT_ERR_MSG(session, ret, "LSM checkpoint");
/* Now the file is written, get the chunk size. */
- WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk));
+ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
/* Update the flush timestamp to help track ongoing progress. */
- WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts));
+ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
++lsm_tree->chunks_flushed;
/* Lock the tree, mark the chunk as on disk and update the metadata. */
- WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
F_SET(chunk, WT_LSM_CHUNK_ONDISK);
ret = __wt_lsm_meta_write(session, lsm_tree);
++lsm_tree->dsk_gen;
@@ -346,9 +358,11 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
/* Update the throttle time. */
__wt_lsm_tree_throttle(session, lsm_tree, true);
WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
-
if (ret != 0)
- WT_RET_MSG(session, ret, "LSM metadata write");
+ WT_ERR_MSG(session, ret, "LSM metadata write");
+
+ WT_PUBLISH(chunk->flushing, 0);
+ flush_set = false;
/*
* Clear the no-eviction flag so the primary can be evicted and
@@ -356,24 +370,28 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* otherwise, accessing the leaf page during the checkpoint can trigger
* forced eviction.
*/
- WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
+ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
__wt_btree_evictable(session, true);
- WT_RET(__wt_session_release_btree(session));
+ WT_ERR(__wt_session_release_btree(session));
/* Make sure we aren't pinning a transaction ID. */
__wt_txn_release_snapshot(session);
- WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
chunk->uri));
/* Schedule a bloom filter create for our newly flushed chunk. */
if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
- WT_RET(__wt_lsm_manager_push_entry(
+ WT_ERR(__wt_lsm_manager_push_entry(
session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
else
- WT_RET(__wt_lsm_manager_push_entry(
+ WT_ERR(__wt_lsm_manager_push_entry(
session, WT_LSM_WORK_MERGE, 0, lsm_tree));
- return (0);
+
+err: if (flush_set)
+ WT_PUBLISH(chunk->flushing, 0);
+
+ return (ret);
}
/*
@@ -487,7 +505,9 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
*
* This will fail with EBUSY if the file is still in use.
*/
- WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT));
+ WT_WITH_HANDLE_LIST_LOCK(session,
+ ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT));
+ WT_RET(ret);
/*
* Take the schema lock for the drop operation. Since __wt_schema_drop
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index 0bab52b9d9c..e7074a9c1b5 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -151,8 +151,11 @@ __wt_metadata_update(
key, value, WT_META_TRACKING(session) ? "true" : "false",
__metadata_turtle(key) ? "" : "not "));
- if (__metadata_turtle(key))
- return (__wt_turtle_update(session, key, value));
+ if (__metadata_turtle(key)) {
+ WT_WITH_TURTLE_LOCK(session,
+ ret = __wt_turtle_update(session, key, value));
+ return (ret);
+ }
if (WT_META_TRACKING(session))
WT_RET(__wt_meta_track_update(session, key));
@@ -219,9 +222,20 @@ __wt_metadata_search(
if (__metadata_turtle(key))
return (__wt_turtle_read(session, key, valuep));
+ /*
+ * All metadata reads are at read-uncommitted isolation. That's
+ * because once a schema-level operation completes, subsequent
+ * operations must see the current version of checkpoint metadata, or
+ * they may try to read blocks that may have been freed from a file.
+ * Metadata updates use non-transactional techniques (such as the
+ * schema and metadata locks) to protect access to in-flight updates.
+ */
WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
cursor->set_key(cursor, key);
- WT_ERR(cursor->search(cursor));
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = cursor->search(cursor));
+ WT_ERR(ret);
+
WT_ERR(cursor->get_value(cursor, &value));
WT_ERR(__wt_strdup(session, value, valuep));
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index b223c2fb8fc..bc96a35efc7 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -223,35 +223,6 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
}
/*
- * __wt_meta_track_find_handle --
- * Check if we have already seen a handle.
- */
-int
-__wt_meta_track_find_handle(
- WT_SESSION_IMPL *session, const char *name, const char *checkpoint)
-{
- WT_META_TRACK *trk, *trk_orig;
-
- WT_ASSERT(session,
- WT_META_TRACKING(session) && session->meta_track_nest > 0);
-
- trk_orig = session->meta_track;
- trk = session->meta_track_next;
-
- while (--trk >= trk_orig) {
- if (trk->op != WT_ST_LOCK)
- continue;
- if (strcmp(trk->dhandle->name, name) == 0 &&
- ((trk->dhandle->checkpoint == NULL && checkpoint == NULL) ||
- (trk->dhandle->checkpoint != NULL &&
- strcmp(trk->dhandle->checkpoint, checkpoint) == 0)))
- return (0);
- }
-
- return (WT_NOTFOUND);
-}
-
-/*
* __wt_meta_track_off --
* Turn off metadata operation tracking, unrolling on error.
*/
@@ -293,7 +264,8 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
* If we don't have the metadata handle (e.g, we're in the process of
* creating the metadata), we can't sync it.
*/
- if (!need_sync || session->meta_dhandle == NULL)
+ if (!need_sync || session->meta_dhandle == NULL ||
+ F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
goto done;
/* If we're logging, make sure the metadata update was flushed. */
@@ -304,7 +276,8 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
WT_RET(ret);
} else {
WT_WITH_DHANDLE(session, session->meta_dhandle,
- ret = __wt_checkpoint(session, NULL));
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_COMMITTED,
+ ret = __wt_checkpoint(session, NULL)));
WT_RET(ret);
WT_WITH_DHANDLE(session, session->meta_dhandle,
ret = __wt_checkpoint_sync(session, NULL));
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index 1aa9c953689..13e8b31916f 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -202,7 +202,9 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
/* Create the turtle file. */
WT_RET(__metadata_config(session, &metaconf));
- WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf));
+ WT_WITH_TURTLE_LOCK(session, ret = __wt_turtle_update(
+ session, WT_METAFILE_URI, metaconf));
+ WT_ERR(ret);
}
/* Remove the backup files, we'll never read them again. */
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
index 20a9e8236ac..6280e334afb 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -49,8 +49,7 @@ __wt_std_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
#if defined(HAVE_FALLOCATE)
WT_DECL_RET;
- WT_SYSCALL_RETRY(
- fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret);
+ WT_SYSCALL_RETRY(fallocate(fh->fd, 0, offset, len), ret);
return (ret);
#else
WT_UNUSED(fh);
@@ -76,8 +75,7 @@ __wt_sys_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
* Linux versions (RHEL 5.5), but not in the version of the C library.
* This allows it to work everywhere the kernel supports it.
*/
- WT_SYSCALL_RETRY(syscall(
- SYS_fallocate, fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret);
+ WT_SYSCALL_RETRY(syscall(SYS_fallocate, fh->fd, 0, offset, len), ret);
return (ret);
#else
WT_UNUSED(fh);
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 40917bebf56..965f798e820 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -25,7 +25,7 @@ typedef struct {
WT_PAGE *page;
uint32_t flags; /* Caller's configuration */
- WT_ITEM dsk; /* Temporary disk-image buffer */
+ WT_ITEM disk_image; /* Temporary disk-image buffer */
/*
* Track start/stop write generation to decide if all changes to the
@@ -40,9 +40,7 @@ typedef struct {
uint64_t orig_btree_checkpoint_gen;
uint64_t orig_txn_checkpoint_gen;
- /*
- * Track maximum transaction ID seen and first unwritten transaction ID.
- */
+ /* Track the page's maximum transaction ID. */
uint64_t max_txn;
/*
@@ -161,7 +159,7 @@ typedef struct {
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t cksum; /* Split's checksum */
- void *dsk; /* Split's disk image */
+ void *disk_image; /* Split's disk image */
/*
* Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
@@ -450,12 +448,16 @@ __wt_reconcile(WT_SESSION_IMPL *session,
}
/*
- * Clean up the boundary structures: some workloads result in millions
- * of these structures, and if associated with some random session that
- * got roped into doing forced eviction, they won't be discarded for the
- * life of the session.
+ * Clean up reconciliation resources: some workloads have millions of
+ * boundary structures, and if associated with an application session
+ * pulled into doing forced eviction, they won't be discarded for the
+ * life of the session (or until session.reset is called). Discard all
+ * of the reconciliation resources if an application thread, not doing
+ * a checkpoint.
*/
- __rec_bnd_cleanup(session, r, false);
+ __rec_bnd_cleanup(session, r,
+ F_ISSET(session, WT_SESSION_INTERNAL) ||
+ WT_SESSION_IS_CHECKPOINT(session) ? false : true);
WT_RET(ret);
@@ -619,7 +621,6 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
switch (mod->rec_result) {
case WT_PM_REC_EMPTY: /* Page is empty */
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
- case WT_PM_REC_REWRITE: /* Rewrite */
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
@@ -647,6 +648,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
WT_INTL_INDEX_GET(session, next, pindex);
for (i = 0; i < mod->mod_multi_entries; ++i) {
+ /*
+ * There's special error handling required when re-instantiating
+ * pages in memory; it's not needed here, asserted for safety.
+ */
+ WT_ASSERT(session, mod->mod_multi[i].supd == NULL);
+
WT_ERR(__wt_multi_to_ref(session,
next, &mod->mod_multi[i], &pindex->index[i], NULL));
pindex->index[i]->home = next;
@@ -751,7 +758,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->last = &r->_last;
/* Disk buffers need to be aligned for writing. */
- F_SET(&r->dsk, WT_ITEM_ALIGNED);
+ F_SET(&r->disk_image, WT_ITEM_ALIGNED);
}
/* Reconciliation is not re-entrant, make sure that doesn't happen. */
@@ -809,6 +816,9 @@ __rec_write_init(WT_SESSION_IMPL *session,
}
r->flags = flags;
+ /* Track the page's maximum transaction ID. */
+ r->max_txn = WT_TXN_NONE;
+
/* Track if the page can be marked clean. */
r->leave_dirty = false;
@@ -890,7 +900,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
return;
*(WT_RECONCILE **)reconcilep = NULL;
- __wt_buf_free(session, &r->dsk);
+ __wt_buf_free(session, &r->disk_image);
__wt_free(session, r->raw_entries);
__wt_free(session, r->raw_offsets);
@@ -945,14 +955,15 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
*
* During some big-page evictions we have seen boundary arrays that have
* millions of elements. That should not be a normal event, but if the
- * memory is associated with a random session, it won't be discarded
- * until the session is closed. If there are more than 10,000 boundary
- * structure elements, destroy the boundary array and we'll start over.
+ * memory is associated with a random application session, it won't be
+ * discarded until the session is closed or reset. If there are more
+ * than 10,000 boundary structure elements, discard the boundary array
+ * entirely and start over next time.
*/
if (destroy || r->bnd_entries > 10 * 1000) {
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
- __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->disk_image);
__wt_free(session, bnd->supd);
__wt_buf_free(session, &bnd->key);
}
@@ -973,7 +984,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
++last_used;
for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
- __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->disk_image);
__wt_free(session, bnd->supd);
}
}
@@ -1436,7 +1447,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* If there are deleted child pages we can't discard immediately, keep
* the page dirty so they are eventually freed.
*/
- r->leave_dirty = 1;
+ r->leave_dirty = true;
/*
* If the original page cannot be freed, we need to keep a slot on the
@@ -1631,8 +1642,8 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
* for overflow in diagnostic mode.
*/
WT_ASSERT(session, r->space_avail >= size);
- WT_ASSERT(session,
- WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->dsk.memsize));
+ WT_ASSERT(session, WT_BLOCK_FITS(
+ r->first_free, size, r->disk_image.mem, r->disk_image.memsize));
r->entries += v;
r->space_avail -= size;
@@ -1854,7 +1865,7 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
WT_CLEAR(bnd->addr);
bnd->size = 0;
bnd->cksum = 0;
- __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->disk_image);
__wt_free(session, bnd->supd);
bnd->supd_next = 0;
@@ -1967,14 +1978,14 @@ __rec_split_init(WT_SESSION_IMPL *session,
*/
corrected_page_size = r->page_size;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size));
+ WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size));
/*
* Clear the disk page's header and block-manager space, set the page
* type (the type doesn't change, and setting it later would require
* additional code in a few different places).
*/
- dsk = r->dsk.mem;
+ dsk = r->disk_image.mem;
memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
dsk->type = page->type;
@@ -2253,11 +2264,11 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
btree = S2BT(session);
bm = btree->bm;
- len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+ len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
corrected_page_size = len + add_len;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
- r->first_free = (uint8_t *)r->dsk.mem + len;
+ WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size));
+ r->first_free = (uint8_t *)r->disk_image.mem + len;
WT_ASSERT(session, corrected_page_size >= len);
r->space_avail = corrected_page_size - len;
WT_ASSERT(session, r->space_avail >= add_len);
@@ -2278,7 +2289,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
size_t inuse;
btree = S2BT(session);
- dsk = r->dsk.mem;
+ dsk = r->disk_image.mem;
/*
* We should never split during salvage, and we're about to drop core
@@ -2410,8 +2421,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
/* Finalize the header information and write the page. */
dsk->recno = last->recno;
dsk->u.entries = r->entries;
- dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
- WT_RET(__rec_split_write(session, r, last, &r->dsk, false));
+ dsk->mem_size =
+ r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk);
+ WT_RET(
+ __rec_split_write(session, r, last, &r->disk_image, false));
/*
* Set the caller's entry count and buffer information for the
@@ -2475,7 +2488,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
unpack = &_unpack;
compressor = btree->compressor;
dst = &r->raw_destination;
- dsk = r->dsk.mem;
+ dsk = r->disk_image.mem;
WT_RET(__rec_split_bnd_grow(session, r));
last = &r->bnd[r->bnd_next];
@@ -2751,7 +2764,7 @@ no_slots:
r->first_free = dsk_start + len;
r->space_avail += r->raw_offsets[result_slots];
WT_ASSERT(session, r->first_free + r->space_avail <=
- (uint8_t *)r->dsk.mem + r->dsk.memsize);
+ (uint8_t *)r->disk_image.mem + r->disk_image.memsize);
/*
* Set the key for the next block (before writing the block, a
@@ -2788,14 +2801,15 @@ no_slots:
WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
dsk->recno = last->recno;
- dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ dsk->mem_size =
+ r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk);
dsk->u.entries = r->entries;
r->entries = 0;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- write_ref = &r->dsk;
+ write_ref = &r->disk_image;
last->already_compressed = false;
} else {
/*
@@ -2823,7 +2837,7 @@ no_slots:
last_block && __rec_is_checkpoint(session, r, last)) {
if (write_ref == dst)
WT_RET(__wt_buf_set(
- session, &r->dsk, dst->mem, dst->size));
+ session, &r->disk_image, dst->mem, dst->size));
} else
WT_RET(
__rec_split_write(session, r, last, write_ref, last_block));
@@ -2966,14 +2980,14 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
bnd->entries = r->entries;
/* Finalize the header information. */
- dsk = r->dsk.mem;
+ dsk = r->disk_image.mem;
dsk->recno = bnd->recno;
dsk->u.entries = r->entries;
- dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ dsk->mem_size = r->disk_image.size = WT_PTRDIFF32(r->first_free, dsk);
/* If this is a checkpoint, we're done, otherwise write the page. */
return (__rec_is_checkpoint(session, r, bnd) ?
- 0 : __rec_split_write(session, r, bnd, &r->dsk, true));
+ 0 : __rec_split_write(session, r, bnd, &r->disk_image, true));
}
/*
@@ -3023,9 +3037,9 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* WT_PAGE_HEADER header onto the scratch buffer, most of the header
* information remains unchanged between the pages.
*/
- WT_RET(__wt_scr_alloc(session, r->dsk.memsize, &tmp));
+ WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp));
dsk = tmp->mem;
- memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
+ memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_SIZE);
/*
* For each split chunk we've created, update the disk image and copy
@@ -3035,7 +3049,8 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
/* Copy the page contents to the temporary buffer. */
len = (bnd + 1)->offset - bnd->offset;
- memcpy(dsk_start, (uint8_t *)r->dsk.mem + bnd->offset, len);
+ memcpy(dsk_start,
+ (uint8_t *)r->disk_image.mem + bnd->offset, len);
/* Finalize the header information and write the page. */
dsk->recno = bnd->recno;
@@ -3060,12 +3075,12 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* chunk, including header, because if there was room for that large a
* remnant, we wouldn't have switched from accumulating to a page end.
*/
- p = (uint8_t *)r->dsk.mem + bnd->offset;
+ p = (uint8_t *)r->disk_image.mem + bnd->offset;
len = WT_PTRDIFF(r->first_free, p);
if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
WT_PANIC_ERR(session, EINVAL,
"Reconciliation remnant too large for the split buffer");
- dsk = r->dsk.mem;
+ dsk = r->disk_image.mem;
dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
(void)memmove(dsk_start, p, len);
@@ -3208,13 +3223,17 @@ supd_check_complete:
}
/*
- * If using the save/restore eviction path and we had to skip updates in
- * order to build this disk image, we can't actually write it. Instead,
- * we will re-instantiate the page using the disk image and the list of
- * updates we skipped.
+ * If configured for an in-memory database, or using the save/restore
+ * eviction path and we had to skip updates in order to build this disk
+ * image, we can't actually write it. Instead, we will re-instantiate
+ * the page using the disk image and any list of updates we skipped.
*/
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
- r->cache_write_restore = true;
+ if (F_ISSET(r, WT_EVICT_IN_MEMORY) ||
+ (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)) {
+
+ /* Statistics tracking that we used update/restore. */
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL)
+ r->cache_write_restore = true;
/*
* If the buffer is compressed (raw compression was configured),
@@ -3228,10 +3247,10 @@ supd_check_complete:
*/
if (bnd->already_compressed)
WT_ERR(__rec_raw_decompress(
- session, buf->data, buf->size, &bnd->dsk));
+ session, buf->data, buf->size, &bnd->disk_image));
else {
WT_ERR(__wt_strndup(
- session, buf->data, buf->size, &bnd->dsk));
+ session, buf->data, buf->size, &bnd->disk_image));
WT_ASSERT(session, __wt_verify_dsk_image(session,
"[evict split]", buf->data, buf->size, true) == 0);
}
@@ -3784,8 +3803,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_REPLACE:
addr = &child->modify->mod_replace;
break;
- case WT_PM_REC_REWRITE:
- break;
WT_ILLEGAL_VALUE_ERR(session);
}
break;
@@ -5281,7 +5298,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- if (multi->supd == NULL) {
+ if (multi->disk_image == NULL) {
if (multi->addr.reuse)
multi->addr.addr = NULL;
else {
@@ -5291,7 +5308,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
}
} else {
__wt_free(session, multi->supd);
- __wt_free(session, multi->supd_dsk);
+ __wt_free(session, multi->disk_image);
}
}
__wt_free(session, mod->mod_multi);
@@ -5319,6 +5336,44 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __rec_split_dump_keys --
+ * Dump out the split keys in verbose mode.
+ */
+static int
+__rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_DECL_ITEM(tkey);
+ WT_DECL_RET;
+ uint32_t i;
+
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &tkey));
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_SPLIT, "split: %" PRIu32 " pages", r->bnd_next));
+ for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__wt_buf_set_printable(
+ session, tkey, bnd->key.data, bnd->key.size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "starting key %.*s",
+ (int)tkey->size, (const char *)tkey->data));
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "starting recno %" PRIu64, bnd->recno));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+err: __wt_scr_free(session, &tkey);
+ return (ret);
+}
+
+/*
* __rec_write_wrapup --
* Finish the reconciliation.
*/
@@ -5328,7 +5383,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_BM *bm;
WT_BOUNDARY *bnd;
WT_BTREE *btree;
- WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
size_t addr_size;
@@ -5376,7 +5430,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_EMPTY: /* Page deleted */
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
- case WT_PM_REC_REWRITE: /* Rewrite */
/*
* Discard the multiple replacement blocks.
*/
@@ -5442,24 +5495,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = &r->bnd[0];
/*
- * If we're saving/restoring changes for this page, there's
- * nothing to write. Allocate, then initialize the array of
- * replacement blocks.
+ * If saving/restoring changes for this page and there's only
+ * one block, there's nothing to write. This is an in-memory
+ * configuration or a special case of forced eviction: set up
+ * a single block as if to split, then use that disk image to
+ * rewrite the page in memory.
*/
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
- WT_RET(__wt_calloc_def(
- session, r->bnd_next, &mod->mod_multi));
- multi = mod->mod_multi;
- multi->supd = bnd->supd;
- multi->supd_entries = bnd->supd_next;
- bnd->supd = NULL;
- multi->supd_dsk = bnd->dsk;
- bnd->dsk = NULL;
- mod->mod_multi_entries = 1;
-
- mod->rec_result = WT_PM_REC_REWRITE;
- break;
- }
+ if (bnd->disk_image != NULL)
+ goto split;
/*
* If this is a root page, then we don't have an address and we
@@ -5467,7 +5510,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* we were about to write the buffer so we know what to do here.
*/
if (bnd->addr.addr == NULL)
- WT_RET(__wt_bt_write(session, &r->dsk,
+ WT_RET(__wt_bt_write(session, &r->disk_image,
NULL, NULL, true, bnd->already_compressed));
else {
mod->mod_replace = bnd->addr;
@@ -5495,49 +5538,18 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_ILLEGAL_VALUE(session);
}
- /* Display the actual split keys. */
- if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) {
- WT_DECL_ITEM(tkey);
- WT_DECL_RET;
- uint32_t i;
-
- if (page->type == WT_PAGE_ROW_INT ||
- page->type == WT_PAGE_ROW_LEAF)
- WT_RET(__wt_scr_alloc(session, 0, &tkey));
- for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
- switch (page->type) {
- case WT_PAGE_ROW_INT:
- case WT_PAGE_ROW_LEAF:
- WT_ERR(__wt_buf_set_printable(
- session, tkey,
- bnd->key.data, bnd->key.size));
- WT_ERR(__wt_verbose(
- session, WT_VERB_SPLIT,
- "split: starting key "
- "%.*s",
- (int)tkey->size,
- (const char *)tkey->data));
- break;
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_INT:
- case WT_PAGE_COL_VAR:
- WT_ERR(__wt_verbose(
- session, WT_VERB_SPLIT,
- "split: starting recno %" PRIu64,
- bnd->recno));
- break;
- WT_ILLEGAL_VALUE_ERR(session);
- }
-err: __wt_scr_free(session, &tkey);
- WT_RET(ret);
- }
+ /* Optionally display the actual split keys in verbose mode. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT))
+ WT_RET(__rec_split_dump_keys(session, page, r));
+
+ /* Track the largest set of page-splits. */
if (r->bnd_next > r->bnd_next_max) {
r->bnd_next_max = r->bnd_next;
WT_STAT_FAST_DATA_SET(
session, rec_multiblock_max, r->bnd_next_max);
}
- switch (page->type) {
+split: switch (page->type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
WT_RET(__rec_split_row(session, r, page));
@@ -5575,14 +5587,10 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* information (otherwise we might think the backing block is being
* reused on a subsequent reconciliation where we want to free it).
*/
- switch (mod->rec_result) {
- case WT_PM_REC_MULTIBLOCK:
- case WT_PM_REC_REWRITE:
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i)
multi->addr.reuse = 0;
- break;
- }
/*
* On error, discard blocks we've written, they're unreferenced by the
@@ -5641,18 +5649,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_row_ikey_alloc(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
- multi->supd = bnd->supd;
- multi->supd_entries = bnd->supd_next;
- bnd->supd = NULL;
- multi->supd_dsk = bnd->dsk;
- bnd->dsk = NULL;
- } else {
+ if (bnd->disk_image == NULL) {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
+ } else {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->disk_image = bnd->disk_image;
+ bnd->disk_image = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
@@ -5681,18 +5689,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
multi->key.recno = bnd->recno;
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
- multi->supd = bnd->supd;
- multi->supd_entries = bnd->supd_next;
- bnd->supd = NULL;
- multi->supd_dsk = bnd->dsk;
- bnd->dsk = NULL;
- } else {
+ if (bnd->disk_image == NULL) {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
+ } else {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->disk_image = bnd->disk_image;
+ bnd->disk_image = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
index a86cff4d723..ba8664f2e39 100644
--- a/src/third_party/wiredtiger/src/schema/schema_open.c
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -260,11 +260,11 @@ err: __wt_scr_free(session, &buf);
}
/*
- * __wt_schema_open_index --
- * Open one or more indices for a table.
+ * __schema_open_index --
+ * Open one or more indices for a table (internal version).
*/
-int
-__wt_schema_open_index(WT_SESSION_IMPL *session,
+static int
+__schema_open_index(WT_SESSION_IMPL *session,
WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
WT_CURSOR *cursor;
@@ -387,6 +387,21 @@ err: __wt_scr_free(session, &tmp);
}
/*
+ * __wt_schema_open_index --
+ * Open one or more indices for a table.
+ */
+int
+__wt_schema_open_index(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
+{
+ WT_DECL_RET;
+
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = __schema_open_index(session, table, idxname, len, indexp));
+ return (ret);
+}
+
+/*
* __wt_schema_open_indices --
* Open the indices for a table.
*/
@@ -397,11 +412,11 @@ __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table)
}
/*
- * __wt_schema_open_table --
- * Open a named table.
+ * __schema_open_table --
+ * Open a named table (internal version).
*/
-int
-__wt_schema_open_table(WT_SESSION_IMPL *session,
+static int
+__schema_open_table(WT_SESSION_IMPL *session,
const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep)
{
WT_CONFIG cparser;
@@ -597,3 +612,19 @@ err: __wt_schema_release_table(session, table);
WT_RET(ENOENT);
WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
}
+
+/*
+ * __wt_schema_open_table --
+ * Open a named table.
+ */
+int
+__wt_schema_open_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep)
+{
+ WT_DECL_RET;
+
+ WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
+ ret = __schema_open_table(
+ session, name, namelen, ok_incomplete, tablep));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
index 0c2c5fe78c0..d14b81d389f 100644
--- a/src/third_party/wiredtiger/src/schema/schema_stat.c
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -53,6 +53,63 @@ err: __wt_scr_free(session, &buf);
}
/*
+ * __curstat_size_only --
+ * For very simple tables we can avoid getting table handles if
+ * configured to only retrieve the size. It's worthwhile because
+ * workloads that create and drop a lot of tables can put a lot of
+ * pressure on the table list lock.
+ */
+static int
+__curstat_size_only(WT_SESSION_IMPL *session,
+ const char *uri, bool *was_fast,WT_CURSOR_STAT *cst)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, colconf, cval;
+ WT_DECL_RET;
+ WT_ITEM namebuf;
+ wt_off_t filesize;
+ char *tableconf;
+
+ WT_CLEAR(namebuf);
+ *was_fast = false;
+
+ /* Retrieve the metadata for this table. */
+ WT_RET(__wt_metadata_search(session, uri, &tableconf));
+
+ /*
+ * The fast path only works if the table consists of a single file
+ * and does not have any indexes. The absence of named columns is how
+ * we determine that neither of those conditions can be satisfied.
+ */
+ WT_ERR(__wt_config_getones(session, tableconf, "columns", &colconf));
+ WT_ERR(__wt_config_subinit(session, &cparser, &colconf));
+ if ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ goto err;
+
+ /* Build up the file name from the table URI. */
+ WT_ERR(__wt_buf_fmt(
+ session, &namebuf, "%s.wt", uri + strlen("table:")));
+ /*
+ * Get the size of the underlying file. There is nothing stopping a
+ * race with schema level table operations (for example drop) if there
+ * is a race there will be an error message generated.
+ */
+ WT_ERR(__wt_filesize_name(session, namebuf.data, &filesize));
+
+ /* Setup and populate the statistics structure */
+ __wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
+ cst->u.dsrc_stats.block_size = filesize;
+ __wt_curstat_dsrc_final(cst);
+
+ *was_fast = true;
+
+err: __wt_free(session, tableconf);
+ __wt_buf_free(session, &namebuf);
+
+ return (ret);
+}
+
+/*
* __wt_curstat_table_init --
* Initialize the statistics for a table.
*/
@@ -67,6 +124,17 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
WT_TABLE *table;
u_int i;
const char *name;
+ bool was_fast;
+
+ /*
+ * If only gathering table size statistics, try a fast path that
+ * avoids the schema and table list locks.
+ */
+ if (F_ISSET(cst, WT_CONN_STAT_SIZE)) {
+ WT_RET(__curstat_size_only(session, uri, &was_fast, cst));
+ if (was_fast)
+ return (0);
+ }
name = uri + strlen("table:");
WT_RET(__wt_schema_get_table(
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index a766829afad..2045329b8ff 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -33,6 +33,8 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers)
__wt_buf_free(session, &cursor->value);
}
}
+
+ WT_ASSERT(session, session->ncursors == 0);
return (ret);
}
@@ -59,6 +61,33 @@ __wt_session_copy_values(WT_SESSION_IMPL *session)
}
/*
+ * __wt_session_release_resources --
+ * Release common session resources.
+ */
+int
+__wt_session_release_resources(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ /* Block manager cleanup */
+ if (session->block_manager_cleanup != NULL)
+ WT_TRET(session->block_manager_cleanup(session));
+
+ /* Reconciliation cleanup */
+ if (session->reconcile_cleanup != NULL)
+ WT_TRET(session->reconcile_cleanup(session));
+
+ /*
+ * Discard scratch buffers, error memory; last, just in case a cleanup
+ * routine uses scratch buffers.
+ */
+ __wt_scr_discard(session);
+ __wt_buf_free(session, &session->err);
+
+ return (ret);
+}
+
+/*
* __session_clear --
* Clear a session structure.
*/
@@ -132,24 +161,17 @@ __session_close(WT_SESSION *wt_session, const char *config)
/* Close all tables. */
WT_TRET(__wt_schema_close_tables(session));
+ /* Confirm we're not holding any hazard pointers. */
+ __wt_hazard_close(session);
+
/* Discard metadata tracking. */
__wt_meta_track_discard(session);
- /* Discard scratch buffers, error memory. */
- __wt_scr_discard(session);
- __wt_buf_free(session, &session->err);
-
/* Free transaction information. */
__wt_txn_destroy(session);
- /* Confirm we're not holding any hazard pointers. */
- __wt_hazard_close(session);
-
- /* Cleanup */
- if (session->block_manager_cleanup != NULL)
- WT_TRET(session->block_manager_cleanup(session));
- if (session->reconcile_cleanup != NULL)
- WT_TRET(session->reconcile_cleanup(session));
+ /* Release common session resources. */
+ WT_TRET(__wt_session_release_resources(session));
/* Destroy the thread's mutex. */
WT_TRET(__wt_cond_destroy(session, &session->cond));
@@ -547,39 +569,13 @@ __session_reset(WT_SESSION *wt_session)
WT_TRET(__wt_session_reset_cursors(session, true));
- WT_ASSERT(session, session->ncursors == 0);
-
- __wt_scr_discard(session);
- __wt_buf_free(session, &session->err);
+ /* Release common session resources. */
+ WT_TRET(__wt_session_release_resources(session));
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
/*
- * __session_compact --
- * WT_SESSION->compact method.
- */
-static int
-__session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
-{
- WT_SESSION_IMPL *session;
-
- session = (WT_SESSION_IMPL *)wt_session;
-
- /* Disallow objects in the WiredTiger name space. */
- WT_RET(__wt_str_name_check(session, uri));
-
- if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
- !WT_PREFIX_MATCH(uri, "file:") &&
- !WT_PREFIX_MATCH(uri, "index:") &&
- !WT_PREFIX_MATCH(uri, "lsm:") &&
- !WT_PREFIX_MATCH(uri, "table:"))
- return (__wt_bad_object_type(session, uri));
-
- return (__wt_session_compact(wt_session, uri, config));
-}
-
-/*
* __wt_session_drop --
* Internal version of WT_SESSION::drop.
*/
@@ -630,6 +626,9 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
SESSION_API_CALL(session, salvage, config, cfg);
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
/* Block out checkpoints to avoid spurious EBUSY errors. */
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session, ret =
@@ -818,6 +817,10 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, verify, config, cfg);
+
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
/* Block out checkpoints to avoid spurious EBUSY errors. */
WT_WITH_CHECKPOINT_LOCK(session,
WT_WITH_SCHEMA_LOCK(session,
@@ -1036,11 +1039,12 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
-
WT_STAT_FAST_CONN_INCR(session, txn_checkpoint);
SESSION_API_CALL(session, checkpoint, config, cfg);
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
/*
* Checkpoints require a snapshot to write a transactionally consistent
* snapshot of the data.
@@ -1054,43 +1058,20 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
* from evicting anything newer than this because we track the oldest
* transaction ID in the system that is not visible to all readers.
*/
+ txn = &session->txn;
if (F_ISSET(txn, WT_TXN_RUNNING))
WT_ERR_MSG(session, EINVAL,
"Checkpoint not permitted in a transaction");
- /*
- * Reset open cursors. Do this explicitly, even though it will happen
- * implicitly in the call to begin_transaction for the checkpoint, the
- * checkpoint code will acquire the schema lock before we do that, and
- * some implementation of WT_CURSOR::reset might need the schema lock.
- */
- WT_ERR(__wt_session_reset_cursors(session, false));
-
- /*
- * Don't highjack the session checkpoint thread for eviction.
- *
- * Application threads are not generally available for potentially slow
- * operations, but checkpoint does enough I/O it may be called upon to
- * perform slow operations for the block manager.
- */
- F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
+ ret = __wt_txn_checkpoint(session, cfg);
/*
- * Only one checkpoint can be active at a time, and checkpoints must run
- * in the same order as they update the metadata. It's probably a bad
- * idea to run checkpoints out of multiple threads, but serialize them
- * here to ensure we don't get into trouble.
+ * Release common session resources (for example, checkpoint may acquire
+ * significant reconciliation structures/memory).
*/
- WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
-
- WT_WITH_CHECKPOINT_LOCK(session,
- ret = __wt_txn_checkpoint(session, cfg));
+ WT_TRET(__wt_session_release_resources(session));
- WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
-
-err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
-
- API_END_RET_NOTFOUND_MAP(session, ret);
+err: API_END_RET_NOTFOUND_MAP(session, ret);
}
/*
@@ -1160,7 +1141,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_strerror,
__session_open_cursor,
__session_create,
- __session_compact,
+ __wt_session_compact,
__session_drop,
__session_log_flush,
__session_log_printf,
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
index bbd4bbc536c..bd503cd7826 100644
--- a/src/third_party/wiredtiger/src/session/session_compact.c
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -146,24 +146,12 @@ __session_compact_check_timeout(
static int
__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
- WT_DECL_RET;
+ struct timespec start_time;
WT_DECL_ITEM(t);
- WT_SESSION *wt_session;
- WT_TXN *txn;
+ WT_DECL_RET;
int i;
- struct timespec start_time;
-
- txn = &session->txn;
- wt_session = &session->iface;
-
- /*
- * File compaction requires checkpoints, which will fail in a
- * transactional context. Check now so the error message isn't
- * confusing.
- */
- if (session->compact->file_count != 0 && F_ISSET(txn, WT_TXN_RUNNING))
- WT_ERR_MSG(session, EINVAL,
- " File compaction not permitted in a transaction");
+ const char *checkpoint_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_checkpoint), NULL, NULL };
/*
* Force the checkpoint: we don't want to skip it because the work we
@@ -171,6 +159,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
*/
WT_ERR(__wt_scr_alloc(session, 128, &t));
WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));
+ checkpoint_cfg[1] = t->data;
WT_ERR(__wt_epoch(session, &start_time));
@@ -182,7 +171,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
* time through the loop.
*/
for (i = 0; i < 100; ++i) {
- WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
session->compaction = false;
WT_WITH_SCHEMA_LOCK(session,
@@ -192,8 +181,8 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
if (!session->compaction)
break;
- WT_ERR(wt_session->checkpoint(wt_session, t->data));
- WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
+ WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
WT_ERR(__session_compact_check_timeout(session, start_time));
}
@@ -212,10 +201,24 @@ __wt_session_compact(
WT_CONFIG_ITEM cval;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ WT_TXN *txn;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, compact, config, cfg);
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "lsm:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ WT_ERR(__wt_bad_object_type(session, uri));
+
/* Setup the structure in the session handle */
memset(&compact, 0, sizeof(WT_COMPACT));
session->compact = &compact;
@@ -231,9 +234,27 @@ __wt_session_compact(
if (session->compact->lsm_count != 0)
WT_ERR(__wt_schema_worker(
session, uri, NULL, __wt_lsm_compact, cfg, 0));
- if (session->compact->file_count != 0)
+ if (session->compact->file_count != 0) {
+ /*
+ * File compaction requires checkpoints, which will fail in a
+ * transactional context. Check now so the error message isn't
+ * confusing.
+ */
+ txn = &session->txn;
+ if (F_ISSET(txn, WT_TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL,
+ " File compaction not permitted in a transaction");
+
WT_ERR(__compact_file(session, uri, cfg));
+ }
err: session->compact = NULL;
+
+ /*
+ * Release common session resources (for example, checkpoint may acquire
+ * significant reconciliation structures/memory).
+ */
+ WT_TRET(__wt_session_release_resources(session));
+
API_END_RET_NOTFOUND_MAP(session, ret);
}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
index e506b6848a1..346e9c0ab38 100644
--- a/src/third_party/wiredtiger/src/session/session_dhandle.c
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -541,14 +541,6 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
saved_dhandle = session->dhandle;
/*
- * If we already have the checkpoint locked, don't attempt to lock
- * it again.
- */
- if ((ret = __wt_meta_track_find_handle(
- session, saved_dhandle->name, checkpoint)) != WT_NOTFOUND)
- return (ret);
-
- /*
* Get the checkpoint handle exclusive, so no one else can access it
* while we are creating the new checkpoint.
*/
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index ccd6ce23560..066abc9ed0f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -344,11 +344,11 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
}
/*
- * __wt_txn_checkpoint --
+ * __txn_checkpoint --
* Checkpoint a database or a list of objects in the database.
*/
-int
-__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+static int
+__txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
struct timespec start, stop, verb_timer;
WT_CONNECTION_IMPL *conn;
@@ -631,6 +631,50 @@ err: /*
}
/*
+ * __wt_txn_checkpoint --
+ * Checkpoint a database or a list of objects in the database.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+
+ /*
+ * Reset open cursors. Do this explicitly, even though it will happen
+ * implicitly in the call to begin_transaction for the checkpoint, the
+ * checkpoint code will acquire the schema lock before we do that, and
+ * some implementation of WT_CURSOR::reset might need the schema lock.
+ */
+ WT_RET(__wt_session_reset_cursors(session, false));
+
+ /*
+ * Don't highjack the session checkpoint thread for eviction.
+ *
+ * Application threads are not generally available for potentially slow
+ * operations, but checkpoint does enough I/O it may be called upon to
+ * perform slow operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
+
+ /*
+ * Only one checkpoint can be active at a time, and checkpoints must run
+ * in the same order as they update the metadata. It's probably a bad
+ * idea to run checkpoints out of multiple threads, but as compaction
+ * calls checkpoint directly, it can be tough to avoid. Serialize here
+ * to ensure we don't get into trouble.
+ */
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
+
+ WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint(session, cfg));
+
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
+
+ F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
+
+ return (ret);
+}
+
+/*
* __drop --
* Drop all checkpoints with a specific name.
*/
@@ -726,8 +770,8 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
* Checkpoint a tree.
*/
static int
-__checkpoint_worker(
- WT_SESSION_IMPL *session, const char *cfg[], bool is_checkpoint)
+__checkpoint_worker(WT_SESSION_IMPL *session,
+ const char *cfg[], bool is_checkpoint, bool need_tracking)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -753,6 +797,22 @@ __checkpoint_worker(
name_alloc = NULL;
/*
+ * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied
+ * with wrapping the entire assert condition in the unused macro.
+ */
+ WT_UNUSED(need_tracking);
+
+ /*
+ * Most callers need meta tracking to be on here, otherwise it is
+ * possible for this checkpoint to cleanup handles that are still in
+ * use. The exceptions are:
+ * - Checkpointing the metadata handle itself.
+ * - On connection close when we know there can't be any races.
+ */
+ WT_ASSERT(session, !need_tracking ||
+ WT_IS_METADATA(dhandle) || WT_META_TRACKING(session));
+
+ /*
* Set the checkpoint LSN to the maximum LSN so that if logging is
* disabled, recovery will never roll old changes forward over the
* non-logged changes in this checkpoint. If logging is enabled, a
@@ -1128,7 +1188,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Should be holding the schema lock. */
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
- return (__checkpoint_worker(session, cfg, true));
+ return (__checkpoint_worker(session, cfg, true, true));
}
/*
@@ -1208,7 +1268,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (need_tracking)
WT_RET(__wt_meta_track_on(session));
- WT_TRET(__checkpoint_worker(session, NULL, false));
+ WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking));
if (need_tracking)
WT_RET(__wt_meta_track_off(session, true, ret != 0));
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 63d86969311..d0b3b909f09 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -421,8 +421,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
/* We need a real session for recovery. */
- WT_RET(__wt_open_session(conn, NULL, NULL, true, &session));
- F_SET(session, WT_SESSION_NO_LOGGING);
+ WT_RET(__wt_open_internal_session(conn, "txn-recover",
+ false, WT_SESSION_NO_LOGGING, &session));
r.session = session;
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));