summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2015-09-11 16:23:54 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2015-09-11 16:23:54 +1000
commit58c7ad85c90619d4fa0e7e4df3b9f4d643b9b73b (patch)
tree63cfbe95d22f14a3d3366d68976df0d739318e9c /src/third_party/wiredtiger/src
parent8b205afd0ae74fd7351bc183e39b8931044f3987 (diff)
downloadmongo-58c7ad85c90619d4fa0e7e4df3b9f4d643b9b73b.tar.gz
Import wiredtiger-wiredtiger-2.6.1-1056-g5205bb1.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party/wiredtiger/src')
-rw-r--r--src/third_party/wiredtiger/src/async/async_op.c2
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c14
-rw-r--r--src/third_party/wiredtiger/src/block/block_slvg.c12
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c35
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c59
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c28
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c38
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c12
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c213
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c557
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c12
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c119
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c18
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c1
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c391
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c39
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c7
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c219
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c322
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c1
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c19
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c8
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c15
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c11
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c8
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c4
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c9
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c290
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c81
-rw-r--r--src/third_party/wiredtiger/src/include/bitstring.i4
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h48
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h14
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i216
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h16
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i100
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i2
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h20
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h1
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i2
-rw-r--r--src/third_party/wiredtiger/src/include/error.h3
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h57
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h25
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h4
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h10
-rw-r--r--src/third_party/wiredtiger/src/include/lint.h12
-rw-r--r--src/third_party/wiredtiger/src/include/log.h163
-rw-r--r--src/third_party/wiredtiger/src/include/log.i40
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h4
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h1
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i12
-rw-r--r--src/third_party/wiredtiger/src/include/msvc.h4
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i44
-rw-r--r--src/third_party/wiredtiger/src/include/session.h5
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h15
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h3
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i45
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in371
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h6
-rw-r--r--src/third_party/wiredtiger/src/log/log.c615
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c584
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c4
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c8
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c20
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_apply.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c33
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c13
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_open.c2
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_path.c4
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_errno.c4
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_cond.c29
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_path.c4
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c1181
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c47
-rw-r--r--src/third_party/wiredtiger/src/support/pow.c2
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c56
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c14
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c31
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c6
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c13
89 files changed, 4227 insertions, 2306 deletions
diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c
index 7e1920933c2..469dbc8e615 100644
--- a/src/third_party/wiredtiger/src/async/async_op.c
+++ b/src/third_party/wiredtiger/src/async/async_op.c
@@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
asyncop->c.set_key = __wt_cursor_set_key;
asyncop->c.get_value = __wt_cursor_get_value;
asyncop->c.set_value = __wt_cursor_set_value;
- asyncop->c.recno = 0;
+ asyncop->c.recno = WT_RECNO_OOB;
memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index cdef1682faf..018f6a20164 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -86,7 +86,7 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
* __block_first_srch --
* Search the skiplist for the first available slot.
*/
-static inline int
+static inline bool
__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
{
WT_EXT *ext;
@@ -99,11 +99,11 @@ __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
if (ext->size >= size)
break;
if (ext == NULL)
- return (0);
+ return (false);
/* Build a stack for the offset we want. */
__block_off_srch(head, ext->off, stack, 0);
- return (1);
+ return (true);
}
/*
@@ -251,7 +251,7 @@ __block_off_insert(
* Return if any part of a specified range appears on a specified extent
* list.
*/
-static int
+static bool
__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
WT_EXT *before, *after;
@@ -261,10 +261,10 @@ __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
/* If "before" or "after" overlaps, we have a winner. */
if (before != NULL && before->off + before->size > off)
- return (1);
+ return (true);
if (after != NULL && off + size > after->off)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c
index c78a6c39942..641bb8a42f7 100644
--- a/src/third_party/wiredtiger/src/block/block_slvg.c
+++ b/src/third_party/wiredtiger/src/block/block_slvg.c
@@ -73,19 +73,19 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
* __wt_block_offset_invalid --
* Return if the block offset is insane.
*/
-int
+bool
__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
{
if (size == 0) /* < minimum page size */
- return (1);
+ return (true);
if (size % block->allocsize != 0) /* not allocation-size units */
- return (1);
+ return (true);
if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */
- return (1);
+ return (true);
/* past end-of-file */
if (offset + (wt_off_t)size > block->fh->size)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 18f8ca54601..79a52dbcaa3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
} else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
/*
* The page's modification information can change underfoot if
- * the page is being reconciled, lock the page down.
+ * the page is being reconciled, serialize with reconciliation.
*/
- WT_PAGE_LOCK(session, page);
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
- WT_PAGE_UNLOCK(session, page);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
WT_RET(ret);
}
return (0);
@@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_BM *bm;
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_REF *ref;
- int block_manager_begin, evict_reset, skip;
+ int block_manager_begin, skip;
WT_UNUSED(cfg);
- conn = S2C(session);
btree = S2BT(session);
bm = btree->bm;
ref = NULL;
@@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
*/
__wt_spin_lock(session, &btree->flush_lock);
- /*
- * That leaves eviction, we don't want to block eviction. Set a flag
- * so reconciliation knows compaction is running. If reconciliation
- * sees the flag it locks the page it's writing, we acquire the same
- * lock when reading the page's modify information, serializing access.
- * The same page lock blocks work on the page, but compaction is an
- * uncommon, heavy-weight operation. If it's ever a problem, there's
- * no reason we couldn't use an entirely separate lock than the page
- * lock.
- *
- * We also need to ensure we don't race with an on-going reconciliation.
- * After we set the flag, wait for eviction of this file to drain, and
- * then let eviction continue;
- */
- conn->compact_in_memory_pass = 1;
- WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
-
/* Start compaction. */
WT_ERR(bm->compact_start(bm, session));
block_manager_begin = 1;
@@ -172,11 +151,7 @@ err: if (ref != NULL)
if (block_manager_begin)
WT_TRET(bm->compact_end(bm, session));
- /*
- * Unlock will be a release barrier, use it to update the compaction
- * status for reconciliation.
- */
- conn->compact_in_memory_pass = 0;
+ /* Unblock threads writing leaf pages. */
__wt_spin_unlock(session, &btree->flush_lock);
return (ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 9f41e3ae684..458a1985e28 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -70,7 +70,7 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
* __cursor_valid --
* Return if the cursor references an valid key/value pair.
*/
-static inline int
+static inline bool
__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
WT_BTREE *btree;
@@ -133,10 +133,10 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
if (cbt->ins != NULL &&
(upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
if (WT_UPDATE_DELETED_ISSET(upd))
- return (0);
+ return (false);
if (updp != NULL)
*updp = upd;
- return (1);
+ return (true);
}
/*
@@ -155,7 +155,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* keys, check for retrieval past the end of the page.
*/
if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
- return (0);
+ return (false);
/*
* Updates aren't stored on the page, an update would have
@@ -170,7 +170,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* "slots", check if search returned a valid slot.
*/
if (cbt->slot >= page->pg_var_entries)
- return (0);
+ return (false);
/*
* Updates aren't stored on the page, an update would have
@@ -181,7 +181,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
cip = &page->pg_var_d[cbt->slot];
if ((cell = WT_COL_PTR(page, cip)) == NULL ||
__wt_cell_type(cell) == WT_CELL_DEL)
- return (0);
+ return (false);
break;
case BTREE_ROW:
/*
@@ -189,7 +189,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* key as an on-page object, we're done.
*/
if (cbt->ins != NULL)
- return (0);
+ return (false);
/*
* Check if searched returned a valid slot (the failure mode is
@@ -198,19 +198,19 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* mirrors the column-store test).
*/
if (cbt->slot >= page->pg_row_entries)
- return (0);
+ return (false);
/* Updates are stored on the page, check for a delete. */
if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
session, page->pg_row_upd[cbt->slot])) != NULL) {
if (WT_UPDATE_DELETED_ISSET(upd))
- return (0);
+ return (false);
if (updp != NULL)
*updp = upd;
}
break;
}
- return (1);
+ return (true);
}
/*
@@ -517,7 +517,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ERR(__cursor_col_search(session, cbt, NULL));
if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = 0;
+ cbt->iface.recno = WT_RECNO_OOB;
/*
* If not overwriting, fail if the key exists. Creating a
@@ -911,7 +911,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
* __cursor_equals --
* Return if two cursors reference the same row.
*/
-static inline int
+static inline bool
__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
{
switch (a->btree->type) {
@@ -923,21 +923,21 @@ __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
* one being returned to the application.
*/
if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
- return (1);
+ return (true);
break;
case BTREE_ROW:
if (a->ref != b->ref)
- return (0);
+ return (false);
if (a->ins != NULL || b->ins != NULL) {
if (a->ins == b->ins)
- return (1);
+ return (true);
break;
}
if (a->slot == b->slot)
- return (1);
+ return (true);
break;
}
- return (0);
+ return (false);
}
/*
@@ -1153,6 +1153,19 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
}
/*
+ * __wt_btcur_init --
+ * Initialize an cursor used for internal purposes.
+ */
+void
+__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ memset(cbt, 0, sizeof(WT_CURSOR_BTREE));
+
+ cbt->iface.session = &session->iface;
+ cbt->btree = S2BT(session);
+}
+
+/*
* __wt_btcur_open --
* Open a btree cursor.
*/
@@ -1168,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
* Close a btree cursor.
*/
int
-__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- ret = __curfile_leave(cbt);
+ /*
+ * The in-memory split and lookaside table code creates low-level btree
+ * cursors to search/modify leaf pages. Those cursors don't hold hazard
+ * pointers, nor are they counted in the session handle's cursor count.
+ * Skip the usual cursor tear-down in that case.
+ */
+ if (!lowlevel)
+ ret = __curfile_leave(cbt);
+
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 77d80cdb3a2..38ef407e160 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -340,6 +340,8 @@ __wt_debug_disk(
__dmsg(ds, ", empty-all");
if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
__dmsg(ds, ", empty-none");
+ if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE))
+ __dmsg(ds, ", LAS-update");
__dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen);
@@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", disk-mapped");
if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
__dmsg(ds, ", evict-lru");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
- __dmsg(ds, ", scanning");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
+ __dmsg(ds, ", reconciliation");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED))
- __dmsg(ds, ", split-locked");
if (mod != NULL)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index cddfa0ef801..0d512b13c5e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -77,7 +77,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
}
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
- ret = __wt_evict_page(session, ref);
+ ret = __wt_evict(session, ref, 0);
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
WT_RET_BUSY_OK(ret);
}
@@ -216,10 +216,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* __wt_delete_page_skip --
* If iterating a cursor, skip deleted pages that are visible to us.
*/
-int
+bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
- int skip;
+ bool skip;
/*
* Deleted pages come from two sources: either it's a fast-delete as
@@ -240,10 +240,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* the structure, just to be safe.
*/
if (ref->page_del == NULL)
- return (1);
+ return (true);
if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
- return (0);
+ return (false);
skip = (ref->page_del == NULL ||
__wt_txn_visible(session, ref->page_del->txnid));
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 060a93f543f..73e6affccd3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
-static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
/*
* __wt_ref_out --
@@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION));
#ifdef HAVE_DIAGNOSTIC
{
@@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- __wt_free(session, multi->skip);
- __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->supd);
+ __wt_free(session, multi->supd_dsk);
__wt_free(session, multi->addr.addr);
}
__wt_free(session, mod->mod_multi);
@@ -235,10 +234,7 @@ __wt_free_ref(
* it clean explicitly.)
*/
if (free_pages && ref->page != NULL) {
- if (ref->page->modify != NULL) {
- ref->page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, ref->page);
- }
+ __wt_page_modify_clear(session, ref->page);
__wt_page_out(session, &ref->page);
}
@@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
WT_INSERT *next;
for (; ins != NULL; ins = next) {
- __free_update_list(session, ins->upd);
+ __wt_free_update_list(session, ins->upd);
next = WT_SKIP_NEXT(ins);
__wt_free(session, ins);
}
@@ -395,29 +391,23 @@ __free_update(
*/
for (updp = update_head; entries > 0; --entries, ++updp)
if (*updp != NULL)
- __free_update_list(session, *updp);
+ __wt_free_update_list(session, *updp);
/* Free the update array. */
__wt_free(session, update_head);
}
/*
- * __free_update_list --
+ * __wt_free_update_list --
* Walk a WT_UPDATE forward-linked list and free the per-thread combination
* of a WT_UPDATE structure and its associated data.
*/
-static void
-__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+void
+__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_UPDATE *next;
for (; upd != NULL; upd = next) {
- /* Everything we free should be visible to everyone. */
- WT_ASSERT(session,
- F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
- upd->txnid == WT_TXN_ABORTED ||
- __wt_txn_visible_all(session, upd->txnid));
-
next = upd->next;
__wt_free(session, upd);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index 0cc6b6eb25f..6a4243a0fc7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
/* Page sizes */
WT_RET(__btree_page_sizes(session));
- /*
- * Set special flags for the metadata file.
- * Eviction; the metadata file is never evicted.
- * Logging; the metadata file is always logged if possible.
- */
- if (WT_IS_METADATA(btree->dhandle)) {
+ WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+ if (cval.val)
F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+ else
+ F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ if (cval.val)
F_CLR(btree, WT_BTREE_NO_LOGGING);
- } else {
- WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
- if (cval.val)
- F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
- else
- F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
-
- WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
- if (cval.val)
- F_CLR(btree, WT_BTREE_NO_LOGGING);
- else
- F_SET(btree, WT_BTREE_NO_LOGGING);
- }
+ else
+ F_SET(btree, WT_BTREE_NO_LOGGING);
/* Checksums */
WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
@@ -370,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
root_ref->page = root;
root_ref->state = WT_REF_MEM;
- root_ref->key.recno = is_recno ? 1 : 0;
+ root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB;
root->pg_intl_parent_ref = root_ref;
}
@@ -697,9 +687,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage =
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
- cache_size = S2C(session)->cache_size;
- if (cache_size > 0)
- btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4);
+ if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
+ if ((cache_size = S2C(session)->cache_size) > 0)
+ btree->maxmempage =
+ WT_MIN(btree->maxmempage, cache_size / 4);
+ }
/*
* Get the split percentage (reconciliation splits pages into smaller
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index d8456c5b61f..7104e702418 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -79,7 +79,7 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
* __ovfl_cache_col_visible --
* column-store: check for a globally visible update.
*/
-static int
+static bool
__ovfl_cache_col_visible(
WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
{
@@ -99,15 +99,15 @@ __ovfl_cache_col_visible(
if (__wt_cell_rle(unpack) == 1 &&
upd != NULL && /* Sanity: upd should always be set. */
__wt_txn_visible_all(session, upd->txnid))
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
* __ovfl_cache_row_visible --
* row-store: check for a globally visible update.
*/
-static int
+static bool
__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
{
WT_UPDATE *upd;
@@ -115,9 +115,9 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
/* Check to see if there's a globally visible update. */
for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
if (__wt_txn_visible_all(session, upd->txnid))
- return (1);
+ return (true);
- return (0);
+ return (false);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 922dc2892b8..ba218fc332c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -17,219 +17,6 @@ static int __inmem_row_leaf_entries(
WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
/*
- * __evict_force_check --
- * Check if a page matches the criteria for forced eviction.
- */
-static int
-__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_BTREE *btree;
-
- btree = S2BT(session);
-
- /* Pages are usually small enough, check that first. */
- if (page->memory_footprint < btree->maxmempage)
- return (0);
-
- /* Leaf pages only. */
- if (WT_PAGE_IS_INTERNAL(page))
- return (0);
-
- /*
- * It's hard to imagine a page with a huge memory footprint that has
- * never been modified, but check to be sure.
- */
- if (page->modify == NULL)
- return (0);
-
- /* Trigger eviction on the next page release. */
- __wt_page_evict_soon(page);
-
- /* Bump the oldest ID, we're about to do some visibility checks. */
- __wt_txn_update_oldest(session, 0);
-
- /* If eviction cannot succeed, don't try. */
- return (__wt_page_can_evict(session, page, 1, NULL));
-}
-
-/*
- * __wt_page_in_func --
- * Acquire a hazard pointer to a page; if the page is not in-memory,
- * read it from the disk and build an in-memory version.
- */
-int
-__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
-#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
-#endif
- )
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_PAGE *page;
- u_int sleep_cnt, wait_cnt;
- int busy, cache_work, force_attempts, oldgen;
-
- btree = S2BT(session);
-
- for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
- switch (ref->state) {
- case WT_REF_DISK:
- case WT_REF_DELETED:
- if (LF_ISSET(WT_READ_CACHE))
- return (WT_NOTFOUND);
-
- /*
- * The page isn't in memory, read it. If this thread is
- * allowed to do eviction work, check for space in the
- * cache.
- */
- if (!LF_ISSET(WT_READ_NO_EVICT))
- WT_RET(__wt_cache_eviction_check(
- session, 1, NULL));
- WT_RET(__wt_cache_read(session, ref));
- oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
- F_ISSET(session, WT_SESSION_NO_CACHE);
- continue;
- case WT_REF_READING:
- if (LF_ISSET(WT_READ_CACHE))
- return (WT_NOTFOUND);
- if (LF_ISSET(WT_READ_NO_WAIT))
- return (WT_NOTFOUND);
-
- /* Waiting on another thread's read, stall. */
- WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
- goto stall;
- case WT_REF_LOCKED:
- if (LF_ISSET(WT_READ_NO_WAIT))
- return (WT_NOTFOUND);
-
- /* Waiting on eviction, stall. */
- WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
- goto stall;
- case WT_REF_SPLIT:
- return (WT_RESTART);
- case WT_REF_MEM:
- /*
- * The page is in memory.
- *
- * Get a hazard pointer if one is required. We cannot
- * be evicting if no hazard pointer is required, we're
- * done.
- */
- if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
- goto skip_evict;
-
- /*
- * The expected reason we can't get a hazard pointer is
- * because the page is being evicted, yield, try again.
- */
-#ifdef HAVE_DIAGNOSTIC
- WT_RET(
- __wt_hazard_set(session, ref, &busy, file, line));
-#else
- WT_RET(__wt_hazard_set(session, ref, &busy));
-#endif
- if (busy) {
- WT_STAT_FAST_CONN_INCR(
- session, page_busy_blocked);
- break;
- }
-
- /*
- * If eviction is configured for this file, check to see
- * if the page qualifies for forced eviction and update
- * the page's generation number. If eviction isn't being
- * done on this file, we're done.
- */
- if (LF_ISSET(WT_READ_NO_EVICT) ||
- F_ISSET(btree, WT_BTREE_NO_EVICTION))
- goto skip_evict;
-
- /*
- * Forcibly evict pages that are too big.
- */
- page = ref->page;
- if (force_attempts < 10 &&
- __evict_force_check(session, page)) {
- ++force_attempts;
- ret = __wt_page_release_evict(session, ref);
- /* If forced eviction fails, stall. */
- if (ret == EBUSY) {
- ret = 0;
- WT_STAT_FAST_CONN_INCR(session,
- page_forcible_evict_blocked);
- goto stall;
- }
- WT_RET(ret);
-
- /*
- * The result of a successful forced eviction
- * is a page-state transition (potentially to
- * an in-memory page we can use, or a restart
- * return for our caller), continue the outer
- * page-acquisition loop.
- */
- continue;
- }
-
- /*
- * If we read the page and we are configured to not
- * trash the cache, set the oldest read generation so
- * the page is forcibly evicted as soon as possible.
- *
- * Otherwise, update the page's read generation.
- */
- if (oldgen && page->read_gen == WT_READGEN_NOTSET)
- __wt_page_evict_soon(page);
- else if (!LF_ISSET(WT_READ_NO_GEN) &&
- page->read_gen != WT_READGEN_OLDEST &&
- page->read_gen < __wt_cache_read_gen(session))
- page->read_gen =
- __wt_cache_read_gen_bump(session);
-skip_evict:
- /*
- * Check if we need an autocommit transaction.
- * Starting a transaction can trigger eviction, so skip
- * it if eviction isn't permitted.
- */
- return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
- __wt_txn_autocommit_check(session));
- WT_ILLEGAL_VALUE(session);
- }
-
- /*
- * We failed to get the page -- yield before retrying, and if
- * we've yielded enough times, start sleeping so we don't burn
- * CPU to no purpose.
- */
- if (++wait_cnt < 1000)
- __wt_yield();
- else {
- if (0) {
-stall: wait_cnt += 1000;
- }
-
- /*
- * If stalling and this thread is allowed to do eviction
- * work, check if the cache needs help. If we do work
- * for the cache, substitute that for a sleep.
- */
- if (!LF_ISSET(WT_READ_NO_EVICT)) {
- WT_RET(__wt_cache_eviction_check(
- session, 1, &cache_work));
- if (cache_work)
- continue;
- }
- sleep_cnt = WT_MIN(wait_cnt, 10000);
- wait_cnt *= 2;
- WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
- __wt_sleep(0, sleep_cnt);
- }
- }
-}
-
-/*
* __wt_page_alloc --
* Create or read a page into the cache.
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index a3ce39b7758..d26b44e04c0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -9,12 +9,320 @@
#include "wt_internal.h"
/*
- * __wt_cache_read --
- * Read a page from the file.
+ * __wt_las_remove_block --
+ * Remove all records matching a key prefix from the lookaside store.
*/
int
-__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_las_remove_block(WT_SESSION_IMPL *session,
+ WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
{
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_RET;
+ uint64_t las_counter, las_txnid;
+ uint32_t las_id;
+ int exact;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+ /*
+ * Search for the block's unique prefix and step through all matching
+ * records, removing them.
+ */
+ las_addr->data = addr;
+ las_addr->size = addr_size;
+ las_key->size = 0;
+ cursor->set_key(
+ cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+ if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * Confirm the search using the unique prefix; if not a match,
+ * we're done searching for records for this page.
+ */
+ if (las_id != btree_id ||
+ las_addr->size != addr_size ||
+ memcmp(las_addr->data, addr, addr_size) != 0)
+ break;
+
+ /*
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+ * another thread remove the record before we do, and the cursor
+ * remains positioned in that case.
+ */
+ WT_ERR(cursor->remove(cursor));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+ return (ret);
+}
+
+/*
+ * __col_instantiate --
+ * Update a column-store page entry based on a lookaside table update list.
+ */
+static int
+__col_instantiate(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ /* Search the page and add updates. */
+ WT_RET(__wt_col_search(session, recno, ref, cbt));
+ WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0));
+ return (0);
+}
+
+/*
+ * __row_instantiate --
+ * Update a row-store page entry based on a lookaside table update list.
+ */
+static int
+__row_instantiate(WT_SESSION_IMPL *session,
+ WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ /* Search the page and add updates. */
+ WT_RET(__wt_row_search(session, key, ref, cbt, 1));
+ WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0));
+ return (0);
+}
+
+/*
+ * __las_page_instantiate --
+ * Instantiate lookaside update records in a recently read page.
+ */
+static int
+__las_page_instantiate(WT_SESSION_IMPL *session,
+ WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
+{
+ WT_CURSOR *cursor;
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_ITEM(current_key);
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_ITEM(las_value);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_UPDATE *first_upd, *last_upd, *upd;
+ size_t incr, total_incr;
+ uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
+ uint32_t las_id, upd_size, session_flags;
+ int exact;
+ const uint8_t *p;
+
+ cursor = NULL;
+ page = ref->page;
+ first_upd = last_upd = upd = NULL;
+ total_incr = 0;
+ current_recno = recno = WT_RECNO_OOB;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ __wt_btcur_init(session, &cbt);
+ __wt_btcur_open(&cbt);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &current_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_value));
+
+ /* Open a lookaside table cursor. */
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /*
+ * The lookaside records are in key and update order, that is, there
+ * will be a set of in-order updates for a key, then another set of
+ * in-order updates for a subsequent key. We process all of the updates
+ * for a key and then insert those updates into the page, then all the
+ * updates for the next key, and so on.
+ *
+ * Search for the block's unique prefix, stepping through any matching
+ * records.
+ */
+ las_addr->data = addr;
+ las_addr->size = addr_size;
+ las_key->size = 0;
+ cursor->set_key(
+ cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+ if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * Confirm the search using the unique prefix; if not a match,
+ * we're done searching for records for this page.
+ */
+ if (las_id != read_id ||
+ las_addr->size != addr_size ||
+ memcmp(las_addr->data, addr, addr_size) != 0)
+ break;
+
+ /*
+ * If the on-page value has become globally visible, this record
+ * is no longer needed.
+ */
+ if (__wt_txn_visible_all(session, las_txnid))
+ continue;
+
+ /* Allocate the WT_UPDATE structure. */
+ WT_ERR(cursor->get_value(
+ cursor, &upd_txnid, &upd_size, las_value));
+ WT_ERR(__wt_update_alloc(session,
+ (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
+ &upd, &incr));
+ total_incr += incr;
+ upd->txnid = upd_txnid;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = las_key->data;
+ WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
+ if (current_recno == recno)
+ break;
+ WT_ASSERT(session, current_recno < recno);
+
+ if (first_upd != NULL) {
+ WT_ERR(__col_instantiate(session,
+ current_recno, ref, &cbt, first_upd));
+ first_upd = NULL;
+ }
+ current_recno = recno;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (current_key->size == las_key->size &&
+ memcmp(current_key->data,
+ las_key->data, las_key->size) == 0)
+ break;
+
+ if (first_upd != NULL) {
+ WT_ERR(__row_instantiate(session,
+ current_key, ref, &cbt, first_upd));
+ first_upd = NULL;
+ }
+ WT_ERR(__wt_buf_set(session,
+ current_key, las_key->data, las_key->size));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Append the latest update to the list. */
+ if (first_upd == NULL)
+ first_upd = last_upd = upd;
+ else {
+ last_upd->next = upd;
+ last_upd = upd;
+ }
+ upd = NULL;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* Insert the last set of updates, if any. */
+ if (first_upd != NULL)
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__col_instantiate(session,
+ current_recno, ref, &cbt, first_upd));
+ first_upd = NULL;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__row_instantiate(session,
+ current_key, ref, &cbt, first_upd));
+ first_upd = NULL;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Discard the cursor. */
+ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ if (total_incr != 0) {
+ __wt_cache_page_inmem_incr(session, page, total_incr);
+
+ /*
+ * We've modified/dirtied the page, but that's not necessary and
+ * if we keep the page clean, it's easier to evict. We leave the
+ * lookaside table updates in place, so if we evict this page
+ * without dirtying it, any future instantiation of it will find
+ * the records it needs. If the page is dirtied before eviction,
+ * then we'll write any needed lookaside table records for the
+ * new location of the page.
+ */
+ __wt_page_modify_clear(session, page);
+ }
+
+err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+ WT_TRET(__wt_btcur_close(&cbt, 1));
+
+ /*
+ * On error, upd points to a single unlinked WT_UPDATE structure,
+ * first_upd points to a list.
+ */
+ if (upd != NULL)
+ __wt_free(session, upd);
+ if (first_upd != NULL)
+ __wt_free_update_list(session, first_upd);
+
+ __wt_scr_free(session, &current_key);
+ __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+ __wt_scr_free(session, &las_value);
+
+ return (ret);
+}
+
+/*
+ * __evict_force_check --
+ * Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->maxmempage)
+ return (0);
+
+ /* Leaf pages only. */
+ if (WT_PAGE_IS_INTERNAL(page))
+ return (0);
+
+ /*
+ * It's hard to imagine a page with a huge memory footprint that has
+ * never been modified, but check to be sure.
+ */
+ if (page->modify == NULL)
+ return (0);
+
+ /* Trigger eviction on the next page release. */
+ __wt_page_evict_soon(page);
+
+ /* Bump the oldest ID, we're about to do some visibility checks. */
+ __wt_txn_update_oldest(session, 0);
+
+ /* If eviction cannot succeed, don't try. */
+ return (__wt_page_can_evict(session, page, 1, NULL));
+}
+
+/*
+ * __page_read --
+ * Read a page from the file.
+ */
+static int
+__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ const WT_PAGE_HEADER *dsk;
+ WT_BTREE *btree;
WT_DECL_RET;
WT_ITEM tmp;
WT_PAGE *page;
@@ -22,6 +330,7 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
uint32_t previous_state;
const uint8_t *addr;
+ btree = S2BT(session);
page = NULL;
/*
@@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* Get the address: if there is no address, the page was deleted, but a
* subsequent search or insert is forcing re-creation of the name space.
- * Otherwise, there's an address, read the backing disk page and build
- * an in-memory version of the page.
*/
WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
if (addr == NULL) {
@@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(__wt_btree_new_leaf_page(session, &page));
ref->page = page;
- } else {
- /*
- * Read the page, then build the in-memory version of the page.
- * Clear any local reference to an allocated copy of the disk
- * image on return, the page steals it.
- */
- WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
- WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
- WT_DATA_IN_ITEM(&tmp) ?
- WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
- tmp.mem = NULL;
-
- /* If the page was deleted, instantiate that information. */
- if (previous_state == WT_REF_DELETED)
- WT_ERR(__wt_delete_page_instantiate(session, ref));
+ goto done;
}
- WT_ERR(__wt_verbose(session, WT_VERB_READ,
- "page %p: %s", page, __wt_page_type_string(page->type)));
+ /*
+ * There's an address, read or map the backing disk page and build an
+ * in-memory version of the page.
+ */
+ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+ WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
+ WT_DATA_IN_ITEM(&tmp) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+ /*
+ * Clear the local reference to an allocated copy of the disk image on
+ * return; the page steals it, errors in this code should not free it.
+ */
+ tmp.mem = NULL;
- WT_PUBLISH(ref->state, WT_REF_MEM);
+ /*
+ * If reading for a checkpoint, there's no additional work to do, the
+ * page on disk is correct as written.
+ */
+ if (session->dhandle->checkpoint != NULL)
+ goto done;
+
+ /* If the page was deleted, instantiate that information. */
+ if (previous_state == WT_REF_DELETED)
+ WT_ERR(__wt_delete_page_instantiate(session, ref));
+
+ /*
+ * Instantiate updates from the database's lookaside table. The page
+ * flag was set when the page was written, potentially a long time ago.
+ * We only care if the lookaside table is currently active, check that
+ * before doing any work.
+ */
+ dsk = tmp.data;
+ if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
+ WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
+ WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);
+
+ WT_ERR(__las_page_instantiate(
+ session, ref, btree->id, addr, addr_size));
+ }
+
+done: WT_PUBLISH(ref->state, WT_REF_MEM);
return (0);
err: /*
@@ -90,3 +421,183 @@ err: /*
return (ret);
}
+
+/*
+ * __wt_page_in_func --
+ * Acquire a hazard pointer to a page; if the page is not in-memory,
+ * read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ u_int sleep_cnt, wait_cnt;
+ int busy, cache_work, force_attempts, oldgen, stalled;
+
+ btree = S2BT(session);
+ stalled = 0;
+
+ for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+
+ /*
+ * The page isn't in memory, read it. If this thread is
+ * allowed to do eviction work, check for space in the
+ * cache.
+ */
+ if (!LF_ISSET(WT_READ_NO_EVICT))
+ WT_RET(__wt_cache_eviction_check(
+ session, 1, NULL));
+ WT_RET(__page_read(session, ref));
+ oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+ F_ISSET(session, WT_SESSION_NO_CACHE);
+ continue;
+ case WT_REF_READING:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+
+ /* Waiting on another thread's read, stall. */
+ WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
+ stalled = 1;
+ break;
+ case WT_REF_LOCKED:
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+
+ /* Waiting on eviction, stall. */
+ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
+ stalled = 1;
+ break;
+ case WT_REF_SPLIT:
+ return (WT_RESTART);
+ case WT_REF_MEM:
+ /*
+ * The page is in memory.
+ *
+ * Get a hazard pointer if one is required. We cannot
+ * be evicting if no hazard pointer is required, we're
+ * done.
+ */
+ if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
+ goto skip_evict;
+
+ /*
+ * The expected reason we can't get a hazard pointer is
+ * because the page is being evicted, yield, try again.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(
+ __wt_hazard_set(session, ref, &busy, file, line));
+#else
+ WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+ if (busy) {
+ WT_STAT_FAST_CONN_INCR(
+ session, page_busy_blocked);
+ break;
+ }
+
+ /*
+ * If eviction is configured for this file, check to see
+ * if the page qualifies for forced eviction and update
+ * the page's generation number. If eviction isn't being
+ * done on this file, we're done.
+ */
+ if (LF_ISSET(WT_READ_NO_EVICT) ||
+ F_ISSET(session, WT_SESSION_NO_EVICTION) ||
+ F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ goto skip_evict;
+
+ /*
+ * Forcibly evict pages that are too big.
+ */
+ page = ref->page;
+ if (force_attempts < 10 &&
+ __evict_force_check(session, page)) {
+ ++force_attempts;
+ ret = __wt_page_release_evict(session, ref);
+ /* If forced eviction fails, stall. */
+ if (ret == EBUSY) {
+ ret = 0;
+ WT_STAT_FAST_CONN_INCR(session,
+ page_forcible_evict_blocked);
+ stalled = 1;
+ break;
+ }
+ WT_RET(ret);
+
+ /*
+ * The result of a successful forced eviction
+ * is a page-state transition (potentially to
+ * an in-memory page we can use, or a restart
+ * return for our caller), continue the outer
+ * page-acquisition loop.
+ */
+ continue;
+ }
+
+ /*
+ * If we read the page and we are configured to not
+ * trash the cache, set the oldest read generation so
+ * the page is forcibly evicted as soon as possible.
+ *
+ * Otherwise, update the page's read generation.
+ */
+ if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+ __wt_page_evict_soon(page);
+ else if (!LF_ISSET(WT_READ_NO_GEN) &&
+ page->read_gen != WT_READGEN_OLDEST &&
+ page->read_gen < __wt_cache_read_gen(session))
+ page->read_gen =
+ __wt_cache_read_gen_bump(session);
+skip_evict:
+ /*
+ * Check if we need an autocommit transaction.
+ * Starting a transaction can trigger eviction, so skip
+ * it if eviction isn't permitted.
+ */
+ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
+ __wt_txn_autocommit_check(session));
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We failed to get the page -- yield before retrying, and if
+ * we've yielded enough times, start sleeping so we don't burn
+ * CPU to no purpose.
+ */
+ if (stalled)
+ wait_cnt += 1000;
+ else if (++wait_cnt < 1000) {
+ __wt_yield();
+ continue;
+ }
+
+ /*
+ * If stalling and this thread is allowed to do eviction work,
+ * check if the cache needs help. If we do work for the cache,
+ * substitute that for a sleep.
+ */
+ if (!LF_ISSET(WT_READ_NO_EVICT)) {
+ WT_RET(
+ __wt_cache_eviction_check(session, 1, &cache_work));
+ if (cache_work)
+ continue;
+ }
+ sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+ WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
+ __wt_sleep(0, sleep_cnt);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 22d4948e07d..c2a211bdd2d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -349,9 +349,6 @@ err: WT_TRET(bm->salvage_end(bm, session));
__wt_scr_free(session, &ss->tmp1);
__wt_scr_free(session, &ss->tmp2);
- /* Wrap up reporting. */
- WT_TRET(__wt_progress(session, NULL, ss->fcnt));
-
return (ret);
}
@@ -381,8 +378,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
if (eof)
break;
- /* Report progress every 10 chunks. */
- if (++ss->fcnt % 10 == 0)
+ /* Report progress occasionally. */
+#define WT_SALVAGE_PROGRESS_INTERVAL 100
+ if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0)
WT_ERR(__wt_progress(session, NULL, ss->fcnt));
/*
@@ -1305,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
/* Reset the page. */
page->pg_var_d = save_col_var;
@@ -2011,7 +2009,7 @@ __slvg_row_build_leaf(
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
/* Reset the page. */
page->pg_row_entries += skip_stop;
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index a63eadcaeab..4b9ab45c678 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -173,7 +173,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
* __split_should_deepen --
* Return if we should deepen the tree.
*/
-static int
+static bool
__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_BTREE *btree;
@@ -196,7 +196,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
* pressure on the cache).
*/
if (page->memory_footprint < btree->maxmempage)
- return (0);
+ return (false);
/*
* Ensure the page has enough entries to make it worth splitting and
@@ -204,7 +204,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
* splitting won't help).
*/
if (pindex->entries > btree->split_deepen_min_child)
- return (1);
+ return (true);
/*
* Don't allow a single page to put pressure on cache usage. The root
@@ -216,9 +216,9 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
if (pindex->entries >= 100 &&
(__wt_ref_is_root(ref) ||
page->memory_footprint >= S2C(session)->cache_size / 4))
- return (1);
+ return (true);
- return (0);
+ return (false);
}
/*
@@ -343,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
switch (page->type) {
case WT_PAGE_COL_INT:
- recno = 0;
+ recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
@@ -684,13 +684,11 @@ __split_multi_inmem(
WT_DECL_RET;
WT_PAGE *page;
WT_UPDATE *upd;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
uint64_t recno;
uint32_t i, slot;
- WT_CLEAR(cbt);
- cbt.iface.session = &session->iface;
- cbt.btree = S2BT(session);
+ __wt_btcur_init(session, &cbt);
__wt_btcur_open(&cbt);
/*
@@ -704,22 +702,22 @@ __split_multi_inmem(
* allocated page on error, when discarding the allocated WT_REF.
*/
WT_RET(__wt_page_inmem(session, ref,
- multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size,
+ multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size,
WT_PAGE_DISK_ALLOC, &page));
- multi->skip_dsk = NULL;
+ multi->supd_dsk = NULL;
if (orig->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &key));
/* Re-create each modification we couldn't write. */
- for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
switch (orig->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
/* Build a key. */
- upd = skip->ins->upd;
- skip->ins->upd = NULL;
- recno = WT_INSERT_RECNO(skip->ins);
+ upd = supd->ins->upd;
+ supd->ins->upd = NULL;
+ recno = WT_INSERT_RECNO(supd->ins);
/* Search the page. */
WT_ERR(__wt_col_search(session, recno, ref, &cbt));
@@ -730,19 +728,19 @@ __split_multi_inmem(
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
- if (skip->ins == NULL) {
- slot = WT_ROW_SLOT(orig, skip->rip);
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->rip);
upd = orig->pg_row_upd[slot];
orig->pg_row_upd[slot] = NULL;
WT_ERR(__wt_row_leaf_key(
- session, orig, skip->rip, key, 0));
+ session, orig, supd->rip, key, 0));
} else {
- upd = skip->ins->upd;
- skip->ins->upd = NULL;
+ upd = supd->ins->upd;
+ supd->ins->upd = NULL;
- key->data = WT_INSERT_KEY(skip->ins);
- key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ key->data = WT_INSERT_KEY(supd->ins);
+ key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
/* Search the page. */
@@ -765,7 +763,7 @@ __split_multi_inmem(
page->modify->first_dirty_txn = WT_TXN_FIRST;
err: /* Free any resources that may have been cached in the cursor. */
- WT_TRET(__wt_btcur_close(&cbt));
+ WT_TRET(__wt_btcur_close(&cbt, 1));
__wt_scr_free(session, &key);
return (ret);
@@ -801,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
*/
ref->home = NULL;
- if (multi->skip == NULL) {
+ if (multi->supd == NULL) {
/*
* Copy the address: we could simply take the buffer, but that
* would complicate error handling, freeing the reference array
@@ -830,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
break;
}
- ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+ ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM;
/*
* If our caller wants to track the memory allocations, we have a return
@@ -841,16 +839,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
return (0);
}
-#define WT_SPLIT_EXCLUSIVE 0x01 /* Page held exclusively */
-#define WT_SPLIT_INMEM 0x02 /* In-memory split */
-
/*
* __split_parent --
* Resolve a multi-page split, inserting new information into the parent.
*/
static int
__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags)
+ WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
{
WT_DECL_RET;
WT_IKEY *ikey;
@@ -878,27 +873,39 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* memory inside of the lock and may want to invest effort in making the
* locked period shorter.
*
- * We could race with another thread deepening our parent. To deal
- * with that, read the parent pointer each time we try to lock it, and
- * check that it's still correct after it is locked.
+ * We use the reconciliation lock here because not only do we have to
+ * single-thread the split, we have to lock out reconciliation of the
+ * parent because reconciliation of the parent can't deal with finding
+ * a split child during internal page traversal. Basically, there's no
+ * reason to use a different lock if we have to block reconciliation
+ * anyway.
*/
for (;;) {
parent = ref->home;
- F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret);
+ F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret);
if (ret == 0) {
+ /*
+ * We can race with another thread deepening our parent.
+ * To deal with that, read the parent pointer each time
+ * we try to lock it, and check it's still correct after
+ * it's locked.
+ */
if (parent == ref->home)
break;
- F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+ F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
continue;
}
/*
- * If we're attempting an in-memory split and we can't lock the
- * parent while there is a checkpoint in progress, give up.
- * This avoids an infinite loop where we are trying to split a
- * page while its parent is being checkpointed.
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we can't lock
+ * the parent, give up to avoid that deadlock.
*/
- if (LF_ISSET(WT_SPLIT_INMEM) && S2BT(session)->checkpointing)
+ if (S2BT(session)->checkpointing)
return (EBUSY);
__wt_yield();
}
@@ -1095,8 +1102,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Add it to the session discard list, to be freed when it's safe.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session,
- split_gen, LF_ISSET(WT_SPLIT_EXCLUSIVE) ? 1 : 0, pindex, size));
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
parent_decr += size;
/*
@@ -1121,7 +1127,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Do the check here because we've just grown the parent page and
* are holding it locked.
*/
- if (ret == 0 && !LF_ISSET(WT_SPLIT_EXCLUSIVE) &&
+ if (ret == 0 && !exclusive &&
__split_should_deepen(session, parent_ref))
ret = __split_deepen(session, parent);
@@ -1131,7 +1137,7 @@ err: if (!complete)
if (next_ref->state == WT_REF_SPLIT)
next_ref->state = WT_REF_DELETED;
}
- F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+ F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
if (hazard)
WT_TRET(__wt_hazard_clear(session, parent));
@@ -1170,7 +1176,13 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
right = NULL;
page_decr = parent_incr = right_incr = 0;
+ /*
+ * Assert splitting makes sense; specifically assert the page is dirty,
+ * we depend on that, otherwise the page might be evicted based on its
+ * last reconciliation which no longer matches reality after the split.
+ */
WT_ASSERT(session, __wt_page_can_split(session, page));
+ WT_ASSERT(session, __wt_page_is_modified(page));
/* Find the last item on the page. */
ins_head = page->pg_row_entries == 0 ?
@@ -1198,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* The key-instantiation code checks for races, clear the key fields so
* we don't trigger them.
*/
- child->key.recno = 0;
+ child->key.recno = WT_RECNO_OOB;
child->key.ikey = NULL;
child->state = WT_REF_MEM;
@@ -1373,7 +1385,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, WT_SPLIT_INMEM)) != 0) {
+ session, ref, split_ref, 2, parent_incr, 0)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1390,8 +1402,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* We marked the new page dirty; we're going to discard it, but
* first mark it clean and fix up the cache statistics.
*/
- right->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, right);
+ __wt_page_modify_clear(session, right);
WT_ERR(ret);
}
@@ -1448,8 +1459,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
* Pages with unresolved changes are not marked clean during
* reconciliation, do it now.
*/
- mod->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
+ __wt_page_modify_clear(session, page);
__wt_ref_out(session, ref);
/* Swap the new page into place. */
@@ -1492,8 +1502,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* Split into the parent; if we're closing the file, we hold it
* exclusively.
*/
- WT_ERR(__split_parent( session, ref, ref_new,
- new_entries, parent_incr, closing ? WT_SPLIT_EXCLUSIVE : 0));
+ WT_ERR(__split_parent(
+ session, ref, ref_new, new_entries, parent_incr, closing));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
@@ -1506,10 +1516,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* Pages with unresolved changes are not marked clean during
* reconciliation, do it now.
*/
- if (__wt_page_is_modified(page)) {
- mod->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ __wt_page_modify_clear(session, page);
__wt_page_out(session, &page);
return (0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 9a0584d3217..b379712f6e7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -51,6 +51,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_column_deleted, 0);
WT_STAT_SET(session, stats, btree_column_fix, 0);
WT_STAT_SET(session, stats, btree_column_internal, 0);
+ WT_STAT_SET(session, stats, btree_column_rle, 0);
WT_STAT_SET(session, stats, btree_column_variable, 0);
WT_STAT_SET(session, stats, btree_entries, 0);
WT_STAT_SET(session, stats, btree_overflow, 0);
@@ -114,12 +115,12 @@ __stat_page_col_var(
WT_COL *cip;
WT_INSERT *ins;
WT_UPDATE *upd;
- uint64_t deleted_cnt, entry_cnt, ovfl_cnt;
+ uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
uint32_t i;
int orig_deleted;
unpack = &_unpack;
- deleted_cnt = entry_cnt = ovfl_cnt = 0;
+ deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0;
WT_STAT_INCR(session, stats, btree_column_variable);
@@ -140,8 +141,10 @@ __stat_page_col_var(
__wt_cell_unpack(cell, unpack);
if (unpack->type == WT_CELL_ADDR_DEL)
orig_deleted = 1;
- else
+ else {
entry_cnt += __wt_cell_rle(unpack);
+ rle_cnt += __wt_cell_rle(unpack) - 1;
+ }
if (unpack->ovfl)
++ovfl_cnt;
}
@@ -173,6 +176,7 @@ __stat_page_col_var(
++entry_cnt;
WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
+ WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
WT_STAT_INCRV(session, stats, btree_entries, entry_cnt);
WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 3f615babb07..1fd660d4cd4 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -245,9 +245,6 @@ err: /* Inform the underlying block manager we're done. */
if (ckptbase != NULL)
__wt_meta_ckptlist_free(session, ckptbase);
- /* Wrap up reporting. */
- WT_TRET(__wt_progress(session, NULL, vs->fcnt));
-
/* Free allocated memory. */
__wt_scr_free(session, &vs->max_key);
__wt_scr_free(session, &vs->max_addr);
@@ -343,9 +340,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
* of the page to be built, and then a subsequent logical verification
* which happens here.
*
- * Report progress every 10 pages.
+ * Report progress occasionally.
*/
- if (++vs->fcnt % 10 == 0)
+#define WT_VERIFY_PROGRESS_INTERVAL 100
+ if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
WT_RET(__wt_progress(session, NULL, vs->fcnt));
#ifdef HAVE_DIAGNOSTIC
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index 095e439786c..38396facc3d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -71,19 +71,20 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
- if (dsk->recno != 0)
+ if (dsk->recno != WT_RECNO_OOB)
break;
WT_RET_VRFY(session,
- "%s page at %s has a record number of zero",
- __wt_page_type_string(dsk->type), tag);
+ "%s page at %s has an invalid record number of %d",
+ __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB);
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- if (dsk->recno == 0)
+ if (dsk->recno == WT_RECNO_OOB)
break;
WT_RET_VRFY(session,
- "%s page at %s has a non-zero record number",
+ "%s page at %s has a record number, which is illegal for "
+ "this page type",
__wt_page_type_string(dsk->type), tag);
}
@@ -91,8 +92,6 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
flags = dsk->flags;
if (LF_ISSET(WT_PAGE_COMPRESSED))
LF_CLR(WT_PAGE_COMPRESSED);
- if (LF_ISSET(WT_PAGE_ENCRYPTED))
- LF_CLR(WT_PAGE_ENCRYPTED);
if (dsk->type == WT_PAGE_ROW_LEAF) {
if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
LF_ISSET(WT_PAGE_EMPTY_V_NONE))
@@ -105,6 +104,10 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
LF_CLR(WT_PAGE_EMPTY_V_NONE);
}
+ if (LF_ISSET(WT_PAGE_ENCRYPTED))
+ LF_CLR(WT_PAGE_ENCRYPTED);
+ if (LF_ISSET(WT_PAGE_LAS_UPDATE))
+ LF_CLR(WT_PAGE_LAS_UPDATE);
if (flags != 0)
WT_RET_VRFY(session,
"page at %s has invalid flags set: 0x%" PRIx8,
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index fb7c9a1ce90..cbc5143698b 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -17,7 +17,7 @@ static int __col_insert_alloc(
*/
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+ uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -25,7 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
WT_INSERT_HEAD *ins_head, **ins_headp;
WT_ITEM _value;
WT_PAGE *page;
- WT_UPDATE *old_upd;
+ WT_UPDATE *old_upd, *upd;
size_t ins_size, upd_size;
u_int i, skipdepth;
int append, logged;
@@ -33,6 +33,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
btree = cbt->btree;
ins = NULL;
page = cbt->ref->page;
+ upd = upd_arg;
append = logged = 0;
/* This code expects a remove to have a NULL value. */
@@ -48,10 +49,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* There's some chance the application specified a record past
* the last record on the page. If that's the case, and we're
* inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
- * append list, not the update list. In addition, a recno of 0
+ * append list, not the update list. Also, an out-of-band recno
* implies an append operation, we're allocating a new row.
*/
- if (recno == 0 ||
+ if (recno == WT_RECNO_OOB ||
recno > (btree->type == BTREE_COL_VAR ?
__col_var_last_recno(page) : __col_fix_last_recno(page)))
append = 1;
@@ -76,7 +77,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* If we are restoring updates that couldn't be evicted, the
* key must not exist on the new page.
*/
- WT_ASSERT(session, upd == NULL);
+ WT_ASSERT(session, upd_arg == NULL);
/* Make sure the update can proceed. */
WT_ERR(__wt_txn_update_check(
@@ -134,7 +135,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
cbt->ins_head = ins_head;
cbt->ins = ins;
- if (upd == NULL) {
+ if (upd_arg == NULL) {
WT_ERR(
__wt_update_alloc(session, value, &upd, &upd_size));
WT_ERR(__wt_txn_modify(session, upd));
@@ -160,7 +161,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* The serial mutex acts as our memory barrier to flush these
* writes before inserting them into the list.
*/
- if (cbt->ins_stack[0] == NULL || recno == 0)
+ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
for (i = 0; i < skipdepth; i++) {
cbt->ins_stack[i] = &ins_head->head[i];
ins->next[i] = cbt->next_stack[i] = NULL;
@@ -192,7 +193,8 @@ err: /*
if (logged)
__wt_txn_unmodify(session);
__wt_free(session, ins);
- __wt_free(session, upd);
+ if (upd_arg == NULL)
+ __wt_free(session, upd);
}
return (ret);
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index 49a749b8a02..888c54d1ec9 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* there should only be one update list per key.
*/
WT_ASSERT(session, *upd_entry == NULL);
+
/*
* Set the "old" entry to the second update in the list
* so that the serialization function succeeds in
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
new file mode 100644
index 00000000000..e269e8702e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -0,0 +1,391 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_las_stats_update --
+ * Update the lookaside table statistics for return to the application.
+ */
+void
+__wt_las_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS **cstats;
+ WT_DSRC_STATS **dstats;
+
+ conn = S2C(session);
+
+ /*
+ * Lookaside table statistics are copied from the underlying lookaside
+ * table data-source statistics. If there's no lookaside table, values
+ * remain 0. In the current system, there's always a lookaside table,
+ * but there's no reason not to be cautious.
+ */
+ if (conn->las_cursor == NULL)
+ return;
+
+ /*
+ * We have a cursor, and we need the underlying data handle; we can get
+ * to it by way of the underlying btree handle, but it's a little ugly.
+ */
+ cstats = conn->stats;
+ dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats;
+
+ WT_STAT_SET(session, cstats,
+ cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
+ WT_STAT_SET(session, cstats,
+ cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+}
+
+/*
+ * __las_cursor_create --
+ * Open a new lookaside table cursor.
+ */
+static int
+__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+{
+ WT_BTREE *btree;
+ const char *open_cursor_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
+
+ WT_RET(__wt_open_cursor(
+ session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
+
+ /*
+ * Set special flags for the lookaside table: the lookaside flag (used,
+ * for example, to avoid writing records during reconciliation), also
+ * turn off checkpoints and logging.
+ *
+ * Test flags before setting them so updates can't race in subsequent
+ * opens (the first update is safe because it's single-threaded from
+ * wiredtiger_open).
+ */
+ btree = S2BT(session);
+ if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ F_SET(btree, WT_BTREE_LOOKASIDE);
+ if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ F_SET(btree, WT_BTREE_NO_CHECKPOINT);
+ if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
+ F_SET(btree, WT_BTREE_NO_LOGGING);
+
+ return (0);
+}
+
+/*
+ * __wt_las_create --
+ * Initialize the database's lookaside store.
+ */
+int
+__wt_las_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ const char *drop_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
+
+ conn = S2C(session);
+
+ /*
+ * Done at startup: we cannot do it on demand because we require the
+ * schema lock to create and drop the file, and it may not always be
+ * available.
+ *
+ * Open an internal session, used for the shared lookaside cursor.
+ *
+ * Sessions associated with a lookaside cursor should never be tapped
+ * for eviction.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "lookaside table", 1, 1, &conn->las_session));
+ session = conn->las_session;
+ F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
+
+ /* Discard any previous incarnation of the file. */
+ WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg));
+
+ /* Re-create the file. */
+ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
+
+ /* Open the shared cursor. */
+ WT_WITHOUT_DHANDLE(session,
+ ret = __las_cursor_create(session, &conn->las_cursor));
+
+ return (ret);
+}
+
+/*
+ * __wt_las_destroy --
+ * Destroy the database's lookaside store.
+ */
+int
+__wt_las_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ if (conn->las_session == NULL)
+ return (0);
+
+ wt_session = &conn->las_session->iface;
+ ret = wt_session->close(wt_session, NULL);
+
+ conn->las_cursor = NULL;
+ conn->las_session = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_las_set_written --
+ * Flag that the lookaside table has been written.
+ */
+void
+__wt_las_set_written(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ if (!conn->las_written) {
+ conn->las_written = true;
+
+ /*
+ * Push the flag: unnecessary, but from now page reads must deal
+ * with lookaside table records, and we only do the write once.
+ */
+ WT_FULL_BARRIER();
+ }
+}
+
+/*
+ * __wt_las_is_written --
+ * Return if the lookaside table has been written.
+ */
+bool
+__wt_las_is_written(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->las_written);
+}
+
+/*
+ * __wt_las_cursor --
+ * Return a lookaside cursor.
+ */
+int
+__wt_las_cursor(
+ WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ *cursorp = NULL;
+
+ /*
+ * We don't want to get tapped for eviction after we start using the
+ * lookaside cursor; save a copy of the current eviction state, we'll
+ * turn eviction off before we return.
+ *
+ * Don't cache lookaside table pages, we're here because of eviction
+ * problems and there's no reason to believe lookaside pages will be
+ * useful more than once.
+ */
+ *session_flags =
+ F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+ conn = S2C(session);
+
+ /* Eviction and sweep threads have their own lookaside table cursors. */
+ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
+ if (session->las_cursor == NULL) {
+ WT_WITHOUT_DHANDLE(session, ret =
+ __las_cursor_create(session, &session->las_cursor));
+ WT_RET(ret);
+ }
+
+ *cursorp = session->las_cursor;
+ } else {
+ /* Lock the shared lookaside cursor. */
+ __wt_spin_lock(session, &conn->las_lock);
+
+ *cursorp = conn->las_cursor;
+ }
+
+ /* Turn caching and eviction off. */
+ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+ return (0);
+}
+
+/*
+ * __wt_las_cursor_close --
+ * Discard a lookaside cursor.
+ */
+int
+__wt_las_cursor_close(
+ WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ if ((cursor = *cursorp) == NULL)
+ return (0);
+ *cursorp = NULL;
+
+ /* Reset the cursor. */
+ ret = cursor->reset(cursor);
+
+ /*
+ * We turned off caching and eviction while the lookaside cursor was in
+ * use, restore the session's flags.
+ */
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_SET(session, session_flags);
+
+ /*
+ * Eviction and sweep threads have their own lookaside table cursors;
+ * else, unlock the shared lookaside cursor.
+ */
+ if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
+ __wt_spin_unlock(session, &conn->las_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_las_sweep --
+ * Sweep the lookaside table.
+ */
+int
+__wt_las_sweep(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_RET;
+ WT_ITEM *key;
+ uint64_t cnt, las_counter, las_txnid;
+ uint32_t las_id, session_flags;
+ int notused;
+
+ conn = S2C(session);
+ cursor = NULL;
+ key = &conn->las_sweep_key;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /*
+ * If we're not starting a new sweep, position the cursor using the key
+ * from the last call (we don't care if we're before or after the key,
+ * just roughly in the same spot is fine).
+ */
+ if (conn->las_sweep_call != 0 && key->data != NULL) {
+ __wt_cursor_set_raw_key(cursor, key);
+ if ((ret = cursor->search_near(cursor, &notused)) != 0)
+ goto srch_notfound;
+ }
+
+ /*
+ * The sweep server wakes up every 10 seconds (by default), it's a slow
+ * moving thread. Try to review the entire lookaside table once every 5
+ * minutes, or every 30 calls.
+ *
+ * The reason is because the lookaside table exists because we're seeing
+ * cache/eviction pressure (it allows us to trade performance and disk
+ * space for cache space), and it's likely lookaside blocks are being
+ * evicted, and reading them back in doesn't help things. A trickier,
+ * but possibly better, alternative might be to review all lookaside
+ * blocks in the cache in order to get rid of them, and slowly review
+ * lookaside blocks that have already been evicted.
+ *
+ * We can't know for sure how many records are in the lookaside table,
+ * the cursor insert and remove statistics aren't updated atomically.
+ * Start with reviewing 100 rows, and if it takes more than the target
+ * number of calls to finish, increase the number of rows checked on
+ * each call; if it takes less than the target calls to finish, then
+ * decrease the number of rows reviewed on each call (but never less
+ * than 100).
+ */
+#define WT_SWEEP_LOOKASIDE_MIN_CNT 100
+#define WT_SWEEP_LOOKASIDE_PASS_TARGET 30
+ ++conn->las_sweep_call;
+ if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
+ cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;
+
+ /* Walk the file. */
+ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
+ /*
+ * If the loop terminates after completing a work unit, we will
+ * continue the table sweep next time. Get a local copy of the
+ * sweep key, we're going to reset the cursor; do so before
+ * calling cursor.remove, cursor.remove can discard our hazard
+ * pointer and the page could be evicted from underneath us.
+ */
+ if (cnt == 1) {
+ WT_ERR(__wt_cursor_get_raw_key(cursor, key));
+ if (!WT_DATA_IN_ITEM(key))
+ WT_ERR(__wt_buf_set(
+ session, key, key->data, key->size));
+ }
+
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * If the on-page record transaction ID associated with the
+ * record is globally visible, the record can be discarded.
+ *
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+ * another thread remove the record before we do, and the cursor
+ * remains positioned in that case.
+ */
+ if (__wt_txn_visible_all(session, las_txnid))
+ WT_ERR(cursor->remove(cursor));
+ }
+
+ /*
+ * When reaching the lookaside table end or the target number of calls,
+ * adjust the row count. Decrease/increase the row count depending on
+ * if the number of calls is less/more than the target.
+ */
+ if (ret == WT_NOTFOUND ||
+ conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
+ if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
+ conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
+ conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
+ if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
+ conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
+ }
+
+srch_notfound:
+ if (ret == WT_NOTFOUND)
+ conn->las_sweep_call = 0;
+
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (0) {
+err: __wt_buf_free(session, key);
+ }
+
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 73837c46ee8..91cfcedfcaf 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -76,6 +76,7 @@ static const WT_CONFIG_CHECK
confchk_wiredtiger_open_shared_cache_subconfigs[] = {
{ "chunk", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "name", "string", NULL, NULL, NULL, 0 },
+ { "quota", "int", NULL, NULL, NULL, 0 },
{ "reserve", "int", NULL, NULL, NULL, 0 },
{ "size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
@@ -121,7 +122,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "lsm_merge", "boolean", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -520,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -595,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -668,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -740,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -807,8 +808,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95"
",file_manager=(close_handle_minimum=250,close_idle_time=30,"
"close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)"
- ",lsm_merge=,shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)"
- ",statistics=none,statistics_log=(on_close=0,"
+ ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
"path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
confchk_WT_CONNECTION_reconfigure, 17
@@ -959,9 +960,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),use_environment_priv=0,verbose=",
confchk_wiredtiger_open, 34
@@ -979,9 +980,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
"minor=0)",
@@ -999,9 +1000,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),verbose=,version=(major=0,minor=0)",
confchk_wiredtiger_open_basecfg, 31
@@ -1018,9 +1019,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),verbose=",
confchk_wiredtiger_open_usercfg, 30
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index b28fca3a71b..b1155d06826 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -2031,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_turtle_init(session));
WT_ERR(__wt_metadata_open(session));
- /*
- * Start the worker threads last.
- */
+ /* Start the worker threads and run recovery. */
WT_ERR(__wt_connection_workers(session, cfg));
+ /* Create the lookaside table. */
+ WT_ERR(__wt_las_create(session));
+
WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
*wt_connp = &conn->iface;
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index fdc95a32387..aaae58ef168 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -22,21 +22,22 @@
*/
#define WT_CACHE_POOL_REDUCE_THRESHOLD 20
/* Balancing passes after a bump before a connection is a candidate. */
-#define WT_CACHE_POOL_BUMP_SKIPS 10
+#define WT_CACHE_POOL_BUMP_SKIPS 5
/* Balancing passes after a reduction before a connection is a candidate. */
-#define WT_CACHE_POOL_REDUCE_SKIPS 5
+#define WT_CACHE_POOL_REDUCE_SKIPS 10
/*
* Constants that control how much influence different metrics have on
* the pressure calculation.
*/
-#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 10
-#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 50
+#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3
+#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6
#define WT_CACHE_POOL_READ_MULTIPLIER 1
-static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_adjust(
+ WT_SESSION_IMPL *, uint64_t, uint64_t, int, int *);
static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
-static int __cache_pool_balance(WT_SESSION_IMPL *);
+static int __cache_pool_balance(WT_SESSION_IMPL *, int);
/*
* __wt_cache_pool_config --
@@ -51,7 +52,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
WT_DECL_RET;
char *pool_name;
int created, updating;
- uint64_t chunk, reserve, size, used_cache;
+ uint64_t chunk, quota, reserve, size, used_cache;
conn = S2C(session);
created = updating = 0;
@@ -142,6 +143,11 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
chunk = (uint64_t)cval.val;
else
chunk = cp->chunk;
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.quota", &cval) == 0 && cval.val != 0)
+ quota = (uint64_t)cval.val;
+ else
+ quota = cp->quota;
} else {
/*
* The only time shared cache configuration uses default
@@ -155,6 +161,9 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
session, cfg, "shared_cache.chunk", &cval));
WT_ASSERT(session, cval.val != 0);
chunk = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.quota", &cval));
+ quota = (uint64_t)cval.val;
}
/*
@@ -197,8 +206,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
/* The configuration is verified - it's safe to update the pool. */
cp->size = size;
cp->chunk = chunk;
+ cp->quota = quota;
conn->cache->cp_reserved = reserve;
+ conn->cache->cp_quota = quota;
/* Wake up the cache pool server so any changes are noticed. */
if (updating)
@@ -402,7 +413,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
* effectively used.
*/
static int
-__cache_pool_balance(WT_SESSION_IMPL *session)
+__cache_pool_balance(WT_SESSION_IMPL *session, int forward)
{
WT_CACHE_POOL *cp;
WT_DECL_RET;
@@ -421,16 +432,16 @@ __cache_pool_balance(WT_SESSION_IMPL *session)
WT_ERR(__cache_pool_assess(session, &highest));
bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+
/*
* Actively attempt to:
* - Reduce the amount allocated, if we are over the budget
* - Increase the amount used if there is capacity and any pressure.
*/
- for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
- F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
- F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+ while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
WT_ERR(__cache_pool_adjust(
- session, highest, bump_threshold, &adjusted));
+ session, highest, bump_threshold, forward, &adjusted));
/*
* Stop if the amount of cache being used is stable, and we
* aren't over capacity.
@@ -456,30 +467,39 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
WT_CACHE *cache;
WT_CONNECTION_IMPL *entry;
uint64_t app_evicts, app_waits, reads;
- uint64_t entries, highest, tmp;
+ uint64_t balanced_size, entries, highest, tmp;
cp = __wt_process.cache_pool;
- entries = 0;
+ balanced_size = entries = 0;
highest = 1; /* Avoid divide by zero */
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ if (entry->cache_size == 0 || entry->cache == NULL)
+ continue;
+ ++entries;
+ }
+
+ if (entries > 0)
+ balanced_size = cp->currently_used / entries;
+
/* Generate read pressure information. */
TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
- if (entry->cache_size == 0 ||
- entry->cache == NULL)
+ if (entry->cache_size == 0 || entry->cache == NULL)
continue;
cache = entry->cache;
- ++entries;
/*
* Figure out a delta since the last time we did an assessment
* for each metric we are tracking. Watch out for wrapping
* of values.
+ *
+ * Count pages read, assuming pages are 4KB.
*/
- tmp = cache->bytes_read;
+ tmp = cache->bytes_read >> 12;
if (tmp >= cache->cp_saved_read)
reads = tmp - cache->cp_saved_read;
else
- reads = (UINT64_MAX - cache->cp_saved_read) + tmp;
+ reads = tmp;
cache->cp_saved_read = tmp;
/* Update the application eviction count information */
@@ -500,12 +520,19 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
(UINT64_MAX - cache->cp_saved_app_waits) + tmp;
cache->cp_saved_app_waits = tmp;
- /* Calculate the weighted pressure for this member */
- cache->cp_pass_pressure =
- (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
+ /* Calculate the weighted pressure for this member. */
+ tmp = (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
(app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) +
(reads * WT_CACHE_POOL_READ_MULTIPLIER);
+ /* Weight smaller caches higher. */
+ tmp = (uint64_t)(tmp *
+ ((double)balanced_size / entry->cache_size));
+
+ /* Smooth over history. */
+ cache->cp_pass_pressure =
+ (9 * cache->cp_pass_pressure + tmp) / 10;
+
if (cache->cp_pass_pressure > highest)
highest = cache->cp_pass_pressure;
@@ -524,24 +551,25 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
/*
* __cache_pool_adjust --
- * Adjust the allocation of cache to each connection. If force is set
+ * Adjust the allocation of cache to each connection. If full is set
* ignore cache load information, and reduce the allocation for every
* connection allocated more than their reserved size.
*/
static int
__cache_pool_adjust(WT_SESSION_IMPL *session,
- uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+ uint64_t highest, uint64_t bump_threshold, int forward, int *adjustedp)
{
WT_CACHE_POOL *cp;
WT_CACHE *cache;
WT_CONNECTION_IMPL *entry;
- uint64_t adjusted, highest_percentile, pressure, reserved;
- int force, grew;
+ uint64_t adjustment, highest_percentile, pressure, reserved, smallest;
+ int busy, pool_full, grow;
+ u_int pct_full;
*adjustedp = 0;
cp = __wt_process.cache_pool;
- force = (cp->currently_used > cp->size);
- grew = 0;
+ grow = 0;
+ pool_full = (cp->currently_used >= cp->size);
/* Highest as a percentage, avoid 0 */
highest_percentile = (highest / 100) + 1;
@@ -549,13 +577,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
WT_RET(__wt_verbose(session,
WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
- "\t" "cache_size, pressure, skips: "));
+ "\t" "cache (MB), pressure, skips, busy, %% full:"));
}
- TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ for (entry = forward ? TAILQ_FIRST(&cp->cache_pool_qh) :
+ TAILQ_LAST(&cp->cache_pool_qh, __wt_cache_pool_qh);
+ entry != NULL;
+ entry = forward ? TAILQ_NEXT(entry, cpq) :
+ TAILQ_PREV(entry, __wt_cache_pool_qh, cpq)) {
cache = entry->cache;
reserved = cache->cp_reserved;
- adjusted = 0;
+ adjustment = 0;
/*
* The read pressure is calculated as a percentage of how
@@ -565,84 +597,109 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
* assigned.
*/
pressure = cache->cp_pass_pressure / highest_percentile;
+ busy = __wt_eviction_needed(entry->default_session, &pct_full);
+
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
- "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
- entry->cache_size, pressure, cache->cp_skip_count));
+ "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u",
+ entry->cache_size >> 20, pressure, cache->cp_skip_count,
+ busy, pct_full));
/* Allow to stabilize after changes. */
if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
continue;
+
/*
* If the entry is currently allocated less than the reserved
- * size, increase it's allocation. This should only happen if:
- * - It's the first time we've seen this member
- * - The reserved size has been adjusted
+ * size, increase its allocation. This should only happen if:
+ * - it's the first time we've seen this member, or
+ * - the reserved size has been adjusted
*/
if (entry->cache_size < reserved) {
- grew = 1;
- adjusted = reserved - entry->cache_size;
-
+ grow = 1;
+ adjustment = reserved - entry->cache_size;
/*
* Conditions for reducing the amount of resources for an
* entry:
- * - If we are forcing and this entry has more than the
- * minimum amount of space in use.
- * - If the read pressure in this entry is below the
- * threshold, other entries need more cache, the entry has
- * more than the minimum space and there is no available
- * space in the pool.
+ * - the pool is full,
+ * - application threads are not busy doing eviction already,
+ * - this entry has more than the minimum amount of space in
+ * use,
+ * - the read pressure in this entry is below the threshold,
+ * other entries need more cache, the entry has more than
+ * the minimum space and there is no available space in the
+ * pool.
*/
- } else if ((force && entry->cache_size > reserved) ||
- (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
- highest > 1 && entry->cache_size > reserved &&
- cp->currently_used >= cp->size)) {
- grew = 0;
+ } else if (pool_full && !busy &&
+ entry->cache_size > reserved &&
+ pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) {
+ grow = 0;
/*
- * Shrink by a chunk size if that doesn't drop us
- * below the reserved size.
+ * Don't drop the size down too much - or it can
+ * trigger aggressive eviction in the connection,
+ * which is likely to lead to lower throughput and
+ * potentially a negative feedback loop in the
+ * balance algorithm.
*/
- if (entry->cache_size > cp->chunk + reserved)
- adjusted = cp->chunk;
- else
- adjusted = entry->cache_size - reserved;
+ smallest = (100 * __wt_cache_bytes_inuse(cache)) /
+ cache->eviction_trigger;
+ if (entry->cache_size > smallest)
+ adjustment = WT_MIN(cp->chunk,
+ (entry->cache_size - smallest) / 2);
+ adjustment =
+ WT_MIN(adjustment, entry->cache_size - reserved);
/*
* Conditions for increasing the amount of resources for an
* entry:
- * - There was some activity across the pool
- * - This entry is using less than the entire cache pool
- * - The connection is using enough cache to require eviction
- * - There is space available in the pool
- * - Additional cache would benefit the connection OR
- * - The pool is less than half distributed
+ * - there is space available in the pool
+ * - the connection isn't over quota
+ * - the connection is using enough cache to require eviction
+ * - there was some activity across the pool
+ * - this entry is using less than the entire cache pool
+ * - additional cache would benefit the connection OR
+ * - the pool is less than half distributed
*/
- } else if (entry->cache_size < cp->size &&
+ } else if (!pool_full &&
+ (cache->cp_quota == 0 ||
+ entry->cache_size < cache->cp_quota) &&
__wt_cache_bytes_inuse(cache) >=
(entry->cache_size * cache->eviction_target) / 100 &&
- ((cp->currently_used < cp->size &&
- pressure > bump_threshold) ||
+ (pressure > bump_threshold ||
cp->currently_used < cp->size * 0.5)) {
- grew = 1;
- adjusted = WT_MIN(cp->chunk,
- cp->size - cp->currently_used);
+ grow = 1;
+ adjustment = WT_MIN(WT_MIN(cp->chunk,
+ cp->size - cp->currently_used),
+ cache->cp_quota - entry->cache_size);
}
- if (adjusted > 0) {
+ /*
+ * Bounds checking: don't go over the pool size or under the
+ * reserved size for this cache.
+ *
+ * Shrink by a chunk size if that doesn't drop us
+ * below the reserved size.
+ *
+ * Limit the reduction to half of the free space in the
+ * connection's cache. This should reduce cache sizes
+ * gradually without stalling application threads.
+ */
+ if (adjustment > 0) {
*adjustedp = 1;
- if (grew > 0) {
+ if (grow) {
cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
- entry->cache_size += adjusted;
- cp->currently_used += adjusted;
+ entry->cache_size += adjustment;
+ cp->currently_used += adjustment;
} else {
cache->cp_skip_count =
WT_CACHE_POOL_REDUCE_SKIPS;
WT_ASSERT(session,
- entry->cache_size >= adjusted &&
- cp->currently_used >= adjusted);
- entry->cache_size -= adjusted;
- cp->currently_used -= adjusted;
+ entry->cache_size >= adjustment &&
+ cp->currently_used >= adjustment);
+ entry->cache_size -= adjustment;
+ cp->currently_used -= adjustment;
}
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
"Allocated %s%" PRId64 " to %s",
- grew ? "" : "-", adjusted, entry->home));
+ grow ? "" : "-", adjustment, entry->home));
+
/*
* TODO: Add a loop waiting for connection to give up
* cache.
@@ -663,11 +720,13 @@ __wt_cache_pool_server(void *arg)
WT_CACHE_POOL *cp;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ int forward;
session = (WT_SESSION_IMPL *)arg;
cp = __wt_process.cache_pool;
cache = S2C(session)->cache;
+ forward = 1;
while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
@@ -695,8 +754,10 @@ __wt_cache_pool_server(void *arg)
* Continue even if there was an error. Details of errors are
* reported in the balance function.
*/
- if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
- (void)__cache_pool_balance(session);
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+ (void)__cache_pool_balance(session, forward);
+ forward = !forward;
+ }
}
if (0) {
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
index 1c4a631cc59..7a8a6cba838 100644
--- a/src/third_party/wiredtiger/src/conn/conn_handle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -55,6 +55,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
WT_RET(__wt_rwlock_alloc(session,
&conn->hot_backup_lock, "hot backup"));
+ WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation"));
@@ -140,6 +141,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->encryptor_lock);
__wt_spin_destroy(session, &conn->fh_lock);
WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock));
+ __wt_spin_destroy(session, &conn->las_lock);
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
__wt_spin_destroy(session, &conn->table_lock);
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index dae0293d790..2b115190b06 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -287,8 +287,9 @@ __log_file_server(void *arg)
WT_DECL_RET;
WT_FH *close_fh;
WT_LOG *log;
- WT_LSN close_end_lsn, close_lsn, min_lsn;
+ WT_LSN close_end_lsn, min_lsn;
WT_SESSION_IMPL *session;
+ uint32_t filenum;
int locked;
session = arg;
@@ -300,66 +301,97 @@ __log_file_server(void *arg)
* If there is a log file to close, make sure any outstanding
* write operations have completed, then fsync and close it.
*/
- if ((close_fh = log->log_close_fh) != NULL &&
- (ret = __wt_log_extract_lognum(session, close_fh->name,
- &close_lsn.file)) == 0 &&
- close_lsn.file < log->write_lsn.file) {
+ if ((close_fh = log->log_close_fh) != NULL) {
+ WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
+ &filenum));
/*
- * We've copied the file handle, clear out the one in
- * log structure to allow it to be set again.
+ * We update the close file handle before updating the
+ * close LSN when changing files. It is possible we
+ * could see mismatched settings. If we do, yield
+ * until it is set. This should rarely happen.
*/
- log->log_close_fh = NULL;
- /*
- * Set the close_end_lsn to the LSN immediately after
- * ours. That is, the beginning of the next log file.
- * We need to know the LSN file number of our own close
- * in case earlier calls are still in progress and the
- * next one to move the sync_lsn into the next file for
- * later syncs.
- */
- close_lsn.offset = 0;
- close_end_lsn = close_lsn;
- close_end_lsn.file++;
- WT_ERR(__wt_fsync(session, close_fh));
- __wt_spin_lock(session, &log->log_sync_lock);
- locked = 1;
- WT_ERR(__wt_close(session, &close_fh));
- WT_ASSERT(session,
- WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0);
- log->sync_lsn = close_end_lsn;
- WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
- locked = 0;
- __wt_spin_unlock(session, &log->log_sync_lock);
+ while (log->log_close_lsn.file < filenum)
+ __wt_yield();
+
+ if (__wt_log_cmp(
+ &log->write_lsn, &log->log_close_lsn) >= 0) {
+ /*
+ * We've copied the file handle, clear out the
+ * one in the log structure to allow it to be
+ * set again. Copy the LSN before clearing
+ * the file handle.
+ * Use a barrier to make sure the compiler does
+ * not reorder the following two statements.
+ */
+ close_end_lsn = log->log_close_lsn;
+ WT_FULL_BARRIER();
+ log->log_close_fh = NULL;
+ /*
+ * Set the close_end_lsn to the LSN immediately
+ * after ours. That is, the beginning of the
+ * next log file. We need to know the LSN
+ * file number of our own close in case earlier
+ * calls are still in progress and the next one
+ * to move the sync_lsn into the next file for
+ * later syncs.
+ */
+ close_end_lsn.file++;
+ close_end_lsn.offset = 0;
+ WT_ERR(__wt_fsync(session, close_fh));
+ __wt_spin_lock(session, &log->log_sync_lock);
+ locked = 1;
+ WT_ERR(__wt_close(session, &close_fh));
+ WT_ASSERT(session, __wt_log_cmp(
+ &close_end_lsn, &log->sync_lsn) >= 0);
+ log->sync_lsn = close_end_lsn;
+ WT_ERR(__wt_cond_signal(
+ session, log->log_sync_cond));
+ locked = 0;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ }
}
/*
* If a later thread asked for a background sync, do it now.
*/
- if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
+ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
/*
* Save the latest write LSN which is the minimum
* we will have written to disk.
*/
min_lsn = log->write_lsn;
/*
- * The sync LSN we asked for better be smaller than
- * the current written LSN.
+ * We have to wait until the LSN we asked for is
+ * written. If it isn't signal the wrlsn thread
+ * to get it written.
*/
- WT_ASSERT(session,
- WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0);
- WT_ERR(__wt_fsync(session, log->log_fh));
- __wt_spin_lock(session, &log->log_sync_lock);
- locked = 1;
- /*
- * The sync LSN could have advanced while we were
- * writing to disk.
- */
- if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) {
- log->sync_lsn = min_lsn;
+ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
+ WT_ERR(__wt_fsync(session, log->log_fh));
+ __wt_spin_lock(session, &log->log_sync_lock);
+ locked = 1;
+ /*
+ * The sync LSN could have advanced while we
+ * were writing to disk.
+ */
+ if (__wt_log_cmp(
+ &log->sync_lsn, &min_lsn) <= 0) {
+ log->sync_lsn = min_lsn;
+ WT_ERR(__wt_cond_signal(
+ session, log->log_sync_cond));
+ }
+ locked = 0;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ } else {
WT_ERR(__wt_cond_signal(
- session, log->log_sync_cond));
+ session, conn->log_wrlsn_cond));
+ /*
+ * We do not want to wait potentially a second
+ * to process this. Yield to give the wrlsn
+ * thread a chance to run and try again in
+ * this case.
+ */
+ __wt_yield();
+ continue;
}
- locked = 0;
- __wt_spin_unlock(session, &log->log_sync_lock);
}
/* Wait until the next event. */
WT_ERR(__wt_cond_wait(
@@ -394,26 +426,29 @@ typedef struct {
/*
* __wt_log_wrlsn --
* Process written log slots and attempt to coalesce them if the LSNs
- * are contiguous. Returns 1 if slots were freed, 0 if no slots were
- * freed in the progress arg. Must be called with the log slot lock held.
+ * are contiguous. The purpose of this function is to advance the
+ * write_lsn in LSN order after the buffer is written to the log file.
*/
int
-__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
+__wt_log_wrlsn(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
WT_LOGSLOT *coalescing, *slot;
+ WT_LSN save_lsn;
size_t written_i;
uint32_t i, save_i;
conn = S2C(session);
log = conn->log;
+ __wt_spin_lock(session, &log->log_writelsn_lock);
+restart:
coalescing = NULL;
+ WT_INIT_LSN(&save_lsn);
written_i = 0;
i = 0;
- if (free_i != NULL)
- *free_i = WT_SLOT_POOL;
/*
* Walk the array once saving any slots that are in the
@@ -422,9 +457,14 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
while (i < WT_SLOT_POOL) {
save_i = i;
slot = &log->slot_pool[i++];
- if (free_i != NULL && *free_i == WT_SLOT_POOL &&
- slot->slot_state == WT_LOG_SLOT_FREE)
- *free_i = save_i;
+ /*
+ * XXX - During debugging I saw slot 0 become orphaned.
+ * I believe it is fixed, but check for now.
+ * This assertion should catch that.
+ */
+ if (slot->slot_state == 0)
+ WT_ASSERT(session,
+ slot->slot_release_lsn.file >= log->write_lsn.file);
if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
continue;
written[written_i].slot_index = save_i;
@@ -435,15 +475,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* based on the release LSN, and then look for them in order.
*/
if (written_i > 0) {
- /*
- * If wanted, reset the yield variable to indicate that we
- * have found written slots.
- */
- if (yield != NULL)
- *yield = 0;
WT_INSERTION_SORT(written, written_i,
WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
-
/*
* We know the written array is sorted by LSN. Go
* through them either advancing write_lsn or coalesce
@@ -451,8 +484,28 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
*/
for (i = 0; i < written_i; i++) {
slot = &log->slot_pool[written[i].slot_index];
+ /*
+ * The log server thread pushes out slots periodically.
+ * Sometimes they are empty slots. If we find an
+ * empty slot, where empty means the start and end LSN
+ * are the same, free it and continue.
+ */
+ if (__wt_log_cmp(&slot->slot_start_lsn,
+ &slot->slot_release_lsn) == 0 &&
+ __wt_log_cmp(&slot->slot_start_lsn,
+ &slot->slot_end_lsn) == 0) {
+ __wt_log_slot_free(session, slot);
+ continue;
+ }
if (coalescing != NULL) {
- if (WT_LOG_CMP(&coalescing->slot_end_lsn,
+ /*
+ * If the write_lsn changed, we may be able to
+ * process slots. Try again.
+ */
+ if (__wt_log_cmp(
+ &log->write_lsn, &save_lsn) != 0)
+ goto restart;
+ if (__wt_log_cmp(&coalescing->slot_end_lsn,
&written[i].lsn) != 0) {
coalescing = slot;
continue;
@@ -461,6 +514,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* If we get here we have a slot to coalesce
* and free.
*/
+ coalescing->slot_last_offset =
+ slot->slot_last_offset;
coalescing->slot_end_lsn = slot->slot_end_lsn;
WT_STAT_FAST_CONN_INCR(
session, log_slot_coalesced);
@@ -473,8 +528,12 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
/*
* If this written slot is not the next LSN,
* try to start coalescing with later slots.
+ * A synchronous write may update write_lsn
+ * so save the last one we saw to check when
+ * coalescing slots.
*/
- if (WT_LOG_CMP(
+ save_lsn = log->write_lsn;
+ if (__wt_log_cmp(
&log->write_lsn, &written[i].lsn) != 0) {
coalescing = slot;
continue;
@@ -483,27 +542,29 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* If we get here we have a slot to process.
* Advance the LSN and process the slot.
*/
- WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn,
+ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn,
&slot->slot_release_lsn) == 0);
+ if (slot->slot_start_lsn.offset !=
+ slot->slot_last_offset)
+ slot->slot_start_lsn.offset =
+ slot->slot_last_offset;
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
- WT_RET(__wt_cond_signal(
+ WT_ERR(__wt_cond_signal(
session, log->log_write_cond));
WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
/*
* Signal the close thread if needed.
*/
if (F_ISSET(slot, WT_SLOT_CLOSEFH))
- WT_RET(__wt_cond_signal(
+ WT_ERR(__wt_cond_signal(
session, conn->log_file_cond));
}
- WT_RET(__wt_log_slot_free(session, slot));
- if (free_i != NULL && *free_i == WT_SLOT_POOL &&
- slot->slot_state == WT_LOG_SLOT_FREE)
- *free_i = written[i].slot_index;
+ __wt_log_slot_free(session, slot);
}
}
- return (0);
+err: __wt_spin_unlock(session, &log->log_writelsn_lock);
+ return (ret);
}
/*
@@ -515,31 +576,26 @@ __log_wrlsn_server(void *arg)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_LOG *log;
WT_SESSION_IMPL *session;
- int locked, yield;
session = arg;
conn = S2C(session);
- log = conn->log;
- locked = yield = 0;
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
- __wt_spin_lock(session, &log->log_slot_lock);
- locked = 1;
- WT_ERR(__wt_log_wrlsn(session, NULL, &yield));
- locked = 0;
- __wt_spin_unlock(session, &log->log_slot_lock);
- if (++yield < 1000)
- __wt_yield();
- else
- WT_ERR(__wt_cond_wait(session,
- conn->log_wrlsn_cond, 100000));
+ /*
+ * Write out any log record buffers.
+ */
+ WT_ERR(__wt_log_wrlsn(session));
+ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000));
}
+ /*
+ * On close we need to do this one more time because there could
+ * be straggling log writes that need to be written.
+ */
+ WT_ERR(__wt_log_force_write(session, 1));
+ WT_ERR(__wt_log_wrlsn(session));
if (0) {
err: __wt_err(session, ret, "log wrlsn server error");
}
- if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
return (WT_THREAD_RET_VALUE);
}
@@ -554,44 +610,81 @@ __log_server(void *arg)
WT_DECL_RET;
WT_LOG *log;
WT_SESSION_IMPL *session;
- u_int locked;
+ int freq_per_sec, signalled;
session = arg;
conn = S2C(session);
log = conn->log;
- locked = 0;
+ signalled = 0;
+
+ /*
+ * Set this to the number of times per second we want to force out the
+ * log slot buffer.
+ */
+#define WT_FORCE_PER_SECOND 20
+ freq_per_sec = WT_FORCE_PER_SECOND;
+
+ /*
+ * The log server thread does a variety of work. It forces out any
+ * buffered log writes. It pre-allocates log files and it performs
+ * log archiving. The reason the wrlsn thread does not force out
+ * the buffered writes is because we want to process and move the
+ * write_lsn forward as quickly as possible. The same reason applies
+ * to why the log file server thread does not force out the writes.
+ * That thread does fsync calls which can take a long time and we
+ * don't want log records sitting in the buffer over the time it
+ * takes to sync out an earlier file.
+ */
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
/*
- * Perform log pre-allocation.
+ * Slots depend on future activity. Force out buffered
+ * writes in case we are idle. This cannot be part of the
+ * wrlsn thread because of interaction advancing the write_lsn
+ * and a buffer may need to wait for the write_lsn to advance
+ * in the case of a synchronous buffer. We end up with a hang.
*/
- if (conn->log_prealloc > 0)
- WT_ERR(__log_prealloc_once(session));
+ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));
/*
- * Perform the archive.
+ * We don't want to archive or pre-allocate files as often as
+ * we want to force out log buffers. Only do it once per second
+ * or if the condition was signalled.
*/
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
- if (__wt_try_writelock(
- session, log->log_archive_lock) == 0) {
- locked = 1;
- WT_ERR(__log_archive_once(session, 0));
- WT_ERR( __wt_writeunlock(
- session, log->log_archive_lock));
- locked = 0;
- } else
- WT_ERR(__wt_verbose(session, WT_VERB_LOG,
- "log_archive: Blocked due to open log "
- "cursor holding archive lock"));
+ if (--freq_per_sec <= 0 || signalled != 0) {
+ freq_per_sec = WT_FORCE_PER_SECOND;
+
+ /*
+ * Perform log pre-allocation.
+ */
+ if (conn->log_prealloc > 0)
+ WT_ERR(__log_prealloc_once(session));
+
+ /*
+ * Perform the archive.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
+ if (__wt_try_writelock(
+ session, log->log_archive_lock) == 0) {
+ ret = __log_archive_once(session, 0);
+ WT_TRET(__wt_writeunlock(
+ session, log->log_archive_lock));
+ WT_ERR(ret);
+ } else
+ WT_ERR(
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_archive: Blocked due to open "
+ "log cursor holding archive lock"));
+ }
}
+
/* Wait until the next event. */
- WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION));
+ WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
+ WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
}
if (0) {
err: __wt_err(session, ret, "log server error");
}
- if (locked)
- (void)__wt_writeunlock(session, log->log_archive_lock);
return (WT_THREAD_RET_VALUE);
}
@@ -624,6 +717,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+ WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
+ "log write LSN"));
WT_RET(__wt_rwlock_alloc(session,
&log->log_archive_lock, "log archive lock"));
if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
@@ -755,13 +850,11 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->log_tid));
conn->log_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
if (conn->log_file_tid_set) {
WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
WT_TRET(__wt_thread_join(session, conn->log_file_tid));
conn->log_file_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
if (conn->log_file_session != NULL) {
wt_session = &conn->log_file_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
@@ -772,13 +865,13 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
conn->log_wrlsn_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
if (conn->log_wrlsn_session != NULL) {
wt_session = &conn->log_wrlsn_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
conn->log_wrlsn_session = NULL;
}
+ WT_TRET(__wt_log_slot_destroy(session));
WT_TRET(__wt_log_close(session));
/* Close the server thread's session. */
@@ -788,13 +881,18 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_session = NULL;
}
- WT_TRET(__wt_log_slot_destroy(session));
+ /* Destroy the condition variables now that all threads are stopped */
+ WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+
WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
__wt_spin_destroy(session, &conn->log->log_lock);
__wt_spin_destroy(session, &conn->log->log_slot_lock);
__wt_spin_destroy(session, &conn->log->log_sync_lock);
+ __wt_spin_destroy(session, &conn->log->log_writelsn_lock);
__wt_free(session, conn->log_path);
__wt_free(session, conn->log);
return (ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 397f3ff8c38..8bc69bb3e80 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -111,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
F_CLR(conn, WT_CONN_SERVER_RUN);
WT_TRET(__wt_async_destroy(session));
WT_TRET(__wt_lsm_manager_destroy(session));
+ WT_TRET(__wt_sweep_destroy(session));
F_SET(conn, WT_CONN_CLOSING);
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, 1));
- WT_TRET(__wt_sweep_destroy(session));
WT_TRET(__wt_evict_destroy(session));
+ /* Shut down the lookaside table, after all eviction is complete. */
+ WT_TRET(__wt_las_destroy(session));
+
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
@@ -238,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
/* Run recovery. */
WT_RET(__wt_txn_recover(session));
- /*
- * Start the handle sweep thread.
- */
+ /* Start the handle sweep thread. */
WT_RET(__wt_sweep_create(session));
/* Start the optional async threads. */
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 80698c536cd..3b188bfd22a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -50,6 +50,7 @@ __wt_conn_stat_init(WT_SESSION_IMPL *session)
__wt_async_stats_update(session);
__wt_cache_stats_update(session);
+ __wt_las_stats_update(session);
__wt_txn_stats_update(session);
WT_STAT_SET(session, stats, file_open, conn->open_file_count);
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 3de9347f38f..8da32416242 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -283,6 +283,13 @@ __sweep_server(void *arg)
WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
/*
+ * Sweep the lookaside table. If the lookaside table hasn't yet
+ * been written, there's no work to do.
+ */
+ if (__wt_las_is_written(session))
+ WT_ERR(__wt_las_sweep(session));
+
+ /*
* Mark handles with a time of death, and report whether any
* handles are marked dead. If sweep_idle_time is 0, handles
* never become idle.
@@ -359,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
/*
* Handle sweep does enough I/O it may be called upon to perform slow
* operations for the block manager.
+ *
+ * The sweep thread sweeps the lookaside table for outdated records,
+ * it gets its own cursor for that purpose.
+ *
+ * Don't tap the sweep thread for eviction.
*/
- F_SET(session, WT_SESSION_CAN_WAIT);
+ F_SET(session, WT_SESSION_CAN_WAIT |
+ WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
WT_RET(__wt_cond_alloc(
session, "handle sweep server", 0, &conn->sweep_cond));
@@ -399,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
conn->sweep_session = NULL;
}
+
+ /* Discard any saved lookaside key. */
+ __wt_buf_free(session, &conn->las_sweep_key);
+
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index 60d94697189..3d9e5e405e8 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -514,17 +514,23 @@ static int
__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_CURSOR_BACKUP *cb;
+ const char *name;
WT_UNUSED(cfg);
cb = session->bkp_cursor;
+ name = session->dhandle->name;
/* Ignore files in the process of being bulk-loaded. */
if (F_ISSET(S2BT(session), WT_BTREE_BULK))
return (0);
+ /* Ignore the lookaside table. */
+ if (strcmp(name, WT_LAS_URI) == 0)
+ return (0);
+
/* Add the file to the list of files to be copied. */
- return (__backup_list_append(session, cb, session->dhandle->name));
+ return (__backup_list_append(session, cb, name));
}
/*
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
index c58d6899150..8ee57d24413 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_ds.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -510,7 +510,7 @@ __wt_curds_open(
source = data_source->source;
source->session = (WT_SESSION *)session;
memset(&source->q, 0, sizeof(source->q));
- source->recno = 0;
+ source->recno = WT_RECNO_OOB;
memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
memset(&source->key, 0, sizeof(source->key));
memset(&source->value, 0, sizeof(source->value));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index a9f3124149e..c998565eb75 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -369,15 +369,20 @@ __curfile_close(WT_CURSOR *cursor)
__wt_buf_free(session, &cbulk->last);
}
- WT_TRET(__wt_btcur_close(cbt));
- if (cbt->btree != NULL) {
+ WT_TRET(__wt_btcur_close(cbt, 0));
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+ /*
+ * Note: release the data handle last so that cursor statistics are
+ * updated correctly.
+ */
+ if (session->dhandle != NULL) {
/* Increment the data-source's in-use counter. */
__wt_cursor_dhandle_decr_use(session);
WT_TRET(__wt_session_release_btree(session));
}
- /* The URI is owned by the btree handle. */
- cursor->internal_uri = NULL;
- WT_TRET(__wt_cursor_close(cursor));
err: API_END_RET(session, ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index 7dad85e9d38..045663b3614 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -130,7 +130,8 @@ __curindex_move(WT_CURSOR_INDEX *cindex)
(*cp)->recno = first->recno;
}
F_SET(*cp, WT_CURSTD_KEY_EXT);
- WT_RET((*cp)->search(*cp));
+ if (cindex->cg_needvalue[i])
+ WT_RET((*cp)->search(*cp));
}
F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -320,6 +321,7 @@ __curindex_close(WT_CURSOR *cursor)
*cp = NULL;
}
+ __wt_free(session, cindex->cg_needvalue);
__wt_free(session, cindex->cg_cursors);
if (cindex->key_plan != idx->key_plan)
__wt_free(session, cindex->key_plan);
@@ -353,14 +355,19 @@ __curindex_open_colgroups(
/* Child cursors are opened with dump disabled. */
const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
char *proj;
+ size_t cgcnt;
table = cindex->table;
- WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+ cgcnt = WT_COLGROUPS(table);
+ WT_RET(__wt_calloc_def(session, cgcnt, &cindex->cg_needvalue));
+ WT_RET(__wt_calloc_def(session, cgcnt, &cp));
cindex->cg_cursors = cp;
/* Work out which column groups we need. */
for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_VALUE)
+ cindex->cg_needvalue[arg] = 1;
if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
cp[arg] != NULL)
continue;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
index 3376f2a3166..ade9fd18962 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_log.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
acl = (WT_CURSOR_LOG *)a;
bcl = (WT_CURSOR_LOG *)b;
WT_ASSERT(session, cmpp != NULL);
- *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+ *cmpp = __wt_log_cmp(acl->cur_lsn, bcl->cur_lsn);
/*
* If both are on the same LSN, compare step counter.
*/
@@ -392,6 +392,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+ /*
+ * The user may be trying to read a log record they just wrote.
+ * Log records may be buffered, so force out any now.
+ */
+ WT_ERR(__wt_log_force_write(session, 1));
+
/* Log cursors block archiving. */
WT_ERR(__wt_readlock(session, log->log_archive_lock));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 2f844baaa00..2216a1d969d 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -497,7 +497,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
conn = S2C(session);
- WT_ERR(__wt_calloc_one(session, &cst));
+ WT_RET(__wt_calloc_one(session, &cst));
cursor = &cst->iface;
*cursor = iface;
cursor->session = &session->iface;
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index b7d8be14e5c..701bd845ae9 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
item->data, item->size, "q", &cursor->recno));
} else
cursor->recno = va_arg(ap, uint64_t);
- if (cursor->recno == 0)
+ if (cursor->recno == WT_RECNO_OOB)
WT_ERR_MSG(session, EINVAL,
- "Record numbers must be greater than zero");
+ "%d is an invalid record number", WT_RECNO_OOB);
buf->data = &cursor->recno;
sz = sizeof(cursor->recno);
} else {
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 35ff0e4329e..66fabe48fb2 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -80,16 +80,13 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
break;
case WT_SYNC_DISCARD:
/*
- * If we see a dirty page in a dead handle, clean the
+ * Dead handles may reference dirty pages; clean the
* page, both to keep statistics correct, and to let
* the page-discard function assert no dirty page is
* ever discarded.
*/
- if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) &&
- __wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
+ __wt_page_modify_clear(session, page);
WT_ASSERT(session,
F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index d442a34de71..b16621d1e6f 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -10,14 +10,13 @@
static int __evict_clear_all_walks(WT_SESSION_IMPL *);
static int __evict_clear_walks(WT_SESSION_IMPL *);
-static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
static int WT_CDECL __evict_lru_cmp(const void *, const void *);
static int __evict_lru_pages(WT_SESSION_IMPL *, int);
-static int __evict_lru_walk(WT_SESSION_IMPL *, uint32_t);
+static int __evict_lru_walk(WT_SESSION_IMPL *);
static int __evict_page(WT_SESSION_IMPL *, int);
static int __evict_pass(WT_SESSION_IMPL *);
-static int __evict_walk(WT_SESSION_IMPL *, uint32_t);
-static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static int __evict_walk(WT_SESSION_IMPL *);
+static int __evict_walk_file(WT_SESSION_IMPL *, u_int *);
static WT_THREAD_RET __evict_worker(void *);
static int __evict_server_work(WT_SESSION_IMPL *);
@@ -248,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) {
WT_ERR(__wt_open_internal_session(conn,
- "eviction-worker", 0, 0, &workers[i].session));
+ "eviction-worker", 1, 0, &workers[i].session));
workers[i].id = i;
- F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+ /*
+ * Eviction worker threads get their own lookaside table cursor.
+ * Eviction worker threads may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(workers[i].session,
+ WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT);
if (i < conn->evict_workers_min) {
++conn->evict_workers;
@@ -280,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session)
/* We need a session handle because we're reading/writing pages. */
WT_RET(__wt_open_internal_session(
- conn, "eviction-server", 0, 0, &conn->evict_session));
+ conn, "eviction-server", 1, 0, &conn->evict_session));
session = conn->evict_session;
/*
@@ -297,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session)
else
F_SET(session, WT_SESSION_CAN_WAIT);
+ /* The eviction server gets its own lookaside table cursor. */
+ F_SET(session, WT_SESSION_LOOKASIDE_CURSOR);
+
/*
* Start the primary eviction server thread after the worker threads
* have started to avoid it starting additional worker threads before
@@ -406,47 +415,62 @@ err: WT_PANIC_MSG(session, ret, "cache eviction worker error");
}
/*
- * __evict_has_work --
- * Find out if there is eviction work to be done.
+ * __evict_update_work --
+ * Configure eviction work state.
*/
-static int
-__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+static bool
+__evict_update_work(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- uint32_t flags;
- int evict, dirty;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
conn = S2C(session);
cache = conn->cache;
- *flagsp = flags = 0;
+
+ /* Clear previous state. */
+ cache->state = 0;
if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
- return (0);
+ return (false);
- /* Check to see if the eviction server should run. */
- __wt_cache_status(session, &evict, &dirty);
- if (evict)
- /* The cache is too small. */
- LF_SET(WT_EVICT_PASS_ALL);
- else if (dirty)
- /* Too many dirty pages, ignore clean pages. */
- LF_SET(WT_EVICT_PASS_DIRTY);
- else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
- /*
- * Evict pages with oldest generation (which would otherwise
- * block application threads) set regardless of whether we have
- * reached the eviction trigger.
- */
- LF_SET(WT_EVICT_PASS_WOULD_BLOCK);
- F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+ /*
+ * Page eviction overrides the dirty target and other types of eviction,
+ * that is, we don't care where we are with respect to the dirty target
+ * if page eviction is configured.
+ *
+ * Avoid division by zero if the cache size has not yet been set in a
+ * shared cache.
+ */
+ bytes_max = conn->cache_size + 1;
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
+ FLD_SET(cache->state, WT_EVICT_PASS_ALL);
+ goto done;
}
- if (F_ISSET(cache, WT_CACHE_STUCK))
- LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+ dirty_inuse = __wt_cache_dirty_inuse(cache);
+ if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) {
+ FLD_SET(cache->state, WT_EVICT_PASS_DIRTY);
+ goto done;
+ }
- *flagsp = flags;
- return (0);
+ /*
+ * Evict pages with oldest generation (which would otherwise block
+ * application threads), set regardless of whether we have reached
+ * the eviction trigger.
+ */
+ if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
+ FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK);
+
+ F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+ goto done;
+ }
+ return (false);
+
+done: if (F_ISSET(cache, WT_CACHE_STUCK))
+ FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
+ return (true);
}
/*
@@ -460,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_EVICT_WORKER *worker;
uint64_t pages_evicted;
- uint32_t flags;
int loop;
conn = S2C(session);
@@ -483,10 +506,10 @@ __evict_pass(WT_SESSION_IMPL *session)
}
/*
- * Increment the shared read generation. We do this
- * occasionally even if eviction is not currently required, so
- * that pages have some relative read generation when the
- * eviction server does need to do some work.
+ * Increment the shared read generation. Do this occasionally
+ * even if eviction is not currently required, so that pages
+ * have some relative read generation when the eviction server
+ * does need to do some work.
*/
__wt_cache_read_gen_incr(session);
@@ -502,18 +525,17 @@ __evict_pass(WT_SESSION_IMPL *session)
*/
__wt_txn_update_oldest(session, 1);
- WT_RET(__evict_has_work(session, &flags));
- if (flags == 0)
+ if (!__evict_update_work(session))
break;
if (loop > 10)
- LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+ FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
/*
* Start a worker if we have capacity and we haven't reached
* the eviction targets.
*/
- if (LF_ISSET(WT_EVICT_PASS_ALL |
+ if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL |
WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) &&
conn->evict_workers < conn->evict_workers_max) {
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
@@ -532,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session)
" In use: %" PRIu64 " Dirty: %" PRIu64,
conn->cache_size, cache->bytes_inmem, cache->bytes_dirty));
- WT_RET(__evict_lru_walk(session, flags));
+ WT_RET(__evict_lru_walk(session));
WT_RET(__evict_server_work(session));
/*
@@ -553,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* Mark the cache as stuck if we need space
* and aren't evicting any pages.
*/
- if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) {
+ if (!FLD_ISSET(cache->state,
+ WT_EVICT_PASS_WOULD_BLOCK)) {
F_SET(cache, WT_CACHE_STUCK);
WT_STAT_FAST_CONN_INCR(
session, cache_eviction_slow);
@@ -673,44 +696,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
}
/*
- * __wt_evict_page --
- * Evict a given page.
- */
-int
-__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_ISOLATION saved_iso;
-
- /*
- * We have to take care when evicting pages not to write a change that:
- * (a) is not yet committed; or
- * (b) is committed more recently than an in-progress checkpoint.
- *
- * We handle both of these cases by setting up the transaction context
- * before evicting, using a special "eviction" isolation level, where
- * only globally visible updates can be evicted.
- */
- __wt_txn_update_oldest(session, 1);
- txn = &session->txn;
- saved_iso = txn->isolation;
- txn->isolation = WT_ISO_EVICTION;
-
- /*
- * Sanity check: if a transaction has updates, its updates should not
- * be visible to eviction.
- */
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) ||
- !__wt_txn_visible(session, txn->id));
-
- ret = __wt_evict(session, ref, 0);
- txn->isolation = saved_iso;
-
- return (ret);
-}
-
-/*
* __wt_evict_file_exclusive_on --
* Get exclusive eviction access to a file and discard any of the file's
* blocks queued for eviction.
@@ -808,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server)
* Add pages to the LRU queue to be evicted from cache.
*/
static int
-__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_lru_walk(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_DECL_RET;
@@ -819,7 +804,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
cache = S2C(session)->cache;
/* Get some more pages to consider for eviction. */
- if ((ret = __evict_walk(session, flags)) != 0)
+ if ((ret = __evict_walk(session)) != 0)
return (ret == EBUSY ? 0 : ret);
/* Sort the list into LRU order and restart. */
@@ -851,7 +836,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
/* Track the oldest read generation we have in the queue. */
cache->read_gen_oldest = cache->evict[0].ref->page->read_gen;
- if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+ if (FLD_ISSET(cache->state,
+ WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
/*
* Take all candidates if we only gathered pages with an oldest
* read generation set.
@@ -929,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session)
* Fill in the array by walking the next set of pages.
*/
static int
-__evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -1023,7 +1009,7 @@ retry: while (slot < max_entries && ret == 0) {
* stick in cache until we get aggressive.
*/
if ((btree->checkpointing || btree->evict_priority != 0) &&
- !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
continue;
/* Skip files if we have used all available hazard pointers. */
@@ -1055,7 +1041,7 @@ retry: while (slot < max_entries && ret == 0) {
*/
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
WT_WITH_DHANDLE(session, dhandle,
- ret = __evict_walk_file(session, &slot, flags));
+ ret = __evict_walk_file(session, &slot));
WT_ASSERT(session, session->split_gen == 0);
}
@@ -1093,7 +1079,8 @@ retry: while (slot < max_entries && ret == 0) {
*/
if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
slot < max_entries && (retries < 2 ||
- (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
+ (retries < 10 &&
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
(slot == cache->evict_entries || slot > start_slot)))) {
start_slot = slot;
++retries;
@@ -1136,10 +1123,11 @@ __evict_init_candidate(
* Get a few page eviction candidates from a single underlying file.
*/
static int
-__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
{
WT_BTREE *btree;
WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_EVICT_ENTRY *end, *evict, *start;
WT_PAGE *page;
@@ -1149,8 +1137,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
uint32_t walk_flags;
int enough, internal_pages, modified, restarts;
+ conn = S2C(session);
btree = S2BT(session);
- cache = S2C(session)->cache;
+ cache = conn->cache;
start = cache->evict + *slotp;
end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
cache->evict + cache->evict_slots);
@@ -1204,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
goto fast;
/* Optionally ignore clean pages. */
- if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+ if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY))
continue;
/*
* If we are only trickling out pages marked for definite
* eviction, skip anything that isn't marked.
*/
- if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) &&
+ if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
page->read_gen != WT_READGEN_OLDEST)
continue;
/* Limit internal pages to 50% unless we get aggressive. */
if (WT_PAGE_IS_INTERNAL(page) &&
++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
- !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
continue;
/*
@@ -1233,36 +1222,44 @@ fast: /* If the page can't be evicted, give up. */
continue;
/*
- * If the page is clean but has modifications that appear too
- * new to evict, skip it.
+ * Additional tests if eviction is likely to succeed.
*
- * Note: take care with ordering: if we detected that the page
- * is modified above, we expect mod != NULL.
+ * If eviction is stuck or we are helping with forced eviction,
+ * try anyway: maybe a transaction that was running last time
+ * we wrote the page has since rolled back, or we can help the
+ * checkpoint complete sooner. Additionally, being stuck will
+ * configure lookaside table writes in reconciliation, allowing
+ * us to evict pages we can't usually evict.
*/
- mod = page->modify;
- if (!modified && mod != NULL && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
- !__wt_txn_visible_all(session, mod->rec_max_txn))
- continue;
+ if (!FLD_ISSET(cache->state,
+ WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
+ /*
+ * Note: take care with ordering: if we detected that
+ * the page is modified above, we expect mod != NULL.
+ */
+ mod = page->modify;
- /*
- * If the oldest transaction hasn't changed since the last time
- * this page was written, it's unlikely that we can make
- * progress. Similarly, if the most recent update on the page
- * is not yet globally visible, eviction will fail. These
- * heuristics attempt to avoid repeated attempts to evict the
- * same page.
- *
- * That said, if eviction is stuck, or we are helping with
- * forced eviction, try anyway: maybe a transaction that was
- * running last time we wrote the page has since rolled back,
- * or we can help get the checkpoint completed sooner.
- */
- if (modified && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
- (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
- !__wt_txn_visible_all(session, mod->update_txn)))
- continue;
+ /*
+ * If the page is clean but has modifications that
+ * appear too new to evict, skip it.
+ */
+ if (!modified && mod != NULL &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ continue;
+
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, it's unlikely we
+ * can make progress. Similarly, if the most recent
+ * update on the page is not yet globally visible,
+ * eviction will fail. These heuristics attempt to
+ * avoid repeated attempts to evict the same page.
+ */
+ if (modified &&
+ (mod->disk_snap_min == conn->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
+ continue;
+ }
WT_ASSERT(session, evict->ref == NULL);
__evict_init_candidate(session, evict, ref);
@@ -1428,13 +1425,10 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
* page-discard function assert that no dirty pages are ever
* discarded.
*/
- if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) &&
- __wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
+ __wt_page_modify_clear(session, page);
- WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+ WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0));
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
@@ -1453,7 +1447,7 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
* crosses its boundaries.
*/
int
-__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
+__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -1570,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
* NOTE: this function is not called anywhere, it is intended to be called
* from a debugger.
*/
-void
-__wt_cache_dump(WT_SESSION_IMPL *session)
+int
+__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
{
- WT_BTREE *btree;
+ FILE *fp;
WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle;
- WT_REF *next_walk;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
WT_PAGE *page;
+ WT_REF *next_walk;
uint64_t file_intl_pages, file_leaf_pages;
uint64_t file_bytes, file_dirty, total_bytes;
conn = S2C(session);
total_bytes = 0;
+ if (ofile == NULL)
+ fp = stdout;
+ else
+ WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp));
+
+ saved_dhandle = session->dhandle;
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- btree = dhandle->handle;
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
- continue;
-
file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
next_walk = NULL;
session->dhandle = dhandle;
@@ -1607,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
file_bytes += page->memory_footprint;
if (__wt_page_is_modified(page))
file_dirty += page->memory_footprint;
+ (void)__wt_fprintf(fp,
+ "%" WT_SIZET_FMT ", ", page->memory_footprint);
}
session->dhandle = NULL;
- printf("cache dump: %s%s%s%s:"
- " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
- " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+ (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t"
+ " %" PRIu64 " internal pages, %" PRIu64 " leaf pages,"
+ " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n",
dhandle->name,
dhandle->checkpoint == NULL ? "" : " [",
dhandle->checkpoint == NULL ? "" : dhandle->checkpoint,
@@ -1622,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
total_bytes += file_bytes;
}
- printf("cache dump: total found = %" PRIu64 "MB"
+ session->dhandle = saved_dhandle;
+
+ (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB"
" vs tracked inuse %" PRIu64 "MB\n",
total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
- fflush(stdout);
+ if (fp != stdout)
+ WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE));
+ return (0);
}
#endif
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 1e5faf45de2..11284ce7b21 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -150,17 +150,12 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
int
__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
- int evict;
-
/*
* If doing normal system eviction, but only in the service of reducing
* the number of dirty pages, leave the clean page in cache.
*/
- if (!closing) {
- __wt_cache_status(session, &evict, NULL);
- if (!evict)
- return (EBUSY);
- }
+ if (!closing && __wt_eviction_dirty_target(session))
+ return (EBUSY);
/*
* Discard the page and update the reference structure; if the page has
@@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_ADDR *addr;
WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
- int evict;
parent = ref->home;
mod = ref->page->modify;
@@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* push it out of cache (and read it back in, when needed), we
* would rather have more, smaller pages than fewer large pages.
*/
- if (!closing) {
- __wt_cache_status(session, &evict, NULL);
- if (!evict)
- return (EBUSY);
- }
+ if (!closing && __wt_eviction_dirty_target(session))
+ return (EBUSY);
/* Discard the parent's address. */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
@@ -309,8 +300,7 @@ __evict_review(
{
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- uint32_t reconcile_flags;
+ uint32_t flags;
/*
* Get exclusive access to the page if our caller doesn't have the tree
@@ -331,7 +321,6 @@ __evict_review(
/* Now that we have exclusive access, review the page. */
page = ref->page;
- mod = page->modify;
/*
* Fail if an internal has active children, the children must be evicted
@@ -347,6 +336,13 @@ __evict_review(
/* Check if the page can be evicted. */
if (!closing) {
+ /*
+ * Update the oldest ID to avoid wasted effort should it have
+ * fallen behind current.
+ */
+ if (__wt_page_is_modified(page))
+ __wt_txn_update_oldest(session, 1);
+
if (!__wt_page_can_evict(session, page, 0, inmem_splitp))
return (EBUSY);
@@ -361,9 +357,12 @@ __evict_review(
return (__wt_split_insert(session, ref));
}
+ /* If the page is clean, we're done and we can evict. */
+ if (!__wt_page_is_modified(page))
+ return (0);
+
/*
- * If the page is dirty and can possibly change state, reconcile it to
- * determine the final state.
+ * If the page is dirty, reconcile it to decide if we can evict it.
*
* If we have an exclusive lock (we're discarding the tree), assert
* there are no updates we cannot read.
@@ -377,30 +376,38 @@ __evict_review(
* in-memory pages, (restoring the updates that stopped us from writing
* the block), and inserting the whole mess into the page's parent.
*
- * Don't set the update-restore flag for internal pages, they don't have
- * updates that can be saved and restored.
+ * Otherwise, if eviction is getting pressed, configure reconciliation
+ * to write not-yet-globally-visible updates to the lookaside table,
+ * allowing the eviction of pages we'd otherwise have to retain in cache
+ * to support older readers.
+ *
+ * Don't set the update-restore or lookaside table flags for internal
+ * pages, they don't have update lists that can be saved and restored.
*/
- reconcile_flags = WT_EVICTING;
- if (__wt_page_is_modified(page)) {
- if (closing)
- FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR);
- else if (!WT_PAGE_IS_INTERNAL(page) &&
- page->read_gen == WT_READGEN_OLDEST)
- FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE);
- WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags));
- WT_ASSERT(session,
- !__wt_page_is_modified(page) ||
- FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE));
+ flags = WT_EVICTING;
+ if (closing)
+ LF_SET(WT_VISIBILITY_ERR);
+ else if (!WT_PAGE_IS_INTERNAL(page)) {
+ if (page->read_gen == WT_READGEN_OLDEST)
+ LF_SET(WT_EVICT_UPDATE_RESTORE);
+ else if (__wt_eviction_aggressive(session))
+ LF_SET(WT_EVICT_LOOKASIDE);
}
+ WT_RET(__wt_reconcile(session, ref, NULL, flags));
+
/*
- * If the page was ever modified, make sure all of the updates
- * on the page are old enough they can be discarded from cache.
+ * Success: assert the page is clean or reconciliation was configured
+ * for an update/restore split, and if the page is clean, reconciliation
+ * was configured for a lookaside table or all updates on the page are
+ * globally visible.
*/
- if (!closing && mod != NULL &&
- !__wt_txn_visible_all(session, mod->rec_max_txn) &&
- !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE))
- return (EBUSY);
+ WT_ASSERT(session,
+ LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page));
+ WT_ASSERT(session,
+ LF_SET(WT_EVICT_LOOKASIDE) ||
+ __wt_page_is_modified(page) ||
+ __wt_txn_visible_all(session, page->modify->rec_max_txn));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i
index c548c12761d..5449ffe6209 100644
--- a/src/third_party/wiredtiger/src/include/bitstring.i
+++ b/src/third_party/wiredtiger/src/include/bitstring.i
@@ -84,10 +84,10 @@ __bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
* __bit_test --
* Test one bit in name.
*/
-static inline int
+static inline bool
__bit_test(uint8_t *bitf, uint64_t bit)
{
- return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+ return ((bitf[__bit_byte(bit)] & __bit_mask(bit)) != 0);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 4aa2b1c7a7d..f214ddb1dc3 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -6,6 +6,8 @@
* See the file LICENSE for redistribution information.
*/
+#define WT_RECNO_OOB 0 /* Illegal record number */
+
/*
* WT_PAGE_HEADER --
* Blocks have a common header, a WT_PAGE_HEADER structure followed by a
@@ -43,6 +45,7 @@ struct __wt_page_header {
#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */
#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */
#define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */
+#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */
uint8_t flags; /* 25: flags */
/*
@@ -168,6 +171,29 @@ struct __wt_ovfl_txnc {
};
/*
+ * Lookaside table support: when a page is being reconciled for eviction and has
+ * updates that might be required by earlier readers in the system, the updates
+ * are written into a lookaside table, and restored as necessary if the page is
+ * read. The key is a unique marker for the page (a file ID plus an address),
+ * a counter (used to ensure the update records remain in the original order),
+ * the on-page item's transaction ID (so we can discard any update records from
+ * the lookaside table once the on-page item's transaction is globally visible),
+ * and the page key (byte-string for row-store, record number for column-store).
+ * The value is the WT_UPDATE structure's transaction ID, update size and value.
+ *
+ * As the key for the lookaside table is different for row- and column-store, we
+ * store both key types in a WT_ITEM, building/parsing them in the code, because
+ * otherwise we'd need two lookaside files with different key formats. We could
+ * make the lookaside table's key standard by moving the source key into the
+ * lookaside table value, but that doesn't make the coding any simpler, and it
+ * makes the lookaside table's value more likely to overflow the page size when
+ * the row-store key is relatively large.
+ */
+#define WT_LAS_FORMAT \
+ "key_format=" WT_UNCHECKED_STRING(IuQQu) \
+ ",value_format=" WT_UNCHECKED_STRING(QIu)
+
+/*
* WT_PAGE_MODIFY --
* When a page is modified, there's additional information to maintain.
*/
@@ -238,15 +264,17 @@ struct __wt_page_modify {
* Eviction, but block wasn't written: unresolved updates and
* associated disk image.
*
- * Skipped updates are either a WT_INSERT, or a row-store leaf
- * page entry.
+ * Saved updates are either a WT_INSERT, or a row-store leaf
+ * page entry; in the case of creating lookaside records, there
+ * is an additional value, the committed item's transaction ID.
*/
- struct __wt_upd_skipped {
+ struct __wt_save_upd {
WT_INSERT *ins;
WT_ROW *rip;
- } *skip;
- uint32_t skip_entries;
- void *skip_dsk;
+ uint64_t onpage_txn;
+ } *supd;
+ uint32_t supd_entries;
+ void *supd_dsk;
/*
* Block was written: address, size and checksum.
@@ -556,9 +584,8 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
@@ -869,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update)
* store 4GB objects; I'd rather do that than increase the size of this
* structure for a flag bit.
*/
-#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
-#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+#define WT_UPDATE_DELETED_VALUE UINT32_MAX
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE)
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE)
uint32_t size; /* update length */
/* The untyped value immediately follows the WT_UPDATE structure. */
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index deecd8f6d88..98ce4c22c10 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -146,12 +146,14 @@ struct __wt_btree {
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */
#define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */
-#define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */
-#define WT_BTREE_NO_LOGGING 0x00800 /* Disable logging */
-#define WT_BTREE_SALVAGE 0x01000 /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x02000 /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x04000 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x08000 /* Handle is for verify */
+#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */
+#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */
+#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */
+#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */
+#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 058a00d5a78..b54cecb6ce0 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -10,17 +10,17 @@
* __wt_ref_is_root --
* Return if the page reference is for the root page.
*/
-static inline int
+static inline bool
__wt_ref_is_root(WT_REF *ref)
{
- return (ref->home == NULL ? 1 : 0);
+ return (ref->home == NULL);
}
/*
* __wt_page_is_empty --
* Return if the page is empty.
*/
-static inline int
+static inline bool
__wt_page_is_empty(WT_PAGE *page)
{
return (page->modify != NULL &&
@@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page)
* __wt_page_is_modified --
* Return if the page is dirty.
*/
-static inline int
+static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+ return (page->modify != NULL && page->modify->write_gen != 0);
}
/*
@@ -84,6 +84,9 @@ __wt_cache_decr_check_size(
__wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
first = 0;
}
+#else
+ WT_UNUSED(fld);
+ WT_UNUSED(session);
#endif
}
@@ -109,6 +112,9 @@ __wt_cache_decr_check_uint64(
__wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
first = 0;
}
+#else
+ WT_UNUSED(fld);
+ WT_UNUSED(session);
#endif
}
@@ -352,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
* have committed in the meantime, and the last_running field
* been updated past it. That is all very unlikely, but not
* impossible, so we take care to read the global state before
- * the atomic increment. If we raced with reconciliation, just
- * leave the previous value here: at worst, we will write a
- * page in a checkpoint when not absolutely necessary.
+ * the atomic increment.
+ *
+ * If the page was dirty on entry, then last_running == 0. The
+ * page could have become clean since then, if reconciliation
+ * completed. In that case, we leave the previous value for
+ * first_dirty_txn rather than potentially racing to update it,
+ * at worst, we'll unnecessarily write a page in a checkpoint.
*/
if (last_running != 0)
page->modify->first_dirty_txn = last_running;
@@ -366,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __wt_page_modify_clear --
+ * Clean a modified page.
+ */
+static inline void
+__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /*
+ * The page must be held exclusive when this call is made, this call
+ * can only be used when the page is owned by a single thread.
+ *
+ * Allow the call to be made on clean pages.
+ */
+ if (__wt_page_is_modified(page)) {
+ page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ }
+}
+
+/*
* __wt_page_modify_set --
* Mark the page and tree dirty.
*/
@@ -385,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
* shouldn't cause problems; regardless, let's play it safe.)
*/
if (S2BT(session)->modified == 0) {
+ /* Assert we never dirty a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
S2BT(session)->modified = 1;
WT_FULL_BARRIER();
}
@@ -426,7 +458,7 @@ __wt_page_parent_modify_set(
* __wt_off_page --
* Return if a pointer references off-page data.
*/
-static inline int
+static inline bool
__wt_off_page(WT_PAGE *page, const void *p)
{
/*
@@ -527,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref)
static inline void
__wt_ref_key_clear(WT_REF *ref)
{
- /* The key union has 2 fields, both of which are 8B. */
+ /*
+ * The key union has 2 8B fields; this is equivalent to:
+ *
+ * ref->key.recno = WT_RECNO_OOB;
+ * ref->key.ikey = NULL;
+ */
ref->key.recno = 0;
}
@@ -537,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref)
* had without unpacking a cell, and information about the cell, if the key
* isn't cheaply available.
*/
-static inline int
+static inline bool
__wt_row_leaf_key_info(WT_PAGE *page, void *copy,
WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
{
@@ -628,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
if (cellp != NULL)
*cellp =
WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
- return (0);
+ return (false);
case WT_K_FLAG:
/* Encoded key: no instantiated key, no cell. */
if (cellp != NULL)
@@ -639,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
*(void **)datap =
WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
*sizep = WT_K_DECODE_KEY_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
case WT_KV_FLAG:
/* Encoded key/value pair: no instantiated key, no cell. */
if (cellp != NULL)
@@ -652,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
*(void **)datap = WT_PAGE_REF_OFFSET(
page, WT_KV_DECODE_KEY_OFFSET(v));
*sizep = WT_KV_DECODE_KEY_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
@@ -667,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
if (datap != NULL) {
*(void **)datap = WT_IKEY_DATA(ikey);
*sizep = ikey->size;
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -857,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
* __wt_row_leaf_value --
* Return the value for a row-store leaf page encoded key/value pair.
*/
-static inline int
+static inline bool
__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
{
uintptr_t v;
@@ -873,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
value->data =
WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
value->size = WT_KV_DECODE_VALUE_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -934,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session,
* __wt_page_can_split --
* Check whether a page can be split in memory.
*/
-static inline int
+static inline bool
__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_INSERT_HEAD *ins_head;
+ WT_INSERT *ins;
+ int i;
btree = S2BT(session);
@@ -947,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* of the page could continually split without benefit.
*/
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
- return (0);
+ return (false);
/*
* Check for pages with append-only workloads. A common application
* pattern is to have multiple threads frantically appending to the
* tree. We want to reconcile and evict this page, but we'd like to
- * do it without making the appending threads wait. If we're not
- * discarding the tree, check and see if it's worth doing a split to
- * let the threads continue before doing eviction.
- *
- * Ignore anything other than large, dirty row-store leaf pages.
+ * do it without making the appending threads wait. See if it's worth
+ * doing a split to let the threads continue before doing eviction.
*
- * XXX KEITH
- * Need a better test for append-only workloads.
+ * Ignore anything other than large, dirty row-store leaf pages. The
+ * split code only supports row-store pages, and we depend on the page
+ * being dirty for correctness (the page must be reconciled again
+ * before being evicted after the split, information from a previous
+ * reconciliation will be wrong, so we can't evict immediately).
*/
if (page->type != WT_PAGE_ROW_LEAF ||
page->memory_footprint < btree->maxmempage ||
!__wt_page_is_modified(page))
- return (0);
-
- /* Don't split a page that is pending a multi-block split. */
- if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK))
- return (0);
+ return (false);
/*
* There is no point splitting if the list is small, no deep items is
- * our heuristic for that. (A 1/4 probability of adding a new skiplist
- * level means there will be a new 6th level for roughly each 4KB of
- * entries in the list. If we have at least two 6th level entries, the
- * list is at least large enough to work with.)
- *
- * The following code requires at least two items on the insert list,
- * this test serves the additional purpose of confirming that.
+ * our heuristic for that. A 1/4 probability of adding a new skiplist
+ * level, with level-0 always created, means there will be a 5th level
+ * entry for roughly every 1024 entries in the list. If there are at
+ * least 4 5th level entries (4K items), the list is large enough.
*/
-#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1)
+#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1)
ins_head = page->pg_row_entries == 0 ?
WT_ROW_INSERT_SMALLEST(page) :
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
- if (ins_head == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] ==
- ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH])
- return (0);
-
- return (1);
+ if (ins_head == NULL)
+ return (false);
+ for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH];
+ ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH])
+ if (++i == 4) {
+ WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable);
+ WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable);
+ return (true);
+ }
+ return (false);
}
/*
* __wt_page_can_evict --
* Check whether a page can be evicted.
*/
-static inline int
+static inline bool
__wt_page_can_evict(WT_SESSION_IMPL *session,
WT_PAGE *page, int check_splits, int *inmem_splitp)
{
@@ -1011,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
btree = S2BT(session);
mod = page->modify;
- txn_global = &S2C(session)->txn_global;
/* Pages that have never been modified can always be evicted. */
if (mod == NULL)
- return (1);
+ return (true);
+
+ /*
+ * Check for in-memory splits before other eviction tests. If the page
+ * should split in-memory, return success immediately and skip more
+ * detailed eviction tests. We don't need further tests since the page
+ * won't be written or discarded from the cache.
+ */
+ if (__wt_page_can_split(session, page)) {
+ if (inmem_splitp != NULL)
+ *inmem_splitp = 1;
+ return (true);
+ }
/*
* If the tree was deepened, there's a requirement that newly created
@@ -1028,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
*/
if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
!__wt_txn_visible_all(session, mod->mod_split_txn))
- return (0);
-
- /*
- * Allow for the splitting of pages when a checkpoint is underway only
- * if the allow_splits flag has been passed, we know we are performing
- * a checkpoint, the page is larger than the stated maximum and there
- * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK
- * flag is unset.
- */
- if (__wt_page_can_split(session, page)) {
- if (inmem_splitp != NULL)
- *inmem_splitp = 1;
- return (1);
- }
+ return (false);
/*
* If the file is being checkpointed, we can't evict dirty pages:
@@ -1049,25 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* previous version might be referenced by an internal page already
* been written in the checkpoint, leaving the checkpoint inconsistent.
*/
- if (btree->checkpointing &&
- (__wt_page_is_modified(page) ||
- F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+ if (btree->checkpointing && __wt_page_is_modified(page)) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
- return (0);
+ return (false);
}
/*
- * If the page was recently split in-memory, don't force it out: we
- * hope an eviction thread will find it first. The check here is
- * similar to __wt_txn_visible_all, but ignores the checkpoint's
- * transaction.
+ * If the page was recently split in-memory, don't evict it immediately:
+ * we want to give application threads that are appending a chance to
+ * move to the new leaf page created by the split.
+ *
+ * Note the check here is similar to __wt_txn_visible_all, but ignores
+ * the checkpoint's transaction.
*/
- if (check_splits &&
- WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
- return (0);
+ if (check_splits) {
+ txn_global = &S2C(session)->txn_global;
+ if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
+ return (false);
+ }
- return (1);
+ return (true);
}
/*
@@ -1100,7 +1135,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
(void)__wt_atomic_addv32(&btree->evict_busy, 1);
too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0;
- if ((ret = __wt_evict_page(session, ref)) == 0) {
+ if ((ret = __wt_evict(session, ref, 0)) == 0) {
if (too_big)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
else
@@ -1151,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
* memory_page_max setting, when we see many deleted items, and when we
* are attempting to scan without trashing the cache.
*
- * Fast checks if eviction is disabled for this operation or this tree,
- * then perform a general check if eviction will be possible.
+ * Fast checks if eviction is disabled for this handle, operation or
+ * tree, then perform a general check if eviction will be possible.
*/
page = ref->page;
if (page->read_gen != WT_READGEN_OLDEST ||
LF_ISSET(WT_READ_NO_EVICT) ||
+ F_ISSET(session, WT_SESSION_NO_EVICTION) ||
F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
!__wt_page_can_evict(session, page, 1, NULL))
return (__wt_hazard_clear(session, page));
@@ -1272,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session)
}
/*
- * __wt_btree_lsm_size --
+ * __wt_btree_lsm_over_size --
* Return if the size of an in-memory tree with a single leaf page is over
* a specified maximum. If called on anything other than a simple tree with a
* single leaf page, returns true so our LSM caller will switch to a new tree.
*/
-static inline int
-__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
+static inline bool
+__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
{
WT_BTREE *btree;
WT_PAGE *child, *root;
@@ -1290,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
/* Check for a non-existent tree. */
if (root == NULL)
- return (0);
+ return (false);
/* A tree that can be evicted always requires a switch. */
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
- return (1);
+ return (true);
/* Check for a tree with a single leaf page. */
WT_INTL_INDEX_GET(session, root, pindex);
if (pindex->entries != 1) /* > 1 child page, switch */
- return (1);
+ return (true);
first = pindex->index[0];
if (first->state != WT_REF_MEM) /* no child page, ignore */
- return (0);
+ return (false);
/*
* We're reaching down into the page without a hazard pointer, but
@@ -1312,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
*/
child = first->page;
if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */
- return (1);
+ return (true);
return (child->memory_footprint > maxsize);
}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index ed93f82538c..f98483a215f 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -18,11 +18,6 @@
#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
-#define WT_EVICT_PASS_AGGRESSIVE 0x01
-#define WT_EVICT_PASS_ALL 0x02
-#define WT_EVICT_PASS_DIRTY 0x04
-#define WT_EVICT_PASS_WOULD_BLOCK 0x08
-
/*
* WT_EVICT_ENTRY --
* Encapsulation of an eviction candidate.
@@ -109,6 +104,7 @@ struct __wt_cache {
* Cache pool information.
*/
uint64_t cp_pass_pressure; /* Calculated pressure from this pass */
+ uint64_t cp_quota; /* Maximum size for this cache */
uint64_t cp_reserved; /* Base size for this cache */
WT_SESSION_IMPL *cp_session; /* May be used for cache management */
uint32_t cp_skip_count; /* Post change stabilization */
@@ -119,6 +115,15 @@ struct __wt_cache {
uint64_t cp_saved_read; /* Read count at last review */
/*
+ * Work state.
+ */
+#define WT_EVICT_PASS_AGGRESSIVE 0x01
+#define WT_EVICT_PASS_ALL 0x02
+#define WT_EVICT_PASS_DIRTY 0x04
+#define WT_EVICT_PASS_WOULD_BLOCK 0x08
+ uint32_t state;
+
+ /*
* Flags.
*/
#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */
@@ -140,6 +145,7 @@ struct __wt_cache_pool {
const char *name;
uint64_t size;
uint64_t chunk;
+ uint64_t quota;
uint64_t currently_used;
uint32_t refs; /* Reference count for structure. */
/* Locked: List of connections participating in the cache pool. */
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index 87f8c5543d1..bc33f82d927 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -104,48 +104,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache)
}
/*
- * __wt_cache_status --
- * Return if the cache usage exceeds the eviction or dirty targets.
- */
-static inline void
-__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp)
-{
- WT_CONNECTION_IMPL *conn;
- WT_CACHE *cache;
- uint64_t bytes_inuse, bytes_max, dirty_inuse;
-
- conn = S2C(session);
- cache = conn->cache;
-
- /*
- * There's an assumption "evict" overrides "dirty", that is, if eviction
- * is required, we no longer care where we are with respect to the dirty
- * target.
- *
- * Avoid division by zero if the cache size has not yet been set in a
- * shared cache.
- */
- bytes_max = conn->cache_size + 1;
- if (evictp != NULL) {
- bytes_inuse = __wt_cache_bytes_inuse(cache);
- if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
- *evictp = 1;
- return;
- }
- *evictp = 0;
- }
- if (dirtyp != NULL) {
- dirty_inuse = __wt_cache_dirty_inuse(cache);
- if (dirty_inuse >
- (cache->eviction_dirty_target * bytes_max) / 100) {
- *dirtyp = 1;
- return;
- }
- *dirtyp = 0;
- }
-}
-
-/*
* __wt_session_can_wait --
* Return if a session available for a potentially slow operation.
*/
@@ -161,29 +119,52 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
return (0);
/*
- * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * LSM sets the no-eviction flag when holding the LSM tree lock,
* in that case, or when holding the schema lock, we don't want to
* highjack the thread for eviction.
*/
if (F_ISSET(session,
- WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA))
+ WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA))
return (0);
return (1);
}
/*
+ * __wt_eviction_aggressive --
+ * Return if the eviction server is running in aggressive mode.
+ */
+static inline int
+__wt_eviction_aggressive(WT_SESSION_IMPL *session)
+{
+ return (FLD_ISSET(
+ S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0);
+}
+
+/*
+ * __wt_eviction_dirty_target --
+ * Return if the eviction server is running to reduce the number of dirty
+ * pages (versus running to discard pages from the cache).
+ */
+static inline int
+__wt_eviction_dirty_target(WT_SESSION_IMPL *session)
+{
+ return (FLD_ISSET(
+ S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0);
+}
+
+/*
* __wt_eviction_needed --
* Return if an application thread should do eviction, and the cache full
* percentage as a side-effect.
*/
-static inline int
-__wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
+static inline bool
+__wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp)
{
WT_CONNECTION_IMPL *conn;
WT_CACHE *cache;
uint64_t bytes_inuse, bytes_max;
- int pct_full;
+ u_int pct_full;
conn = S2C(session);
cache = conn->cache;
@@ -196,25 +177,20 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
bytes_max = conn->cache_size + 1;
/*
- * Return the cache full percentage; anything over 95% means we involve
- * the application thread.
+ * Calculate the cache full percentage; anything over the trigger means
+ * we involve the application thread.
*/
- pct_full = (int)((100 * bytes_inuse) / bytes_max);
+ pct_full = (u_int)((100 * bytes_inuse) / bytes_max);
if (pct_fullp != NULL)
*pct_fullp = pct_full;
- if (pct_full >= 95)
- return (1);
+ if (pct_full > cache->eviction_trigger)
+ return (true);
- /*
- * Return if we're over the trigger cache size or there are too many
- * dirty pages.
- */
- if (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100)
- return (1);
+ /* Return if there are too many dirty bytes in cache. */
if (__wt_cache_dirty_inuse(cache) >
(cache->eviction_dirty_trigger * bytes_max) / 100)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
@@ -225,7 +201,7 @@ static inline int
__wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
{
WT_BTREE *btree;
- int pct_full;
+ u_int pct_full;
if (didworkp != NULL)
*didworkp = 0;
@@ -235,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
* that case, or when holding the schema or handle list locks (which
* block eviction), we don't want to highjack the thread for eviction.
*/
- if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK |
+ if (F_ISSET(session, WT_SESSION_NO_EVICTION |
WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
return (0);
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index 20a4d214015..d7ecfd3bda4 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
p = cell->__chunk + 1;
- if (recno == 0)
+ if (recno == WT_RECNO_OOB)
cell->__chunk[0] = cell_type; /* Type */
else {
cell->__chunk[0] = cell_type | WT_CELL_64V;
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 64043035e76..d8ff261cd82 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -270,7 +270,9 @@ struct __wt_connection_impl {
uint32_t hazard_max; /* Hazard array size */
WT_CACHE *cache; /* Page cache */
- uint64_t cache_size; /* Configured cache size */
+ volatile uint64_t cache_size; /* Cache size (either statically
+ configured or the current size
+ within a cache pool). */
WT_TXN_GLOBAL txn_global; /* Global transaction state */
@@ -292,8 +294,6 @@ struct __wt_connection_impl {
uint64_t ckpt_time_recent; /* Checkpoint time recent/total */
uint64_t ckpt_time_total;
- int compact_in_memory_pass; /* Compaction serialization */
-
#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */
#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */
#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */
@@ -370,6 +370,20 @@ struct __wt_connection_impl {
time_t sweep_interval;/* Handle sweep interval */
u_int sweep_handles_min;/* Handle sweep minimum open */
+ /*
+ * Shared lookaside lock, session and cursor, used by threads accessing
+ * the lookaside table (other than eviction server and worker threads
+ * and the sweep thread, all of which have their own lookaside cursors).
+ */
+ WT_SPINLOCK las_lock; /* Lookaside table spinlock */
+ WT_SESSION_IMPL *las_session; /* Lookaside table session */
+ WT_CURSOR *las_cursor; /* Lookaside table cursor */
+ bool las_written; /* Lookaside table has been written */
+
+ WT_ITEM las_sweep_key; /* Sweep server's saved key */
+ int las_sweep_call;/* Sweep server's call count */
+ uint64_t las_sweep_cnt; /* Sweep server's per-call row count */
+
/* Locked: collator list */
TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 2b3a3221004..2f55dfc8186 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -261,6 +261,7 @@ struct __wt_cursor_index {
WT_CURSOR *child;
WT_CURSOR **cg_cursors;
+ uint8_t *cg_needvalue;
};
struct __wt_cursor_json {
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 484af0b4a58..e7fed250251 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
* and it's a minimal set of things we need to clear. It would be a
* lot simpler to clear everything, but we call this function a lot.
*/
- cbt->recno = 0;
+ cbt->recno = WT_RECNO_OOB;
cbt->ins = NULL;
cbt->ins_head = NULL;
diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h
index fcb96b16361..abffc02945e 100644
--- a/src/third_party/wiredtiger/src/include/error.h
+++ b/src/third_party/wiredtiger/src/include/error.h
@@ -92,7 +92,8 @@
return (__wt_illegal_value(session, NULL))
#define WT_ILLEGAL_VALUE_ERR(session) \
default: \
- WT_ERR(__wt_illegal_value(session, NULL))
+ ret = __wt_illegal_value(session, NULL); \
+ goto err
#define WT_ILLEGAL_VALUE_SET(session) \
default: \
ret = __wt_illegal_value(session, NULL); \
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index a7b02ec4a75..e5c5a72fe02 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -63,7 +63,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]);
@@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp);
extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt);
-extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel);
extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
@@ -115,12 +116,13 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
-extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd);
extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
@@ -138,15 +140,15 @@ extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *add
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size);
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
, const char *file, int line
#endif
);
-extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
-extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
-extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
@@ -162,7 +164,7 @@ extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok);
extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf);
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
-extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove);
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
@@ -179,6 +181,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page,
extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_las_create(WT_SESSION_IMPL *session);
+extern int __wt_las_destroy(WT_SESSION_IMPL *session);
+extern void __wt_las_set_written(WT_SESSION_IMPL *session);
+extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
+extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
+extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
+extern int __wt_las_sweep(WT_SESSION_IMPL *session);
extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
@@ -237,7 +247,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]);
-extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield);
+extern int __wt_log_wrlsn(WT_SESSION_IMPL *session);
extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_logmgr_open(WT_SESSION_IMPL *session);
extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
@@ -308,14 +318,14 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
extern int __wt_evict_create(WT_SESSION_IMPL *session);
extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
-extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp);
extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
-extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full);
-extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full);
+extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile);
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern int __wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec);
@@ -323,12 +333,13 @@ extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only);
extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot);
extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, int prealloc);
extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum);
extern int __wt_log_open(WT_SESSION_IMPL *session);
extern int __wt_log_close(WT_SESSION_IMPL *session);
-extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created);
extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie);
+extern int __wt_log_force_write(WT_SESSION_IMPL *session, int retry);
extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
@@ -354,14 +365,16 @@ extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logr
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced);
+extern int __wt_log_slot_switch_internal(WT_SESSION_IMPL *session, WT_MYSLOT *myslot);
+extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot);
+extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
-extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
-extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
-extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot);
+extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size);
+extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
@@ -474,7 +487,7 @@ extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t siz
extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
-extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs);
+extern int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled);
extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
@@ -488,7 +501,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
extern int __wt_once(void (*init_routine)(void));
extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
-extern int __wt_absolute_path(const char *path);
+extern bool __wt_absolute_path(const char *path);
extern const char *__wt_path_separator(void);
extern int __wt_has_priv(void);
extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
@@ -576,6 +589,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f
extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers);
extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
@@ -638,7 +653,7 @@ extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, cons
extern uint32_t __wt_nlpo2_round(uint32_t v);
extern uint32_t __wt_nlpo2(uint32_t v);
extern uint32_t __wt_log2_int(uint32_t n);
-extern int __wt_ispo2(uint32_t v);
+extern bool __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index 031be7e7c59..ca3c3c38245 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -18,6 +18,8 @@
#define WT_CONN_SERVER_SWEEP 0x00002000
#define WT_CONN_WAS_BACKUP 0x00004000
#define WT_EVICTING 0x00000001
+#define WT_EVICT_LOOKASIDE 0x00000002
+#define WT_EVICT_UPDATE_RESTORE 0x00000004
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
#define WT_FILE_TYPE_DIRECTORY 0x00000004
@@ -46,17 +48,17 @@
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008
#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010
#define WT_SESSION_LOCKED_SCHEMA 0x00000020
-#define WT_SESSION_LOCKED_TABLE 0x00000040
-#define WT_SESSION_LOGGING_INMEM 0x00000080
-#define WT_SESSION_NO_CACHE 0x00000100
-#define WT_SESSION_NO_CACHE_CHECK 0x00000200
-#define WT_SESSION_NO_DATA_HANDLES 0x00000400
-#define WT_SESSION_NO_LOGGING 0x00000800
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00001000
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x00002000
-#define WT_SESSION_SERVER_ASYNC 0x00004000
-#define WT_SKIP_UPDATE_ERR 0x00000002
-#define WT_SKIP_UPDATE_RESTORE 0x00000004
+#define WT_SESSION_LOCKED_SLOT 0x00000040
+#define WT_SESSION_LOCKED_TABLE 0x00000080
+#define WT_SESSION_LOGGING_INMEM 0x00000100
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200
+#define WT_SESSION_NO_CACHE 0x00000400
+#define WT_SESSION_NO_DATA_HANDLES 0x00000800
+#define WT_SESSION_NO_EVICTION 0x00001000
+#define WT_SESSION_NO_LOGGING 0x00002000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000
+#define WT_SESSION_SERVER_ASYNC 0x00010000
#define WT_SYNC_CHECKPOINT 0x00000001
#define WT_SYNC_CLOSE 0x00000002
#define WT_SYNC_DISCARD 0x00000004
@@ -90,6 +92,7 @@
#define WT_VERB_VERIFY 0x00200000
#define WT_VERB_VERSION 0x00400000
#define WT_VERB_WRITE 0x00800000
+#define WT_VISIBILITY_ERR 0x00000008
/*
* flags section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
index 3472985745e..01e33792d73 100644
--- a/src/third_party/wiredtiger/src/include/gcc.h
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -123,7 +123,7 @@ __wt_atomic_sub##name(type *vp, type v) \
{ \
return (__sync_sub_and_fetch(vp, v)); \
} \
-static inline int \
+static inline bool \
__wt_atomic_cas##name(type *vp, type old, type new) \
{ \
return (WT_ATOMIC_CAS(vp, old, new)); \
@@ -145,7 +145,7 @@ WT_ATOMIC_FUNC(size, size_t, size_t)
* __wt_atomic_cas_ptr --
* Pointer compare and swap.
*/
-static inline int
+static inline bool
__wt_atomic_cas_ptr(void *vp, void *old, void *new)
{
return (WT_ATOMIC_CAS((void **)vp, old, new));
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
index c9b72f8a609..32353072c5b 100644
--- a/src/third_party/wiredtiger/src/include/hardware.h
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -50,6 +50,16 @@
&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
} while (0)
+#define F_CAS_ATOMIC_WAIT(p, mask) do { \
+ int __ret; \
+ for (;;) { \
+ F_CAS_ATOMIC(p, mask, __ret); \
+ if (__ret == 0) \
+ break; \
+ __wt_yield(); \
+ } \
+} while (0)
+
#define F_CLR_ATOMIC(p, mask) do { \
uint8_t __orig; \
do { \
diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h
index eba4a1c3b3f..f288fb98683 100644
--- a/src/third_party/wiredtiger/src/include/lint.h
+++ b/src/third_party/wiredtiger/src/include/lint.h
@@ -49,14 +49,14 @@ __wt_atomic_sub##name(type *vp, type v) \
*vp -= v; \
return (*vp); \
} \
-static inline int \
+static inline bool \
__wt_atomic_cas##name(type *vp, type old, type new) \
{ \
if (*vp == old) { \
*vp = new; \
- return (1); \
+ return (true); \
} \
- return (0); \
+ return (false); \
}
WT_ATOMIC_FUNC(8, uint8_t, uint8_t)
@@ -75,13 +75,13 @@ WT_ATOMIC_FUNC(size, size_t, size_t)
* __wt_atomic_cas_ptr --
* Pointer compare and swap.
*/
-static inline int
+static inline bool
__wt_atomic_cas_ptr(void *vp, void *old, void *new) {
if (*(void **)vp == old) {
*(void **)vp = new;
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
static inline void WT_BARRIER(void) { return; }
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 949eb09ca30..06be95697c7 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -12,7 +12,6 @@
/* Logging subsystem declarations. */
#define WT_LOG_ALIGN 128
-#define WT_LOG_SLOT_BUF_SIZE 256 * 1024
#define WT_INIT_LSN(l) do { \
(l)->file = 1; \
@@ -48,63 +47,133 @@
((size) - offsetof(WT_LOG_RECORD, record))
/*
- * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
- * and 1 if lsn0 > lsn1.
- */
-#define WT_LOG_CMP(lsn1, lsn2) \
- ((lsn1)->file != (lsn2)->file ? \
- ((lsn1)->file < (lsn2)->file ? -1 : 1) : \
- ((lsn1)->offset != (lsn2)->offset ? \
- ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
-
-/*
* Possible values for the consolidation array slot states:
- * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.)
*
- * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
- * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
* WT_LOG_SLOT_FREE - slot is available for allocation.
- * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
* WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
- * WT_LOG_SLOT_READY - slot is ready for threads to join.
- * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
*
* The slot state must be volatile: threads loop checking the state and can't
* cache the first value they see.
+ *
+ * The slot state is divided into two 32 bit sizes. One half is the
+ * amount joined and the other is the amount released. Since we use
+ * a few special states, reserve the top few bits for state. That makes
+ * the maximum size less than 32 bits for both joined and released.
+ */
+
+/*
+ * The high bit is reserved for the special states. If the high bit is
+ * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state.
+ */
+#define WT_LOG_SLOT_FREE -1 /* Not in use */
+#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */
+
+/*
+ * We allocate the buffer size, but trigger a slot switch when we cross
+ * the maximum size of half the buffer. If a record is more than the buffer
+ * maximum then we trigger a slot switch and write that record unbuffered.
+ * We use a larger buffer to provide overflow space so that we can switch
+ * once we cross the threshold.
+ */
+#define WT_LOG_SLOT_BUF_SIZE (256 * 1024) /* Must be power of 2 */
+#define WT_LOG_SLOT_BUF_MAX ((uint32_t)log->slot_buf_size / 2)
+#define WT_LOG_SLOT_UNBUFFERED (WT_LOG_SLOT_BUF_SIZE << 1)
+
+/*
+ * If new slot states are added, adjust WT_LOG_SLOT_BITS and
+ * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32
+ * bits we are using. More slot states here will reduce the maximum
+ * size that a slot can hold unbuffered by half. If a record is
+ * larger than the maximum we can account for in the slot state we fall
+ * back to direct writes.
+ */
+#define WT_LOG_SLOT_BITS 2
+#define WT_LOG_SLOT_MAXBITS (32 - WT_LOG_SLOT_BITS)
+#define WT_LOG_SLOT_CLOSE 0x4000000000000000LL /* Force slot close */
+#define WT_LOG_SLOT_RESERVED 0x8000000000000000LL /* Reserved states */
+
+/*
+ * Check if the unbuffered flag is set in the joined portion of
+ * the slot state.
*/
-#define WT_LOG_SLOT_DONE 0
-#define WT_LOG_SLOT_FREE 1
-#define WT_LOG_SLOT_PENDING 2
-#define WT_LOG_SLOT_WRITTEN 3
-#define WT_LOG_SLOT_READY 4
+#define WT_LOG_SLOT_UNBUFFERED_ISSET(state) \
+ ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32))
+
+#define WT_LOG_SLOT_MASK_OFF 0x3fffffffffffffffLL
+#define WT_LOG_SLOT_MASK_ON ~(WT_LOG_SLOT_MASK_OFF)
+#define WT_LOG_SLOT_JOIN_MASK (WT_LOG_SLOT_MASK_OFF >> 32)
+
+/*
+ * These macros manipulate the slot state and its component parts.
+ */
+#define WT_LOG_SLOT_FLAGS(state) ((state) & WT_LOG_SLOT_MASK_ON)
+#define WT_LOG_SLOT_JOINED(state) (((state) & WT_LOG_SLOT_MASK_OFF) >> 32)
+#define WT_LOG_SLOT_JOINED_BUFFERED(state) \
+ (WT_LOG_SLOT_JOINED(state) & \
+ (WT_LOG_SLOT_UNBUFFERED - 1))
+#define WT_LOG_SLOT_JOIN_REL(j, r, s) (((j) << 32) + (r) + (s))
+#define WT_LOG_SLOT_RELEASED(state) ((int64_t)(int32_t)(state))
+#define WT_LOG_SLOT_RELEASED_BUFFERED(state) \
+ ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) & \
+ (WT_LOG_SLOT_UNBUFFERED - 1)))
+
+/* Slot is in use */
+#define WT_LOG_SLOT_ACTIVE(state) \
+ (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK)
+/* Slot is in use, but closed to new joins */
+#define WT_LOG_SLOT_CLOSED(state) \
+ (WT_LOG_SLOT_ACTIVE(state) && \
+ (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \
+ !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED)))
+/* Slot is in use, all data copied into buffer */
+#define WT_LOG_SLOT_INPROGRESS(state) \
+ (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state))
+#define WT_LOG_SLOT_DONE(state) \
+ (WT_LOG_SLOT_CLOSED(state) && \
+ !WT_LOG_SLOT_INPROGRESS(state))
+/* Slot is in use, more threads may join this slot */
+#define WT_LOG_SLOT_OPEN(state) \
+ (WT_LOG_SLOT_ACTIVE(state) && \
+ !WT_LOG_SLOT_UNBUFFERED_ISSET(state) && \
+ !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \
+ WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX)
+
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
volatile int64_t slot_state; /* Slot state */
- uint64_t slot_group_size; /* Group size */
+ int64_t slot_unbuffered; /* Unbuffered data in this slot */
int32_t slot_error; /* Error value */
-#define WT_SLOT_INVALID_INDEX 0xffffffff
- uint32_t slot_index; /* Active slot index */
wt_off_t slot_start_offset; /* Starting file offset */
- WT_LSN slot_release_lsn; /* Slot release LSN */
- WT_LSN slot_start_lsn; /* Slot starting LSN */
- WT_LSN slot_end_lsn; /* Slot ending LSN */
+ wt_off_t slot_last_offset; /* Last record offset */
+ WT_LSN slot_release_lsn; /* Slot release LSN */
+ WT_LSN slot_start_lsn; /* Slot starting LSN */
+ WT_LSN slot_end_lsn; /* Slot ending LSN */
WT_FH *slot_fh; /* File handle for this group */
- WT_ITEM slot_buf; /* Buffer for grouped writes */
- int32_t slot_churn; /* Active slots are scarce. */
+ WT_ITEM slot_buf; /* Buffer for grouped writes */
-#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */
-#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */
-#define WT_SLOT_SYNC 0x04 /* Needs sync on release */
-#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */
+#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */
+#define WT_SLOT_SYNC 0x02 /* Needs sync on release */
+#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */
uint32_t flags; /* Flags */
};
-#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED)
+#define WT_SLOT_INIT_FLAGS 0
+
+#define WT_WITH_SLOT_LOCK(session, log, op) do { \
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \
+ WT_WITH_LOCK(session, \
+ &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \
+} while (0)
struct __wt_myslot {
- WT_LOGSLOT *slot;
- wt_off_t offset;
+ WT_LOGSLOT *slot; /* Slot I'm using */
+ wt_off_t end_offset; /* My end offset in buffer */
+ wt_off_t offset; /* Slot buffer offset */
+#define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */
+#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */
+ uint32_t flags; /* Flags */
};
- /* Offset of first record */
+
#define WT_LOG_FIRST_RECORD log->allocsize
struct __wt_log {
@@ -118,8 +187,9 @@ struct __wt_log {
uint32_t tmp_fileid; /* Temporary file number */
uint32_t prep_missed; /* Pre-allocated file misses */
WT_FH *log_fh; /* Logging file handle */
- WT_FH *log_close_fh; /* Logging file handle to close */
WT_FH *log_dir_fh; /* Log directory file handle */
+ WT_FH *log_close_fh; /* Logging file handle to close */
+ WT_LSN log_close_lsn; /* LSN needed to close */
/*
* System LSNs
@@ -140,8 +210,9 @@ struct __wt_log {
WT_SPINLOCK log_lock; /* Locked: Logging fields */
WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */
+ WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */
- WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
+ WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
/* Notify any waiting threads when sync_lsn is updated. */
WT_CONDVAR *log_sync_cond;
@@ -150,7 +221,6 @@ struct __wt_log {
/*
* Consolidation array information
- * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL.
* Our testing shows that the more consolidation we generate the
* better the performance we see which equates to an active slot
* slot count of one.
@@ -158,13 +228,14 @@ struct __wt_log {
* Note: this can't be an array, we impose cache-line alignment and
* gcc doesn't support that for arrays.
*/
-#define WT_SLOT_ACTIVE 1
#define WT_SLOT_POOL 128
- WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */
- WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
- size_t slot_buf_size; /* Buffer size for slots */
+ WT_LOGSLOT *active_slot; /* Active slot */
+ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
+ size_t slot_buf_size; /* Buffer size for slots */
+#ifdef HAVE_DIAGNOSTIC
+ uint64_t write_calls; /* Calls to log_write */
+#endif
-#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/log.i b/src/third_party/wiredtiger/src/include/log.i
new file mode 100644
index 00000000000..ff309c31265
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/log.i
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2);
+
+/*
+ * __wt_log_cmp --
+ * Compare 2 LSNs, return -1 if lsn1 < lsn2, 0if lsn1 == lsn2
+ * and 1 if lsn1 > lsn2.
+ */
+static inline int
+__wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2)
+{
+ WT_LSN l1, l2;
+
+ /*
+ * Read LSNs into local variables so that we only read each field
+ * once and all comparisons are on the same values.
+ */
+ l1 = *(volatile WT_LSN *)lsn1;
+ l2 = *(volatile WT_LSN *)lsn2;
+
+ /*
+ * If the file numbers are different we don't need to compare the
+ * offset.
+ */
+ if (l1.file != l2.file)
+ return (l1.file < l2.file ? -1 : 1);
+ /*
+ * If the file numbers are the same, compare the offset.
+ */
+ if (l1.offset != l2.offset)
+ return (l1.offset < l2.offset ? -1 : 1);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index 66547262417..a5a303f1630 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -21,7 +21,9 @@
#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */
#define WT_METADATA_URI "metadata:" /* Metadata alias */
-#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */
+#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */
+
+#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/
/*
* Pre computed hash for the metadata file. Used to optimize comparisons
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index 7fb6ae13d38..1b2cbf11fc2 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -130,6 +130,7 @@
#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask)))
#define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask)))
+#define FLD64_ISSET(field, mask) ((field) & ((uint64_t)(mask)))
#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask)))
/*
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
index 98facff02b9..6b502c4c1d1 100644
--- a/src/third_party/wiredtiger/src/include/misc.i
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -7,6 +7,18 @@
*/
/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+static inline int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+{
+ int notused;
+
+ return (__wt_cond_wait_signal(session, cond, usecs, &notused));
+}
+
+/*
* __wt_strdup --
* ANSI strdup function.
*/
diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h
index f4d8ba52fc1..8f5aa9abde8 100644
--- a/src/third_party/wiredtiger/src/include/msvc.h
+++ b/src/third_party/wiredtiger/src/include/msvc.h
@@ -52,7 +52,7 @@ __wt_atomic_sub##name(type *vp, type v) \
{ \
return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \
} \
-static inline int \
+static inline bool \
__wt_atomic_cas##name(type *vp, type old, type new) \
{ \
return (_InterlockedCompareExchange ## s \
@@ -75,7 +75,7 @@ WT_ATOMIC_FUNC(size, size_t, size_t, 64, __int64)
* __wt_atomic_cas_ptr --
* Pointer compare and swap.
*/
-static inline int
+static inline bool
__wt_atomic_cas_ptr(void *vp, void *old, void *new)
{
return (_InterlockedCompareExchange64(
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 7b62e66eccb..d90b29c2133 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -123,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
* If the application didn't specify a record number, allocate a new one
* and set up for an append.
*/
- if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+ if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) {
recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
@@ -292,25 +292,37 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
__wt_page_modify_set(session, page);
/*
- * If there are subsequent WT_UPDATE structures, we're evicting pages
- * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
- * structures. Serialization is needed so only one thread does the
- * obsolete check at a time, and to protect updates from disappearing
- * under reconciliation.
+ * If there are no subsequent WT_UPDATE structures we are done here.
*/
- if (upd->next != NULL &&
- __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
- /* If we can't lock it, don't scan, that's okay. */
- if (ret != 0)
- return (0);
- obsolete = __wt_update_obsolete_check(session, page, upd->next);
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (obsolete != NULL) {
+ if (upd->next == NULL)
+ return (0);
+ /*
+ * We would like to call __wt_txn_update_oldest only in the event that
+ * there are further updates to this page, the check against WT_TXN_NONE
+ * is used as an indicator of there being further updates on this page.
+ */
+ if (page->modify->obsolete_check_txn != WT_TXN_NONE) {
+ if (!__wt_txn_visible_all(session,
+ page->modify->obsolete_check_txn)) {
+ /* Try to move the oldest ID forward and re-check */
+ __wt_txn_update_oldest(session,0);
+ }
+ if (!__wt_txn_visible_all(session,
+ page->modify->obsolete_check_txn)) {
page->modify->obsolete_check_txn = WT_TXN_NONE;
- __wt_update_obsolete_free(session, page, obsolete);
+ return (0);
}
}
+ F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
+
+ /* If we can't lock it, don't scan, that's okay. */
+ if (ret != 0)
+ return (0);
+ obsolete = __wt_update_obsolete_check(session, page, upd->next);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ if (obsolete != NULL) {
+ __wt_update_obsolete_free(session, page, obsolete);
+ }
return (0);
}
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index c6c246954f7..a691794fd46 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
WT_COMPACT *compact; /* Compact state */
+ /*
+ * Lookaside table cursor, sweep and eviction worker threads only.
+ */
+ WT_CURSOR *las_cursor; /* Lookaside table cursor */
+
WT_DATA_HANDLE *meta_dhandle; /* Metadata file */
void *meta_track; /* Metadata operation tracking */
void *meta_track_next; /* Current position */
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 6ecb6b3a3c7..cd2c149bc94 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -276,11 +276,17 @@ struct __wt_connection_stats {
int64_t cache_eviction_walk;
int64_t cache_eviction_worker_evicting;
int64_t cache_inmem_split;
+ int64_t cache_inmem_splittable;
+ int64_t cache_lookaside_insert;
+ int64_t cache_lookaside_remove;
int64_t cache_overhead;
int64_t cache_pages_dirty;
int64_t cache_pages_inuse;
int64_t cache_read;
+ int64_t cache_read_lookaside;
int64_t cache_write;
+ int64_t cache_write_lookaside;
+ int64_t cache_write_restore;
int64_t cond_wait;
int64_t cursor_create;
int64_t cursor_insert;
@@ -323,9 +329,9 @@ struct __wt_connection_stats {
int64_t log_slot_consolidated;
int64_t log_slot_joins;
int64_t log_slot_races;
- int64_t log_slot_toobig;
- int64_t log_slot_toosmall;
+ int64_t log_slot_switch_busy;
int64_t log_slot_transitions;
+ int64_t log_slot_unbuffered;
int64_t log_sync;
int64_t log_sync_dir;
int64_t log_write_lsn;
@@ -400,6 +406,7 @@ struct __wt_dsrc_stats {
int64_t btree_column_deleted;
int64_t btree_column_fix;
int64_t btree_column_internal;
+ int64_t btree_column_rle;
int64_t btree_column_variable;
int64_t btree_compact_rewrite;
int64_t btree_entries;
@@ -424,10 +431,14 @@ struct __wt_dsrc_stats {
int64_t cache_eviction_internal;
int64_t cache_eviction_split;
int64_t cache_inmem_split;
+ int64_t cache_inmem_splittable;
int64_t cache_overflow_value;
int64_t cache_read;
+ int64_t cache_read_lookaside;
int64_t cache_read_overflow;
int64_t cache_write;
+ int64_t cache_write_lookaside;
+ int64_t cache_write_restore;
int64_t compress_raw_fail;
int64_t compress_raw_fail_temporary;
int64_t compress_raw_ok;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 0e7be1be6bc..4a325c70a95 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -78,9 +78,8 @@ struct __wt_txn_global {
};
typedef enum __wt_txn_isolation {
- WT_ISO_EVICTION, /* Internal: eviction context */
- WT_ISO_READ_UNCOMMITTED,
WT_ISO_READ_COMMITTED,
+ WT_ISO_READ_UNCOMMITTED,
WT_ISO_SNAPSHOT
} WT_TXN_ISOLATION;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 1228893871f..2b42990f5e5 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -140,12 +140,22 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
}
/*
+ * __wt_txn_committed --
+ * Return if a transaction has been committed.
+ */
+static inline bool
+__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id)
+{
+ return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running));
+}
+
+/*
* __wt_txn_visible_all --
* Check if a given transaction ID is "globally visible". This is, if
* all sessions in the system will see the transaction ID including the
* ID that belongs to a running checkpoint.
*/
-static inline int
+static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
{
uint64_t oldest_id;
@@ -159,28 +169,21 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
* __wt_txn_visible --
* Can the current transaction see the given ID?
*/
-static inline int
+static inline bool
__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
{
WT_TXN *txn;
- int found;
+ bool found;
txn = &session->txn;
/* Changes with no associated transaction are always visible. */
if (id == WT_TXN_NONE)
- return (1);
+ return (true);
/* Nobody sees the results of aborted transactions. */
if (id == WT_TXN_ABORTED)
- return (0);
-
- /*
- * Eviction only sees globally visible updates, or if there is a
- * checkpoint transaction running, use its transaction.
- */
- if (txn->isolation == WT_ISO_EVICTION)
- return (__wt_txn_visible_all(session, id));
+ return (false);
/*
* Read-uncommitted transactions see all other changes.
@@ -194,11 +197,11 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
*/
if (txn->isolation == WT_ISO_READ_UNCOMMITTED ||
session->dhandle == session->meta_dhandle)
- return (1);
+ return (true);
/* Transactions see their own changes. */
if (id == txn->id)
- return (1);
+ return (true);
/*
* WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
@@ -210,9 +213,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
* snapshot is empty.
*/
if (WT_TXNID_LE(txn->snap_max, id))
- return (0);
+ return (false);
if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
- return (1);
+ return (true);
WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
return (!found);
@@ -266,7 +269,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
}
F_SET(txn, WT_TXN_RUNNING);
- return (0);
+ return (false);
}
/*
@@ -477,7 +480,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
* __wt_txn_am_oldest --
* Am I the oldest transaction in the system?
*/
-static inline int
+static inline bool
__wt_txn_am_oldest(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
@@ -492,12 +495,12 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session)
txn_global = &conn->txn_global;
if (txn->id == WT_TXN_NONE)
- return (0);
+ return (false);
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
- return (0);
+ return (false);
- return (1);
+ return (true);
}
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index ddcbf19b847..71ba3f41a44 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -1750,6 +1750,9 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that
* is shared between databases or \c "none" when no shared cache is
* configured., a string; default \c none.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of cache this
+ * database can be allocated from the shared cache. Defaults to the
+ * entire shared cache size., an integer; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
* database is guaranteed to have available from the shared cache. This
* setting is per database. Defaults to the chunk size., an integer;
@@ -2216,10 +2219,12 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that is shared
* between databases or \c "none" when no shared cache is configured., a string;
* default \c none.}
- * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache
- * this database is guaranteed to have available from the shared cache. This
- * setting is per database. Defaults to the chunk size., an integer; default \c
- * 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of
+ * cache this database can be allocated from the shared cache. Defaults to the
+ * entire shared cache size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache. This setting is per
+ * database. Defaults to the chunk size., an integer; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
* shared cache. Setting this will update the value if one is already set., an
* integer between 1MB and 10TB; default \c 500MB.}
@@ -3642,198 +3647,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047
/*! cache: in-memory page splits */
#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048
+/*! cache: in-memory page passed criteria to be split */
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049
+/*! cache: lookaside table insert calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050
+/*! cache: lookaside table remove calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1049
+#define WT_STAT_CONN_CACHE_OVERHEAD 1052
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1050
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1051
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1052
+#define WT_STAT_CONN_CACHE_READ 1055
+/*! cache: pages read into cache requiring lookaside entries */
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1053
+#define WT_STAT_CONN_CACHE_WRITE 1057
+/*! cache: page written requiring lookaside records */
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058
+/*! cache: pages written requiring in-memory restoration */
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1054
+#define WT_STAT_CONN_COND_WAIT 1060
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1055
+#define WT_STAT_CONN_CURSOR_CREATE 1061
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1056
+#define WT_STAT_CONN_CURSOR_INSERT 1062
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1057
+#define WT_STAT_CONN_CURSOR_NEXT 1063
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1058
+#define WT_STAT_CONN_CURSOR_PREV 1064
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1059
+#define WT_STAT_CONN_CURSOR_REMOVE 1065
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1060
+#define WT_STAT_CONN_CURSOR_RESET 1066
/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1061
+#define WT_STAT_CONN_CURSOR_RESTART 1067
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1062
+#define WT_STAT_CONN_CURSOR_SEARCH 1068
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1063
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1064
+#define WT_STAT_CONN_CURSOR_UPDATE 1070
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1065
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1066
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1072
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1067
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1068
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1069
+#define WT_STAT_CONN_DH_SWEEP_REF 1075
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1070
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1071
+#define WT_STAT_CONN_DH_SWEEP_TOD 1077
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1072
+#define WT_STAT_CONN_DH_SWEEPS 1078
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1073
+#define WT_STAT_CONN_FILE_OPEN 1079
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1074
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1075
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1076
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1077
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1078
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1079
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1080
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1081
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1082
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1083
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1089
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1084
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1090
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1085
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1091
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1086
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1092
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1087
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1093
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1088
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1094
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1089
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1095
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1090
+#define WT_STAT_CONN_LOG_SCANS 1096
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1091
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1097
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1092
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1098
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1093
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1099
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1094
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1100
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1095
-/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1096
-/*! log: failed to find a slot large enough for record */
-#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1097
+#define WT_STAT_CONN_LOG_SLOT_RACES 1101
+/*! log: busy returns attempting to switch slots */
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1098
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1103
+/*! log: consolidated slot unbuffered writes */
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1104
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1099
+#define WT_STAT_CONN_LOG_SYNC 1105
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1100
+#define WT_STAT_CONN_LOG_SYNC_DIR 1106
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1101
+#define WT_STAT_CONN_LOG_WRITE_LSN 1107
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1102
+#define WT_STAT_CONN_LOG_WRITES 1108
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1103
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1109
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1104
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1110
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1105
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1111
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1106
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1112
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1107
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1113
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1108
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1114
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1109
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1115
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1110
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1116
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1111
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1117
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1112
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1118
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1113
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1119
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1114
+#define WT_STAT_CONN_MEMORY_FREE 1120
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1115
+#define WT_STAT_CONN_MEMORY_GROW 1121
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1116
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1122
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1117
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1123
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1118
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1124
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1119
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1125
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1120
+#define WT_STAT_CONN_PAGE_SLEEP 1126
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1121
+#define WT_STAT_CONN_READ_IO 1127
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1122
+#define WT_STAT_CONN_REC_PAGES 1128
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1123
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1129
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1124
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1130
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1125
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1131
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1126
+#define WT_STAT_CONN_RWLOCK_READ 1132
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1127
+#define WT_STAT_CONN_RWLOCK_WRITE 1133
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1128
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1134
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1129
+#define WT_STAT_CONN_SESSION_OPEN 1135
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1130
+#define WT_STAT_CONN_TXN_BEGIN 1136
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1131
+#define WT_STAT_CONN_TXN_CHECKPOINT 1137
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1132
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1138
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1133
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1139
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1134
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1140
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1135
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1141
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1136
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1142
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1137
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1143
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1138
+#define WT_STAT_CONN_TXN_COMMIT 1144
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1139
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1145
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1140
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1146
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1141
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1147
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1142
+#define WT_STAT_CONN_TXN_ROLLBACK 1148
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1143
+#define WT_STAT_CONN_TXN_SYNC 1149
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1144
+#define WT_STAT_CONN_WRITE_IO 1150
/*!
* @}
@@ -3883,148 +3900,158 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019
/*! btree: column-store internal pages */
#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020
+/*! btree: column-store variable-size RLE encoded values */
+#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021
/*! btree: column-store variable-size leaf pages */
-#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2021
+#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022
/*! btree: pages rewritten by compaction */
-#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2022
+#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023
/*! btree: number of key/value pairs */
-#define WT_STAT_DSRC_BTREE_ENTRIES 2023
+#define WT_STAT_DSRC_BTREE_ENTRIES 2024
/*! btree: fixed-record size */
-#define WT_STAT_DSRC_BTREE_FIXED_LEN 2024
+#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025
/*! btree: maximum tree depth */
-#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2025
+#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026
/*! btree: maximum internal page key size */
-#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2026
+#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027
/*! btree: maximum internal page size */
-#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2027
+#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028
/*! btree: maximum leaf page key size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2028
+#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029
/*! btree: maximum leaf page size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2029
+#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030
/*! btree: maximum leaf page value size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2030
+#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031
/*! btree: overflow pages */
-#define WT_STAT_DSRC_BTREE_OVERFLOW 2031
+#define WT_STAT_DSRC_BTREE_OVERFLOW 2032
/*! btree: row-store internal pages */
-#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2032
+#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033
/*! btree: row-store leaf pages */
-#define WT_STAT_DSRC_BTREE_ROW_LEAF 2033
+#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034
/*! cache: bytes read into cache */
-#define WT_STAT_DSRC_CACHE_BYTES_READ 2034
+#define WT_STAT_DSRC_CACHE_BYTES_READ 2035
/*! cache: bytes written from cache */
-#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2035
+#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2036
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037
/*! cache: unmodified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2037
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2038
+#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039
/*! cache: modified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2039
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040
/*! cache: data source pages selected for eviction unable to be evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2040
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2041
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042
/*! cache: internal pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2042
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043
/*! cache: pages split during eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2043
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044
/*! cache: in-memory page splits */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2044
+#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045
+/*! cache: in-memory page passed criteria to be split */
+#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046
/*! cache: overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2045
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047
/*! cache: pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 2046
+#define WT_STAT_DSRC_CACHE_READ 2048
+/*! cache: pages read into cache requiring lookaside entries */
+#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049
/*! cache: overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2047
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2048
+#define WT_STAT_DSRC_CACHE_WRITE 2051
+/*! cache: page written requiring lookaside records */
+#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052
+/*! cache: pages written requiring in-memory restoration */
+#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2049
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2050
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2051
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2052
+#define WT_STAT_DSRC_COMPRESS_READ 2057
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2053
+#define WT_STAT_DSRC_COMPRESS_WRITE 2058
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2054
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2055
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2056
+#define WT_STAT_DSRC_CURSOR_CREATE 2061
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2057
+#define WT_STAT_DSRC_CURSOR_INSERT 2062
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2058
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2059
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2060
+#define WT_STAT_DSRC_CURSOR_NEXT 2065
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2061
+#define WT_STAT_DSRC_CURSOR_PREV 2066
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2062
+#define WT_STAT_DSRC_CURSOR_REMOVE 2067
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2063
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2064
+#define WT_STAT_DSRC_CURSOR_RESET 2069
/*! cursor: restarted searches */
-#define WT_STAT_DSRC_CURSOR_RESTART 2065
+#define WT_STAT_DSRC_CURSOR_RESTART 2070
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2066
+#define WT_STAT_DSRC_CURSOR_SEARCH 2071
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2067
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2068
+#define WT_STAT_DSRC_CURSOR_UPDATE 2073
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2069
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2070
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075
/*! LSM: chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2071
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076
/*! LSM: highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2072
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077
/*! LSM: queries that could have benefited from a Bloom filter that did
* not exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2073
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2074
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2075
+#define WT_STAT_DSRC_REC_DICTIONARY 2080
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2076
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2077
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2078
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2079
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2080
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2081
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2082
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2087
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2083
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2088
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2084
+#define WT_STAT_DSRC_REC_PAGES 2089
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2085
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2086
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2087
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2088
+#define WT_STAT_DSRC_SESSION_COMPACT 2093
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2089
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2090
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095
/*! @} */
/*
* Statistics section: END
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 9cc2ce2135a..4d46a25b63c 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -41,6 +41,7 @@ extern "C" {
#else
#include <pthread.h>
#endif
+#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdint.h>
@@ -245,6 +246,8 @@ struct __wt_rwlock;
typedef struct __wt_rwlock WT_RWLOCK;
struct __wt_salvage_cookie;
typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_save_upd;
+ typedef struct __wt_save_upd WT_SAVE_UPD;
struct __wt_scratch_track;
typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
struct __wt_session_impl;
@@ -265,8 +268,6 @@ struct __wt_txn_op;
typedef struct __wt_txn_op WT_TXN_OP;
struct __wt_txn_state;
typedef struct __wt_txn_state WT_TXN_STATE;
-struct __wt_upd_skipped;
- typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
struct __wt_update;
typedef struct __wt_update WT_UPDATE;
union __wt_rand_state;
@@ -335,6 +336,7 @@ union __wt_rand_state;
#include "cache.i" /* required by txn.i */
#include "cell.i" /* required by btree.i */
+#include "log.i"
#include "mutex.i" /* required by btree.i */
#include "txn.i" /* required by btree.i */
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 26ba34c7f93..574442f645c 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -34,6 +34,24 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
}
/*
+ * __wt_log_ckpt_lsn --
+ * Force out buffered records and return an LSN for checkpoint.
+ */
+int
+__wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ WT_RET(__wt_log_force_write(session, 1));
+ WT_RET(__wt_log_wrlsn(session));
+ *ckp_lsn = log->write_start_lsn;
+ return (0);
+}
+
+/*
* __wt_log_background --
* Record the given LSN as the background LSN and signal the
* thread as needed.
@@ -53,7 +71,7 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
* needed.
*/
__wt_spin_lock(session, &log->log_sync_lock);
- if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0)
+ if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
log->bg_sync_lsn = *lsn;
__wt_spin_unlock(session, &log->log_sync_lock);
return (__wt_cond_signal(session, conn->log_file_cond));
@@ -100,7 +118,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
/*
* Sync the log file if needed.
*/
- if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) {
+ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_force_sync: sync to LSN %d/%lu",
min_lsn->file, min_lsn->offset));
@@ -241,6 +259,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
log = S2C(session)->log;
*maxid = 0;
+ /*
+ * These may be files needed by backup. Force the current slot
+ * to get written to the file.
+ */
+ WT_RET(__wt_log_force_write(session, 1));
WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
/* Filter out any files that are below the checkpoint LSN. */
@@ -354,70 +377,12 @@ static int
__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
{
WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
- return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
-}
-
-/*
- * __log_acquire --
- * Called with the log slot lock held. Can be called recursively
- * from __wt_log_newfile when we change log files.
- */
-static int
-__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
-{
- WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- int created_log;
conn = S2C(session);
log = conn->log;
- created_log = 1;
- /*
- * Called locked. Add recsize to alloc_lsn. Save our starting LSN
- * where the previous allocation finished for the release LSN.
- * That way when log files switch, we're waiting for the correct LSN
- * from outstanding writes.
- */
- slot->slot_release_lsn = log->alloc_lsn;
- if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
- WT_RET(__wt_log_newfile(session, 0, &created_log));
- if (log->log_close_fh != NULL)
- F_SET(slot, WT_SLOT_CLOSEFH);
- }
-
- /*
- * Checkpoints can be configured based on amount of log written.
- * Add in this log record to the sum and if needed, signal the
- * checkpoint condition. The logging subsystem manages the
- * accumulated field. There is a bit of layering violation
- * here checking the connection ckpt field and using its
- * condition.
- */
- if (WT_CKPT_LOGSIZE(conn)) {
- log->log_written += (wt_off_t)recsize;
- WT_RET(__wt_checkpoint_signal(session, log->log_written));
- }
-
- /*
- * Need to minimally fill in slot info here. Our slot start LSN
- * comes after any potential new log file creations.
- */
- slot->slot_start_lsn = log->alloc_lsn;
- slot->slot_start_offset = log->alloc_lsn.offset;
- /*
- * Pre-allocate on the first real write into the log file, if it
- * was just created (i.e. not pre-allocated).
- */
- if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
- WT_RET(__log_prealloc(session, log->log_fh));
-
- log->alloc_lsn.offset += (wt_off_t)recsize;
- slot->slot_end_lsn = log->alloc_lsn;
- slot->slot_error = 0;
- slot->slot_fh = log->log_fh;
- return (0);
+ return (lsn->offset == WT_LOG_FIRST_RECORD ||
+ lsn->offset + (wt_off_t)recsize < conn->log_file_max);
}
/*
@@ -490,24 +455,32 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
*/
static int
__log_fill(WT_SESSION_IMPL *session,
- WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+ WT_MYSLOT *myslot, int force, WT_ITEM *record, WT_LSN *lsnp)
{
WT_DECL_RET;
WT_LOG_RECORD *logrec;
+ /*
+ * The WT_LOG_SLOT_BUF_MAX macro uses log.
+ */
logrec = (WT_LOG_RECORD *)record->mem;
/*
- * Call __wt_write. For now the offset is the real byte offset. If the
- * offset becomes a unit of WT_LOG_ALIGN this is where we would multiply
- * by WT_LOG_ALIGN to get the real file byte offset for write().
+ * Call __wt_write or copy into the buffer. For now the offset is the
+ * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this
+ * is where we would multiply by WT_LOG_ALIGN to get the real file byte
+ * offset for write().
*/
- if (direct)
+ if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+ else
+ /*
+ * If this is a force or unbuffered write, write it now.
+ * A forced write sends in a temporary, local slot.
+ */
WT_ERR(__wt_write(session, myslot->slot->slot_fh,
myslot->offset + myslot->slot->slot_start_offset,
(size_t)logrec->len, (void *)logrec));
- else
- memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
- logrec, logrec->len);
WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
if (lsnp != NULL) {
@@ -563,12 +536,12 @@ __log_file_header(
logrec->checksum = 0;
logrec->checksum = __wt_cksum(logrec, log->allocsize);
WT_CLEAR(tmp);
+ memset(&myslot, 0, sizeof(myslot));
myslot.slot = &tmp;
- myslot.offset = 0;
/*
- * We may recursively call __log_acquire to allocate log space for the
- * log descriptor record. Call __log_fill to write it, but we
+ * We may recursively call __wt_log_acquire to allocate log space for
+ * the log descriptor record. Call __log_fill to write it, but we
* do not need to call __log_release because we're not waiting for
* any earlier operations to complete.
*/
@@ -577,7 +550,7 @@ __log_file_header(
tmp.slot_fh = fh;
} else {
WT_ASSERT(session, fh == NULL);
- WT_ERR(__log_acquire(session, logrec->len, &tmp));
+ WT_ERR(__wt_log_acquire(session, logrec->len, &tmp));
}
WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
/*
@@ -697,6 +670,146 @@ err: __wt_scr_free(session, &from_path);
}
/*
+ * __log_newfile --
+ * Create the next log file and write the file header record into it.
+ */
+static int
+__log_newfile(WT_SESSION_IMPL *session, int conn_open, int *created)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LSN end_lsn;
+ int create_log, yield_cnt;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ create_log = 1;
+ yield_cnt = 0;
+ /*
+ * Set aside the log file handle to be closed later. Other threads
+ * may still be using it to write to the log. If the log file size
+ * is small we could fill a log file before the previous one is closed.
+ * Wait for that to close.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ while (log->log_close_fh != NULL) {
+ WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+ WT_RET(__wt_log_wrlsn(session));
+ if (++yield_cnt > 10000)
+ return (EBUSY);
+ __wt_yield();
+ }
+ log->log_close_fh = log->log_fh;
+ if (log->log_close_fh != NULL)
+ log->log_close_lsn = log->alloc_lsn;
+ log->fileid++;
+ /*
+ * Make sure everything we set above is visible.
+ */
+ WT_FULL_BARRIER();
+ /*
+ * If we're pre-allocating log files, look for one. If there aren't any
+ * or we're not pre-allocating, then create one.
+ */
+ if (conn->log_prealloc) {
+ ret = __log_alloc_prealloc(session, log->fileid);
+ /*
+ * If ret is 0 it means we found a pre-allocated file.
+ * If ret is non-zero but not WT_NOTFOUND, we return the error.
+ * If ret is WT_NOTFOUND, we leave create_log set and create
+ * the new log file.
+ */
+ if (ret == 0)
+ create_log = 0;
+ /*
+ * If we get any error other than WT_NOTFOUND, return it.
+ */
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ ret = 0;
+ }
+ /*
+ * If we need to create the log file, do so now.
+ */
+ if (create_log) {
+ log->prep_missed++;
+ WT_RET(__wt_log_allocfile(
+ session, log->fileid, WT_LOG_FILENAME, 1));
+ }
+ WT_RET(__log_openfile(session,
+ 0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
+ /*
+ * We need to setup the LSNs. Set the end LSN and alloc LSN to
+ * the end of the header.
+ */
+ log->alloc_lsn.file = log->fileid;
+ log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
+ end_lsn = log->alloc_lsn;
+
+ /*
+ * If we're called from connection creation code, we need to update
+ * the LSNs since we're the only write in progress.
+ */
+ if (conn_open) {
+ WT_RET(__wt_fsync(session, log->log_fh));
+ log->sync_lsn = end_lsn;
+ log->write_lsn = end_lsn;
+ log->write_start_lsn = end_lsn;
+ }
+ if (created != NULL)
+ *created = create_log;
+ return (0);
+}
+
+/*
+ * __wt_log_acquire --
+ * Called serially when switching slots. Can be called recursively
+ * from __log_newfile when we change log files.
+ */
+int
+__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int created_log;
+
+ conn = S2C(session);
+ log = conn->log;
+ created_log = 1;
+ /*
+ * Add recsize to alloc_lsn. Save our starting LSN
+ * where the previous allocation finished for the release LSN.
+ * That way when log files switch, we're waiting for the correct LSN
+ * from outstanding writes.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ /*
+ * We need to set the release LSN earlier, before a log file change.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__log_newfile(session, 0, &created_log));
+ if (log->log_close_fh != NULL)
+ F_SET(slot, WT_SLOT_CLOSEFH);
+ }
+
+ /*
+ * Pre-allocate on the first real write into the log file, if it
+ * was just created (i.e. not pre-allocated).
+ */
+ if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
+ WT_RET(__log_prealloc(session, log->log_fh));
+ /*
+ * Initialize the slot for activation.
+ */
+ __wt_log_slot_activate(session, slot);
+
+ return (0);
+}
+
+/*
* __log_truncate --
* Truncate the log to the given LSN. If this_log is set, it will only
* truncate the log file indicated in the given LSN. If not set,
@@ -842,7 +955,7 @@ err: __wt_scr_free(session, &path);
* __wt_log_open --
* Open the appropriate log file for the connection. The purpose is
* to find the last log file that exists, open it and set our initial
- * LSNs to the end of that file. If none exist, call __wt_log_newfile
+ * LSNs to the end of that file. If none exist, call __log_newfile
* to create it.
*/
int
@@ -917,7 +1030,9 @@ __wt_log_open(WT_SESSION_IMPL *session)
* Start logging at the beginning of the next log file, no matter
* where the previous log file ends.
*/
- WT_ERR(__wt_log_newfile(session, 1, NULL));
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __log_newfile(session, 1, NULL));
+ WT_ERR(ret);
/* If we found log files, save the new state. */
if (logcount > 0) {
@@ -1065,38 +1180,57 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
WT_DECL_RET;
WT_LOG *log;
WT_LSN sync_lsn;
- size_t write_size;
- int locked, yield_count;
+ int locked, need_relock, yield_count;
+ int64_t release_buffered, release_bytes;
conn = S2C(session);
log = conn->log;
- locked = yield_count = 0;
- *freep = 1;
+ locked = need_relock = yield_count = 0;
+ if (freep != NULL)
+ *freep = 1;
+ release_buffered =
+ WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+ release_bytes = release_buffered + slot->slot_unbuffered;
/* Write the buffered records */
- if (F_ISSET(slot, WT_SLOT_BUFFERED)) {
- write_size = (size_t)
- (slot->slot_end_lsn.offset - slot->slot_start_offset);
- WT_ERR(__wt_write(session, slot->slot_fh,
- slot->slot_start_offset, write_size, slot->slot_buf.mem));
+ /*
+ * Checkpoints can be configured based on amount of log written.
+ * Add in this log record to the sum and if needed, signal the
+ * checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation
+ * here checking the connection ckpt field and using its
+ * condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)release_bytes;
+ WT_RET(__wt_checkpoint_signal(session, log->log_written));
}
+ if (release_buffered != 0)
+ WT_ERR(__wt_write(session,
+ slot->slot_fh, slot->slot_start_offset,
+ (size_t)release_buffered, slot->slot_buf.mem));
+
/*
- * If this is not a buffered write, meaning the slot we have is a
- * dummy constructed slot, not from the slot pool, or we have to wait
- * for a synchronous operation, we do not pass handling of this slot
- * off to the worker thread. The caller is responsible for freeing
- * the slot in that case. Otherwise the worker thread will free it.
+ * If we have to wait for a synchronous operation, we do not pass
+ * handling of this slot off to the worker thread. The caller is
+ * responsible for freeing the slot in that case. Otherwise the
+ * worker thread will free it.
*/
- if (F_ISSET(slot, WT_SLOT_BUFFERED) &&
- !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
- *freep = 0;
+ if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ if (freep != NULL)
+ *freep = 0;
slot->slot_state = WT_LOG_SLOT_WRITTEN;
/*
* After this point the worker thread owns the slot. There
* is nothing more to do but return.
*/
- WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+ /*
+ * !!! Signalling the wrlsn_cond condition here results in
+ * worse performance because it causes more scheduling churn
+ * and more walking of the slot pool for a very small number
+ * of slots to process. Don't signal here.
+ */
goto done;
}
@@ -1105,15 +1239,31 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* be holes in the log file.
*/
WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
- while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ /*
+ * If we're on a locked path and the write LSN is not advancing,
+ * unlock in case an earlier thread is trying to switch its
+ * slot and complete its operation.
+ */
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ need_relock = 1;
+ }
if (++yield_count < 1000)
__wt_yield();
else
WT_ERR(__wt_cond_wait(
session, log->log_write_cond, 200));
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+ __wt_spin_lock(session, &log->log_slot_lock);
+ need_relock = 0;
+ }
}
+
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
+
+ WT_ASSERT(session, slot != log->active_slot);
WT_ERR(__wt_cond_signal(session, log->log_write_cond));
/*
@@ -1168,7 +1318,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* Sync the log file if needed.
*/
if (F_ISSET(slot, WT_SLOT_SYNC) &&
- WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_release: sync log %s", log->log_fh->name));
WT_STAT_FAST_CONN_INCR(session, log_sync);
@@ -1186,6 +1336,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
}
err: if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
+ if (need_relock)
+ __wt_spin_lock(session, &log->log_slot_lock);
if (ret != 0 && slot->slot_error == 0)
slot->slot_error = ret;
done:
@@ -1193,93 +1345,6 @@ done:
}
/*
- * __wt_log_newfile --
- * Create the next log file and write the file header record into it.
- */
-int
-__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created)
-{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- WT_LSN end_lsn;
- int create_log;
-
- conn = S2C(session);
- log = conn->log;
-
- create_log = 1;
- /*
- * Set aside the log file handle to be closed later. Other threads
- * may still be using it to write to the log. If the log file size
- * is small we could fill a log file before the previous one is closed.
- * Wait for that to close.
- */
- while (log->log_close_fh != NULL) {
- WT_STAT_FAST_CONN_INCR(session, log_close_yields);
- WT_RET(__wt_log_wrlsn(session, NULL, NULL));
- __wt_yield();
- }
- log->log_close_fh = log->log_fh;
- log->fileid++;
-
- /*
- * If we're pre-allocating log files, look for one. If there aren't any
- * or we're not pre-allocating, then create one.
- */
- ret = 0;
- if (conn->log_prealloc) {
- ret = __log_alloc_prealloc(session, log->fileid);
- /*
- * If ret is 0 it means we found a pre-allocated file.
- * If ret is non-zero but not WT_NOTFOUND, we return the error.
- * If ret is WT_NOTFOUND, we leave create_log set and create
- * the new log file.
- */
- if (ret == 0)
- create_log = 0;
- /*
- * If we get any error other than WT_NOTFOUND, return it.
- */
- if (ret != 0 && ret != WT_NOTFOUND)
- return (ret);
- ret = 0;
- }
- /*
- * If we need to create the log file, do so now.
- */
- if (create_log) {
- log->prep_missed++;
- if ((ret = __wt_log_allocfile(
- session, log->fileid, WT_LOG_FILENAME, 0)) != 0)
- return (ret);
- }
- WT_RET(__log_openfile(session,
- 0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
- /*
- * We need to setup the LSNs. Set the end LSN and alloc LSN to
- * the end of the header.
- */
- log->alloc_lsn.file = log->fileid;
- log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
- end_lsn = log->alloc_lsn;
-
- /*
- * If we're called from connection creation code, we need to update
- * the LSNs since we're the only write in progress.
- */
- if (conn_create) {
- WT_RET(__wt_fsync(session, log->log_fh));
- log->sync_lsn = end_lsn;
- log->write_lsn = end_lsn;
- log->write_start_lsn = end_lsn;
- }
- if (created != NULL)
- *created = create_log;
- return (0);
-}
-
-/*
* __wt_log_scan --
* Scan the logs, calling a function on each record found.
*/
@@ -1535,7 +1600,7 @@ advance:
/* Truncate if we're in recovery. */
if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0)
WT_ERR(__log_truncate(session,
&rd_lsn, WT_LOG_FILENAME, 0));
@@ -1559,42 +1624,54 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans);
}
/*
- * __log_direct_write --
- * Write a log record without using the consolidation arrays.
+ * __log_force_write_internal --
+ * Force a switch and release and write of the current slot.
+ * Must be called with the slot lock held.
*/
static int
-__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
- uint32_t flags)
+__log_force_write_internal(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT tmp;
- WT_MYSLOT myslot;
- int dummy, locked;
+ WT_LOGSLOT *slot;
+ int free_slot, release;
log = S2C(session)->log;
- myslot.slot = &tmp;
- myslot.offset = 0;
- dummy = 0;
- WT_CLEAR(tmp);
+ slot = log->active_slot;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ /*
+ * If closing the slot returns WT_NOTFOUND, it means that someone else
+ * is processing the slot change: we're done. If we get EBUSY (or any
+ * other error), return that so the caller can decide what to do.
+ */
+ ret = __wt_log_slot_close(session, slot, &release, 1);
+ if (ret == WT_NOTFOUND)
+ return (0);
+ WT_RET(ret);
+ if (release) {
+ WT_RET(__log_release(session, slot, &free_slot));
+ if (free_slot)
+ __wt_log_slot_free(session, slot);
+ }
+ WT_RET(__wt_log_slot_new(session));
+ return (0);
+}
- /* Fast path the contended case. */
- if (__wt_spin_trylock(session, &log->log_slot_lock) != 0)
- return (EAGAIN);
- locked = 1;
-
- if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
- F_SET(&tmp, WT_SLOT_SYNC_DIR);
- if (LF_ISSET(WT_LOG_FSYNC))
- F_SET(&tmp, WT_SLOT_SYNC);
- WT_ERR(__log_acquire(session, record->size, &tmp));
- __wt_spin_unlock(session, &log->log_slot_lock);
- locked = 0;
- WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
- WT_ERR(__log_release(session, &tmp, &dummy));
+/*
+ * __wt_log_force_write --
+ * Force a switch and release and write of the current slot.
+ * Wrapper function that takes the lock.
+ */
+int
+__wt_log_force_write(WT_SESSION_IMPL *session, int retry)
+{
+ WT_DECL_RET;
+
+ do {
+ WT_WITH_SLOT_LOCK(session, S2C(session)->log,
+ ret = __log_force_write_internal(session));
+ } while (retry && ret == EBUSY);
-err: if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
return (ret);
}
@@ -1741,14 +1818,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_LOG_RECORD *logrec;
WT_LSN lsn;
WT_MYSLOT myslot;
- uint32_t rdup_len;
- int free_slot, locked;
+ int64_t release_size;
+ uint32_t force, rdup_len;
+ int free_slot;
conn = S2C(session);
log = conn->log;
- free_slot = locked = 0;
+ free_slot = 0;
WT_INIT_LSN(&lsn);
myslot.slot = NULL;
+ memset(&myslot, 0, sizeof(myslot));
/*
* Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
* header at the beginning for us to fill in.
@@ -1778,87 +1857,67 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_STAT_FAST_CONN_INCR(session, log_writes);
- if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
- ret = __log_direct_write(session, record, &lsn, flags);
- if (ret == 0 && lsnp != NULL)
- *lsnp = lsn;
- /*
- * All needed syncing will be handled directly except
- * a background sync. Handle that here.
- */
- if (ret == 0) {
- if (LF_ISSET(WT_LOG_BACKGROUND))
- goto bg;
- else
- return (0);
- }
- if (ret != EAGAIN)
- WT_ERR(ret);
- /*
- * An EAGAIN return means we failed to get the try lock -
- * fall through to the consolidation code in that case.
- */
- }
-
+ __wt_log_slot_join(session, rdup_len, flags, &myslot);
/*
- * As soon as we see contention for the log slot, disable direct
- * log writes. We get better performance by forcing writes through
- * the consolidation code. This is because individual writes flood
- * the I/O system faster than they contend on the log slot lock.
+ * If the addition of this record crosses the buffer boundary,
+ * switch in a new slot.
*/
- F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
- if ((ret = __wt_log_slot_join(
- session, rdup_len, flags, &myslot)) == ENOMEM) {
+ force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
+ ret = 0;
+ if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX ||
+ F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
+ ret = __wt_log_slot_switch(session, &myslot);
+ if (ret == 0)
+ ret = __log_fill(session, &myslot, 0, record, &lsn);
+ release_size = __wt_log_slot_release(
+ session, &myslot, (int64_t)rdup_len);
+ /*
+ * If we get an error we still need to do proper accounting in
+ * the slot fields.
+ * XXX On error we may still need to call release and free.
+ */
+ if (ret != 0)
+ myslot.slot->slot_error = ret;
+ WT_ASSERT(session, ret == 0);
+ if (WT_LOG_SLOT_DONE(release_size)) {
+ WT_ERR(__log_release(session, myslot.slot, &free_slot));
+ if (free_slot)
+ __wt_log_slot_free(session, myslot.slot);
+ } else if (force) {
/*
- * If we couldn't find a consolidated slot for this record
- * write the record directly.
+ * If we are going to wait for this slot to get written,
+ * signal the wrlsn thread.
+ *
+ * XXX I've seen times when conditions are NULL.
*/
- while ((ret = __log_direct_write(
- session, record, lsnp, flags)) == EAGAIN)
- ;
- WT_ERR(ret);
- return (0);
+ if (conn->log_cond != NULL) {
+ WT_ERR(__wt_cond_signal(session, conn->log_cond));
+ __wt_yield();
+ } else
+ WT_ERR(__wt_log_force_write(session, 1));
}
- WT_ERR(ret);
- if (myslot.offset == 0) {
- __wt_spin_lock(session, &log->log_slot_lock);
- locked = 1;
- WT_ERR(__wt_log_slot_close(session, myslot.slot));
- WT_ERR(__log_acquire(
- session, myslot.slot->slot_group_size, myslot.slot));
- __wt_spin_unlock(session, &log->log_slot_lock);
- locked = 0;
- WT_ERR(__wt_log_slot_notify(session, myslot.slot));
- } else
- WT_ERR(__wt_log_slot_wait(session, myslot.slot));
- WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
- if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
- WT_ERR(__log_release(session, myslot.slot, &free_slot));
- if (free_slot)
- WT_ERR(__wt_log_slot_free(session, myslot.slot));
+ if (LF_ISSET(WT_LOG_FLUSH)) {
+ /* Wait for our writes to reach the OS */
+ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
+ myslot.slot->slot_error == 0)
+ (void)__wt_cond_wait(
+ session, log->log_write_cond, 10000);
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
- while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
(void)__wt_cond_wait(
session, log->log_sync_cond, 10000);
- } else if (LF_ISSET(WT_LOG_FLUSH)) {
- /* Wait for our writes to reach the OS */
- while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
- (void)__wt_cond_wait(
- session, log->log_write_cond, 10000);
}
/*
* Advance the background sync LSN if needed.
*/
-bg: if (LF_ISSET(WT_LOG_BACKGROUND) &&
- WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0)
+ if (LF_ISSET(WT_LOG_BACKGROUND) &&
+ __wt_log_cmp(&session->bg_sync_lsn, &lsn) <= 0)
WT_ERR(__wt_log_background(session, &lsn));
-err: if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
+err:
if (ret == 0 && lsnp != NULL)
*lsnp = lsn;
/*
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index 07878d1ae1e..a1a68557f93 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -9,325 +9,493 @@
#include "wt_internal.h"
/*
- * This file implements the consolidated array algorithm as described in
- * the paper:
- * Scalability of write-ahead logging on multicore and multisocket hardware
- * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
- * and Anastasia Ailamaki.
- *
- * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
- * be found at:
- * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ * __wt_log_slot_activate --
+ * Initialize a slot to become active.
*/
-
-/*
- * __wt_log_slot_init --
- * Initialize the slot array.
- */
-int
-__wt_log_slot_init(WT_SESSION_IMPL *session)
+void
+__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT *slot;
- int32_t i;
conn = S2C(session);
log = conn->log;
- WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool);
- for (i = 0; i < WT_SLOT_POOL; i++) {
- log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
- log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX;
- }
-
- /*
- * Set up the available slots from the pool the first time.
- */
- for (i = 0; i < WT_SLOT_ACTIVE; i++) {
- slot = &log->slot_pool[i];
- slot->slot_index = (uint32_t)i;
- slot->slot_state = WT_LOG_SLOT_READY;
- log->slot_array[i] = slot;
- }
-
- /*
- * Allocate memory for buffers now that the arrays are setup. Split
- * this out to make error handling simpler.
- *
- * Cap the slot buffer to the log file size.
- */
- log->slot_buf_size =
- WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE);
- for (i = 0; i < WT_SLOT_POOL; i++) {
- WT_ERR(__wt_buf_init(session,
- &log->slot_pool[i].slot_buf, log->slot_buf_size));
- F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
- }
- WT_STAT_FAST_CONN_INCRV(session,
- log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
- if (0) {
-err: while (--i >= 0)
- __wt_buf_free(session, &log->slot_pool[i].slot_buf);
- }
- return (ret);
+ slot->slot_state = 0;
+ slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
+ slot->slot_start_offset = log->alloc_lsn.offset;
+ slot->slot_last_offset = log->alloc_lsn.offset;
+ slot->slot_fh = log->log_fh;
+ slot->slot_error = 0;
+ slot->slot_unbuffered = 0;
}
/*
- * __wt_log_slot_destroy --
- * Clean up the slot array on shutdown.
+ * __wt_log_slot_close --
+ * Close out the slot the caller is using. The slot may already be
+ * closed or freed by another thread.
*/
int
-__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+__wt_log_slot_close(
+ WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced)
{
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- int i;
+ int64_t end_offset, new_state, old_state;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
log = conn->log;
-
- for (i = 0; i < WT_SLOT_POOL; i++)
- __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ if (releasep != NULL)
+ *releasep = 0;
+ if (slot == NULL)
+ return (WT_NOTFOUND);
+retry:
+ old_state = slot->slot_state;
+ /*
+ * If this close is coming from a forced close and a thread is in
+ * the middle of using the slot, return EBUSY. The caller can
+ * decide if retrying is necessary or not.
+ */
+ if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
+ return (EBUSY);
+ /*
+ * If someone else is switching out this slot we lost. Nothing to
+ * do but return. Return WT_NOTFOUND anytime the given slot was
+ * processed by another closing thread. Only return 0 when we
+ * actually closed the slot.
+ */
+ if (WT_LOG_SLOT_CLOSED(old_state))
+ return (WT_NOTFOUND);
+ /*
+ * If someone completely processed this slot, we're done.
+ */
+ if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED))
+ return (WT_NOTFOUND);
+ new_state = (old_state | WT_LOG_SLOT_CLOSE);
+ /*
+ * Close this slot. If we lose the race retry.
+ */
+ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state))
+ goto retry;
+ /*
+ * We own the slot now. No one else can join.
+ * Set the end LSN.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+ if (WT_LOG_SLOT_DONE(new_state) && releasep != NULL)
+ *releasep = 1;
+ slot->slot_end_lsn = slot->slot_start_lsn;
+ end_offset =
+ WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
+ slot->slot_end_lsn.offset += (wt_off_t)end_offset;
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_slot_consolidated, end_offset);
+ /*
+ * XXX Would like to change so one piece of code advances the LSN.
+ */
+ log->alloc_lsn = slot->slot_end_lsn;
+ WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file);
return (0);
}
/*
- * __wt_log_slot_join --
- * Join a consolidated logging slot. Callers should be prepared to deal
- * with a ENOMEM return - which indicates no slots could accommodate
- * the log record.
+ * __wt_log_slot_switch_internal --
+ * Switch out the current slot and set up a new one.
*/
int
-__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
- uint32_t flags, WT_MYSLOT *myslotp)
+__wt_log_slot_switch_internal(WT_SESSION_IMPL *session, WT_MYSLOT *myslot)
{
- WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT *slot;
- int64_t new_state, old_state;
- uint32_t allocated_slot, slot_attempts;
+ int release;
+#ifdef HAVE_DIAGNOSTIC
+ int64_t r, state;
+ int32_t j;
+#endif
- conn = S2C(session);
- log = conn->log;
- slot_attempts = 0;
+ log = S2C(session)->log;
+ release = 0;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
- if (mysize >= (uint64_t)log->slot_buf_size) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
- return (ENOMEM);
- }
-find_slot:
-#if WT_SLOT_ACTIVE == 1
- allocated_slot = 0;
-#else
- allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE;
-#endif
- /*
- * Get the selected slot. Use a barrier to prevent the compiler from
- * caching this read.
- */
- WT_BARRIER();
- slot = log->slot_array[allocated_slot];
-join_slot:
/*
- * Read the current slot state. Use a barrier to prevent the compiler
- * from caching this read.
+ * If someone else raced us to closing this specific slot, we're
+ * done here.
*/
- WT_BARRIER();
- old_state = slot->slot_state;
+ if (myslot->slot != log->active_slot)
+ return (0);
+
/*
- * WT_LOG_SLOT_READY and higher means the slot is available for
- * joining. Any other state means it is in use and transitioning
- * from the active array.
+ * If close returns WT_NOTFOUND, it means that someone else is
+ * processing the slot change. However, we could have retried
+ * from a busy time creating a new slot. If so, we are that
+ * someone else and we need to try setting up a new slot again.
*/
- if (old_state < WT_LOG_SLOT_READY) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
- goto find_slot;
+ if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) {
+ ret = __wt_log_slot_close(session, myslot->slot, &release, 0);
+ if (ret == WT_NOTFOUND)
+ return (0);
}
+
/*
- * Add in our size to the state and then atomically swap that
- * into place if it is still the same value.
+ * Only mainline callers use switch. Our size should be in join
+ * and we have not yet released, so we should never think release
+ * should be done now.
*/
- new_state = old_state + (int64_t)mysize;
- if (new_state < old_state) {
- /* Our size doesn't fit here. */
- WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
- goto find_slot;
- }
+ WT_ASSERT(session, release == 0);
+ WT_ASSERT(session, ret == 0);
+
/*
- * If the slot buffer isn't big enough to hold this update, try
- * to find another slot.
+ * Set that we have closed this slot because we may call in here
+ * multiple times if we retry creating a new slot.
*/
- if (new_state > (int64_t)slot->slot_buf.memsize) {
- if (++slot_attempts > 5) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
- return (ENOMEM);
- }
- goto find_slot;
- }
+ F_SET(myslot, WT_MYSLOT_CLOSE);
+#ifdef HAVE_DIAGNOSTIC
+ state = myslot->slot->slot_state;
+ j = WT_LOG_SLOT_JOINED(state);
+ r = WT_LOG_SLOT_RELEASED(state);
+ WT_ASSERT(session, j > r);
+#endif
+ WT_RET(__wt_log_slot_new(session));
+ return (0);
+}
+
+/*
+ * __wt_log_slot_switch --
+ * Switch out the current slot and set up a new one.
+ */
+int
+__wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ log = S2C(session)->log;
/*
- * We lost a race to add our size into this slot. Check the state
- * and try again.
+ * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the
+ * compiler does not like it combined directly with the while loop
+ * here.
*/
- if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_races);
- goto join_slot;
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __wt_log_slot_switch_internal(session, myslot));
+ while (ret == EBUSY) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy);
+ __wt_yield();
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __wt_log_slot_switch_internal(session, myslot));
}
- WT_ASSERT(session, myslotp != NULL);
- /*
- * We joined this slot. Fill in our information to return to
- * the caller.
- */
- WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
- if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
- F_SET(slot, WT_SLOT_SYNC_DIR);
- if (LF_ISSET(WT_LOG_FSYNC))
- F_SET(slot, WT_SLOT_SYNC);
- myslotp->slot = slot;
- myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
- return (0);
+ WT_ASSERT(session, ret == 0);
+ return (ret);
}
/*
- * __log_slot_find_free --
- * Find and return a free log slot.
+ * __wt_log_slot_new --
+ * Find a free slot and switch it as the new active slot.
+ * Must be called holding the slot lock.
*/
-static int
-__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot)
+int
+__wt_log_slot_new(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- uint32_t pool_i;
+ WT_LOGSLOT *slot;
+ int32_t i;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
log = conn->log;
- WT_ASSERT(session, slot != NULL);
/*
- * Encourage processing and moving the write LSN forward.
- * That process has to walk the slots anyway, so do that
- * work and let it give us the index of a free slot along
- * the way.
+ * Although this function is single threaded, multiple threads could
+ * be trying to set a new active slot sequentially. If we find an
+ * active slot that is valid, return.
*/
- WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
- while (pool_i == WT_SLOT_POOL) {
+ if ((slot = log->active_slot) != NULL &&
+ WT_LOG_SLOT_OPEN(slot->slot_state))
+ return (0);
+
+ /*
+ * Keep trying until we can find a free slot.
+ */
+ for (;;) {
+ /*
+ * For now just restart at 0. We could use log->pool_index
+ * if that is inefficient.
+ */
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (slot->slot_state == WT_LOG_SLOT_FREE) {
+ /*
+ * Make sure that the next buffer size can
+ * fit in the file. Proactively switch if
+ * it cannot. This reduces, but does not
+ * eliminate, log files that exceed the
+ * maximum file size.
+ *
+ * We want to minimize the risk of an
+ * error due to no space.
+ */
+ WT_RET(__wt_log_acquire(session,
+ log->slot_buf_size, slot));
+ /*
+ * We have a new, free slot to use.
+ * Set it as the active slot.
+ */
+ WT_STAT_FAST_CONN_INCR(session,
+ log_slot_transitions);
+ log->active_slot = slot;
+ return (0);
+ }
+ }
+ /*
+ * If we didn't find any free slots signal the worker thread.
+ */
+ (void)__wt_cond_signal(session, conn->log_wrlsn_cond);
__wt_yield();
- WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
}
- *slot = &log->slot_pool[pool_i];
- WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE);
- return (0);
+ /* NOTREACHED */
}
/*
- * __wt_log_slot_close --
- * Close a slot and do not allow any other threads to join this slot.
- * Remove this from the active slot array and move a new slot from
- * the pool into its place. Set up the size of this group;
- * Must be called with the logging spinlock held.
+ * __wt_log_slot_init --
+ * Initialize the slot array.
*/
int
-__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_init(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT *newslot;
- int64_t old_state;
+ WT_LOGSLOT *slot;
+ int32_t i;
conn = S2C(session);
log = conn->log;
- /*
- * Find an unused slot in the pool.
- */
- WT_RET(__log_slot_find_free(session, &newslot));
+ WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool);
+ for (i = 0; i < WT_SLOT_POOL; i++)
+ log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
/*
- * Swap out the slot we're going to use and put a free one in the
- * slot array in its place so that threads can use it right away.
+ * Allocate memory for buffers now that the arrays are setup. Split
+ * this out to make error handling simpler.
*/
- WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
- newslot->slot_state = WT_LOG_SLOT_READY;
- newslot->slot_index = slot->slot_index;
- log->slot_array[newslot->slot_index] = newslot;
- old_state =
- __wt_atomic_storeiv64(&slot->slot_state, WT_LOG_SLOT_PENDING);
- slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
/*
- * Note that this statistic may be much bigger than in reality,
- * especially when compared with the total bytes written in
- * __log_fill. The reason is that this size reflects any
- * rounding up that is needed and the total bytes in __log_fill
- * is the amount of user bytes.
+ * Cap the slot buffer to the log file size times two if needed.
+ * That means we try to fill to half the buffer but allow some
+ * extra space.
+ *
+ * !!! If the buffer size is too close to the log file size, we will
+ * switch log files very aggressively. Scale back the buffer for
+ * small log file sizes.
*/
+ log->slot_buf_size = (uint32_t)WT_MIN(
+ (size_t)conn->log_file_max/10, WT_LOG_SLOT_BUF_SIZE);
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, log->slot_buf_size));
+ F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
+ }
WT_STAT_FAST_CONN_INCRV(session,
- log_slot_consolidated, (uint64_t)slot->slot_group_size);
- return (0);
+ log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
+ /*
+ * Set up the available slot from the pool the first time.
+ */
+ slot = &log->slot_pool[0];
+ /*
+ * We cannot initialize the release LSN in the activate function
+ * because that is called after a log file switch.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ __wt_log_slot_activate(session, slot);
+ log->active_slot = slot;
+
+ if (0) {
+err: while (--i >= 0)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
+ return (ret);
}
/*
- * __wt_log_slot_notify --
- * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ * __wt_log_slot_destroy --
+ * Clean up the slot array on shutdown.
*/
int
-__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
{
- WT_UNUSED(session);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t rel;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
- slot->slot_state =
- (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+ /*
+ * Write out any remaining buffers. Free the buffer.
+ */
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (!FLD64_ISSET(
+ (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) {
+ rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+ if (rel != 0)
+ WT_RET(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, (size_t)rel,
+ slot->slot_buf.mem));
+ }
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
return (0);
}
/*
- * __wt_log_slot_wait --
- * Wait for slot leader to allocate log area and tell us our log offset.
+ * __wt_log_slot_join --
+ * Join a consolidated logging slot. Must be called with
+ * the read lock held.
*/
-int
-__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+void
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+ uint32_t flags, WT_MYSLOT *myslot)
{
- int yield_count;
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t flag_state, new_state, old_state, released;
+ int32_t join_offset, new_join;
+#ifdef HAVE_DIAGNOSTIC
+ int unbuf_force;
+#endif
- yield_count = 0;
- WT_UNUSED(session);
+ conn = S2C(session);
+ log = conn->log;
- while (slot->slot_state > WT_LOG_SLOT_DONE)
- if (++yield_count < 1000)
- __wt_yield();
- else
- __wt_sleep(0, 200);
- return (0);
+ /*
+ * Make sure the length cannot overflow. The caller should not
+ * even call this function if it doesn't fit but use direct
+ * writes.
+ */
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+
+ /*
+ * There should almost always be a slot open.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ unbuf_force = ((++log->write_calls % 1000) == 0);
+#endif
+ for (;;) {
+ WT_BARRIER();
+ slot = log->active_slot;
+ old_state = slot->slot_state;
+ /*
+ * Try to join our size into the existing size and
+ * atomically write it back into the state.
+ */
+ flag_state = WT_LOG_SLOT_FLAGS(old_state);
+ released = WT_LOG_SLOT_RELEASED(old_state);
+ join_offset = WT_LOG_SLOT_JOINED(old_state);
+#ifdef HAVE_DIAGNOSTIC
+ if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
+#else
+ if (mysize > WT_LOG_SLOT_BUF_MAX) {
+#endif
+ new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
+ F_SET(myslot, WT_MYSLOT_UNBUFFERED);
+ myslot->slot = slot;
+ } else
+ new_join = join_offset + (int32_t)mysize;
+ new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
+ (int64_t)new_join, (int64_t)released, (int64_t)flag_state);
+
+ /*
+ * Check if the slot is open for joining and we are able to
+ * swap in our size into the state.
+ */
+ if (WT_LOG_SLOT_OPEN(old_state) &&
+ __wt_atomic_casiv64(
+ &slot->slot_state, old_state, new_state))
+ break;
+ /*
+ * The slot is no longer open or we lost the race to
+ * update it. Yield and try again.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+ __wt_yield();
+ }
+ /*
+ * We joined this slot. Fill in our information to return to
+ * the caller.
+ */
+ if (mysize != 0)
+ WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
+ F_SET(slot, WT_SLOT_SYNC_DIR);
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(slot, WT_SLOT_SYNC);
+ if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
+ WT_ASSERT(session, slot->slot_unbuffered == 0);
+ WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered);
+ slot->slot_unbuffered = (int64_t)mysize;
+ }
+ myslot->slot = slot;
+ myslot->offset = join_offset;
+ myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
}
/*
* __wt_log_slot_release --
* Each thread in a consolidated group releases its portion to
- * signal it has completed writing its piece of the log.
+ * signal it has completed copying its piece of the log into
+ * the memory buffer.
*/
int64_t
-__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
{
+ WT_LOGSLOT *slot;
+ wt_off_t cur_offset, my_start;
+ int64_t my_size, rel_size;
+
+ WT_UNUSED(session);
+ slot = myslot->slot;
+ my_start = slot->slot_start_offset + myslot->offset;
+ while ((cur_offset = slot->slot_last_offset) < my_start) {
+ /*
+ * Set our offset if we are larger.
+ */
+ if (__wt_atomic_casiv64(
+ &slot->slot_last_offset, cur_offset, my_start))
+ break;
+ /*
+ * If we raced another thread updating this, try again.
+ */
+ WT_BARRIER();
+ }
/*
- * Add my size into the state. When it reaches WT_LOG_SLOT_DONE
- * all participatory threads have completed copying their piece.
+ * Add my size into the state and return the new size.
*/
- return (__wt_atomic_addiv64(&slot->slot_state, (int64_t)size));
+ rel_size = size;
+ if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+ rel_size = WT_LOG_SLOT_UNBUFFERED;
+ my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
+ return (__wt_atomic_addiv64(&slot->slot_state, my_size));
}
/*
* __wt_log_slot_free --
* Free a slot back into the pool.
*/
-int
+void
__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
- WT_UNUSED(session);
/*
* Make sure flags don't get retained between uses.
* We have to reset them them here because multiple threads may
* change the flags when joining the slot.
*/
+ WT_UNUSED(session);
slot->flags = WT_SLOT_INIT_FLAGS;
+ slot->slot_error = 0;
slot->slot_state = WT_LOG_SLOT_FREE;
- return (0);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 674b9e6d3a8..6068bb6c559 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -134,7 +134,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
if (have_primary) {
WT_ENTER_PAGE_INDEX(session);
WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
- ovfl = __wt_btree_lsm_size(session, hard_limit ?
+ ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
WT_LEAVE_PAGE_INDEX(session);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
index 40991f845e4..01a61359949 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_merge.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
* Don't block if the cache is full: our next unit of work may be to
* discard some trees to free space.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_NO_EVICTION);
if (create_bloom) {
if (ret == 0)
@@ -632,6 +632,6 @@ err: if (locked)
"Merge failed with %s",
__wt_strerror(session, ret, NULL, 0)));
}
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index f34f0598261..46db76e099c 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -1144,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
return (0);
}
@@ -1157,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
WT_DECL_RET;
- F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
@@ -1177,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
return (0);
}
@@ -1190,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
WT_DECL_RET;
- F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 0c36c68e9f5..8eba0127b8b 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* Flush the file before checkpointing: this is the expensive part in
* terms of I/O.
*
- * Use the special eviction isolation level to avoid interfering with
- * an application checkpoint: we have already checked that all of the
- * updates in this chunk are globally visible.
- *
- * !!! We can wait here for checkpoints and fsyncs to complete, which
- * can be a long time.
+ * !!!
+ * We can wait here for checkpoints and fsyncs to complete, which can
+ * take a long time.
*/
if ((ret = __wt_session_get_btree(
session, chunk->uri, NULL, NULL, 0)) == 0) {
+ /*
+ * Set read-uncommitted: we have already checked that all of the
+ * updates in this chunk are globally visible, use the cheapest
+ * possible check in reconciliation.
+ */
saved_isolation = session->txn.isolation;
- session->txn.isolation = WT_ISO_EVICTION;
+ session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
session->txn.isolation = saved_isolation;
WT_TRET(__wt_session_release_btree(session));
@@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
* ourselves to get stuck creating bloom filters, the entire tree
* can stall since there may be no worker threads available to flush.
*/
- F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
WT_ERR(__wt_bloom_insert(bloom, &key));
@@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
err: if (bloom != NULL)
WT_TRET(__wt_bloom_close(bloom));
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c
index 6d08ce3aa6a..315621f2ae9 100644
--- a/src/third_party/wiredtiger/src/meta/meta_apply.c
+++ b/src/third_party/wiredtiger/src/meta/meta_apply.c
@@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session,
WT_ERR(cursor->get_key(cursor, &uri));
if (!WT_PREFIX_MATCH(uri, "file:"))
break;
- else if (strcmp(uri, WT_METAFILE_URI) == 0)
+ if (strcmp(uri, WT_METAFILE_URI) == 0)
continue;
/*
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index 227d0fa9a6c..8255f004dab 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -12,22 +12,22 @@
* __metadata_turtle --
* Return if a key's value should be taken from the turtle file.
*/
-static int
+static bool
__metadata_turtle(const char *key)
{
switch (key[0]) {
case 'f':
if (strcmp(key, WT_METAFILE_URI) == 0)
- return (1);
+ return (true);
break;
case 'W':
if (strcmp(key, "WiredTiger version") == 0)
- return (1);
+ return (true);
if (strcmp(key, "WiredTiger version string") == 0)
- return (1);
+ return (true);
break;
}
- return (0);
+ return (false);
}
/*
@@ -37,6 +37,8 @@ __metadata_turtle(const char *key)
int
__wt_metadata_open(WT_SESSION_IMPL *session)
{
+ WT_BTREE *btree;
+
if (session->meta_dhandle != NULL)
return (0);
@@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session)
session->meta_dhandle = session->dhandle;
WT_ASSERT(session, session->meta_dhandle != NULL);
- /* The meta_dhandle doesn't need to stay locked -- release it. */
+ /*
+ * Set special flags for the metadata file: eviction (the metadata file
+ * is in-memory and never evicted), logging (the metadata file is always
+ * logged if possible).
+ *
+ * Test flags before setting them so updates can't race in subsequent
+ * opens (the first update is safe because it's single-threaded from
+ * wiredtiger_open).
+ */
+ btree = S2BT(session);
+ if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
+ F_SET(btree, WT_BTREE_IN_MEMORY);
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
+ F_CLR(btree, WT_BTREE_NO_LOGGING);
+
+ /* The metadata handle doesn't need to stay locked -- release it. */
return (__wt_session_release_btree(session));
}
@@ -59,9 +78,9 @@ __wt_metadata_cursor(
{
WT_DATA_HANDLE *saved_dhandle;
WT_DECL_RET;
+ int is_dead;
const char *cfg[] =
{ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
- int is_dead;
saved_dhandle = session->dhandle;
WT_ERR(__wt_metadata_open(session));
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index baf9b475777..7946b4ab0cc 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -41,11 +41,13 @@ err: __wt_free(session, cond);
}
/*
- * __wt_cond_wait --
- * Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ * Wait on a mutex, optionally timing out. If we get it
+ * before the time out period expires, let the caller know.
*/
int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+ WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
{
struct timespec ts;
WT_DECL_RET;
@@ -54,6 +56,7 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
locked = 0;
/* Fast path if already signalled. */
+ *signalled = 1;
if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
return (0);
@@ -88,8 +91,10 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
#ifdef ETIME
ret == ETIME ||
#endif
- ret == ETIMEDOUT)
+ ret == ETIMEDOUT) {
+ *signalled = 0;
ret = 0;
+ }
(void)__wt_atomic_subi32(&cond->waiters, 1);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c
index 8622bb5b4ca..ef4662aa369 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_open.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_open.c
@@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
fh = *fhp;
*fhp = NULL;
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name));
+
__wt_spin_lock(session, &conn->fh_lock);
if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c
index 07b14b55b44..af28e1b3b56 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_path.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_path.c
@@ -12,10 +12,10 @@
* __wt_absolute_path --
* Return if a filename is an absolute path.
*/
-int
+bool
__wt_absolute_path(const char *path)
{
- return (path[0] == '/' ? 1 : 0);
+ return (path[0] == '/');
}
/*
diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c
index 097c73b5731..a9d3d521052 100644
--- a/src/third_party/wiredtiger/src/os_win/os_errno.c
+++ b/src/third_party/wiredtiger/src/os_win/os_errno.c
@@ -22,7 +22,7 @@ __wt_map_error_to_windows_error(int error) {
Also validate he do not get any COM errors
(which are negative integers)
*/
- WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset));
+ WT_ASSERT(NULL, error < 0);
return (error + -(windows_error_offset));
}
@@ -96,7 +96,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen)
snprintf(errbuf, errlen, "%s", buf) > 0)
return (errbuf);
if (lasterror != 0 && session != NULL &&
- __wt_buf_set(session, &session->err, buf, strlen(buf)) == 0)
+ __wt_buf_fmt(session, &session->err, "%s", buf) == 0)
return (session->err.data);
}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
index 565928cb863..14ca5d61282 100644
--- a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
@@ -37,13 +37,15 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
}
/*
- * __wt_cond_wait --
- * Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ * Wait on a mutex, optionally timing out. If we get it
+ * before the time out period expires, let the caller know.
*/
int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+ WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
{
- DWORD milliseconds;
+ DWORD err, milliseconds;
WT_DECL_RET;
uint64_t milliseconds64;
int locked;
@@ -51,6 +53,7 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
locked = 0;
/* Fast path if already signalled. */
+ *signalled = 1;
if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
return (0);
@@ -91,17 +94,25 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
ret = SleepConditionVariableCS(
&cond->cond, &cond->mtx, INFINITE);
+ /*
+ * SleepConditionVariableCS returns non-zero on success, 0 on timeout
+ * or failure. Check for timeout, else convert to a WiredTiger error
+ * value and fail.
+ */
if (ret == 0) {
- if (GetLastError() == ERROR_TIMEOUT) {
- ret = 1;
- }
- }
+ if ((err = GetLastError()) == ERROR_TIMEOUT)
+ *signalled = 0;
+ else
+ ret = __wt_errno();
+ } else
+ ret = 0;
(void)__wt_atomic_subi32(&cond->waiters, 1);
if (locked)
LeaveCriticalSection(&cond->mtx);
- if (ret != 0)
+
+ if (ret == 0)
return (0);
WT_RET_MSG(session, ret, "SleepConditionVariableCS");
}
diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c
index 89f05e238c4..9d001e50571 100644
--- a/src/third_party/wiredtiger/src/os_win/os_path.c
+++ b/src/third_party/wiredtiger/src/os_win/os_path.c
@@ -12,7 +12,7 @@
* __wt_absolute_path --
* Return if a filename is an absolute path.
*/
-int
+bool
__wt_absolute_path(const char *path)
{
/*
@@ -21,7 +21,7 @@ __wt_absolute_path(const char *path)
*/
if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
path += 2;
- return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+ return (path[0] == '/' || path[0] == '\\');
}
/*
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 6b0ca54065e..10daa8b717c 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -27,18 +27,30 @@ typedef struct {
WT_ITEM dsk; /* Temporary disk-image buffer */
- /* Track whether all changes to the page are written. */
+ /*
+ * Track start/stop write generation to decide if all changes to the
+ * page are written.
+ */
+ uint32_t orig_write_gen;
+
+ /*
+ * Track start/stop checkpoint generations to decide if lookaside table
+ * records are correct.
+ */
+ uint64_t orig_btree_checkpoint_gen;
+ uint64_t orig_txn_checkpoint_gen;
+
+ /*
+ * Track maximum transaction ID seen and first unwritten transaction ID.
+ */
uint64_t max_txn;
uint64_t first_dirty_txn;
- uint32_t orig_write_gen;
/*
- * If page updates are skipped because they are as yet unresolved, or
- * the page has updates we cannot discard, the page is left "dirty":
- * the page cannot be discarded and a subsequent reconciliation will
- * be necessary to discard the page.
+ * When we can't mark the page clean (for example, checkpoint found some
+ * uncommitted updates), there's a leave-dirty flag.
*/
- int leave_dirty;
+ int leave_dirty;
/*
* Raw compression (don't get me started, as if normal reconciliation
@@ -153,18 +165,12 @@ typedef struct {
void *dsk; /* Split's disk image */
/*
- * When busy pages get large, we need to be able to evict them
- * even when they contain unresolved updates, or updates which
- * cannot be evicted because of running transactions. In such
- * cases, break the page into multiple blocks, write the blocks
- * that can be evicted, saving lists of updates for blocks that
- * cannot be evicted, then re-instantiate the blocks that cannot
- * be evicted as new, in-memory pages, restoring the updates on
- * those pages.
+ * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+ * WT_EVICT_LOOKASIDE configurations.
*/
- WT_UPD_SKIPPED *skip; /* Skipped updates */
- uint32_t skip_next;
- size_t skip_allocated;
+ WT_SAVE_UPD *supd; /* Saved updates */
+ uint32_t supd_next;
+ size_t supd_allocated;
/*
* The key for a row-store page; no column-store key is needed
@@ -220,12 +226,14 @@ typedef struct {
size_t space_avail; /* Remaining space in this chunk */
/*
- * While reviewing updates for each page, we store skipped updates here,
- * and then move them to per-block areas as the blocks are defined.
+ * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+ * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each
+ * page, we save WT_UPDATE lists here, and then move them to per-block
+ * areas as the blocks are defined.
*/
- WT_UPD_SKIPPED *skip; /* Skipped updates */
- uint32_t skip_next;
- size_t skip_allocated;
+ WT_SAVE_UPD *supd; /* Saved updates */
+ uint32_t supd_next;
+ size_t supd_allocated;
/*
* We don't need to keep the 0th key around on internal pages, the
@@ -277,6 +285,9 @@ typedef struct {
WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
+ int cache_write_lookaside; /* Used the lookaside table */
+ int cache_write_restore; /* Used update/restoration */
+
uint32_t tested_ref_state; /* Debugging information */
} WT_RECONCILE;
@@ -318,8 +329,11 @@ static int __rec_split_row_promote(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
static int __rec_split_write(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int __rec_update_las(
+ WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
static int __rec_write_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
@@ -338,31 +352,19 @@ int
__wt_reconcile(WT_SESSION_IMPL *session,
WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
- int page_lock, scan_lock, split_lock;
- conn = S2C(session);
page = ref->page;
mod = page->modify;
- page_lock = scan_lock = split_lock = 0;
-
- /* We're shouldn't get called with a clean page, that's an error. */
- if (!__wt_page_is_modified(page))
- WT_RET_MSG(session, WT_ERROR,
- "Attempt to reconcile a clean page.");
WT_RET(__wt_verbose(session,
WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
- WT_STAT_FAST_CONN_INCR(session, rec_pages);
- WT_STAT_FAST_DATA_INCR(session, rec_pages);
- if (LF_ISSET(WT_EVICTING)) {
- WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
- WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
- }
+
+ /* We shouldn't get called with a clean page, that's an error. */
+ WT_ASSERT(session, __wt_page_is_modified(page));
#ifdef HAVE_DIAGNOSTIC
{
@@ -386,39 +388,15 @@ __wt_reconcile(WT_SESSION_IMPL *session,
r = session->reconcile;
/*
- * The compaction process looks at the page's modification information;
- * if compaction is running, acquire the page's lock.
- */
- if (conn->compact_in_memory_pass) {
- WT_PAGE_LOCK(session, page);
- page_lock = 1;
- }
-
- /*
- * Reconciliation reads the lists of updates, so obsolete updates cannot
- * be discarded while reconciliation is in progress.
- */
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
- scan_lock = 1;
-
- /*
- * Mark internal pages as splitting to ensure we don't deadlock when
- * performing an in-memory split during a checkpoint.
+ * Reconciliation locks the page for three reasons:
+ * Reconciliation reads the lists of page updates, obsolete updates
+ * cannot be discarded while reconciliation is in progress;
+ * The compaction process reads page modification information, which
+ * reconciliation modifies;
+ * In-memory splits: reconciliation of an internal page cannot handle
+ * a child page splitting during the reconciliation.
*/
- if (WT_PAGE_IS_INTERNAL(page)) {
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
- split_lock = 1;
- }
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
/* Reconcile the page. */
switch (page->type) {
@@ -445,19 +423,34 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_ILLEGAL_VALUE_SET(session);
}
+ /* Get the final status for the reconciliation. */
+ if (ret == 0)
+ ret = __rec_write_status(session, r, page);
+
/* Wrap up the page reconciliation. */
if (ret == 0)
ret = __rec_write_wrapup(session, r, page);
else
WT_TRET(__rec_write_wrapup_err(session, r, page));
- /* Release the locks we're holding. */
- if (split_lock)
- F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
- if (scan_lock)
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (page_lock)
- WT_PAGE_UNLOCK(session, page);
+ /* Release the reconciliation lock. */
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+
+ /* Update statistics. */
+ WT_STAT_FAST_CONN_INCR(session, rec_pages);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages);
+ if (LF_ISSET(WT_EVICTING)) {
+ WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+ }
+ if (r->cache_write_lookaside) {
+ WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside);
+ WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside);
+ }
+ if (r->cache_write_restore) {
+ WT_STAT_FAST_CONN_INCR(session, cache_write_restore);
+ WT_STAT_FAST_DATA_INCR(session, cache_write_restore);
+ }
/*
* Clean up the boundary structures: some workloads result in millions
@@ -489,6 +482,125 @@ __wt_reconcile(WT_SESSION_IMPL *session,
}
/*
+ * __rec_las_checkpoint_test --
+ * Return if the lookaside table is going to collide with a checkpoint.
+ */
+static inline bool
+__rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_BTREE *btree;
+
+ conn = S2C(session);
+ btree = S2BT(session);
+
+ /*
+ * Running checkpoints can collide with the lookaside table because
+ * reconciliation using the lookaside table writes the key's last
+ * committed value, which might not be the value checkpoint would write.
+ * If reconciliation was configured for lookaside table eviction, this
+ * file participates in checkpoints, and any of the tree or system
+ * transactional generation numbers don't match, there's a possible
+ * collision.
+ *
+ * It's a complicated test, but the alternative is to have checkpoint
+ * drain lookaside table reconciliations, and this isn't a problem for
+ * most workloads.
+ */
+ if (!F_ISSET(r, WT_EVICT_LOOKASIDE))
+ return (false);
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ return (false);
+ if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen &&
+ r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen &&
+ r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen)
+ return (false);
+ return (true);
+}
+
+/*
+ * __rec_write_status --
+ * Return the final status for reconciliation.
+ */
+static int
+__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_PAGE_MODIFY *mod;
+
+ btree = S2BT(session);
+ mod = page->modify;
+
+ /* Check for a lookaside table and checkpoint collision. */
+ if (__rec_las_checkpoint_test(session, r))
+ return (EBUSY);
+
+ /*
+ * Set the page's status based on whether or not we cleaned the page.
+ */
+ if (r->leave_dirty) {
+ /*
+ * Update the page's first unwritten transaction ID.
+ */
+ mod->first_dirty_txn = r->first_dirty_txn;
+
+ /*
+ * The page remains dirty.
+ *
+ * Any checkpoint call cleared the tree's modified flag before
+ * writing pages, so we must explicitly reset it. We insert a
+ * barrier after the change for clarity (the requirement is the
+ * flag be set before a subsequent checkpoint reads it, and
+ * as the current checkpoint is waiting on this reconciliation
+ * to complete, there's no risk of that happening)
+ */
+ btree->modified = 1;
+ WT_FULL_BARRIER();
+
+ /*
+ * Eviction should only be here if following the save/restore
+ * eviction path.
+ */
+ WT_ASSERT(session,
+ !F_ISSET(r, WT_EVICTING) ||
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+ } else {
+ /*
+ * Track the page's maximum transaction ID (used to decide if
+ * we're likely to be able to evict this page in the future).
+ */
+ mod->rec_max_txn = r->max_txn;
+
+ /*
+ * Track the tree's maximum transaction ID (used to decide if
+ * it's safe to discard the tree). Reconciliation for eviction
+ * is multi-threaded, only update the tree's maximum transaction
+ * ID when doing a checkpoint. That's sufficient, we only care
+ * about the maximum transaction ID of current updates in the
+ * tree, and checkpoint visits every dirty page in the tree.
+ */
+ if (!F_ISSET(r, WT_EVICTING) &&
+ WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
+ btree->rec_max_txn = r->max_txn;
+
+ /*
+ * The page only might be clean; if the write generation is
+ * unchanged since reconciliation started, it's clean.
+ *
+ * If the write generation changed, the page has been written
+ * since reconciliation started and remains dirty (that can't
+ * happen when evicting, the page is exclusively locked).
+ */
+ if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ __wt_cache_dirty_decr(session, page);
+ else
+ WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ }
+
+ return (0);
+}
+
+/*
* __rec_root_write --
* Handle the write of a root page.
*/
@@ -577,7 +689,7 @@ err: __wt_page_out(session, &next);
* __rec_raw_compression_config --
* Configure raw compression.
*/
-static inline int
+static inline bool
__rec_raw_compression_config(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
{
@@ -588,11 +700,11 @@ __rec_raw_compression_config(
/* Check if raw compression configured. */
if (btree->compressor == NULL ||
btree->compressor->compress_raw == NULL)
- return (0);
+ return (false);
/* Only for row-store and variable-length column-store objects. */
if (page->type == WT_PAGE_COL_FIX)
- return (0);
+ return (false);
/*
* Raw compression cannot support dictionary compression. (Technically,
@@ -602,11 +714,11 @@ __rec_raw_compression_config(
* that seems an unlikely use case.)
*/
if (btree->dictionary != 0)
- return (0);
+ return (false);
/* Raw compression cannot support prefix compression. */
if (btree->prefix_compression != 0)
- return (0);
+ return (false);
/*
* Raw compression is also turned off during salvage: we can't allow
@@ -614,9 +726,9 @@ __rec_raw_compression_config(
* can't manipulate the page size.
*/
if (salvage != NULL)
- return (0);
+ return (false);
- return (1);
+ return (true);
}
/*
@@ -628,10 +740,12 @@ __rec_write_init(WT_SESSION_IMPL *session,
WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
{
WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
WT_PAGE *page;
WT_RECONCILE *r;
btree = S2BT(session);
+ conn = S2C(session);
page = ref->page;
if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
@@ -648,9 +762,59 @@ __rec_write_init(WT_SESSION_IMPL *session,
F_SET(&r->dsk, WT_ITEM_ALIGNED);
}
+ /* Reconciliation is not re-entrant, make sure that doesn't happen. */
+ WT_ASSERT(session, r->ref == NULL);
+
/* Remember the configuration. */
r->ref = ref;
r->page = page;
+
+ /*
+ * Save the page's write generation before reading the page.
+ * Save the transaction generations before reading the page.
+ * These are all ordered reads, but we only need one.
+ */
+ r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
+ r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen;
+ WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Lookaside table eviction is configured when eviction gets aggressive,
+ * adjust the flags for cases we don't support.
+ */
+ if (LF_ISSET(WT_EVICT_LOOKASIDE)) {
+ /*
+ * Saving lookaside table updates into the lookaside table won't
+ * work.
+ */
+ if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ LF_CLR(WT_EVICT_LOOKASIDE);
+
+ /*
+ * We don't yet support fixed-length column-store combined with
+ * the lookaside table. It's not hard to do, but the underlying
+ * function that reviews which updates can be written to the
+ * evicted page and which updates need to be written to the
+ * lookaside table needs access to the original value from the
+ * page being evicted, and there's no code path for that in the
+ * case of fixed-length column-store objects. (Row-store and
+ * variable-width column-store objects provide a reference to
+ * the unpacked on-page cell for this purpose, but there isn't
+ * an on-page cell for fixed-length column-store objects.) For
+ * now, turn it off.
+ */
+ if (page->type == WT_PAGE_COL_FIX)
+ LF_CLR(WT_EVICT_LOOKASIDE);
+
+ /*
+ * Check for a lookaside table and checkpoint collision, and if
+ * we find one, turn off the lookaside file (we've gone to all
+ * the effort of getting exclusive access to the page, might as
+ * well try and evict it).
+ */
+ if (__rec_las_checkpoint_test(session, r))
+ LF_CLR(WT_EVICT_LOOKASIDE);
+ }
r->flags = flags;
/* Track if the page can be marked clean. */
@@ -668,8 +832,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->all_empty_value = 1;
r->any_empty_value = 0;
- /* The list of cached, skipped updates. */
- r->skip_next = 0;
+ /* The list of saved updates. */
+ r->supd_next = 0;
/*
* Dictionary compression only writes repeated values once. We grow
@@ -714,14 +878,11 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->salvage = salvage;
- /* Save the page's write generation before reading the page. */
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
-
/*
* Running transactions may update the page after we write it, so
* this is the highest ID we can be confident we will see.
*/
- r->first_dirty_txn = S2C(session)->txn_global.last_running;
+ r->first_dirty_txn = conn->txn_global.last_running;
return (0);
}
@@ -748,7 +909,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__rec_bnd_cleanup(session, r, 1);
- __wt_free(session, r->skip);
+ __wt_free(session, r->supd);
__wt_buf_free(session, &r->k.buf);
__wt_buf_free(session, &r->v.buf);
@@ -784,6 +945,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
if (r->bnd == NULL)
return;
+ /* Reconciliation is not re-entrant, make sure that doesn't happen. */
+ r->ref = NULL;
+
/*
* Free the boundary structures' memory. In the case of normal cleanup,
* discard any memory we won't reuse in the next reconciliation; in the
@@ -799,7 +963,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
+ __wt_free(session, bnd->supd);
__wt_buf_free(session, &bnd->key);
}
__wt_free(session, r->bnd);
@@ -820,66 +984,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
+ __wt_free(session, bnd->supd);
}
}
}
/*
- * __rec_skip_update_save --
- * Save a skipped WT_UPDATE list for later restoration.
+ * __rec_block_free --
+ * Helper function to free a block.
*/
static int
-__rec_skip_update_save(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+__rec_block_free(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ return (bm->free(bm, session, addr, addr_size));
+}
+
+/*
+ * __rec_update_save --
+ * Save a WT_UPDATE list for later restoration.
+ */
+static int
+__rec_update_save(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid)
{
WT_RET(__wt_realloc_def(
- session, &r->skip_allocated, r->skip_next + 1, &r->skip));
- r->skip[r->skip_next].ins = ins;
- r->skip[r->skip_next].rip = rip;
- ++r->skip_next;
+ session, &r->supd_allocated, r->supd_next + 1, &r->supd));
+ r->supd[r->supd_next].ins = ins;
+ r->supd[r->supd_next].rip = rip;
+ r->supd[r->supd_next].onpage_txn = txnid;
+ ++r->supd_next;
return (0);
}
/*
- * __rec_skip_update_move --
- * Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * __rec_update_move --
+ * Move a saved WT_UPDATE list from the per-page cache to a specific
* block's list.
*/
static int
-__rec_skip_update_move(
- WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
{
WT_RET(__wt_realloc_def(
- session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
- bnd->skip[bnd->skip_next] = *skip;
- ++bnd->skip_next;
+ session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd));
+ bnd->supd[bnd->supd_next] = *supd;
+ ++bnd->supd_next;
- skip->ins = NULL;
- skip->rip = NULL;
+ supd->ins = NULL;
+ supd->rip = NULL;
return (0);
}
/*
* __rec_txn_read --
- * Return the first visible update in a list (or NULL if none are visible),
- * set a flag if any updates were skipped, track the maximum transaction ID on
- * the page.
+ * Return the update in a list that should be written (or NULL if none can
+ * be written).
*/
-static inline int
+static int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
+ WT_BTREE *btree;
WT_DECL_RET;
- WT_ITEM ovfl;
+ WT_DECL_ITEM(tmp);
WT_PAGE *page;
- WT_UPDATE *upd, *upd_list, *upd_ovfl;
+ WT_UPDATE *append, *upd, *upd_list;
size_t notused;
uint64_t max_txn, min_txn, txnid;
- int skipped;
+ int append_origv, skipped;
*updp = NULL;
+ btree = S2BT(session);
page = r->page;
/*
@@ -893,13 +1075,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
} else
upd_list = ins->upd;
- skipped = 0;
- for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
- upd != NULL; upd = upd->next) {
+ for (skipped = 0,
+ max_txn = WT_TXN_NONE, min_txn = UINT64_MAX,
+ upd = upd_list; upd != NULL; upd = upd->next) {
if ((txnid = upd->txnid) == WT_TXN_ABORTED)
continue;
- /* Track the largest/smallest transaction IDs on the list. */
+ /*
+ * Track the largest/smallest transaction IDs on the list and
+ * the smallest not-globally-visible transaction on the page.
+ */
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
if (WT_TXNID_LT(txnid, min_txn))
@@ -909,132 +1094,231 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
r->first_dirty_txn = txnid;
/*
- * Record whether any updates were skipped on the way to finding
- * the first visible update.
- *
- * If updates were skipped before the one being written, future
- * reads without intervening modifications to the page could
- * see a different value; if no updates were skipped, the page
- * can safely be marked clean and does not need to be
- * reconciled until modified again.
+ * Find the first update we can use.
*/
- if (*updp == NULL) {
- if (__wt_txn_visible(session, txnid))
- *updp = upd;
- else
+ if (F_ISSET(r, WT_EVICTING)) {
+ /*
+ * Eviction can write any committed update.
+ *
+ * When reconciling for eviction, track whether any
+ * uncommitted updates are found.
+ */
+ if (__wt_txn_committed(session, txnid)) {
+ if (*updp == NULL)
+ *updp = upd;
+ } else
skipped = 1;
+ } else {
+ /*
+ * Checkpoint can only write updates visible as of its
+ * snapshot.
+ *
+ * When reconciling for a checkpoint, track whether any
+ * updates were skipped on the way to finding the first
+ * visible update.
+ */
+ if (*updp == NULL) {
+ if (__wt_txn_visible(session, txnid))
+ *updp = upd;
+ else
+ skipped = 1;
+ }
}
}
/*
+ * If all of the updates were aborted, quit. This test is not strictly
+ * necessary because the above loop exits with skipped not set and the
+ * maximum transaction left at its initial value of WT_TXN_NONE, so
+ * the test below will be branch true and return, but it's cheap and a
+ * little more explicit, and makes Coverity happy.
+ */
+ if (max_txn == WT_TXN_NONE)
+ return (0);
+
+ /*
* Track the maximum transaction ID in the page. We store this in the
- * page at the end of reconciliation if no updates are skipped, it's
- * used to avoid evicting clean pages from memory with changes required
- * to satisfy a snapshot read.
+ * tree at the end of reconciliation in the service of checkpoints, it
+ * is used to avoid discarding trees from memory when they have changes
+ * required to satisfy a snapshot read.
*/
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
/*
- * If no updates were skipped and all updates are globally visible, the
- * page can be marked clean and we're done, regardless of whether we're
- * evicting or checkpointing.
+ * If there are no skipped updates and all updates are globally visible,
+ * the page can be marked clean and we're done, regardless if evicting
+ * or checkpointing.
*
* We have to check both: the oldest transaction ID may have moved while
- * we were scanning the update list, so it is possible to skip an update
- * but then find that by the end of the scan, all updates are stable.
+ * we were scanning the update list, so it is possible to find a skipped
+ * update, but then find all updates are stable at the end of the scan.
+ *
+ * Skip the visibility check for the lookaside table as a special-case,
+ * we know there are no older readers of that table.
*/
- if (!skipped && __wt_txn_visible_all(session, max_txn))
+ if (!skipped &&
+ (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
+ __wt_txn_visible_all(session, max_txn)))
return (0);
/*
- * If some updates are not globally visible, or were skipped, the page
- * cannot be marked clean.
+ * In some cases, there had better not be skipped updates or updates not
+ * yet globally visible.
*/
- r->leave_dirty = 1;
-
- /* If we're not evicting, we're done, we know what we'll write. */
- if (!F_ISSET(r, WT_EVICTING))
- return (0);
-
- /* In some cases, there had better not be any updates we can't write. */
- if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ if (F_ISSET(r, WT_VISIBILITY_ERR))
WT_PANIC_RET(session, EINVAL,
- "reconciliation illegally skipped an update");
+ "reconciliation error, uncommitted update or update not "
+ "globally visible");
/*
- * If evicting and we aren't able to save/restore the not-yet-visible
- * updates, the page can't be evicted.
+ * If not trying to evict the page, we know what we'll write and we're
+ * done. Because some updates were skipped or are not globally visible,
+ * the page can't be marked clean.
*/
- if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
- return (EBUSY);
+ if (!F_ISSET(r, WT_EVICTING)) {
+ r->leave_dirty = 1;
+ return (0);
+ }
/*
- * Evicting a page with not-yet-visible updates: save and restore the
- * list of updates on a newly instantiated page.
- *
- * The order of the updates on the list matters so we can't move only
- * the unresolved updates, we have to move the entire update list.
+ * Evicting with either uncommitted changes or not-yet-globally-visible
+ * changes. There are two ways to continue, the save/restore eviction
+ * path or the lookaside table eviction path. Both cannot be configured
+ * because the paths track different information. The save/restore path
+ * can handle both uncommitted and not-yet-globally-visible changes, by
+ * evicting most of the page and then creating a new, smaller page into
+ * which we re-instantiate those changes. The lookaside table path can
+ * only handle not-yet-globally-visible changes by writing those changes
+ * into the lookaside table and restoring them on demand if and when the
+ * page is read back into memory.
*
- * Clear the returned update so our caller ignores the key/value pair
- * in the case of an insert/append entry (everything we need is in the
- * update list), and otherwise writes the original on-page key/value
- * pair to which the update list applies.
+ * Both paths are configured outside of reconciliation: the save/restore
+ * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is
+ * the WT_EVICT_LOOKASIDE flag.
*/
- *updp = NULL;
+ if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE))
+ return (EBUSY);
+ if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ return (EBUSY);
+
+ append_origv = 0;
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
+ /*
+ * The save/restore eviction path.
+ *
+ * Clear the returned update so our caller ignores the key/value
+ * pair in the case of an insert/append list entry (everything
+ * we need is in the update list), and otherwise writes the
+ * original on-page key/value pair to which the update list
+ * applies.
+ */
+ *updp = NULL;
+
+ /* The page can't be marked clean. */
+ r->leave_dirty = 1;
+
+ /*
+ * A special-case for overflow values, where we can't write the
+ * original on-page value item to disk because it's been updated
+ * or removed.
+ *
+ * What happens is that an overflow value is updated or removed
+ * and its backing blocks freed. If any reader in the system
+ * might still want the value, a copy was cached in the page
+ * reconciliation tracking memory, and the page cell set to
+ * WT_CELL_VALUE_OVFL_RM. Eviction then chose the page and
+ * we're splitting it up in order to push parts of it out of
+ * memory.
+ *
+ * We could write the original on-page value item to disk... if
+ * we had a copy. The cache may not have a copy (a globally
+ * visible update would have kept a value from being cached), or
+ * an update that subsequently became globally visible could
+ * cause a cached value to be discarded. Either way, once there
+ * is a globally visible update, we may not have the original
+ * value.
+ *
+ * Fortunately, if there's a globally visible update we don't
+ * care about the original version, so we simply ignore it, no
+ * transaction can ever try and read it. If there isn't a
+ * globally visible update, there had better be a cached value.
+ *
+ * In the latter case, we could write the value out to disk, but
+ * (1) we are planning on re-instantiating this page in memory,
+ * it isn't going to disk, and (2) the value item is eventually
+ * going to be discarded, that seems like a waste of a write.
+ * Instead, find the cached value and append it to the update
+ * list we're saving for later restoration.
+ */
+ if (vpack != NULL &&
+ vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+ !__wt_txn_visible_all(session, min_txn))
+ append_origv = 1;
+ } else {
+ /*
+ * The lookaside table eviction path.
+ *
+ * If at least one update is globally visible, copy the update
+ * list and ignore the current on-page value. If no update is
+ * globally visible, readers require the page's original value.
+ */
+ if (!__wt_txn_visible_all(session, min_txn))
+ append_origv = 1;
+ }
/*
- * Handle the case were we don't want to write an original on-page value
- * item to disk because it's been updated or removed.
- *
- * Here's the deal: an overflow value was updated or removed and its
- * backing blocks freed. If any transaction in the system might still
- * read the value, a copy was cached in page reconciliation tracking
- * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction
- * then chose the page and we're splitting it up in order to push parts
- * of it out of memory.
- *
- * We could write the original on-page value item to disk... if we had
- * a copy. The cache may not have a copy (a globally visible update
- * would have kept a value from ever being cached), or an update that
- * subsequent became globally visible could cause a cached value to be
- * discarded. Either way, once there's a globally visible update, we
- * may not have the value.
- *
- * Fortunately, if there's a globally visible update we don't care about
- * the original version, so we simply ignore it, no transaction can ever
- * try and read it. If there isn't a globally visible update, there had
- * better be a cached value.
- *
- * In the latter case, we could write the value out to disk, but (1) we
- * are planning on re-instantiating this page in memory, it isn't going
- * to disk, and (2) the value item is eventually going to be discarded,
- * that seems like a waste of a write. Instead, find the cached value
- * and append it to the update list we're saving for later restoration.
- */
- if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
- !__wt_txn_visible_all(session, min_txn)) {
- if ((ret = __wt_ovfl_txnc_search(
- page, vpack->data, vpack->size, &ovfl)) != 0)
- WT_PANIC_RET(session, ret,
- "cached overflow item discarded early");
+ * We need the original on-page value for some reason: get a copy and
+ * append it to the end of the update list with a transaction ID that
+ * guarantees its visibility.
+ */
+ if (append_origv) {
+ /*
+ * If we don't have a value cell, it's an insert/append list
+ * key/value pair which simply doesn't exist for some reader;
+ * place a deleted record at the end of the update list.
+ */
+ if (vpack == NULL || vpack->type == WT_CELL_DEL)
+ WT_RET(__wt_update_alloc(
+ session, NULL, &append, &notused));
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ if ((ret = __wt_page_cell_data_ref(
+ session, page, vpack, tmp)) == 0)
+ ret = __wt_update_alloc(
+ session, tmp, &append, &notused);
+ __wt_scr_free(session, &tmp);
+ WT_RET(ret);
+ }
/*
- * Create an update structure with an impossibly low transaction
- * ID and append it to the update list we're about to save.
- * Restoring that update list when this page is re-instantiated
- * creates an update for the key/value pair visible to every
- * running transaction in the system, ensuring the on-page value
- * will be ignored.
+ * Give the entry an impossibly low transaction ID to ensure its
+ * global visibility, append it to the update list.
+ *
+ * Note the change to the actual reader-accessible update list:
+ * from now on, the original on-page value appears at the end
+ * of the update list, even if this reconciliation subsequently
+ * fails.
*/
- WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
- upd_ovfl->txnid = WT_TXN_NONE;
+ append->txnid = WT_TXN_NONE;
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
- upd->next = upd_ovfl;
+ upd->next = append;
}
- return (__rec_skip_update_save(session, r, ins, rip));
+ /*
+ * The order of the updates on the list matters, we can't move only the
+ * unresolved updates, move the entire update list.
+ *
+ * If we skipped updates, the transaction value is never used. If we
+ * didn't skip updates, the list of updates are eventually written to
+ * the lookaside table, and associated with each update record is the
+ * transaction ID of the update we wrote in the reconciled page; once
+ * that transaction ID is globally visible, we know we no longer need
+ * the lookaside table records, allowing them to be discarded.
+ */
+ return (__rec_update_save(session,
+ r, ins, rip, (*updp == NULL) ? WT_TXN_NONE : (*updp)->txnid));
}
/*
@@ -1155,10 +1439,10 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* If called during checkpoint, acquire a hazard pointer
* so the child isn't evicted, it's an in-memory case.
*
- * This call cannot return split/restart, dirty page
- * eviction is shutout during checkpoint, all splits in
- * process will have completed before we walk any pages
- * for checkpoint.
+ * This call cannot return split/restart, eviction of
+ * pages that split into their parent is shutout during
+ * checkpoint, all splits in process will have completed
+ * before we walk any pages for checkpoint.
*/
ret = __wt_page_in(session, ref,
WT_READ_CACHE | WT_READ_NO_EVICT |
@@ -1215,7 +1499,7 @@ in_memory:
* reason to write the cell.
*/
mod = ref->page->modify;
- if (mod != NULL && mod->flags != 0)
+ if (mod != NULL && F_ISSET(mod, WT_PM_REC_MASK))
*statep = WT_CHILD_MODIFIED;
else if (ref->addr == NULL) {
*statep = WT_CHILD_IGNORE;
@@ -1234,37 +1518,32 @@ static int
__rec_child_deleted(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
{
- WT_BM *bm;
WT_PAGE_DELETED *page_del;
size_t addr_size;
const uint8_t *addr;
- bm = S2BT(session)->bm;
page_del = ref->page_del;
/*
* Internal pages with child leaf pages in the WT_REF_DELETED state are
* a special case during reconciliation. First, if the deletion was a
* result of a session truncate call, the deletion may not be visible to
- * us. In that case, we proceed as with any change that's not visible
- * during reconciliation by setting the skipped flag and ignoring the
- * change for the purposes of writing the internal page.
+ * us. In that case, we proceed as with any change not visible during
+ * reconciliation by ignoring the change for the purposes of writing the
+ * internal page.
*
* In this case, there must be an associated page-deleted structure, and
* it holds the transaction ID we care about.
+ *
+ * In some cases, there had better not be any updates we can't see.
*/
- if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
- /*
- * In some cases, there had better not be any updates we can't
- * write.
- */
- if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
- WT_PANIC_RET(session, EINVAL,
- "reconciliation illegally skipped an update");
- }
+ if (F_ISSET(r, WT_VISIBILITY_ERR) &&
+ page_del != NULL && !__wt_txn_visible(session, page_del->txnid))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
/*
- * The deletion is visible to us, deal with any underlying disk blocks.
+ * Deal with any underlying disk blocks.
*
* First, check to see if there is an address associated with this leaf:
* if there isn't, we're done, the underlying page is already gone. If
@@ -1291,7 +1570,7 @@ __rec_child_deleted(
(page_del == NULL ||
__wt_txn_visible_all(session, page_del->txnid))) {
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
- WT_RET(bm->free(bm, session, addr, addr_size));
+ WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
@@ -1562,7 +1841,7 @@ static void
__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
{
bnd->offset = 0;
- bnd->recno = 0;
+ bnd->recno = WT_RECNO_OOB;
bnd->entries = 0;
__wt_free(session, bnd->addr.addr);
@@ -1571,9 +1850,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
bnd->cksum = 0;
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
- bnd->skip_next = 0;
- bnd->skip_allocated = 0;
+ __wt_free(session, bnd->supd);
+ bnd->supd_next = 0;
+ bnd->supd_allocated = 0;
/*
* Don't touch the key, we re-use that memory in each new
@@ -1775,9 +2054,13 @@ __rec_split_init(WT_SESSION_IMPL *session,
* __rec_is_checkpoint --
* Return if we're writing a checkpoint.
*/
-static int
-__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+static bool
+__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd)
{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
/*
* Check to see if we're going to create a checkpoint.
*
@@ -1792,13 +2075,14 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
* we don't do checkpoint writes here; clear the boundary information as
* a reminder and create the checkpoint during wrapup.
*/
- if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+ if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) &&
+ bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
bnd->addr.addr = NULL;
bnd->addr.size = 0;
bnd->addr.type = 0;
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -1841,7 +2125,7 @@ __rec_split_row_promote(
WT_DECL_ITEM(update);
WT_DECL_RET;
WT_ITEM *max;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
size_t cnt, len, size;
uint32_t i;
const uint8_t *pa, *pb;
@@ -1892,36 +2176,37 @@ __rec_split_row_promote(
* the last key and smaller than the current key.
*/
max = r->last;
- for (i = r->skip_next; i > 0; --i) {
- skip = &r->skip[i - 1];
- if (skip->ins == NULL)
- WT_ERR(__wt_row_leaf_key(
- session, r->page, skip->rip, update, 0));
- else {
- update->data = WT_INSERT_KEY(skip->ins);
- update->size = WT_INSERT_KEY_SIZE(skip->ins);
- }
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ for (i = r->supd_next; i > 0; --i) {
+ supd = &r->supd[i - 1];
+ if (supd->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, r->page, supd->rip, update, 0));
+ else {
+ update->data = WT_INSERT_KEY(supd->ins);
+ update->size = WT_INSERT_KEY_SIZE(supd->ins);
+ }
- /* Compare against the current key, it must be less. */
- WT_ERR(__wt_compare(
- session, btree->collator, update, r->cur, &cmp));
- if (cmp >= 0)
- continue;
+ /* Compare against the current key, it must be less. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->cur, &cmp));
+ if (cmp >= 0)
+ continue;
- /* Compare against the last key, it must be greater. */
- WT_ERR(__wt_compare(
- session, btree->collator, update, r->last, &cmp));
- if (cmp >= 0)
- max = update;
+ /* Compare against the last key, it must be greater. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->last, &cmp));
+ if (cmp >= 0)
+ max = update;
- /*
- * The skipped updates are in key-sort order so the entry we're
- * looking for is either the last one or the next-to-last one
- * in the list. Once we've compared an entry against the last
- * key on the page, we're done.
- */
- break;
- }
+ /*
+ * The saved updates are in key-sort order so the entry
+ * we're looking for is either the last or the next-to-
+ * last one in the list. Once we've compared an entry
+ * against the last key on the page, we're done.
+ */
+ break;
+ }
/*
* The largest key on the last block must sort before the current key,
@@ -2228,7 +2513,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
* We track the record number at each column-store split point, set an
* initial value.
*/
- recno = 0;
+ recno = WT_RECNO_OOB;
if (dsk->type == WT_PAGE_COL_VAR)
recno = last->recno;
@@ -2326,10 +2611,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
WT_RET(compressor->pre_size(compressor, wt_session,
(uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
(size_t)r->raw_offsets[slots], &result_len));
- extra_skip = 0;
- if (btree->kencryptor != NULL)
- extra_skip = btree->kencryptor->size_const +
- WT_ENCRYPT_LEN_SIZE;
+ extra_skip = btree->kencryptor == NULL ? 0 :
+ btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE;
corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
@@ -2477,7 +2760,7 @@ no_slots:
break;
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- next->recno = 0;
+ next->recno = WT_RECNO_OOB;
if (!last_block) {
/*
* Confirm there was uncompressed data remaining
@@ -2530,7 +2813,8 @@ no_slots:
*
* If it's not a checkpoint, write the block.
*/
- if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+ if (r->bnd_next == 1 &&
+ last_block && __rec_is_checkpoint(session, r, last)) {
if (write_ref == dst)
WT_RET(__wt_buf_set(
session, &r->dsk, dst->mem, dst->size));
@@ -2647,13 +2931,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * We only arrive here with no entries to write if the page was entirely
- * empty, and if the page is empty, we merge it into its parent during
- * the parent's reconciliation. A page with skipped updates isn't truly
- * empty, continue on.
+ * We may arrive here with no entries to write if the page was entirely
+ * empty or if nothing on the page was visible to us.
*/
- if (r->entries == 0 && r->skip_next == 0)
- return (0);
+ if (r->entries == 0) {
+ /*
+ * Pages with skipped or not-yet-globally visible updates aren't
+ * really empty; otherwise, the page is truly empty and we will
+ * merge it into its parent during the parent's reconciliation.
+ */
+ if (r->supd_next == 0)
+ return (0);
+
+ /*
+ * If using the save/restore eviction path, continue with the
+ * write, the page will be restored after we finish.
+ *
+ * If using the lookaside table eviction path, we can't continue
+ * (we need a page to be written, otherwise we won't ever find
+ * the updates for future reads).
+ */
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE))
+ return (EBUSY);
+ }
/* Set the boundary reference and increment the count. */
bnd = &r->bnd[r->bnd_next++];
@@ -2666,9 +2966,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
/* If this is a checkpoint, we're done, otherwise write the page. */
- return (
- __rec_is_checkpoint(r, bnd) ? 0 :
- __rec_split_write(session, r, bnd, &r->dsk, 1));
+ return (__rec_is_checkpoint(session, r, bnd) ?
+ 0 : __rec_split_write(session, r, bnd, &r->dsk, 1));
}
/*
@@ -2794,7 +3093,7 @@ __rec_split_write(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_HEADER *dsk;
WT_PAGE_MODIFY *mod;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
size_t addr_size;
uint32_t bnd_slot, i, j;
int cmp;
@@ -2837,23 +3136,23 @@ __rec_split_write(WT_SESSION_IMPL *session,
bnd->cksum = 0;
/*
- * Check if we've skipped updates that belong to this block, and move
- * any to the per-block structure. Quit as soon as we find a skipped
+ * Check if we've saved updates that belong to this block, and move
+ * any to the per-block structure. Quit as soon as we find a saved
* update that doesn't belong to the block, they're in sorted order.
*
* This code requires a key be filled in for the next block (or the
* last block flag be set, if there's no next block).
*/
- for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
- /* The last block gets all remaining skipped updates. */
+ for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) {
+ /* The last block gets all remaining saved updates. */
if (last_block) {
- WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ WT_ERR(__rec_update_move(session, bnd, supd));
continue;
}
/*
- * Get the skipped update's key and compare it with this block's
- * key range. If the skipped update list belongs with the block
+ * Get the saved update's key and compare it with this block's
+ * key range. If the saved update list belongs with the block
* we're about to write, move it to the per-block memory. Check
* only to the first update that doesn't go with the block, they
* must be in sorted order.
@@ -2861,43 +3160,56 @@ __rec_split_write(WT_SESSION_IMPL *session,
switch (page->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
- if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
- goto skip_check_complete;
+ if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno)
+ goto supd_check_complete;
break;
case WT_PAGE_ROW_LEAF:
- if (skip->ins == NULL)
+ if (supd->ins == NULL)
WT_ERR(__wt_row_leaf_key(
- session, page, skip->rip, key, 0));
+ session, page, supd->rip, key, 0));
else {
- key->data = WT_INSERT_KEY(skip->ins);
- key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ key->data = WT_INSERT_KEY(supd->ins);
+ key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
WT_ERR(__wt_compare(session,
btree->collator, key, &(bnd + 1)->key, &cmp));
if (cmp >= 0)
- goto skip_check_complete;
+ goto supd_check_complete;
break;
WT_ILLEGAL_VALUE_ERR(session);
}
- WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ WT_ERR(__rec_update_move(session, bnd, supd));
}
-skip_check_complete:
+supd_check_complete:
/*
* If there are updates that weren't moved to the block, shuffle them to
- * the beginning of the cached list (we maintain the skipped updates in
- * sorted order, new skipped updates must be appended to the list).
+ * the beginning of the cached list (we maintain the saved updates in
+ * sorted order, new saved updates must be appended to the list).
+ */
+ for (j = 0; i < r->supd_next; ++j, ++i)
+ r->supd[j] = r->supd[i];
+ r->supd_next = j;
+
+ /*
+ * If using the lookaside table eviction path and we found updates that
+ * weren't globally visible when reconciling this page, note that in the
+ * page header.
*/
- for (j = 0; i < r->skip_next; ++j, ++i)
- r->skip[j] = r->skip[i];
- r->skip_next = j;
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) {
+ F_SET(dsk, WT_PAGE_LAS_UPDATE);
+ r->cache_write_lookaside = 1;
+ }
/*
- * If we had to skip updates in order to build this disk image, we can't
- * actually write it. Instead, we will re-instantiate the page using the
- * disk image and the list of updates we skipped.
+ * If using the save/restore eviction path and we had to skip updates in
+ * order to build this disk image, we can't actually write it. Instead,
+ * we will re-instantiate the page using the disk image and the list of
+ * updates we skipped.
*/
- if (bnd->skip != NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ r->cache_write_restore = 1;
+
/*
* If the buffer is compressed (raw compression was configured),
* we have to decompress it so we can instantiate it later. It's
@@ -2963,12 +3275,148 @@ skip_check_complete:
WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
bnd->addr.size = (uint8_t)addr_size;
+ /*
+ * If using the lookaside table eviction path and we found updates that
+ * weren't globally visible when reconciling this page, copy them into
+ * the database's lookaside store.
+ */
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL)
+ ret = __rec_update_las(session, r, btree->id, bnd);
+
done:
err: __wt_scr_free(session, &key);
return (ret);
}
/*
+ * __rec_update_las --
+ * Copy a set of updates into the database's lookaside buffer.
+ */
+static int
+__rec_update_las(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_ITEM las_addr, las_value;
+ WT_PAGE *page;
+ WT_SAVE_UPD *list;
+ WT_UPDATE *upd;
+ uint64_t las_counter;
+ uint32_t i, session_flags, slot;
+ uint8_t *p;
+
+ cursor = NULL;
+ WT_CLEAR(las_addr);
+ WT_CLEAR(las_value);
+ page = r->page;
+
+ /*
+ * We're writing lookaside records: start instantiating them on pages
+ * we read (with the right flag set), and start sweeping the file.
+ */
+ __wt_las_set_written(session);
+
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /* Ensure enough room for a column-store key without checking. */
+ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+
+ /*
+ * Each key in the lookaside table is associated with a block, and those
+ * blocks are freed and reallocated to other pages as pages in the tree
+ * are modified and reconciled. We want to be sure we don't add records
+ * to the lookaside table, then discard the block to which they apply,
+ * then write a new block to the same address, and then apply the old
+ * records to the new block when it's read. We don't want to clean old
+ * records out of the lookaside table every time we free a block because
+ * that happens a lot and would be costly; instead, we clean out the old
+ * records when adding new records into the lookaside table. This works
+ * because we only read from the lookaside table for pages marked with
+ * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a
+ * block with no lookaside records, so the lookaside table won't be
+ * checked when the block is read, even if there are lookaside table
+ * records matching that block. If we rewrite a block that has lookaside
+ * records, we'll run this code, discarding any old records that might
+ * exist.
+ */
+ WT_ERR(__wt_las_remove_block(
+ session, cursor, btree_id, bnd->addr.addr, bnd->addr.size));
+
+ /* Lookaside table key component: block address. */
+ las_addr.data = bnd->addr.addr;
+ las_addr.size = bnd->addr.size;
+
+ /* Enter each update in the boundary's list into the lookaside store. */
+ for (las_counter = 0, i = 0,
+ list = bnd->supd; i < bnd->supd_next; ++i, ++list) {
+ /* Lookaside table key component: source key. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = key->mem;
+ WT_ERR(
+ __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
+ key->size = WT_PTRDIFF(p, key->data);
+
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (list->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, page, list->rip, key, 0));
+ else {
+ key->data = WT_INSERT_KEY(list->ins);
+ key->size = WT_INSERT_KEY_SIZE(list->ins);
+ }
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Lookaside table value component: update reference. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ upd = list->ins->upd;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (list->ins == NULL) {
+ slot = WT_ROW_SLOT(page, list->rip);
+ upd = page->pg_row_upd[slot];
+ } else
+ upd = list->ins->upd;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Walk the list of updates, storing each key/value pair into
+ * the lookaside table.
+ */
+ do {
+ cursor->set_key(cursor, btree_id,
+ &las_addr, ++las_counter, list->onpage_txn, key);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ las_value.size = 0;
+ else {
+ las_value.data = WT_UPDATE_DATA(upd);
+ las_value.size = upd->size;
+ }
+ cursor->set_value(
+ cursor, upd->txnid, upd->size, &las_value);
+
+ WT_ERR(cursor->insert(cursor));
+ } while ((upd = upd->next) != NULL);
+ }
+
+err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ __wt_scr_free(session, &key);
+ return (ret);
+}
+
+/*
* __wt_bulk_init --
* Bulk insert initialization.
*/
@@ -3008,7 +3456,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
recno = 1;
break;
case BTREE_ROW:
- recno = 0;
+ recno = WT_RECNO_OOB;
break;
WT_ILLEGAL_VALUE(session);
}
@@ -3049,6 +3497,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_split_finish(session, r));
WT_RET(__rec_write_wrapup(session, r, r->page));
+ WT_RET(__rec_write_status(session, r, r->page));
/* Mark the page's parent and the tree dirty. */
parent = r->ref->home;
@@ -3824,7 +4273,7 @@ record_loop: /*
* Write a placeholder.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
data = "@";
size = 1;
@@ -4207,7 +4656,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
vtype = state == WT_CHILD_PROXY ?
WT_CELL_ADDR_DEL : (u_int)vpack->raw;
}
- __rec_cell_build_addr(r, p, size, vtype, 0);
+ __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB);
CHILD_RELEASE_ERR(session, hazard, ref);
/*
@@ -4294,7 +4743,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr = &multi->addr;
__rec_cell_build_addr(
- r, addr->addr, addr->size, __rec_vtype(addr), 0);
+ r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
/* Boundary: split or write the page. */
if (key->len + val->len > r->space_avail)
@@ -4450,7 +4899,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* Assert the case.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
/*
* If the key is also a removed overflow item,
@@ -4777,13 +5226,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
static int
__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_BM *bm;
WT_DECL_RET;
WT_PAGE_MODIFY *mod;
WT_MULTI *multi;
uint32_t i;
- bm = S2BT(session)->bm;
mod = page->modify;
/*
@@ -4799,17 +5246,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- if (multi->skip == NULL) {
+ if (multi->supd == NULL) {
if (multi->addr.reuse)
multi->addr.addr = NULL;
else {
- WT_RET(bm->free(bm, session,
+ WT_RET(__rec_block_free(session,
multi->addr.addr, multi->addr.size));
__wt_free(session, multi->addr.addr);
}
} else {
- __wt_free(session, multi->skip);
- __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->supd);
+ __wt_free(session, multi->supd_dsk);
}
}
__wt_free(session, mod->mod_multi);
@@ -4882,7 +5329,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
WT_RET(__wt_ref_info(
session, ref, &addr, &addr_size, NULL));
- WT_RET(bm->free(bm, session, addr, addr_size));
+ WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(
session, ((WT_ADDR *)ref->addr)->addr);
@@ -4908,7 +5355,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* are checkpoints, and must be explicitly dropped.
*/
if (!__wt_ref_is_root(ref))
- WT_RET(bm->free(bm, session,
+ WT_RET(__rec_block_free(session,
mod->mod_replace.addr, mod->mod_replace.size));
/* Discard the replacement page's address. */
@@ -4962,14 +5409,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* nothing to write. Allocate, then initialize the array of
* replacement blocks.
*/
- if (bnd->skip != NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
WT_RET(__wt_calloc_def(
session, r->bnd_next, &mod->mod_multi));
multi = mod->mod_multi;
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
bnd->dsk = NULL;
mod->mod_multi_entries = 1;
@@ -5068,50 +5515,6 @@ err: __wt_scr_free(session, &tkey);
F_SET(mod, WT_PM_REC_MULTIBLOCK);
break;
}
-
- /*
- * If updates were skipped, the tree isn't clean. The checkpoint call
- * cleared the tree's modified value before calling the eviction thread,
- * so we must explicitly reset the tree's modified flag. We insert a
- * barrier after the change for clarity (the requirement is the value
- * be set before a subsequent checkpoint reads it, and because the
- * current checkpoint is waiting on this reconciliation to complete,
- * there's no risk of that happening).
- */
- if (r->leave_dirty) {
- mod->first_dirty_txn = r->first_dirty_txn;
-
- btree->modified = 1;
- WT_FULL_BARRIER();
- } else {
- /*
- * If no updates were skipped, we have a new maximum transaction
- * written for the page (used to decide if a clean page can be
- * evicted). Set the highest transaction ID for the page.
- *
- * Track the highest transaction ID for the tree (used to decide
- * if it's safe to discard all of the pages in the tree without
- * further checking). Reconciliation in the service of eviction
- * is multi-threaded, only update the tree's maximum transaction
- * ID when doing a checkpoint. That's sufficient, we only care
- * about the highest transaction ID of any update currently in
- * the tree, and checkpoint visits every dirty page in the tree.
- */
- mod->rec_max_txn = r->max_txn;
- if (!F_ISSET(r, WT_EVICTING) &&
- WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
- btree->rec_max_txn = r->max_txn;
-
- /*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean. If the
- * write generation changed, the page has been written since
- * we started reconciliation and remains dirty.
- */
- if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
- __wt_cache_dirty_decr(session, page);
- }
-
return (0);
}
@@ -5122,14 +5525,12 @@ err: __wt_scr_free(session, &tkey);
static int
__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
{
- WT_BM *bm;
WT_BOUNDARY *bnd;
WT_DECL_RET;
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
uint32_t i;
- bm = S2BT(session)->bm;
mod = page->modify;
/*
@@ -5160,7 +5561,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (bnd->addr.reuse)
bnd->addr.addr = NULL;
else {
- WT_TRET(bm->free(bm, session,
+ WT_TRET(__rec_block_free(session,
bnd->addr.addr, bnd->addr.size));
__wt_free(session, bnd->addr.addr);
}
@@ -5203,18 +5604,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_row_ikey_alloc(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
- if (bnd->skip == NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ } else {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
- } else {
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
- bnd->dsk = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
@@ -5243,18 +5644,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
multi->key.recno = bnd->recno;
- if (bnd->skip == NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ } else {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
- } else {
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
- bnd->dsk = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 06786db2f6d..a1f5618a317 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -383,6 +383,22 @@ err: if (cursor != NULL)
}
/*
+ * __wt_session_create --
+ * Internal version of WT_SESSION::create.
+ */
+int
+__wt_session_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_TABLE_LOCK(session,
+ ret = __wt_schema_create(session, uri, config)));
+ return (ret);
+}
+
+/*
* __session_create --
* WT_SESSION->create method.
*/
@@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config)
WT_ERR_NOTFOUND_OK(ret);
}
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- ret = __wt_schema_create(session, uri, config)));
+ ret = __wt_session_create(session, uri, config);
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -529,6 +543,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
}
/*
+ * __wt_session_drop --
+ * Internal version of WT_SESSION::drop.
+ */
+int
+__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_TABLE_LOCK(session,
+ ret = __wt_schema_drop(session, uri, cfg)));
+ return (ret);
+}
+
+/*
* __session_drop --
* WT_SESSION->drop method.
*/
@@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
/* Disallow objects in the WiredTiger name space. */
WT_ERR(__wt_str_name_check(session, uri));
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- ret = __wt_schema_drop(session, uri, cfg)));
+ ret = __wt_session_drop(session, uri, cfg);
err: /* Note: drop operations cannot be unrolled (yet?). */
API_END_RET_NOTFOUND_MAP(session, ret);
@@ -915,7 +942,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
* If our LSN is smaller than the current sync LSN then our
* transaction is stable. We're done.
*/
- if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
+ if (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
goto err;
/*
@@ -937,7 +964,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
* Keep checking the LSNs until we find it is stable or we reach
* our timeout.
*/
- while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
+ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
WT_ERR(__wt_epoch(session, &now));
waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION;
@@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
* operations, but checkpoint does enough I/O it may be called upon to
* perform slow operations for the block manager.
*/
- F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
/*
* Only one checkpoint can be active at a time, and checkpoints must run
@@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
-err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
API_END_RET_NOTFOUND_MAP(session, ret);
}
diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c
index 8e42113a2ee..0f50bfe56a1 100644
--- a/src/third_party/wiredtiger/src/support/pow.c
+++ b/src/third_party/wiredtiger/src/support/pow.c
@@ -100,7 +100,7 @@ __wt_log2_int(uint32_t n)
* __wt_ispo2 --
* Return if a number is a power-of-two.
*/
-int
+bool
__wt_ispo2(uint32_t v)
{
/*
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 6af357202be..79248b0652c 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -24,6 +24,7 @@ static const char * const __stats_dsrc_desc[] = {
"btree: column-store variable-size deleted values",
"btree: column-store fixed-size leaf pages",
"btree: column-store internal pages",
+ "btree: column-store variable-size RLE encoded values",
"btree: column-store variable-size leaf pages",
"btree: pages rewritten by compaction",
"btree: number of key/value pairs",
@@ -48,10 +49,14 @@ static const char * const __stats_dsrc_desc[] = {
"cache: internal pages evicted",
"cache: pages split during eviction",
"cache: in-memory page splits",
+ "cache: in-memory page passed criteria to be split",
"cache: overflow values cached in memory",
"cache: pages read into cache",
+ "cache: pages read into cache requiring lookaside entries",
"cache: overflow pages read into cache",
"cache: pages written from cache",
+ "cache: page written requiring lookaside records",
+ "cache: pages written requiring in-memory restoration",
"compression: raw compression call failed, no additional data available",
"compression: raw compression call failed, additional data available",
"compression: raw compression call succeeded",
@@ -137,6 +142,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->btree_column_internal = 0;
stats->btree_column_deleted = 0;
stats->btree_column_variable = 0;
+ stats->btree_column_rle = 0;
stats->btree_fixed_len = 0;
stats->btree_maxintlkey = 0;
stats->btree_maxintlpage = 0;
@@ -154,15 +160,19 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_eviction_checkpoint = 0;
stats->cache_eviction_fail = 0;
stats->cache_eviction_hazard = 0;
+ stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
stats->cache_eviction_dirty = 0;
stats->cache_read_overflow = 0;
stats->cache_overflow_value = 0;
stats->cache_eviction_deepen = 0;
+ stats->cache_write_lookaside = 0;
stats->cache_read = 0;
+ stats->cache_read_lookaside = 0;
stats->cache_eviction_split = 0;
stats->cache_write = 0;
+ stats->cache_write_restore = 0;
stats->cache_eviction_clean = 0;
stats->compress_read = 0;
stats->compress_write = 0;
@@ -243,6 +253,7 @@ __wt_stat_dsrc_aggregate_single(
to->btree_column_internal += from->btree_column_internal;
to->btree_column_deleted += from->btree_column_deleted;
to->btree_column_variable += from->btree_column_variable;
+ to->btree_column_rle += from->btree_column_rle;
to->btree_fixed_len = from->btree_fixed_len;
if (from->btree_maxintlkey > to->btree_maxintlkey)
to->btree_maxintlkey = from->btree_maxintlkey;
@@ -266,15 +277,19 @@ __wt_stat_dsrc_aggregate_single(
to->cache_eviction_checkpoint += from->cache_eviction_checkpoint;
to->cache_eviction_fail += from->cache_eviction_fail;
to->cache_eviction_hazard += from->cache_eviction_hazard;
+ to->cache_inmem_splittable += from->cache_inmem_splittable;
to->cache_inmem_split += from->cache_inmem_split;
to->cache_eviction_internal += from->cache_eviction_internal;
to->cache_eviction_dirty += from->cache_eviction_dirty;
to->cache_read_overflow += from->cache_read_overflow;
to->cache_overflow_value += from->cache_overflow_value;
to->cache_eviction_deepen += from->cache_eviction_deepen;
+ to->cache_write_lookaside += from->cache_write_lookaside;
to->cache_read += from->cache_read;
+ to->cache_read_lookaside += from->cache_read_lookaside;
to->cache_eviction_split += from->cache_eviction_split;
to->cache_write += from->cache_write;
+ to->cache_write_restore += from->cache_write_restore;
to->cache_eviction_clean += from->cache_eviction_clean;
to->compress_read += from->compress_read;
to->compress_write += from->compress_write;
@@ -354,6 +369,7 @@ __wt_stat_dsrc_aggregate(
to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted);
to->btree_column_variable +=
WT_STAT_READ(from, btree_column_variable);
+ to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
to->btree_fixed_len = from[0]->btree_fixed_len;
if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
to->btree_maxintlkey)
@@ -386,6 +402,8 @@ __wt_stat_dsrc_aggregate(
to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
to->cache_eviction_hazard +=
WT_STAT_READ(from, cache_eviction_hazard);
+ to->cache_inmem_splittable +=
+ WT_STAT_READ(from, cache_inmem_splittable);
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
@@ -394,9 +412,13 @@ __wt_stat_dsrc_aggregate(
to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value);
to->cache_eviction_deepen +=
WT_STAT_READ(from, cache_eviction_deepen);
+ to->cache_write_lookaside +=
+ WT_STAT_READ(from, cache_write_lookaside);
to->cache_read += WT_STAT_READ(from, cache_read);
+ to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_write += WT_STAT_READ(from, cache_write);
+ to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
to->compress_read += WT_STAT_READ(from, compress_read);
to->compress_write += WT_STAT_READ(from, compress_write);
@@ -511,11 +533,17 @@ static const char * const __stats_connection_desc[] = {
"cache: pages walked for eviction",
"cache: eviction worker thread evicting pages",
"cache: in-memory page splits",
+ "cache: in-memory page passed criteria to be split",
+ "cache: lookaside table insert calls",
+ "cache: lookaside table remove calls",
"cache: percentage overhead",
"cache: tracked dirty pages in the cache",
"cache: pages currently held in the cache",
"cache: pages read into cache",
+ "cache: pages read into cache requiring lookaside entries",
"cache: pages written from cache",
+ "cache: page written requiring lookaside records",
+ "cache: pages written requiring in-memory restoration",
"connection: pthread mutex condition wait calls",
"cursor: cursor create calls",
"cursor: cursor insert calls",
@@ -558,9 +586,9 @@ static const char * const __stats_connection_desc[] = {
"log: logging bytes consolidated",
"log: consolidated slot joins",
"log: consolidated slot join races",
- "log: record size exceeded maximum",
- "log: failed to find a slot large enough for record",
+ "log: busy returns attempting to switch slots",
"log: consolidated slot join transitions",
+ "log: consolidated slot unbuffered writes",
"log: log sync operations",
"log: log sync_dir operations",
"log: log server thread advances write LSN",
@@ -667,21 +695,27 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_eviction_worker_evicting = 0;
stats->cache_eviction_force_fail = 0;
stats->cache_eviction_hazard = 0;
+ stats->cache_inmem_splittable = 0;
stats->cache_inmem_split = 0;
stats->cache_eviction_internal = 0;
+ stats->cache_lookaside_insert = 0;
+ stats->cache_lookaside_remove = 0;
/* not clearing cache_bytes_max */
/* not clearing cache_eviction_maximum_page_size */
stats->cache_eviction_dirty = 0;
stats->cache_eviction_deepen = 0;
+ stats->cache_write_lookaside = 0;
/* not clearing cache_pages_inuse */
stats->cache_eviction_force = 0;
stats->cache_eviction_force_delete = 0;
stats->cache_eviction_app = 0;
stats->cache_read = 0;
+ stats->cache_read_lookaside = 0;
stats->cache_eviction_fail = 0;
stats->cache_eviction_split = 0;
stats->cache_eviction_walk = 0;
stats->cache_write = 0;
+ stats->cache_write_restore = 0;
/* not clearing cache_overhead */
/* not clearing cache_bytes_internal */
/* not clearing cache_bytes_leaf */
@@ -716,11 +750,12 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->dh_sweeps = 0;
stats->dh_session_handles = 0;
stats->dh_session_sweeps = 0;
+ stats->log_slot_switch_busy = 0;
stats->log_slot_closes = 0;
stats->log_slot_races = 0;
stats->log_slot_transitions = 0;
stats->log_slot_joins = 0;
- stats->log_slot_toosmall = 0;
+ stats->log_slot_unbuffered = 0;
stats->log_bytes_payload = 0;
stats->log_bytes_written = 0;
stats->log_compress_writes = 0;
@@ -738,7 +773,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing log_prealloc_max */
stats->log_prealloc_files = 0;
stats->log_prealloc_used = 0;
- stats->log_slot_toobig = 0;
stats->log_scan_records = 0;
stats->log_compress_mem = 0;
/* not clearing log_buffer_size */
@@ -835,25 +869,35 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, cache_eviction_force_fail);
to->cache_eviction_hazard +=
WT_STAT_READ(from, cache_eviction_hazard);
+ to->cache_inmem_splittable +=
+ WT_STAT_READ(from, cache_inmem_splittable);
to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
to->cache_eviction_internal +=
WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_lookaside_insert +=
+ WT_STAT_READ(from, cache_lookaside_insert);
+ to->cache_lookaside_remove +=
+ WT_STAT_READ(from, cache_lookaside_remove);
to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max);
to->cache_eviction_maximum_page_size +=
WT_STAT_READ(from, cache_eviction_maximum_page_size);
to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
to->cache_eviction_deepen +=
WT_STAT_READ(from, cache_eviction_deepen);
+ to->cache_write_lookaside +=
+ WT_STAT_READ(from, cache_write_lookaside);
to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse);
to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force);
to->cache_eviction_force_delete +=
WT_STAT_READ(from, cache_eviction_force_delete);
to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app);
to->cache_read += WT_STAT_READ(from, cache_read);
+ to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
to->cache_write += WT_STAT_READ(from, cache_write);
+ to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
to->cache_overhead += WT_STAT_READ(from, cache_overhead);
to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal);
to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf);
@@ -888,11 +932,12 @@ __wt_stat_connection_aggregate(
to->dh_sweeps += WT_STAT_READ(from, dh_sweeps);
to->dh_session_handles += WT_STAT_READ(from, dh_session_handles);
to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps);
+ to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy);
to->log_slot_closes += WT_STAT_READ(from, log_slot_closes);
to->log_slot_races += WT_STAT_READ(from, log_slot_races);
to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions);
to->log_slot_joins += WT_STAT_READ(from, log_slot_joins);
- to->log_slot_toosmall += WT_STAT_READ(from, log_slot_toosmall);
+ to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered);
to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload);
to->log_bytes_written += WT_STAT_READ(from, log_bytes_written);
to->log_compress_writes += WT_STAT_READ(from, log_compress_writes);
@@ -913,7 +958,6 @@ __wt_stat_connection_aggregate(
to->log_prealloc_max += WT_STAT_READ(from, log_prealloc_max);
to->log_prealloc_files += WT_STAT_READ(from, log_prealloc_files);
to->log_prealloc_used += WT_STAT_READ(from, log_prealloc_used);
- to->log_slot_toobig += WT_STAT_READ(from, log_slot_toobig);
to->log_scan_records += WT_STAT_READ(from, log_scan_records);
to->log_compress_mem += WT_STAT_READ(from, log_compress_mem);
to->log_buffer_size += WT_STAT_READ(from, log_buffer_size);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index e671ce28ffb..9f59c53314e 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:"));
+ /* Skip files that are never involved in a checkpoint. */
+ if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT))
+ return (0);
+
/* Make sure there is space for the next entry. */
WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
session->ckpt_handle_next + 1, &session->ckpt_handle));
@@ -1164,7 +1168,15 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
btree = S2BT(session);
bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0;
- /* If the handle is already dead, discard it. */
+ /*
+ * If the handle is already dead or the file isn't durable, force the
+ * discard.
+ *
+ * If the file isn't durable, mark the handle dead, there are asserts
+ * later on that only dead handles can have modified pages.
+ */
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ F_SET(session->dhandle, WT_DHANDLE_DEAD);
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index 0d66eccd7dc..a63720d736f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session,
* 3) row store remove; or
* 4) row store insert/update.
*/
- if (cbt->btree->type != BTREE_ROW) {
- WT_ASSERT(session, cbt->ins != NULL);
- recno = WT_INSERT_RECNO(cbt->ins);
- WT_ASSERT(session, recno != 0);
-
- if (WT_UPDATE_DELETED_ISSET(upd))
- WT_ERR(__wt_logop_col_remove_pack(session, logrec,
- op->fileid, recno));
- else
- WT_ERR(__wt_logop_col_put_pack(session, logrec,
- op->fileid, recno, &value));
- } else {
+ if (cbt->btree->type == BTREE_ROW) {
WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
if (WT_UPDATE_DELETED_ISSET(upd))
@@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session,
else
WT_ERR(__wt_logop_row_put_pack(session, logrec,
op->fileid, &key, &value));
+ } else {
+ recno = WT_INSERT_RECNO(cbt->ins);
+ WT_ASSERT(session, recno != WT_RECNO_OOB);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+ op->fileid, recno));
+ else
+ WT_ERR(__wt_logop_col_put_pack(session, logrec,
+ op->fileid, recno, &value));
}
err: __wt_buf_free(session, &key);
@@ -308,7 +307,7 @@ __wt_txn_checkpoint_log(
switch (flags) {
case WT_TXN_LOG_CKPT_PREPARE:
txn->full_ckpt = 1;
- *ckpt_lsn = S2C(session)->log->write_start_lsn;
+ WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
/*
* We need to make sure that the log records in the checkpoint
* LSN are on disk. In particular to make sure that the
@@ -337,7 +336,7 @@ __wt_txn_checkpoint_log(
txn->ckpt_nsnapshot = 0;
WT_CLEAR(empty);
ckpt_snapshot = &empty;
- *ckpt_lsn = S2C(session)->log->write_start_lsn;
+ WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
} else
ckpt_snapshot = txn->ckpt_snapshot;
@@ -419,9 +418,9 @@ __wt_txn_truncate_log(
} else {
op->type = WT_TXN_OP_TRUNCATE_COL;
op->u.truncate_col.start =
- (start == NULL) ? 0 : start->recno;
+ (start == NULL) ? WT_RECNO_OOB : start->recno;
op->u.truncate_col.stop =
- (stop == NULL) ? 0 : stop->recno;
+ (stop == NULL) ? WT_RECNO_OOB : stop->recno;
}
/* Write that operation into the in-memory log. */
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index f321da303d7..240d0a5ffd3 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
"No file found with ID %u (max %u)",
id, r->nfiles));
r->missing = 1;
- } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
/*
* We're going to apply the operation. Get the cursor, opening
* one if none is cached.
@@ -144,10 +144,10 @@ __txn_op_apply(
GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
/* Set up the cursors. */
- if (start_recno == 0) {
+ if (start_recno == WT_RECNO_OOB) {
start = NULL;
stop = cursor;
- } else if (stop_recno == 0) {
+ } else if (stop_recno == WT_RECNO_OOB) {
start = cursor;
stop = NULL;
} else {
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
index 1888c7d967b..1d35f2efc72 100644
--- a/src/third_party/wiredtiger/src/utilities/util_list.c
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
}
/*
- * XXX
- * We don't normally say anything about the WiredTiger
- * metadata, it's not a normal "object" in the database. I'm
- * making an exception for the checkpoint and verbose options.
+ * !!!
+ * We don't normally say anything about the WiredTiger metadata
+ * and lookaside tables, they're not application/user "objects"
+ * in the database. I'm making an exception for the checkpoint
+ * and verbose options.
*/
- if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+ if (cflag || vflag ||
+ (strcmp(key, WT_METADATA_URI) != 0 &&
+ strcmp(key, WT_LAS_URI) != 0))
printf("%s\n", key);
if (!cflag && !vflag)