summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2015-03-09 17:47:27 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2015-03-09 17:47:27 +1100
commit3a3bda539cdd34428b7489fa0fa102ff0605e8d8 (patch)
treefc901ad7b45300181356b8305ecb02fff13f4bfc
parent89f45aafdff48bf7c8e191b788a144cab0b86122 (diff)
parent0afa07b0cd666adf7576901540a699b0bec396e3 (diff)
downloadmongo-3a3bda539cdd34428b7489fa0fa102ff0605e8d8.tar.gz
Merge branch 'develop' into mongodb-3.0
Conflicts: NEWS.MONGODB
-rw-r--r--README6
-rw-r--r--RELEASE_INFO2
-rw-r--r--build_posix/Make.subdirs1
-rw-r--r--build_posix/aclocal/version-set.m44
-rw-r--r--build_posix/aclocal/version.m42
-rw-r--r--dist/flags.py6
-rw-r--r--dist/package/wiredtiger.spec2
-rw-r--r--dist/s_define.list1
-rw-r--r--dist/s_string.ok2
-rw-r--r--dist/stat_data.py3
-rw-r--r--examples/c/ex_pack.c7
-rw-r--r--src/btree/bt_delete.c3
-rw-r--r--src/btree/bt_handle.c5
-rw-r--r--src/btree/bt_page.c3
-rw-r--r--src/btree/bt_slvg.c7
-rw-r--r--src/btree/bt_split.c41
-rw-r--r--src/btree/bt_stat.c155
-rw-r--r--src/btree/bt_sync.c41
-rw-r--r--src/btree/bt_walk.c110
-rw-r--r--src/btree/row_key.c57
-rw-r--r--src/conn/conn_api.c3
-rw-r--r--src/conn/conn_cache.c82
-rw-r--r--src/conn/conn_cache_pool.c25
-rw-r--r--src/conn/conn_dhandle.c61
-rw-r--r--src/conn/conn_log.c143
-rw-r--r--src/conn/conn_open.c6
-rw-r--r--src/conn/conn_stat.c2
-rw-r--r--src/cursor/cur_index.c2
-rw-r--r--src/cursor/cur_stat.c1
-rw-r--r--src/docs/spell.ok1
-rw-r--r--src/docs/tune-memory-allocator.dox5
-rw-r--r--src/evict/evict_file.c34
-rw-r--r--src/evict/evict_lru.c95
-rw-r--r--src/evict/evict_page.c4
-rw-r--r--src/include/btmem.h55
-rw-r--r--src/include/btree.i6
-rw-r--r--src/include/cache.h7
-rw-r--r--src/include/connection.h4
-rw-r--r--src/include/error.h4
-rw-r--r--src/include/extern.h10
-rw-r--r--src/include/flags.h30
-rw-r--r--src/include/log.h10
-rw-r--r--src/include/stat.h3
-rw-r--r--src/include/wiredtiger.in111
-rw-r--r--src/log/log.c95
-rw-r--r--src/log/log_slot.c30
-rw-r--r--src/lsm/lsm_cursor.c24
-rw-r--r--src/lsm/lsm_work_unit.c18
-rw-r--r--src/reconcile/rec_write.c76
-rw-r--r--src/schema/schema_worker.c16
-rw-r--r--src/support/stat.c8
-rw-r--r--src/txn/txn_ckpt.c27
-rw-r--r--test/bloom/test_bloom.c44
-rw-r--r--test/format/bdb.c5
-rw-r--r--test/format/config.c58
-rw-r--r--test/format/config.h15
-rw-r--r--test/format/format.h2
-rw-r--r--test/format/ops.c58
-rw-r--r--test/format/recover.sh14
-rwxr-xr-xtest/format/smoke.sh2
-rw-r--r--test/format/t.c66
-rw-r--r--test/packing/Makefile.am5
-rw-r--r--test/packing/intpack-test.c28
-rw-r--r--test/packing/intpack-test2.c17
-rw-r--r--test/packing/packing-test.c20
-rw-r--r--test/salvage/salvage.c32
-rw-r--r--test/suite/test_bug009.py67
-rw-r--r--test/suite/test_bug010.py103
-rw-r--r--test/suite/test_dump.py34
-rw-r--r--test/suite/test_shared_cache01.py (renamed from test/suite/test_shared_cache.py)39
-rw-r--r--test/suite/test_shared_cache02.py169
-rw-r--r--test/suite/test_stat01.py98
72 files changed, 1612 insertions, 720 deletions
diff --git a/README b/README
index d9994f0526d..d89756c9f42 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-WiredTiger 2.5.1: (March 9, 2015)
+WiredTiger 2.5.2: (March 9, 2015)
-This is version 2.5.1 of WiredTiger.
+This is version 2.5.2 of WiredTiger.
WiredTiger release packages and documentation can be found at:
@@ -9,7 +9,7 @@ WiredTiger release packages and documentation can be found at:
Information on configuring, building and installing WiredTiger can be
found at:
- http://source.wiredtiger.com/2.5.1/install.html
+ http://source.wiredtiger.com/2.5.2/install.html
WiredTiger licensing information can be found at:
diff --git a/RELEASE_INFO b/RELEASE_INFO
index 6c7da8cb961..ac5ff8ac028 100644
--- a/RELEASE_INFO
+++ b/RELEASE_INFO
@@ -1,6 +1,6 @@
WIREDTIGER_VERSION_MAJOR=2
WIREDTIGER_VERSION_MINOR=5
-WIREDTIGER_VERSION_PATCH=1
+WIREDTIGER_VERSION_PATCH=2
WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index d37acef50e1..82feee58aa1 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -24,5 +24,6 @@ test/checkpoint
test/fops
test/format HAVE_BERKELEY_DB
test/huge
+test/packing
test/salvage
test/thread
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 7f4d68e8b39..cbd389ea40d 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -2,8 +2,8 @@ dnl build by dist/s_version
VERSION_MAJOR=2
VERSION_MINOR=5
-VERSION_PATCH=1
-VERSION_STRING='"WiredTiger 2.5.1: (March 9, 2015)"'
+VERSION_PATCH=2
+VERSION_STRING='"WiredTiger 2.5.2: (March 9, 2015)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4
index 71598b276eb..340f77e5474 100644
--- a/build_posix/aclocal/version.m4
+++ b/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
-2.5.1
+2.5.2
diff --git a/dist/flags.py b/dist/flags.py
index a0e307debf6..f1eb6b24968 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -36,12 +36,11 @@ flags = {
'page_read' : [
'READ_CACHE',
'READ_COMPACT',
- 'READ_NO_GEN',
'READ_NO_EVICT',
+ 'READ_NO_GEN',
'READ_NO_WAIT',
'READ_PREV',
'READ_SKIP_INTL',
- 'READ_SKIP_LEAF',
'READ_TRUNCATE',
'READ_WONT_NEED',
],
@@ -88,15 +87,16 @@ flags = {
'conn' : [
'CONN_CACHE_POOL',
'CONN_CKPT_SYNC',
+ 'CONN_CLOSING',
'CONN_EVICTION_RUN',
'CONN_LEAK_MEMORY',
'CONN_LOG_SERVER_RUN',
'CONN_LSM_MERGE',
'CONN_PANIC',
- 'CONN_SERVER_RUN',
'CONN_SERVER_ASYNC',
'CONN_SERVER_CHECKPOINT',
'CONN_SERVER_LSM',
+ 'CONN_SERVER_RUN',
'CONN_SERVER_STATISTICS',
'CONN_SERVER_SWEEP',
'CONN_WAS_BACKUP',
diff --git a/dist/package/wiredtiger.spec b/dist/package/wiredtiger.spec
index ab762ef17fd..11eca316ffd 100644
--- a/dist/package/wiredtiger.spec
+++ b/dist/package/wiredtiger.spec
@@ -1,5 +1,5 @@
Name: wiredtiger
-Version: 2.5.1
+Version: 2.5.2
Release: 1%{?dist}
Summary: WiredTiger data storage engine
diff --git a/dist/s_define.list b/dist/s_define.list
index 91fbc971afa..4924a1935ae 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -49,6 +49,7 @@ WT_STAT_ATOMIC_DECR
WT_STAT_ATOMIC_DECRV
WT_STAT_ATOMIC_INCR
WT_STAT_ATOMIC_INCRV
+WT_STAT_DECR
WT_STAT_DECRV
WT_STAT_FAST_ATOMIC_DECR
WT_STAT_FAST_ATOMIC_DECRV
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 66439faf161..8b0335a6480 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -551,6 +551,7 @@ dest
dev
dhandle
dhandles
+dir
dirlist
dl
dlclose
@@ -1161,6 +1162,7 @@ wrapup
writelock
writeunlock
wrlock
+wrlsn
ws
wti
wtperf
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 5a42f2ff318..dd4d292c8b6 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -221,11 +221,14 @@ connection_stats = [
LogStat('log_prealloc_max', 'number of pre-allocated log files to create'),
LogStat('log_prealloc_used', 'pre-allocated log files used'),
LogStat('log_reads', 'log read operations'),
+ LogStat('log_release_write_lsn', 'log release advances write LSN'),
LogStat('log_scan_records', 'records processed by log scan'),
LogStat('log_scan_rereads', 'log scan records requiring two reads'),
LogStat('log_scans', 'log scan operations'),
LogStat('log_sync', 'log sync operations'),
+ LogStat('log_sync_dir', 'log sync_dir operations'),
LogStat('log_writes', 'log write operations'),
+ LogStat('log_write_lsn', 'log server thread advances write LSN'),
LogStat('log_slot_consolidated', 'logging bytes consolidated'),
LogStat('log_slot_closes', 'consolidated slot closures'),
diff --git a/examples/c/ex_pack.c b/examples/c/ex_pack.c
index 19be35119af..c24805ade29 100644
--- a/examples/c/ex_pack.c
+++ b/examples/c/ex_pack.c
@@ -42,8 +42,6 @@ main(void)
{
WT_CONNECTION *conn;
WT_SESSION *session;
- char buf[50];
- size_t size;
int i, j, k, ret;
/*
@@ -66,7 +64,11 @@ main(void)
fprintf(stderr, "Error opening a session on %s: %s\n",
home, wiredtiger_strerror(ret));
+ {
/*! [packing] */
+ size_t size;
+ char buf[50];
+
ret = wiredtiger_struct_size(session, &size, "iii", 42, 1000, -9);
if (size > sizeof(buf)) {
/* Allocate a bigger buffer. */
@@ -76,6 +78,7 @@ main(void)
ret = wiredtiger_struct_unpack(session, buf, size, "iii", &i, &j, &k);
/*! [packing] */
+ }
/* Note: closing the connection implicitly closes open session(s). */
if ((ret = conn->close(conn, NULL)) != 0)
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 9c4ab05ce40..479f6547e42 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -221,9 +221,6 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
int skip;
- if (ref->state != WT_REF_DELETED)
- return (0);
-
/*
* Deleted pages come from two sources: either it's a fast-delete as
* described above, or the page has been emptied by other operations
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 299849ad365..5b3624a4a2d 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -453,8 +453,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
ref->page = NULL;
ref->addr = NULL;
ref->state = WT_REF_DELETED;
- WT_ERR(__wt_row_ikey_incr(
- session, root, 0, "", 1, &ref->key.ikey));
+ WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -634,7 +633,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
cache_size = S2C(session)->cache_size;
if (cache_size > 0)
- btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
+ btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4);
/*
* Get the split percentage (reconciliation splits pages into smaller
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index b5140beb792..e177b05cd24 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -165,6 +165,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
if (oldgen && page->read_gen == WT_READGEN_NOTSET)
__wt_page_evict_soon(page);
else if (!LF_ISSET(WT_READ_NO_GEN) &&
+ page->read_gen != WT_READGEN_OLDEST &&
page->read_gen < __wt_cache_read_gen(session))
page->read_gen =
__wt_cache_read_gen_set(session);
@@ -611,7 +612,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
WT_ERR(__wt_row_ikey_incr(session, page,
WT_PAGE_DISK_OFFSET(page, cell),
- current->data, current->size, &ref->key.ikey));
+ current->data, current->size, ref));
*sizep += sizeof(WT_IKEY) + current->size;
break;
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 1cf616a2f6b..d6c20556a9a 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1858,8 +1858,7 @@ __slvg_row_build_internal(
WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss));
} else {
WT_ERR(__wt_row_ikey_incr(session, page, 0,
- trk->row_start.data, trk->row_start.size,
- &ref->key.ikey));
+ trk->row_start.data, trk->row_start.size, ref));
WT_ERR(__slvg_ovfl_ref_all(session, trk));
}
@@ -1981,8 +1980,8 @@ __slvg_row_build_leaf(
*/
rip = page->pg_row_d + skip_start;
WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
- WT_ERR(__wt_row_ikey_incr(session,
- ref->home, 0, key->data, key->size, &ref->key.ikey));
+ WT_ERR(__wt_row_ikey_incr(
+ session, ref->home, 0, key->data, key->size, ref));
/* Set the referenced flag on overflow pages we're using. */
if (trk->trk_ovfl_cnt != 0)
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 6ebd4609efa..95fb9c68a86 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -281,8 +281,8 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session,
if (parent->type == WT_PAGE_ROW_INT) {
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
__wt_ref_key(parent, ref, &key, &size);
- WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
- ref->key.ikey = ikey;
+ WT_RET(__wt_row_ikey(session, 0, key, size, ref));
+ ikey = ref->key.ikey;
} else {
WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
*parent_decrp += sizeof(WT_IKEY) + ikey->size;
@@ -454,8 +454,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
ref->addr = NULL;
if (parent->type == WT_PAGE_ROW_INT) {
__wt_ref_key(parent, *parent_refp, &p, &size);
- WT_ERR(
- __wt_row_ikey(session, 0, p, size, &ref->key.ikey));
+ WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
parent_incr += sizeof(WT_IKEY) + size;
} else
ref->key.recno = (*parent_refp)->key.recno;
@@ -468,7 +467,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
/* Mark it dirty. */
WT_ERR(__wt_page_modify_init(session, child));
- __wt_page_only_modify_set(session, child);
+ __wt_page_modify_set(session, child);
/*
* Once the split goes live, the newly created internal pages
@@ -761,8 +760,8 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
ikey = multi->key.ikey;
- WT_RET(__wt_row_ikey(session, 0,
- WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+ WT_RET(__wt_row_ikey(
+ session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
incr += sizeof(WT_IKEY) + ikey->size;
break;
default:
@@ -855,7 +854,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
for (i = 0, deleted_entries = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
- if (__wt_delete_page_skip(session, next_ref) &&
+ if (next_ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, next_ref) &&
WT_ATOMIC_CAS4(next_ref->state,
WT_REF_DELETED, WT_REF_SPLIT))
deleted_entries++;
@@ -1139,15 +1139,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
/*
- * The first page in the split is the current page, but we still need to
- * create a replacement WT_REF and make a copy of the key (the original
- * WT_REF is set to split-status and eventually freed).
- *
- * The new reference is visible to readers once the split completes.
+ * The first page in the split is the current page, but we still have
+ * to create a replacement WT_REF, the original WT_REF will be set to
+ * split status and eventually freed.
*/
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
child = split_ref[0];
*child = *ref;
+
+ /*
+ * The new WT_REF is not quite identical: we have to instantiate a key,
+ * and the new reference is visible to readers once the split completes.
+ *
+ * The key-instantiation code checks for races, clear the key fields so
+ * we don't trigger them.
+ */
+ child->key.recno = 0;
+ child->key.ikey = NULL;
child->state = WT_REF_MEM;
/*
@@ -1167,8 +1175,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
} else
WT_ERR(__wt_row_leaf_key(
session, page, &page->pg_row_d[0], key, 1));
- WT_ERR(__wt_row_ikey(
- session, 0, key->data, key->size, &child->key.ikey));
+ WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size;
__wt_scr_free(session, &key);
@@ -1187,7 +1194,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
child->state = WT_REF_MEM;
WT_ERR(__wt_row_ikey(session, 0,
WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
- &child->key.ikey));
+ child));
parent_incr +=
sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
@@ -1203,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
/* The new page is dirty by definition. */
WT_ERR(__wt_page_modify_init(session, right));
- __wt_page_only_modify_set(session, right);
+ __wt_page_modify_set(session, right);
/*
* We modified the page above, which will have set the first dirty
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 2e34a925f84..b550158a5a9 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -9,8 +9,9 @@
#include "wt_internal.h"
static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
-static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
/*
* __wt_btree_stat_init --
@@ -89,18 +90,13 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
WT_STAT_INCRV(stats, btree_entries, pindex->entries);
break;
case WT_PAGE_COL_VAR:
- WT_RET(__stat_page_col_var(page, stats));
- break;
- case WT_PAGE_OVFL:
- WT_STAT_INCR(stats, btree_overflow);
+ __stat_page_col_var(page, stats);
break;
case WT_PAGE_ROW_INT:
- WT_STAT_INCR(stats, btree_row_internal);
- pindex = WT_INTL_INDEX_COPY(page);
- WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+ __stat_page_row_int(session, page, stats);
break;
case WT_PAGE_ROW_LEAF:
- WT_RET(__stat_page_row_leaf(page, stats));
+ __stat_page_row_leaf(session, page, stats);
break;
WT_ILLEGAL_VALUE(session);
}
@@ -111,7 +107,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
* __stat_page_col_var --
* Stat a WT_PAGE_COL_VAR page.
*/
-static int
+static void
__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
{
WT_CELL *cell;
@@ -119,29 +115,33 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
WT_COL *cip;
WT_INSERT *ins;
WT_UPDATE *upd;
+ uint64_t deleted_cnt, entry_cnt, ovfl_cnt;
uint32_t i;
int orig_deleted;
unpack = &_unpack;
+ deleted_cnt = entry_cnt = ovfl_cnt = 0;
WT_STAT_INCR(stats, btree_column_variable);
/*
- * Walk the page, counting regular and overflow data items, and checking
- * to be sure any updates weren't deletions. If the item was updated,
- * assume it was updated by an item of the same size (it's expensive to
- * figure out if it will require the same space or not, especially if
- * there's Huffman encoding).
+ * Walk the page counting regular items, adjusting if the item has been
+ * subsequently deleted or not. This is a mess because 10-item RLE might
+ * have 3 of the items subsequently deleted. Overflow items are harder,
+ * we can't know if an updated item will be an overflow item or not; do
+ * our best, and simply count every overflow item (or RLE set of items)
+ * we see.
*/
WT_COL_FOREACH(page, cip, i) {
if ((cell = WT_COL_PTR(page, cip)) == NULL) {
orig_deleted = 1;
- WT_STAT_INCR(stats, btree_column_deleted);
+ ++deleted_cnt;
} else {
orig_deleted = 0;
__wt_cell_unpack(cell, unpack);
- WT_STAT_INCRV(
- stats, btree_entries, __wt_cell_rle(unpack));
+ entry_cnt += __wt_cell_rle(unpack);
+ if (unpack->ovfl)
+ ++ovfl_cnt;
}
/*
@@ -151,57 +151,128 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
upd = ins->upd;
if (WT_UPDATE_DELETED_ISSET(upd)) {
- if (orig_deleted)
- continue;
- WT_STAT_INCR(stats, btree_column_deleted);
- WT_STAT_DECR(stats, btree_entries);
- } else {
- if (!orig_deleted)
- continue;
- WT_STAT_DECR(stats, btree_column_deleted);
- WT_STAT_INCR(stats, btree_entries);
- }
+ if (!orig_deleted) {
+ ++deleted_cnt;
+ --entry_cnt;
+ }
+ } else
+ if (orig_deleted) {
+ --deleted_cnt;
+ ++entry_cnt;
+ }
}
}
- return (0);
+
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
+ ++entry_cnt;
+
+ WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt);
+ WT_STAT_INCRV(stats, btree_entries, entry_cnt);
+ WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+}
+
+/*
+ * __stat_page_row_int --
+ * Stat a WT_PAGE_ROW_INT page.
+ */
+static void
+__stat_page_row_int(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_PAGE_INDEX *pindex;
+ uint32_t i, ovfl_cnt;
+
+ btree = S2BT(session);
+ ovfl_cnt = 0;
+
+ WT_STAT_INCR(stats, btree_row_internal);
+
+ /*
+ * The number of entries tells us the number of items on row-store
+ * internal page.
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+ WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+
+ /*
+ * Overflow keys are hard: we have to walk the disk image to count them,
+ * the in-memory representation of the page doesn't necessarily contain
+ * a reference to the original cell.
+ */
+ if (page->dsk != NULL)
+ WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+ ++ovfl_cnt;
+ }
+
+ WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
}
/*
* __stat_page_row_leaf --
* Stat a WT_PAGE_ROW_LEAF page.
*/
-static int
-__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
+static void
+__stat_page_row_leaf(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
WT_INSERT *ins;
WT_ROW *rip;
WT_UPDATE *upd;
- uint32_t cnt, i;
+ uint32_t entry_cnt, i, ovfl_cnt;
+
+ btree = S2BT(session);
+ entry_cnt = ovfl_cnt = 0;
WT_STAT_INCR(stats, btree_row_leaf);
/*
- * Stat any K/V pairs inserted into the page before the first from-disk
+ * Walk any K/V pairs inserted into the page before the first from-disk
* key on the page.
*/
- cnt = 0;
WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
if (!WT_UPDATE_DELETED_ISSET(ins->upd))
- ++cnt;
+ ++entry_cnt;
- /* Stat the page's K/V pairs. */
+ /*
+ * Walk the page's K/V pairs. Count overflow values, where an overflow
+ * item is any on-disk overflow item that hasn't been updated.
+ */
WT_ROW_FOREACH(page, rip, i) {
upd = WT_ROW_UPDATE(page, rip);
if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
- ++cnt;
+ ++entry_cnt;
+ if (upd == NULL && (cell =
+ __wt_row_leaf_value_cell(page, rip, NULL)) != NULL &&
+ __wt_cell_type(cell) == WT_CELL_VALUE_OVFL)
+ ++ovfl_cnt;
- /* Stat inserted K/V pairs. */
+ /* Walk K/V pairs inserted after the on-page K/V pair. */
WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
if (!WT_UPDATE_DELETED_ISSET(ins->upd))
- ++cnt;
+ ++entry_cnt;
}
- WT_STAT_INCRV(stats, btree_entries, cnt);
+ /*
+ * Overflow keys are hard: we have to walk the disk image to count them,
+ * the in-memory representation of the page doesn't necessarily contain
+ * a reference to the original cell.
+ */
+ if (page->dsk != NULL)
+ WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+ ++ovfl_cnt;
+ }
- return (0);
+ WT_STAT_INCRV(stats, btree_entries, entry_cnt);
+ WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index d925eefc2fe..bc5d1051b1e 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -113,6 +113,13 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
if (walk == NULL)
break;
+ page = walk->page;
+ mod = page->modify;
+
+ /* Skip clean pages. */
+ if (!__wt_page_is_modified(page))
+ continue;
+
/*
* Write dirty pages, unless we can be sure they only
* became dirty after the checkpoint started.
@@ -125,23 +132,27 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
* (3) the first dirty update on the page is
* sufficiently recent that the checkpoint
* transaction would skip them.
+ *
+ * Mark the tree dirty: the checkpoint marked it clean
+ * and we can't skip future checkpoints until this page
+ * is written.
*/
- page = walk->page;
- mod = page->modify;
- if (__wt_page_is_modified(page) &&
- (WT_PAGE_IS_INTERNAL(page) ||
- !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
- TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
- if (WT_PAGE_IS_INTERNAL(page)) {
- internal_bytes +=
- page->memory_footprint;
- ++internal_pages;
- } else {
- leaf_bytes += page->memory_footprint;
- ++leaf_pages;
- }
- WT_ERR(__wt_reconcile(session, walk, NULL, 0));
+ if (!WT_PAGE_IS_INTERNAL(page) &&
+ F_ISSET(txn, TXN_HAS_SNAPSHOT) &&
+ TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
+ __wt_page_modify_set(session, page);
+ continue;
+ }
+
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ internal_bytes +=
+ page->memory_footprint;
+ ++internal_pages;
+ } else {
+ leaf_bytes += page->memory_footprint;
+ ++leaf_pages;
}
+ WT_ERR(__wt_reconcile(session, walk, NULL, 0));
}
break;
}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 10dd5b12936..917e0c54a30 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -20,12 +20,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
- WT_REF *couple, *ref;
- int descending, prev, skip;
+ WT_REF *couple, *couple_orig, *ref;
+ int prev, skip;
uint32_t slot;
btree = S2BT(session);
- descending = 0;
/*
* Tree walks are special: they look inside page structures that splits
@@ -79,7 +78,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
* here. We check when discarding pages that we're not discarding that
* page, so this clear must be done before the page is released.
*/
- couple = ref = *refp;
+ couple = couple_orig = ref = *refp;
*refp = NULL;
/* If no page is active, begin a walk from the start of the tree. */
@@ -102,29 +101,6 @@ ascend: /*
/* Figure out the current slot in the WT_REF array. */
__wt_page_refp(session, ref, &pindex, &slot);
- if (0) {
-restart: /*
- * The page we're moving to might have split, in which case find
- * the last position we held.
- *
- * If we were starting a tree walk, begin again.
- *
- * If we were in the process of descending, repeat the descent.
- * If we were moving within a single level of the tree, repeat
- * the last move.
- */
- ref = couple;
- if (ref == &btree->root) {
- ref = &btree->root;
- if (ref->page == NULL)
- goto done;
- goto descend;
- }
- __wt_page_refp(session, ref, &pindex, &slot);
- if (descending)
- goto descend;
- }
-
for (;;) {
/*
* If we're at the last/first slot on the page, return this page
@@ -152,14 +128,11 @@ restart: /*
/*
* Locate the reference to our parent page then
* swap our child hazard pointer for the parent.
- * We don't handle a restart return because it
- * would require additional complexity in the
- * restart code (ascent code somewhat like the
- * descent code already there), and it's not a
- * possible return: we're moving to the parent
- * of the current child, not another child of
- * the same parent, there's no way our parent
- * split.
+ * We don't handle restart or not-found returns.
+ * It would require additional complexity and is
+ * not a possible return: we're moving to the
+ * parent of the current child page, our parent
+ * reference can't have split or been evicted.
*/
__wt_page_refp(session, ref, &pindex, &slot);
if ((ret = __wt_page_swap(
@@ -182,7 +155,7 @@ restart: /*
if (walkcntp != NULL)
++*walkcntp;
- for (descending = 0;;) {
+ for (;;) {
ref = pindex->index[slot];
if (LF_ISSET(WT_READ_CACHE)) {
@@ -198,7 +171,8 @@ restart: /*
* Avoid pulling a deleted page back in to try
* to delete it again.
*/
- if (__wt_delete_page_skip(session, ref))
+ if (ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, ref))
break;
/*
* If deleting a range, try to delete the page
@@ -232,26 +206,67 @@ restart: /*
}
} else {
/*
- * If iterating a cursor, try to skip deleted
- * pages that are visible to us.
+ * Try to skip deleted pages visible to us.
*/
- if (__wt_delete_page_skip(session, ref))
+ if (ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, ref))
break;
}
ret = __wt_page_swap(session, couple, ref, flags);
+
+ /*
+ * Not-found is an expected return when only walking
+ * in-cache pages.
+ */
if (ret == WT_NOTFOUND) {
ret = 0;
break;
}
- if (ret == WT_RESTART)
- goto restart;
+
+ /*
+ * The page we're moving to might have split, in which
+ * case move to the last position we held.
+ */
+ if (ret == WT_RESTART) {
+ ret = 0;
+
+ /*
+ * If a new walk that never coupled from the
+ * root to a new saved position in the tree,
+ * restart the walk.
+ */
+ if (couple == &btree->root) {
+ ref = &btree->root;
+ if (ref->page == NULL)
+ goto done;
+ goto descend;
+ }
+
+ /*
+ * If restarting from some original position,
+ * repeat the increment or decrement we made at
+ * that time. Otherwise, couple is an internal
+ * page we've acquired after moving from that
+ * starting position and we can treat it as a
+ * new page. This works because we never acquire
+ * a hazard pointer on a leaf page we're not
+ * going to return to our caller, this will quit
+ * work if that ever changes.
+ */
+ WT_ASSERT(session,
+ couple == couple_orig ||
+ WT_PAGE_IS_INTERNAL(couple->page));
+ ref = couple;
+ __wt_page_refp(session, ref, &pindex, &slot);
+ if (couple == couple_orig)
+ break;
+ }
WT_ERR(ret);
/*
- * Entering a new page: configure for traversal of any
- * internal page's children, else return (or optionally
- * skip), the leaf page.
+ * A new page: configure for traversal of any internal
+ * page's children, else return the leaf page.
*/
descend: couple = ref;
page = ref->page;
@@ -259,10 +274,7 @@ descend: couple = ref;
page->type == WT_PAGE_COL_INT) {
pindex = WT_INTL_INDEX_COPY(page);
slot = prev ? pindex->entries - 1 : 0;
- descending = 1;
- } else if (LF_ISSET(WT_READ_SKIP_LEAF))
- goto ascend;
- else {
+ } else {
*refp = ref;
goto done;
}
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 92cfd1e4273..f2868afe13a 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -439,7 +439,7 @@ next: switch (direction) {
(void)__wt_row_leaf_key_info(
page, copy, &ikey, &cell, NULL, NULL);
if (ikey == NULL) {
- WT_ERR(__wt_row_ikey(session,
+ WT_ERR(__wt_row_ikey_alloc(session,
WT_PAGE_DISK_OFFSET(page, cell),
keyb->data, keyb->size, &ikey));
@@ -462,15 +462,37 @@ err: __wt_scr_free(session, &tmp);
}
/*
+ * __wt_row_ikey_alloc --
+ * Instantiate a key in a WT_IKEY structure.
+ */
+int
+__wt_row_ikey_alloc(WT_SESSION_IMPL *session,
+ uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp)
+{
+ WT_IKEY *ikey;
+
+ /*
+ * Allocate memory for the WT_IKEY structure and the key, then copy
+ * the key into place.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
+ ikey->size = WT_STORE_SIZE(size);
+ ikey->cell_offset = cell_offset;
+ memcpy(WT_IKEY_DATA(ikey), key, size);
+ *ikeyp = ikey;
+ return (0);
+}
+
+/*
* __wt_row_ikey_incr --
* Instantiate a key in a WT_IKEY structure and increment the page's
* memory footprint.
*/
int
__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
- uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+ uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
{
- WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+ WT_RET(__wt_row_ikey(session, cell_offset, key, size, ref));
__wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
@@ -483,19 +505,30 @@ __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
*/
int
__wt_row_ikey(WT_SESSION_IMPL *session,
- uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+ uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
{
WT_IKEY *ikey;
+ WT_RET(__wt_row_ikey_alloc(session, cell_offset, key, size, &ikey));
+
+#ifdef HAVE_DIAGNOSTIC
+ {
+ uintptr_t oldv;
+
+ oldv = (uintptr_t)ref->key.ikey;
+ WT_DIAGNOSTIC_YIELD;
+
/*
- * Allocate memory for the WT_IKEY structure and the key, then copy
- * the key into place.
+ * We should never overwrite an instantiated key, and we should
+ * never instantiate a key after a split.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
- ikey->size = WT_STORE_SIZE(size);
- ikey->cell_offset = cell_offset;
- memcpy(WT_IKEY_DATA(ikey), key, size);
-
- *(WT_IKEY **)ikeyp = ikey;
+ WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
+ WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+ WT_ASSERT(session,
+ WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey));
+ }
+#else
+ ref->key.ikey = ikey;
+#endif
return (0);
}
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 0562f9cfc34..6b9824fc415 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -762,8 +762,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
WT_ERR(__conn_statistics_config(session, config_cfg));
WT_ERR(__wt_async_reconfig(session, config_cfg));
- WT_ERR(__wt_cache_config(session, config_cfg));
- WT_ERR(__wt_cache_pool_config(session, config_cfg));
+ WT_ERR(__wt_cache_config(session, 1, config_cfg));
WT_ERR(__wt_checkpoint_server_create(session, config_cfg));
WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg));
WT_ERR(__wt_statlog_create(session, config_cfg));
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index c513d46137c..4a7e15044de 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -9,33 +9,28 @@
#include "wt_internal.h"
/*
- * __wt_cache_config --
+ * __cache_config_local --
* Configure the underlying cache.
*/
-int
-__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
+static int
+__cache_config_local(WT_SESSION_IMPL *session, int shared, const char *cfg[])
{
WT_CACHE *cache;
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
+ uint32_t evict_workers_max, evict_workers_min;
conn = S2C(session);
cache = conn->cache;
/*
* If not using a shared cache configure the cache size, otherwise
- * check for a reserved size.
+ * check for a reserved size. All other settings are independent of
+ * whether we are using a shared cache or not.
*/
- if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ if (!shared) {
WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval));
conn->cache_size = (uint64_t)cval.val;
- } else {
- WT_RET(__wt_config_gets(
- session, cfg, "shared_cache.reserve", &cval));
- if (cval.val == 0)
- WT_RET(__wt_config_gets(
- session, cfg, "shared_cache.chunk", &cval));
- cache->cp_reserved = (uint64_t)cval.val;
}
WT_RET(__wt_config_gets(session, cfg, "cache_overhead", &cval));
@@ -57,16 +52,64 @@ __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval));
WT_ASSERT(session, cval.val > 0);
- conn->evict_workers_max = (u_int)cval.val - 1;
+ evict_workers_max = (uint32_t)cval.val - 1;
WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval));
WT_ASSERT(session, cval.val > 0);
- conn->evict_workers_min = (u_int)cval.val - 1;
+ evict_workers_min = (uint32_t)cval.val - 1;
- if (conn->evict_workers_min > conn->evict_workers_max)
+ if (evict_workers_min > evict_workers_max)
WT_RET_MSG(session, EINVAL,
"eviction=(threads_min) cannot be greater than "
"eviction=(threads_max)");
+ conn->evict_workers_max = evict_workers_max;
+ conn->evict_workers_min = evict_workers_min;
+
+ return (0);
+}
+
+/*
+ * __wt_cache_config --
+ * Configure or reconfigure the current cache and shared cache.
+ */
+int
+__wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ int now_shared, was_shared;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, conn->cache != NULL);
+
+ WT_RET(__wt_config_gets_none(session, cfg, "shared_cache.name", &cval));
+ now_shared = cval.len != 0;
+ was_shared = F_ISSET(conn, WT_CONN_CACHE_POOL);
+
+ /* Cleanup if reconfiguring */
+ if (reconfigure && was_shared && !now_shared)
+ /* Remove ourselves from the pool if necessary */
+ WT_RET(__wt_conn_cache_pool_destroy(session));
+ else if (reconfigure && !was_shared && now_shared)
+ /*
+ * Cache size will now be managed by the cache pool - the
+ * start size always needs to be zero to allow the pool to
+ * manage how much memory is in-use.
+ */
+ conn->cache_size = 0;
+
+ /*
+ * Always setup the local cache - it's used even if we are
+ * participating in a shared cache.
+ */
+ WT_RET(__cache_config_local(session, now_shared, cfg));
+ if (now_shared) {
+ WT_RET(__wt_cache_pool_config(session, cfg));
+ WT_ASSERT(session, F_ISSET(conn, WT_CONN_CACHE_POOL));
+ if (!was_shared)
+ WT_RET(__wt_conn_cache_pool_open(session));
+ }
return (0);
}
@@ -84,19 +127,14 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
- WT_ASSERT(session, conn->cache == NULL ||
- (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
+ WT_ASSERT(session, conn->cache == NULL);
WT_RET(__wt_calloc_one(session, &conn->cache));
cache = conn->cache;
/* Use a common routine for run-time configuration options. */
- WT_RET(__wt_cache_config(session, cfg));
-
- /* Add the configured cache to the cache pool. */
- if (F_ISSET(conn, WT_CONN_CACHE_POOL))
- WT_RET(__wt_conn_cache_pool_open(session));
+ WT_RET(__wt_cache_config(session, 0, cfg));
/*
* The target size must be lower than the trigger size or we will never
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index f5b78e33b04..7bf090496a8 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -36,17 +36,17 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
WT_CONNECTION_IMPL *conn, *entry;
WT_DECL_RET;
char *pool_name;
- int created, reconfiguring;
+ int created, updating;
uint64_t chunk, reserve, size, used_cache;
conn = S2C(session);
- created = reconfiguring = 0;
+ created = updating = 0;
pool_name = NULL;
cp = NULL;
size = 0;
if (F_ISSET(conn, WT_CONN_CACHE_POOL))
- reconfiguring = 1;
+ updating = 1;
else {
WT_RET(__wt_config_gets_none(
session, cfg, "shared_cache.name", &cval));
@@ -81,7 +81,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
__wt_spin_lock(session, &__wt_process.spinlock);
if (__wt_process.cache_pool == NULL) {
- WT_ASSERT(session, !reconfiguring);
+ WT_ASSERT(session, !updating);
/* Create a cache pool. */
WT_ERR(__wt_calloc_one(session, &cp));
created = 1;
@@ -96,7 +96,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
__wt_process.cache_pool = cp;
WT_ERR(__wt_verbose(session,
WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
- } else if (!reconfiguring && !WT_STRING_MATCH(
+ } else if (!updating && !WT_STRING_MATCH(
__wt_process.cache_pool->name, pool_name, strlen(pool_name)))
/* Only a single cache pool is supported. */
WT_ERR_MSG(session, WT_ERROR,
@@ -109,7 +109,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
* The cache pool requires a reference count to avoid a race between
* configuration/open and destroy.
*/
- if (!reconfiguring)
+ if (!updating)
++cp->refs;
/*
@@ -157,7 +157,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
if (__wt_config_gets(session, &cfg[1],
"shared_cache.reserve", &cval) == 0 && cval.val != 0)
reserve = (uint64_t)cval.val;
- else if (reconfiguring)
+ else if (updating)
reserve = conn->cache->cp_reserved;
else
reserve = chunk;
@@ -171,18 +171,23 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
used_cache += entry->cache->cp_reserved;
}
+ /* Ignore our old allocation if reconfiguring */
+ if (updating)
+ used_cache -= conn->cache->cp_reserved;
if (used_cache + reserve > size)
WT_ERR_MSG(session, EINVAL,
"Shared cache unable to accommodate this configuration. "
- "Shared cache size: %" PRIu64 ", reserved: %" PRIu64,
+ "Shared cache size: %" PRIu64 ", requested min: %" PRIu64,
size, used_cache + reserve);
/* The configuration is verified - it's safe to update the pool. */
cp->size = size;
cp->chunk = chunk;
+ conn->cache->cp_reserved = reserve;
+
/* Wake up the cache pool server so any changes are noticed. */
- if (reconfiguring)
+ if (updating)
WT_ERR(__wt_cond_signal(
session, __wt_process.cache_pool->cache_pool_cond));
@@ -192,7 +197,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
F_SET(conn, WT_CONN_CACHE_POOL);
err: __wt_spin_unlock(session, &__wt_process.spinlock);
- if (!reconfiguring)
+ if (!updating)
__wt_free(session, pool_name);
if (ret != 0 && created) {
__wt_free(session, cp->name);
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index a5512352f2c..7756158594c 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -375,6 +375,8 @@ __conn_btree_open(
F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
!LF_ISSET(WT_DHANDLE_LOCK_ONLY));
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING));
+
/*
* If the handle is already open, it has to be closed so it can be
* reopened with a new configuration. We don't need to check again:
@@ -539,6 +541,48 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
}
/*
+ * __wt_conn_btree_apply_single_ckpt --
+ * Decode any checkpoint information from the configuration string then
+ * call btree apply single.
+ */
+int
+__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ const char *checkpoint;
+
+ checkpoint = NULL;
+
+ /*
+ * This function exists to handle checkpoint configuration. Callers
+ * that never open a checkpoint call the underlying function directly.
+ */
+ WT_RET_NOTFOUND_OK(
+ __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0) {
+ /*
+ * The internal checkpoint name is special, find the last
+ * unnamed checkpoint of the object.
+ */
+ if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ WT_RET(__wt_meta_checkpoint_last_name(
+ session, uri, &checkpoint));
+ } else
+ WT_RET(__wt_strndup(
+ session, cval.str, cval.len, &checkpoint));
+ }
+
+ ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg);
+
+ __wt_free(session, checkpoint);
+
+ return (ret);
+}
+
+/*
* __wt_conn_btree_apply_single --
* Apply a function to a single btree handle that couldn't be locked
* (attempting to get the handle returned EBUSY).
@@ -580,10 +624,10 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
ret = func(session, cfg));
}
__wt_spin_unlock(session, &dhandle->close_lock);
- WT_ERR(ret);
+ WT_RET(ret);
}
-err: return (ret);
+ return (0);
}
/*
@@ -683,20 +727,25 @@ __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final)
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ int tret;
dhandle = session->dhandle;
if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
- ret = __wt_conn_btree_sync_and_close(session, 0);
- if (!final)
- WT_RET(ret);
+ tret = __wt_conn_btree_sync_and_close(session, 0);
+ if (final && tret != 0) {
+ __wt_err(session, tret,
+ "Final close of %s failed", dhandle->name);
+ WT_TRET(tret);
+ } else if (!final)
+ WT_RET(tret);
}
/*
* Kludge: interrupt the eviction server in case it is holding the
* handle list lock.
*/
- F_SET(S2C(session)->cache, WT_EVICT_CLEAR_WALKS);
+ F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS);
/* Try to remove the handle, protected by the data handle lock. */
WT_WITH_DHANDLE_LOCK(session,
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 36d4d539d92..315e93c1875 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -347,6 +347,124 @@ err: __wt_err(session, ret, "log close server error");
}
/*
+ * Simple structure for sorting written slots.
+ */
+typedef struct {
+ WT_LSN lsn;
+ uint32_t slot_index;
+} WT_LOG_WRLSN_ENTRY;
+
+/*
+ * __log_wrlsn_cmp --
+ * The log wrlsn comparison function for qsort.
+ */
+static int
+__log_wrlsn_cmp(const void *a, const void *b)
+{
+ WT_LOG_WRLSN_ENTRY *ae, *be;
+
+ ae = (WT_LOG_WRLSN_ENTRY *)a;
+ be = (WT_LOG_WRLSN_ENTRY *)b;
+ return (LOG_CMP(&ae->lsn, &be->lsn));
+}
+
+/*
+ * __log_wrlsn_server --
+ * The log wrlsn server thread.
+ */
+static void *
+__log_wrlsn_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_WRLSN_ENTRY written[SLOT_POOL];
+ WT_LOGSLOT *slot;
+ WT_SESSION_IMPL *session;
+ size_t written_i;
+ uint32_t i, save_i;
+ int yield;
+
+ session = arg;
+ conn = S2C(session);
+ log = conn->log;
+ yield = 0;
+ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
+ /*
+ * No need to use the log_slot_lock because the slot pool
+ * is statically allocated and any slot in the
+ * WT_LOG_SLOT_WRITTEN state is exclusively ours for now.
+ */
+ i = 0;
+ written_i = 0;
+ /*
+ * Walk the array once saving any slots that are in the
+ * WT_LOG_SLOT_WRITTEN state.
+ */
+ while (i < SLOT_POOL) {
+ save_i = i;
+ slot = &log->slot_pool[i++];
+ if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
+ continue;
+ written[written_i].slot_index = save_i;
+ written[written_i++].lsn = slot->slot_release_lsn;
+ }
+ /*
+ * If we found any written slots process them. We sort them
+ * based on the release LSN, and then look for them in order.
+ */
+ if (written_i > 0) {
+ yield = 0;
+ qsort(written, written_i, sizeof(WT_LOG_WRLSN_ENTRY),
+ __log_wrlsn_cmp);
+ /*
+ * We know the written array is sorted by LSN. Go
+ * through them either advancing write_lsn or stop
+ * as soon as one is not in order.
+ */
+ for (i = 0; i < written_i; i++) {
+ if (LOG_CMP(&log->write_lsn,
+ &written[i].lsn) != 0)
+ break;
+ /*
+ * If we get here we have a slot to process.
+ * Advance the LSN and process the slot.
+ */
+ slot = &log->slot_pool[written[i].slot_index];
+ WT_ASSERT(session, LOG_CMP(&written[i].lsn,
+ &slot->slot_release_lsn) == 0);
+ log->write_lsn = slot->slot_end_lsn;
+ WT_ERR(__wt_cond_signal(session,
+ log->log_write_cond));
+ WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
+
+ /*
+ * Signal the close thread if needed.
+ */
+ if (F_ISSET(slot, SLOT_CLOSEFH))
+ WT_ERR(__wt_cond_signal(session,
+ conn->log_close_cond));
+ WT_ERR(__wt_log_slot_free(session, slot));
+ }
+ }
+ /*
+ * If we saw a later write, we always want to yield because
+ * we know something is in progress.
+ */
+ if (yield++ < 1000)
+ __wt_yield();
+ else
+ /* Wait until the next event. */
+ WT_ERR(__wt_cond_wait(session,
+ conn->log_wrlsn_cond, 100000));
+ }
+
+ if (0)
+err: __wt_err(session, ret, "log wrlsn server error");
+ return (NULL);
+}
+
+/*
* __log_server --
* The log server thread.
*/
@@ -479,12 +597,24 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
"log close server", 0, &conn->log_close_cond));
/*
- * Start the thread.
+ * Start the log file close thread.
*/
WT_RET(__wt_thread_create(conn->log_close_session,
&conn->log_close_tid, __log_close_server, conn->log_close_session));
conn->log_close_tid_set = 1;
+ /*
+ * Start the log write LSN thread. It is not configurable.
+ * If logging is enabled, this thread runs.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session));
+ WT_RET(__wt_cond_alloc(conn->log_wrlsn_session,
+ "log write lsn server", 0, &conn->log_wrlsn_cond));
+ WT_RET(__wt_thread_create(conn->log_wrlsn_session,
+ &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
+ conn->log_wrlsn_tid_set = 1;
+
/* If no log thread services are configured, we're done. */
if (!FLD_ISSET(conn->log_flags,
(WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
@@ -557,6 +687,17 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
WT_TRET(wt_session->close(wt_session, NULL));
conn->log_close_session = NULL;
}
+ if (conn->log_wrlsn_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond));
+ WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
+ conn->log_wrlsn_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+ if (conn->log_wrlsn_session != NULL) {
+ wt_session = &conn->log_wrlsn_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ conn->log_wrlsn_session = NULL;
+ }
WT_TRET(__wt_log_close(session));
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index d4f6cf4869c..0a3d35ac0b1 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -55,9 +55,6 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
*/
WT_WRITE_BARRIER();
- /* Connect to a cache pool. */
- WT_RET(__wt_cache_pool_config(session, cfg));
-
/* Create the cache. */
WT_RET(__wt_cache_create(session, cfg));
@@ -113,6 +110,9 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
F_CLR(conn, WT_CONN_SERVER_RUN);
WT_TRET(__wt_async_destroy(session));
WT_TRET(__wt_lsm_manager_destroy(session));
+
+ F_SET(conn, WT_CONN_CLOSING);
+
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, 1));
WT_TRET(__wt_sweep_destroy(session));
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 67814dc330b..c38e0ef125f 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -205,7 +205,7 @@ __statlog_apply(WT_SESSION_IMPL *session, const char *cfg[])
if (WT_PREFIX_MATCH(dhandle->name, *p)) {
WT_WITHOUT_DHANDLE(session,
ret = __statlog_dump(session, dhandle->name, 0));
- WT_RET(ret);
+ return (ret);
}
return (0);
}
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index abc6a106cc9..bf086bcc813 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -26,7 +26,7 @@ __curindex_get_value(WT_CURSOR *cursor, ...)
WT_CURSOR_NEEDVALUE(cursor);
va_start(ap, cursor);
- if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
ret = __wt_schema_project_merge(session,
cindex->cg_cursors, cindex->value_plan,
cursor->value_format, &cursor->value);
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index bebce217a6a..74b998876c2 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -354,7 +354,6 @@ __curstat_file_init(WT_SESSION_IMPL *session,
/* Release the handle, we're done with it. */
WT_TRET(__wt_session_release_btree(session));
- WT_RET(ret);
return (ret);
}
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index f333a8fff58..df31a272361 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -87,6 +87,7 @@ ack'ed
ajn
alloc
allocator
+allocators
allocsize
ao
api
diff --git a/src/docs/tune-memory-allocator.dox b/src/docs/tune-memory-allocator.dox
index ad052bc4ec3..a619708f816 100644
--- a/src/docs/tune-memory-allocator.dox
+++ b/src/docs/tune-memory-allocator.dox
@@ -10,4 +10,9 @@ Google's tcmalloc</a>, or
<a href="http://www.canonware.com/jemalloc">FreeBSD's jemalloc</a>),
can dramatically improve throughput.
+As different memory allocators have different overhead and different
+workloads will have different heap allocation sizes and patterns,
+applications may need to set their allocator overhead using the
+\c cache_overhead configuration to the wiredtiger_open:: call.
+
*/
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 9e39fcc7a2c..1030c0aa818 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -72,6 +72,17 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
WT_READ_CACHE | WT_READ_NO_EVICT));
switch (syncop) {
+ case WT_SYNC_DISCARD:
+ /*
+ * Check that the page is clean: if we see a dirty page
+ * (including a dirty parent page after evicting a
+ * child), give up. The higher level can try to
+ * checkpoint, but during discard we aren't set up to
+ * manage checkpoints.
+ */
+ if (__wt_page_is_modified(page))
+ WT_ERR(EBUSY);
+ /* FALLTHROUGH */
case WT_SYNC_CLOSE:
/*
* Evict the page.
@@ -84,29 +95,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
!F_ISSET(page->modify, WT_PM_REC_EMPTY))
WT_ERR(__wt_evict(session, ref, 1));
break;
- case WT_SYNC_DISCARD:
- /*
- * Ordinary discard of the page, whether clean or dirty.
- * If we see a dirty page in an ordinary discard (e.g.,
- * from sweep), give up: an update must have happened
- * since the file was selected for sweeping.
- */
- if (__wt_page_is_modified(page))
- WT_ERR(EBUSY);
-
- /*
- * If the page contains an update that is too recent to
- * evict, stop. This should never happen during
- * connection close, but in other paths our caller
- * should be prepared to deal with this case.
- */
- if (page->modify != NULL &&
- !__wt_txn_visible_all(session,
- page->modify->rec_max_txn))
- WT_ERR(EBUSY);
-
- __wt_evict_page_clean_update(session, ref);
- break;
case WT_SYNC_DISCARD_FORCE:
/*
* Forced discard of the page, whether clean or dirty.
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index e3d8ea6a4e0..640c9b0541d 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -194,6 +194,17 @@ __evict_server(void *arg)
ret = 0;
}
}
+ /*
+ * Clear the walks so we don't pin pages while asleep,
+ * otherwise we can block applications evicting large pages.
+ */
+ if (!F_ISSET(cache, WT_CACHE_STUCK)) {
+ WT_ERR(__evict_clear_walks(session));
+
+ /* Next time we wake up, reverse the sweep direction. */
+ cache->flags ^= WT_CACHE_WALK_REVERSE;
+ }
+
WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
/* Don't rely on signals: check periodically. */
WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
@@ -237,7 +248,7 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
WT_DECL_RET;
WT_EVICT_WORKER *workers;
size_t alloc;
- u_int i;
+ uint32_t i;
conn = S2C(session);
@@ -321,7 +332,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session)
WT_DECL_RET;
WT_EVICT_WORKER *workers;
WT_SESSION *wt_session;
- u_int i;
+ uint32_t i;
conn = S2C(session);
cache = conn->cache;
@@ -432,17 +443,17 @@ __evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
(cache->eviction_dirty_target * bytes_max) / 100)
/* Ignore clean pages unless the cache is too large */
LF_SET(WT_EVICT_PASS_DIRTY);
- else if (F_ISSET(cache, WT_EVICT_WOULD_BLOCK)) {
+ else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
/*
* Evict pages with oldest generation (which would otherwise
* block application threads) set regardless of whether we have
* reached the eviction trigger.
*/
LF_SET(WT_EVICT_PASS_WOULD_BLOCK);
- F_CLR(cache, WT_EVICT_WOULD_BLOCK);
+ F_CLR(cache, WT_CACHE_WOULD_BLOCK);
}
- if (F_ISSET(cache, WT_EVICT_STUCK))
+ if (F_ISSET(cache, WT_CACHE_STUCK))
LF_SET(WT_EVICT_PASS_AGGRESSIVE);
*flagsp = flags;
@@ -475,8 +486,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* If there is a request to clear eviction walks, do that now,
* before checking if the cache is full.
*/
- if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) {
- F_CLR(cache, WT_EVICT_CLEAR_WALKS);
+ if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS)) {
+ F_CLR(cache, WT_CACHE_CLEAR_WALKS);
WT_RET(__evict_clear_walks(session));
WT_RET(__wt_cond_signal(
session, cache->evict_waiter_cond));
@@ -493,7 +504,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* Start a worker if we have capacity and we haven't reached
* the eviction targets.
*/
- if (LF_ISSET(WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY) &&
+ if (LF_ISSET(WT_EVICT_PASS_ALL |
+ WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) &&
conn->evict_workers < conn->evict_workers_max) {
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
"Starting evict worker: %"PRIu32"\n",
@@ -527,10 +539,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* handles.
*/
__wt_sleep(0, 1000 * (long)loop);
- if (F_ISSET(cache, WT_EVICT_STUCK))
- break;
if (loop == 100) {
- F_SET(cache, WT_EVICT_STUCK);
+ F_SET(cache, WT_CACHE_STUCK);
WT_STAT_FAST_CONN_INCR(
session, cache_eviction_slow);
WT_RET(__wt_verbose(
@@ -605,7 +615,7 @@ __evict_tree_walk_clear(WT_SESSION_IMPL *session)
F_SET(session, WT_SESSION_CLEAR_EVICT_WALK);
while (btree->evict_ref != NULL && ret == 0) {
- F_SET(cache, WT_EVICT_CLEAR_WALKS);
+ F_SET(cache, WT_CACHE_CLEAR_WALKS);
ret = __wt_cond_wait(
session, cache->evict_waiter_cond, 100000);
}
@@ -792,21 +802,29 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
WT_ASSERT(session, cache->evict[0].ref != NULL);
- /* Find the bottom 25% of read generations. */
- cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
- __evict_read_gen(&cache->evict[entries - 1])) / 4;
-
- /*
- * Don't take less than 10% or more than 50% of entries, regardless.
- * That said, if there is only one entry, which is normal when
- * populating an empty file, don't exclude it.
- */
- for (candidates = 1 + entries / 10;
- candidates < entries / 2;
- candidates++)
- if (__evict_read_gen(&cache->evict[candidates]) > cutoff)
- break;
- cache->evict_candidates = candidates;
+ if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+ /*
+ * Take all candidates if we only gathered pages with an oldest
+ * read generation set.
+ */
+ cache->evict_candidates = entries;
+ else {
+ /* Find the bottom 25% of read generations. */
+ cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
+ __evict_read_gen(&cache->evict[entries - 1])) / 4;
+ /*
+ * Don't take less than 10% or more than 50% of entries,
+ * regardless. That said, if there is only one entry, which is
+ * normal when populating an empty file, don't exclude it.
+ */
+ for (candidates = 1 + entries / 10;
+ candidates < entries / 2;
+ candidates++)
+ if (__evict_read_gen(
+ &cache->evict[candidates]) > cutoff)
+ break;
+ cache->evict_candidates = candidates;
+ }
/* If we have more than the minimum number of entries, clear them. */
if (cache->evict_entries > WT_EVICT_WALK_BASE) {
@@ -907,7 +925,7 @@ retry: while (slot < max_entries && ret == 0) {
* If another thread is waiting on the eviction server to clear
* the walk point in a tree, give up.
*/
- if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS))
+ if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS))
break;
/*
@@ -917,7 +935,7 @@ retry: while (slot < max_entries && ret == 0) {
if (!dhandle_locked) {
for (spins = 0; (ret = __wt_spin_trylock(
session, &conn->dhandle_lock, &id)) == EBUSY &&
- !F_ISSET(cache, WT_EVICT_CLEAR_WALKS);
+ !F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
spins++) {
if (spins < 1000)
__wt_yield();
@@ -1029,7 +1047,7 @@ retry: while (slot < max_entries && ret == 0) {
* candidates and we aren't finding more. Take care not to skip files
* on subsequent passes.
*/
- if (!F_ISSET(cache, WT_EVICT_CLEAR_WALKS) && ret == 0 &&
+ if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
slot < max_entries && (retries < 2 ||
(!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
(slot == cache->evict_entries || slot > start_slot)))) {
@@ -1096,8 +1114,11 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
cache->evict + cache->evict_slots);
enough = internal_pages = restarts = 0;
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+ walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
+ WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ if (F_ISSET(cache, WT_CACHE_WALK_REVERSE))
+ walk_flags |= WT_READ_PREV;
/*
* Get some more eviction candidate pages.
@@ -1181,7 +1202,7 @@ fast: /* If the page can't be evicted, give up. */
*/
mod = page->modify;
if (!modified && mod != NULL && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_WOULD_BLOCK) &&
+ WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
!__wt_txn_visible_all(session, mod->rec_max_txn))
continue;
@@ -1355,8 +1376,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server)
WT_RET(ret);
cache = S2C(session)->cache;
- if (F_ISSET(cache, WT_EVICT_STUCK))
- F_CLR(cache, WT_EVICT_STUCK);
+ if (F_ISSET(cache, WT_CACHE_STUCK))
+ F_CLR(cache, WT_CACHE_STUCK);
return (ret);
}
@@ -1400,9 +1421,9 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
* abort the transaction to give up all hazard pointers before
* trying again.
*/
- if (F_ISSET(cache, WT_EVICT_STUCK) &&
+ if (F_ISSET(cache, WT_CACHE_STUCK) &&
__wt_txn_am_oldest(session)) {
- F_CLR(cache, WT_EVICT_STUCK);
+ F_CLR(cache, WT_CACHE_STUCK);
WT_STAT_FAST_CONN_INCR(session, txn_fail_cache);
return (WT_ROLLBACK);
}
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 9ba1af897a4..892d5b4ac60 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -130,8 +130,8 @@ done: session->excl_next = 0;
txn_state->snap_min = WT_TXN_NONE;
if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
- !F_ISSET(conn->cache, WT_EVICT_WOULD_BLOCK)) {
- F_SET(conn->cache, WT_EVICT_WOULD_BLOCK);
+ !F_ISSET(conn->cache, WT_CACHE_WOULD_BLOCK)) {
+ F_SET(conn->cache, WT_CACHE_WOULD_BLOCK);
WT_TRET(__wt_evict_server_wake(session));
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 91d0d1eb654..101fd450fc7 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -442,8 +442,6 @@ struct __wt_page {
/* Row-store leaf page. */
struct {
- WT_ROW *d; /* Key/value pairs */
-
/*
* The column-store leaf page modification structures
* live in the WT_PAGE_MODIFY structure to keep the
@@ -457,6 +455,7 @@ struct __wt_page {
WT_INSERT_HEAD **ins; /* Inserts */
WT_UPDATE **upd; /* Updates */
+ WT_ROW *d; /* Key/value pairs */
uint32_t entries; /* Entries */
} row;
#undef pg_row_d
@@ -510,11 +509,31 @@ struct __wt_page {
#define pg_var_entries u.col_var.entries
} u;
- /* Page's on-disk representation: NULL for pages created in memory. */
- const WT_PAGE_HEADER *dsk;
+ /*
+ * The page's type and flags are positioned at the end of the WT_PAGE
+ * union, it reduces cache misses in the row-store search function.
+ */
+#define WT_PAGE_IS_INTERNAL(page) \
+ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
+#define WT_PAGE_INVALID 0 /* Invalid page */
+#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
+#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
+#define WT_PAGE_COL_INT 3 /* Col-store internal page */
+#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
+#define WT_PAGE_OVFL 5 /* Overflow page */
+#define WT_PAGE_ROW_INT 6 /* Row-store internal page */
+#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
+ uint8_t type; /* Page type */
- /* If/when the page is modified, we need lots more information. */
- WT_PAGE_MODIFY *modify;
+#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */
+#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
+#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
+#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
+#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */
+#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
+#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
+#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */
+ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
* The page's read generation acts as an LRU value for each page in the
@@ -539,27 +558,11 @@ struct __wt_page {
size_t memory_footprint; /* Memory attached to the page */
-#define WT_PAGE_IS_INTERNAL(page) \
- ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
-#define WT_PAGE_INVALID 0 /* Invalid page */
-#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
-#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
-#define WT_PAGE_COL_INT 3 /* Col-store internal page */
-#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
-#define WT_PAGE_OVFL 5 /* Overflow page */
-#define WT_PAGE_ROW_INT 6 /* Row-store internal page */
-#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
- uint8_t type; /* Page type */
+ /* Page's on-disk representation: NULL for pages created in memory. */
+ const WT_PAGE_HEADER *dsk;
-#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */
-#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
-#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
-#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */
-#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */
- uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
+ /* If/when the page is modified, we need lots more information. */
+ WT_PAGE_MODIFY *modify;
};
/*
diff --git a/src/include/btree.i b/src/include/btree.i
index 032178b4755..56fb66abaef 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -404,7 +404,7 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* __wt_page_parent_modify_set --
- * Mark the parent page and tree dirty.
+ * Mark the parent page, and optionally the tree, dirty.
*/
static inline int
__wt_page_parent_modify_set(
@@ -957,6 +957,10 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits)
if (mod == NULL)
return (1);
+ /* Skip pages that are already being evicted. */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ return (0);
+
/*
* If the tree was deepened, there's a requirement that newly created
* internal pages not be evicted until all threads are known to have
diff --git a/src/include/cache.h b/src/include/cache.h
index 84b18082a25..8ed3176492f 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -118,9 +118,10 @@ struct __wt_cache {
*/
#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */
#define WT_CACHE_POOL_RUN 0x02 /* Cache pool thread running */
-#define WT_EVICT_CLEAR_WALKS 0x04 /* Clear eviction walks */
-#define WT_EVICT_STUCK 0x08 /* Eviction server is stuck */
-#define WT_EVICT_WOULD_BLOCK 0x10 /* Pages that would block apps */
+#define WT_CACHE_CLEAR_WALKS 0x04 /* Clear eviction walks */
+#define WT_CACHE_STUCK 0x08 /* Eviction server is stuck */
+#define WT_CACHE_WALK_REVERSE 0x10 /* Scan backwards for candidates */
+#define WT_CACHE_WOULD_BLOCK 0x20 /* Pages that would block apps */
uint32_t flags;
};
diff --git a/src/include/connection.h b/src/include/connection.h
index 9cb42ae7c80..78b2949ab98 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -320,6 +320,10 @@ struct __wt_connection_impl {
WT_SESSION_IMPL *log_close_session;/* Log close thread session */
wt_thread_t log_close_tid; /* Log close thread thread */
int log_close_tid_set;/* Log close thread set */
+ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */
+ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */
+ wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */
+ int log_wrlsn_tid_set;/* Log write lsn thread set */
WT_LOG *log; /* Logging structure */
WT_COMPRESSOR *log_compressor;/* Logging compressor */
wt_off_t log_file_max; /* Log file max size */
diff --git a/src/include/error.h b/src/include/error.h
index b732776badf..efc1617fcd3 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -11,11 +11,11 @@
/* In DIAGNOSTIC mode, yield in places where we want to encourage races. */
#ifdef HAVE_DIAGNOSTIC
-#define WT_HAVE_DIAGNOSTIC_YIELD do { \
+#define WT_DIAGNOSTIC_YIELD do { \
__wt_yield(); \
} while (0)
#else
-#define WT_HAVE_DIAGNOSTIC_YIELD
+#define WT_DIAGNOSTIC_YIELD
#endif
/* Set "ret" and branch-to-err-label tests. */
diff --git a/src/include/extern.h b/src/include/extern.h
index 5d3ee5bc8f8..bddbb5e01eb 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -163,8 +163,9 @@ extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *lea
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate);
-extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
-extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp);
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref);
extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
@@ -206,7 +207,7 @@ extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session);
extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *config, WT_EXTRACTOR **extractorp, int *ownp);
extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session);
extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]);
-extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]);
extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]);
extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
extern int __wt_cache_destroy(WT_SESSION_IMPL *session);
@@ -221,6 +222,7 @@ extern int __wt_conn_dhandle_find(WT_SESSION_IMPL *session, const char *name, co
extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force);
extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags);
extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
extern int __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final);
@@ -349,7 +351,7 @@ extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
-extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
diff --git a/src/include/flags.h b/src/include/flags.h
index 9664fce3f9f..30b2ab1c0e3 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -4,18 +4,19 @@
*/
#define WT_CONN_CACHE_POOL 0x00000001
#define WT_CONN_CKPT_SYNC 0x00000002
-#define WT_CONN_EVICTION_RUN 0x00000004
-#define WT_CONN_LEAK_MEMORY 0x00000008
-#define WT_CONN_LOG_SERVER_RUN 0x00000010
-#define WT_CONN_LSM_MERGE 0x00000020
-#define WT_CONN_PANIC 0x00000040
-#define WT_CONN_SERVER_ASYNC 0x00000080
-#define WT_CONN_SERVER_CHECKPOINT 0x00000100
-#define WT_CONN_SERVER_LSM 0x00000200
-#define WT_CONN_SERVER_RUN 0x00000400
-#define WT_CONN_SERVER_STATISTICS 0x00000800
-#define WT_CONN_SERVER_SWEEP 0x00001000
-#define WT_CONN_WAS_BACKUP 0x00002000
+#define WT_CONN_CLOSING 0x00000004
+#define WT_CONN_EVICTION_RUN 0x00000008
+#define WT_CONN_LEAK_MEMORY 0x00000010
+#define WT_CONN_LOG_SERVER_RUN 0x00000020
+#define WT_CONN_LSM_MERGE 0x00000040
+#define WT_CONN_PANIC 0x00000080
+#define WT_CONN_SERVER_ASYNC 0x00000100
+#define WT_CONN_SERVER_CHECKPOINT 0x00000200
+#define WT_CONN_SERVER_LSM 0x00000400
+#define WT_CONN_SERVER_RUN 0x00000800
+#define WT_CONN_SERVER_STATISTICS 0x00001000
+#define WT_CONN_SERVER_SWEEP 0x00002000
+#define WT_CONN_WAS_BACKUP 0x00004000
#define WT_EVICTING 0x00000001
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
@@ -36,9 +37,8 @@
#define WT_READ_NO_WAIT 0x00000010
#define WT_READ_PREV 0x00000020
#define WT_READ_SKIP_INTL 0x00000040
-#define WT_READ_SKIP_LEAF 0x00000080
-#define WT_READ_TRUNCATE 0x00000100
-#define WT_READ_WONT_NEED 0x00000200
+#define WT_READ_TRUNCATE 0x00000080
+#define WT_READ_WONT_NEED 0x00000100
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_DISCARD_FORCE 0x00000004
diff --git a/src/include/log.h b/src/include/log.h
index 82d90070609..760321d9abb 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -59,17 +59,21 @@
/*
* Possible values for the consolidation array slot states:
+ * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.)
+ *
* < WT_LOG_SLOT_DONE - threads are actively writing to the log.
* WT_LOG_SLOT_DONE - all activity on this slot is complete.
* WT_LOG_SLOT_FREE - slot is available for allocation.
* WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
+ * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
* WT_LOG_SLOT_READY - slot is ready for threads to join.
* > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
*/
#define WT_LOG_SLOT_DONE 0
#define WT_LOG_SLOT_FREE 1
#define WT_LOG_SLOT_PENDING 2
-#define WT_LOG_SLOT_READY 3
+#define WT_LOG_SLOT_WRITTEN 3
+#define WT_LOG_SLOT_READY 4
typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
int64_t slot_state; /* Slot state */
uint64_t slot_group_size; /* Group size */
@@ -92,9 +96,11 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
uint32_t flags; /* Flags */
} WT_LOGSLOT;
+#define SLOT_INIT_FLAGS (SLOT_BUFFERED)
+
typedef struct {
WT_LOGSLOT *slot;
- wt_off_t offset;
+ wt_off_t offset;
} WT_MYSLOT;
/* Offset of first record */
diff --git a/src/include/stat.h b/src/include/stat.h
index 3f684478358..21eaff0677f 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -215,6 +215,7 @@ struct __wt_connection_stats {
WT_STATS log_prealloc_max;
WT_STATS log_prealloc_used;
WT_STATS log_reads;
+ WT_STATS log_release_write_lsn;
WT_STATS log_scan_records;
WT_STATS log_scan_rereads;
WT_STATS log_scans;
@@ -227,6 +228,8 @@ struct __wt_connection_stats {
WT_STATS log_slot_toosmall;
WT_STATS log_slot_transitions;
WT_STATS log_sync;
+ WT_STATS log_sync_dir;
+ WT_STATS log_write_lsn;
WT_STATS log_writes;
WT_STATS lsm_checkpoint_throttle;
WT_STATS lsm_merge_throttle;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index d0d0f9eec77..fed6042c67a 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -561,7 +561,6 @@ struct __wt_cursor {
* user on open.
*/
const char *internal_uri;
- /* Saved modification methods. */
#define WT_CURSTD_APPEND 0x0001
#define WT_CURSTD_BULK 0x0002
@@ -3336,110 +3335,116 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_LOG_PREALLOC_USED 1081
/*! log: log read operations */
#define WT_STAT_CONN_LOG_READS 1082
+/*! log: log release advances write LSN */
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1083
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1083
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1084
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1084
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1085
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1085
+#define WT_STAT_CONN_LOG_SCANS 1086
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1086
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1087
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1087
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1088
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1088
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1089
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1089
+#define WT_STAT_CONN_LOG_SLOT_RACES 1090
/*! log: slots selected for switching that were unavailable */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1090
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1091
/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1091
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1092
/*! log: failed to find a slot large enough for record */
-#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1092
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1093
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1093
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1094
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1094
+#define WT_STAT_CONN_LOG_SYNC 1095
+/*! log: log sync_dir operations */
+#define WT_STAT_CONN_LOG_SYNC_DIR 1096
+/*! log: log server thread advances write LSN */
+#define WT_STAT_CONN_LOG_WRITE_LSN 1097
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1095
+#define WT_STAT_CONN_LOG_WRITES 1098
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1096
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1099
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1097
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1100
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1098
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1101
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1099
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1102
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1100
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1103
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1101
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1104
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1102
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1105
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1103
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1106
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1104
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1107
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1105
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1108
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1106
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1109
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1107
+#define WT_STAT_CONN_MEMORY_FREE 1110
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1108
+#define WT_STAT_CONN_MEMORY_GROW 1111
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1109
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1112
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1110
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1113
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1111
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1114
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1112
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1115
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1113
+#define WT_STAT_CONN_PAGE_SLEEP 1116
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1114
+#define WT_STAT_CONN_READ_IO 1117
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1115
+#define WT_STAT_CONN_REC_PAGES 1118
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1116
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1119
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1117
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1120
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1118
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1121
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1119
+#define WT_STAT_CONN_RWLOCK_READ 1122
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1120
+#define WT_STAT_CONN_RWLOCK_WRITE 1123
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1121
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1124
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1122
+#define WT_STAT_CONN_SESSION_OPEN 1125
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1123
+#define WT_STAT_CONN_TXN_BEGIN 1126
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1124
+#define WT_STAT_CONN_TXN_CHECKPOINT 1127
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1125
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1128
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1126
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1129
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1127
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1130
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1128
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1131
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1129
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1132
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1130
+#define WT_STAT_CONN_TXN_COMMIT 1133
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1131
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1134
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1132
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1135
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1133
+#define WT_STAT_CONN_TXN_ROLLBACK 1136
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1134
+#define WT_STAT_CONN_WRITE_IO 1137
/*!
* @}
diff --git a/src/log/log.c b/src/log/log.c
index f76ec402b0d..f485f0a09e5 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -61,16 +61,23 @@ __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec)
WT_RET(__wt_curlog_open(session, "log:", NULL, &c));
c->set_key(c, ckp_lsn->file, ckp_lsn->offset, 0);
- WT_ERR(c->search(c));
-
- /*
- * If the checkpoint LSN we're given is the last record, then recovery
- * is not needed.
- */
- if ((ret = c->next(c)) == WT_NOTFOUND) {
- *rec = 0;
+ if ((ret = c->search(c)) == 0) {
+ /*
+ * If the checkpoint LSN we're given is the last record,
+ * then recovery is not needed.
+ */
+ if ((ret = c->next(c)) == WT_NOTFOUND) {
+ *rec = 0;
+ ret = 0;
+ }
+ } else if (ret == WT_NOTFOUND)
+ /*
+ * If we didn't find that LSN, we need to run recovery,
+ * but not return any error.
+ */
ret = 0;
- }
+ else
+ WT_ERR(ret);
err: WT_TRET(c->close(c));
return (ret);
@@ -455,6 +462,10 @@ __log_file_header(
WT_ERR(__log_acquire(session, logrec->len, &tmp));
}
WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+ /*
+ * Make sure the header gets to disk.
+ */
+ WT_ERR(__wt_fsync(session, tmp.slot_fh));
if (end_lsn != NULL)
*end_lsn = tmp.slot_end_lsn;
@@ -573,6 +584,7 @@ __log_truncate(WT_SESSION_IMPL *session,
WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
tmp_fh = log_fh;
log_fh = NULL;
+ WT_ERR(__wt_fsync(session, tmp_fh));
WT_ERR(__wt_close(session, tmp_fh));
/*
@@ -596,6 +608,7 @@ __log_truncate(WT_SESSION_IMPL *session,
log_fh, LOG_FIRST_RECORD));
tmp_fh = log_fh;
log_fh = NULL;
+ WT_ERR(__wt_fsync(session, tmp_fh));
WT_ERR(__wt_close(session, tmp_fh));
}
}
@@ -646,6 +659,7 @@ __wt_log_allocfile(
WT_ERR(__log_prealloc(session, log_fh));
tmp_fh = log_fh;
log_fh = NULL;
+ WT_ERR(__wt_fsync(session, tmp_fh));
WT_ERR(__wt_close(session, tmp_fh));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_prealloc: rename %s to %s",
@@ -790,17 +804,20 @@ __wt_log_close(WT_SESSION_IMPL *session)
if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"closing old log %s", log->log_close_fh->name));
+ WT_RET(__wt_fsync(session, log->log_close_fh));
WT_RET(__wt_close(session, log->log_close_fh));
}
if (log->log_fh != NULL) {
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"closing log %s", log->log_fh->name));
+ WT_RET(__wt_fsync(session, log->log_fh));
WT_RET(__wt_close(session, log->log_fh));
log->log_fh = NULL;
}
if (log->log_dir_fh != NULL) {
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"closing log directory %s", log->log_dir_fh->name));
+ WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh));
WT_RET(__wt_close(session, log->log_dir_fh));
log->log_dir_fh = NULL;
}
@@ -900,7 +917,7 @@ err:
* Release a log slot.
*/
static int
-__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -913,6 +930,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
conn = S2C(session);
log = conn->log;
locked = yield_count = 0;
+ *freep = 1;
/* Write the buffered records */
if (F_ISSET(slot, SLOT_BUFFERED)) {
@@ -923,9 +941,29 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
}
/*
- * Wait for earlier groups to finish, otherwise there could be holes
- * in the log file.
+ * If this is not a buffered write, meaning the slot we have is a
+ * dummy constructed slot, not from the slot pool, or we have to wait
+ * for a synchronous operation, we do not pass handling of this slot
+ * off to the worker thread. The caller is responsible for freeing
+ * the slot in that case. Otherwise the worker thread will free it.
+ */
+ if (F_ISSET(slot, SLOT_BUFFERED) &&
+ !F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) {
+ *freep = 0;
+ slot->slot_state = WT_LOG_SLOT_WRITTEN;
+ /*
+ * After this point the worker thread owns the slot. There
+ * is nothing more to do but return.
+ */
+ WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+ goto done;
+ }
+
+ /*
+ * Wait for earlier groups to finish, otherwise there could
+ * be holes in the log file.
*/
+ WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
if (++yield_count < 1000)
__wt_yield();
@@ -936,6 +974,9 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
log->write_lsn = slot->slot_end_lsn;
WT_ERR(__wt_cond_signal(session, log->log_write_cond));
+ /*
+ * Signal the close thread if needed.
+ */
if (F_ISSET(slot, SLOT_CLOSEFH))
WT_ERR(__wt_cond_signal(session, conn->log_close_cond));
@@ -978,7 +1019,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
WT_ERR(__wt_directory_sync_fh(
session, log->log_dir_fh));
log->sync_dir_lsn = sync_lsn;
- F_CLR(slot, SLOT_SYNC_DIR);
+ WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
}
/*
@@ -990,26 +1031,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
"log_release: sync log %s", log->log_fh->name));
WT_STAT_FAST_CONN_INCR(session, log_sync);
WT_ERR(__wt_fsync(session, log->log_fh));
- F_CLR(slot, SLOT_SYNC);
log->sync_lsn = sync_lsn;
WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
}
+ /*
+ * Clear the flags before leaving the loop.
+ */
+ F_CLR(slot, SLOT_SYNC | SLOT_SYNC_DIR);
locked = 0;
__wt_spin_unlock(session, &log->log_sync_lock);
break;
}
- if (F_ISSET(slot, SLOT_BUF_GROW)) {
- WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
- F_CLR(slot, SLOT_BUF_GROW);
- WT_STAT_FAST_CONN_INCRV(session,
- log_buffer_size, slot->slot_buf.memsize);
- WT_ERR(__wt_buf_grow(session,
- &slot->slot_buf, slot->slot_buf.memsize * 2));
- }
err: if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
if (ret != 0 && slot->slot_error == 0)
slot->slot_error = ret;
+done:
return (ret);
}
@@ -1460,12 +1497,13 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_LOG *log;
WT_LOGSLOT tmp;
WT_MYSLOT myslot;
- int locked;
+ int dummy, locked;
WT_DECL_SPINLOCK_ID(id); /* Must appear last */
log = S2C(session)->log;
myslot.slot = &tmp;
myslot.offset = 0;
+ dummy = 0;
WT_CLEAR(tmp);
/* Fast path the contended case. */
@@ -1481,7 +1519,7 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
__wt_spin_unlock(session, &log->log_slot_lock);
locked = 0;
WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
- WT_ERR(__log_release(session, &tmp));
+ WT_ERR(__log_release(session, &tmp, &dummy));
err: if (locked)
__wt_spin_unlock(session, &log->log_slot_lock);
@@ -1609,11 +1647,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_LSN lsn;
WT_MYSLOT myslot;
uint32_t rdup_len;
- int locked;
+ int free_slot, locked;
conn = S2C(session);
log = conn->log;
- locked = 0;
+ free_slot = locked = 0;
WT_INIT_LSN(&lsn);
myslot.slot = NULL;
/*
@@ -1695,8 +1733,9 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_ERR(__wt_log_slot_wait(session, myslot.slot));
WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
- WT_ERR(__log_release(session, myslot.slot));
- WT_ERR(__wt_log_slot_free(myslot.slot));
+ WT_ERR(__log_release(session, myslot.slot, &free_slot));
+ if (free_slot)
+ WT_ERR(__wt_log_slot_free(session, myslot.slot));
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 8dcb2f9f165..02b3056be6f 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -57,7 +57,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
for (i = 0; i < SLOT_POOL; i++) {
WT_ERR(__wt_buf_init(session,
&log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
- F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+ F_SET(&log->slot_pool[i], SLOT_INIT_FLAGS);
}
WT_STAT_FAST_CONN_INCRV(session,
log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
@@ -295,10 +295,34 @@ __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
* Free a slot back into the pool.
*/
int
-__wt_log_slot_free(WT_LOGSLOT *slot)
+__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
+ WT_DECL_RET;
+
+ ret = 0;
+ /*
+ * Grow the buffer if needed before returning it to the pool.
+ */
+ if (F_ISSET(slot, SLOT_BUF_GROW)) {
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, slot->slot_buf.memsize);
+ WT_ERR(__wt_buf_grow(session,
+ &slot->slot_buf, slot->slot_buf.memsize * 2));
+ }
+err:
+ /*
+ * No matter if there is an error, we always want to free
+ * the slot back to the pool.
+ */
+ /*
+ * Make sure flags don't get retained between uses.
+ * We have to reset them them here because multiple threads may
+ * change the flags when joining the slot.
+ */
+ slot->flags = SLOT_INIT_FLAGS;
slot->slot_state = WT_LOG_SLOT_FREE;
- return (0);
+ return (ret);
}
/*
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 3b4dc639945..8474b6e8b37 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -77,6 +77,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
} else {
primary = clsm->cursors[clsm->nchunks - 1];
primary_chunk = clsm->primary_chunk;
+ WT_ASSERT(session, F_ISSET(&session->txn, TXN_HAS_ID));
have_primary = (primary != NULL && primary_chunk != NULL &&
(primary_chunk->switch_txn == WT_TXN_NONE ||
TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
@@ -177,14 +178,15 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
/* Update the maximum transaction ID in the primary chunk. */
if (update) {
- WT_RET(__clsm_enter_update(clsm));
- if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
- goto open;
-
/*
* Ensure that there is a transaction snapshot active.
*/
WT_RET(__wt_txn_autocommit_check(session));
+ WT_RET(__wt_txn_id_check(session));
+
+ WT_RET(__clsm_enter_update(clsm));
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
if (session->txn.isolation == TXN_ISO_SNAPSHOT)
__wt_txn_cursor_op(session);
@@ -1237,11 +1239,12 @@ __clsm_put(WT_SESSION_IMPL *session,
{
WT_CURSOR *c, *primary;
WT_LSM_TREE *lsm_tree;
- u_int i;
+ u_int i, slot;
lsm_tree = clsm->lsm_tree;
WT_ASSERT(session,
+ F_ISSET(&session->txn, TXN_HAS_ID) &&
clsm->primary_chunk != NULL &&
(clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
@@ -1257,8 +1260,15 @@ __clsm_put(WT_SESSION_IMPL *session,
if (position)
clsm->current = primary;
- for (i = 0; i < clsm->nupdates; i++) {
- c = clsm->cursors[(clsm->nchunks - i) - 1];
+ for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) {
+ /* Check if we need to keep updating old chunks. */
+ if (i > 0 &&
+ __wt_txn_visible(session, clsm->switch_txn[slot])) {
+ clsm->nupdates = i;
+ break;
+ }
+
+ c = clsm->cursors[slot];
c->set_key(c, key);
c->set_value(c, value);
WT_RET((position && i == 0) ? c->update(c) : c->insert(c));
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index f4ddd4f7e2f..dea012ccb9e 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -401,7 +401,13 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
F_SET(src, WT_CURSTD_RAW);
WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
- F_SET(session, WT_SESSION_NO_CACHE);
+ /*
+ * Setup so that we don't hold pages we read into cache, and so
+ * that we don't get stuck if the cache is full. If we allow
+ * ourselves to get stuck creating bloom filters, the entire tree
+ * can stall since there may be no worker threads available to flush.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
WT_ERR(__wt_bloom_insert(bloom, &key));
@@ -414,15 +420,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
F_CLR(session, WT_SESSION_NO_CACHE);
- /*
- * Load the new Bloom filter into cache.
- *
- * We're doing advisory reads to fault the new trees into cache.
- * Don't block if the cache is full: our next unit of work may be to
- * discard some trees to free space.
- */
- F_SET(session, WT_SESSION_NO_CACHE_CHECK);
-
+ /* Load the new Bloom filter into cache. */
WT_CLEAR(key);
WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 4ca1a6bc623..33d79e6d4ce 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -522,6 +522,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
*/
mod->mod_root_split = next;
+ /*
+ * Mark the page dirty.
+ * Don't mark the tree dirty: if this reconciliation is in service of a
+ * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+ * set it again as part of that walk.
+ */
WT_ERR(__wt_page_modify_init(session, next));
__wt_page_only_modify_set(session, next);
@@ -1113,12 +1119,14 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* process will have completed before we walk any pages
* for checkpoint.
*/
- if ((ret = __wt_page_in(session, ref,
+ ret = __wt_page_in(session, ref,
WT_READ_CACHE | WT_READ_NO_EVICT |
- WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+ WT_READ_NO_GEN | WT_READ_NO_WAIT);
+ if (ret == WT_NOTFOUND) {
ret = 0;
break;
}
+ WT_RET(ret);
*hazardp = 1;
goto in_memory;
@@ -1173,7 +1181,7 @@ in_memory:
CHILD_RELEASE(session, *hazardp, ref);
}
-done: WT_HAVE_DIAGNOSTIC_YIELD;
+done: WT_DIAGNOSTIC_YIELD;
return (ret);
}
@@ -1982,16 +1990,20 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
next->start = r->first_free;
next->entries = 0;
- /*
- * Set the space available to another split-size chunk, if we
- * have one. If we don't have room for another split chunk,
- * add whatever space remains in this page.
- */
+ /* Set the space available to another split-size chunk. */
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+ /*
+ * Adjust the space available to handle two cases:
+ * - We don't have enough room for another full split-size
+ * chunk on the page.
+ * - We chose to fill past a page boundary because of a
+ * large item.
+ */
if (inuse + r->space_avail > r->page_size) {
- WT_ASSERT(session, r->page_size >= inuse);
- r->space_avail = r->page_size - inuse;
+ r->space_avail =
+ r->page_size > inuse ? (r->page_size - inuse) : 0;
/* There are no further boundary points. */
r->bnd_state = SPLIT_MAX;
@@ -2649,7 +2661,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* WT_PAGE_HEADER header onto the scratch buffer, most of the header
* information remains unchanged between the pages.
*/
- WT_RET(__wt_scr_alloc(session, r->page_size, &tmp));
+ WT_RET(__wt_scr_alloc(session, r->dsk.memsize, &tmp));
dsk = tmp->mem;
memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
@@ -2977,7 +2989,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_split_finish(session, r));
WT_RET(__rec_write_wrapup(session, r, r->page));
- /* Mark the page's parent dirty. */
+ /* Mark the page's parent and the tree dirty. */
parent = r->ref->home;
WT_RET(__wt_page_modify_init(session, parent));
__wt_page_modify_set(session, parent);
@@ -3017,8 +3029,6 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(
__rec_split_raw(session, r, key->len + val->len));
else {
- WT_RET(__rec_split(session, r, key->len + val->len));
-
/*
* Turn off prefix compression until a full key written
* to the new page, and (unless already working with an
@@ -3030,6 +3040,8 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_cell_build_leaf_key(
session, r, NULL, 0, &ovfl_key));
}
+
+ WT_RET(__rec_split(session, r, key->len + val->len));
}
}
@@ -3225,15 +3237,18 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
addr = NULL;
child = ref->page;
- if (state != 0) {
- /*
- * Currently the only non-zero returned stated possible
- * for a column-store page is child-modified (all other
- * states are part of the fast-truncate support, which
- * is row-store only).
- */
- WT_ASSERT(session, state == WT_CHILD_MODIFIED);
+ /* Deleted child we don't have to write. */
+ if (state == WT_CHILD_IGNORE) {
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ }
+
+ /*
+ * Modified child. Empty pages are merged into the parent and
+ * discarded.
+ */
+ if (state == WT_CHILD_MODIFIED) {
switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
case WT_PM_REC_EMPTY:
/*
@@ -3253,7 +3268,9 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
break;
WT_ILLEGAL_VALUE_ERR(session);
}
- }
+ } else
+ /* No other states are expected for column stores. */
+ WT_ASSERT(session, state == 0);
/*
* Build the value cell. The child page address is in one of 3
@@ -4550,8 +4567,6 @@ build:
WT_PAGE_ROW_LEAF, kpack, r->cur));
key_onpage_ovfl = 0;
}
- WT_ERR(__rec_split(
- session, r, key->len + val->len));
/*
* Turn off prefix compression until a full key
@@ -4567,6 +4582,9 @@ build:
session,
r, NULL, 0, &ovfl_key));
}
+
+ WT_ERR(__rec_split(
+ session, r, key->len + val->len));
}
}
@@ -4636,9 +4654,6 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_RET(__rec_split_raw(
session, r, key->len + val->len));
else {
- WT_RET(__rec_split(
- session, r, key->len + val->len));
-
/*
* Turn off prefix compression until a full key
* written to the new page, and (unless already
@@ -4653,6 +4668,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
session,
r, NULL, 0, &ovfl_key));
}
+
+ WT_RET(__rec_split(
+ session, r, key->len + val->len));
}
}
@@ -5085,7 +5103,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
for (multi = mod->mod_multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
- WT_RET(__wt_row_ikey(session, 0,
+ WT_RET(__wt_row_ikey_alloc(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
if (bnd->skip == NULL) {
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index 3dfd068cf9c..e913fcfe69d 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -55,11 +55,17 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
WT_ERR(ret);
}
- WT_ERR(__wt_session_get_btree_ckpt(
- session, uri, cfg, open_flags));
- WT_SAVE_DHANDLE(session,
- ret = file_func(session, cfg));
- WT_TRET(__wt_session_release_btree(session));
+ if ((ret = __wt_session_get_btree_ckpt(
+ session, uri, cfg, open_flags)) == 0) {
+ WT_SAVE_DHANDLE(session,
+ ret = file_func(session, cfg));
+ WT_TRET(__wt_session_release_btree(session));
+ } else if (ret == EBUSY)
+ /* TODO: Decode checkpoint from cfg. */
+ WT_WITH_DHANDLE_LOCK(session,
+ ret = __wt_conn_btree_apply_single_ckpt(
+ session, uri, file_func, cfg));
+ WT_ERR(ret);
}
} else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
WT_ERR(__wt_schema_get_colgroup(
diff --git a/src/support/stat.c b/src/support/stat.c
index 0926636a532..9d10c4d5ca6 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -447,10 +447,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
"log: log records not compressed";
stats->log_compress_small.desc =
"log: log records too small to compress";
+ stats->log_release_write_lsn.desc =
+ "log: log release advances write LSN";
stats->log_scans.desc = "log: log scan operations";
stats->log_scan_rereads.desc =
"log: log scan records requiring two reads";
+ stats->log_write_lsn.desc =
+ "log: log server thread advances write LSN";
stats->log_sync.desc = "log: log sync operations";
+ stats->log_sync_dir.desc = "log: log sync_dir operations";
stats->log_writes.desc = "log: log write operations";
stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
stats->log_max_filesize.desc = "log: maximum log file size";
@@ -613,9 +618,12 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->log_compress_writes.v = 0;
stats->log_compress_write_fails.v = 0;
stats->log_compress_small.v = 0;
+ stats->log_release_write_lsn.v = 0;
stats->log_scans.v = 0;
stats->log_scan_rereads.v = 0;
+ stats->log_write_lsn.v = 0;
stats->log_sync.v = 0;
+ stats->log_sync_dir.v = 0;
stats->log_writes.v = 0;
stats->log_slot_consolidated.v = 0;
stats->log_prealloc_max.v = 0;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index fb590e1a297..87b85eb2d8d 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -259,10 +259,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
session->ckpt_handle[session->ckpt_handle_next++].dhandle =
session->dhandle;
else if (ret == EBUSY)
- WT_ERR(__wt_strdup(session, name,
- &session->ckpt_handle[session->ckpt_handle_next++].name));
+ ret = __wt_strdup(session, name,
+ &session->ckpt_handle[session->ckpt_handle_next++].name);
-err: return (ret);
+ return (ret);
}
/*
@@ -988,14 +988,23 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
int
__wt_checkpoint_close(WT_SESSION_IMPL *session, int force)
{
- /* If closing an unmodified file, simply discard its blocks. */
- if (!S2BT(session)->modified || force)
- return (__wt_cache_op(session, NULL,
- force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD));
+ WT_DECL_RET;
+
+ /* Handle forced discard (when dropping a file). */
+ if (force)
+ return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE));
+
+ /* If closing an unmodified file, try to evict its pages. */
+ if (!S2BT(session)->modified) {
+ ret = __wt_cache_op(session, NULL, WT_SYNC_DISCARD);
+ if (ret != EBUSY)
+ return (ret);
+ }
/*
- * Else, checkpoint the file and optionally flush the writes (the
- * checkpoint call will discard the blocks, there's no additional
+ * If closing a modified file, or closing an unmodified file was blocked
+ * for any reason, checkpoint the file and optionally flush the writes
+ * (the checkpoint call will discard the blocks, there's no additional
* step needed).
*/
WT_RET(__checkpoint_worker(session, NULL, 0));
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index 72f53fed9f8..086faef1a30 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -28,8 +28,6 @@
#include "wt_internal.h"
-#include <signal.h>
-
static struct {
char *progname; /* Program name */
@@ -38,8 +36,7 @@ static struct {
char *config_open; /* Command-line configuration */
- uint32_t c_bitcnt; /* Config values */
- uint32_t c_cache;
+ uint32_t c_cache; /* Config values */
uint32_t c_key_max;
uint32_t c_ops;
uint32_t c_k; /* Number of hash iterations */
@@ -49,12 +46,12 @@ static struct {
uint8_t **entries;
} g;
-static int cleanup(void);
+void cleanup(void);
void die(int e, const char *fmt, ...);
-static int populate_entries(void);
-static int run(void);
-static int setup(void);
-static void usage(void);
+void populate_entries(void);
+void run(void);
+void setup(void);
+void usage(void);
extern char *__wt_optarg;
extern int __wt_optind;
@@ -109,7 +106,7 @@ main(int argc, char *argv[])
return (EXIT_SUCCESS);
}
-int
+void
setup(void)
{
WT_CONNECTION *conn;
@@ -141,13 +138,10 @@ setup(void)
g.wt_conn = conn;
g.wt_session = session;
- if ((ret = populate_entries()) != 0)
- die(ret, "populate_entries");
-
- return (0);
+ populate_entries();
}
-int
+void
run(void)
{
WT_BLOOM *bloomp;
@@ -184,7 +178,8 @@ run(void)
if ((ret = __wt_bloom_close(bloomp)) != 0)
die(ret, "__wt_bloom_close");
- g.wt_session->checkpoint(g.wt_session, NULL);
+ if ((ret = g.wt_session->checkpoint(g.wt_session, NULL)) != 0)
+ die(ret, "WT_SESSION.checkpoint");
if ((ret = __wt_bloom_open(
sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0)
die(ret, "__wt_bloom_open");
@@ -212,28 +207,28 @@ run(void)
g.c_ops, fp, 100.0 * fp/g.c_ops);
if ((ret = __wt_bloom_drop(bloomp, NULL)) != 0)
die(ret, "__wt_bloom_drop");
-
- return (0);
}
-int
+void
cleanup(void)
{
uint32_t i;
+ int ret;
for (i = 0; i < g.c_ops; i++)
free(g.entries[i]);
free(g.entries);
- g.wt_session->close(g.wt_session, NULL);
- g.wt_conn->close(g.wt_conn, NULL);
- return (0);
+ if ((ret = g.wt_session->close(g.wt_session, NULL)) != 0)
+ die(ret, "WT_SESSION.close");
+ if ((g.wt_conn->close(g.wt_conn, NULL)) != 0)
+ die(ret, "WT_CONNECTION.close");
}
/*
* Create and keep all the strings used to populate the bloom filter, so that
* we can do validation with the same set of entries.
*/
-static int
+void
populate_entries(void)
{
uint32_t i, j;
@@ -254,7 +249,6 @@ populate_entries(void)
}
g.entries = entries;
- return (0);
}
/*
@@ -283,7 +277,7 @@ die(int e, const char *fmt, ...)
* usage --
* Display usage statement and exit failure.
*/
-static void
+void
usage(void)
{
fprintf(stderr, "usage: %s [-cfkos]\n", g.progname);
diff --git a/test/format/bdb.c b/test/format/bdb.c
index 563b69b9e27..254dd95e1d3 100644
--- a/test/format/bdb.c
+++ b/test/format/bdb.c
@@ -66,10 +66,7 @@ bdb_open(void)
assert(dbenv->mutex_set_max(dbenv, 10000) == 0);
assert(dbenv->set_cachesize(dbenv, 0, 50 * 1024 * 1024, 1) == 0);
assert(dbenv->open(dbenv, NULL,
- DB_CREATE |
- (g.c_delete_pct == 0 && g.c_insert_pct == 0 && g.c_write_pct == 0 ?
- 0 : DB_INIT_LOCK) |
- DB_INIT_MPOOL | DB_PRIVATE, 0) == 0);
+ DB_CREATE | DB_INIT_LOCK | DB_INIT_MPOOL | DB_PRIVATE, 0) == 0);
assert(db_create(&db, dbenv, 0) == 0);
if (g.type == ROW && g.c_reverse)
diff --git a/test/format/config.c b/test/format/config.c
index 1fbbe90a57e..e801827935c 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -71,14 +71,14 @@ config_setup(void)
}
if (!config_find_is_perm("file_type", strlen("file_type")))
- switch (DATASOURCE("lsm") ? 3 : MMRAND(1, 3)) {
+ switch (DATASOURCE("lsm") ? 5 : MMRAND(1, 10)) {
case 1:
config_single("file_type=fix", 0);
break;
- case 2:
+ case 2: case 3: case 4:
config_single("file_type=var", 0);
break;
- case 3:
+ case 5: case 6: case 7: case 8: case 9: case 10:
config_single("file_type=row", 0);
break;
}
@@ -142,12 +142,6 @@ config_setup(void)
config_compression();
config_isolation();
- /* Clear operations values if the whole run is read-only. */
- if (g.c_ops == 0)
- for (cp = c; cp->name != NULL; ++cp)
- if (cp->flags & C_OPS)
- *cp->v = 0;
-
/*
* Periodically, set the delete percentage to 0 so salvage gets run,
* as long as the delete percentage isn't nailed down.
@@ -174,6 +168,11 @@ config_setup(void)
g.c_insert_pct = MMRAND(50, 85);
}
+ /* Make the default maximum-run length 20 minutes. */
+ cp = config_find("timer", strlen("timer"));
+ if (!(cp->flags & C_PERM))
+ g.c_timer = 20;
+
/*
* Key/value minimum/maximum are related, correct unless specified by
* the configuration.
@@ -238,8 +237,9 @@ config_compression(void)
/*
* Compression: choose something if compression wasn't specified,
* otherwise confirm the appropriate shared library is available.
- * We don't include LZO in the test compression choices, we don't
- * yet have an LZO module of our own.
+ * We used to verify that the libraries existed but that's no longer
+ * robust, since it's possible to build compression libraries into
+ * the WiredTiger library.
*/
cp = config_find("compression", strlen("compression"));
if (!(cp->flags & C_PERM)) {
@@ -249,50 +249,24 @@ config_compression(void)
case 4: case 5: case 6:
break;
case 7: case 8: case 9: case 10: /* 20% bzip */
- if (access(BZIP_PATH, R_OK) == 0)
- cstr = "compression=bzip";
+ cstr = "compression=bzip";
break;
case 11: /* 5% bzip-raw */
- if (access(BZIP_PATH, R_OK) == 0)
- cstr = "compression=bzip-raw";
+ cstr = "compression=bzip-raw";
break;
case 12: case 13: case 14: case 15: /* 20% snappy */
- if (access(SNAPPY_PATH, R_OK) == 0)
- cstr = "compression=snappy";
+ cstr = "compression=snappy";
break;
case 16: case 17: case 18: case 19: /* 20% zlib */
- if (access(ZLIB_PATH, R_OK) == 0)
- cstr = "compression=zlib";
+ cstr = "compression=zlib";
break;
case 20: /* 5% zlib-no-raw */
- if (access(ZLIB_PATH, R_OK) == 0)
- cstr = "compression=zlib-noraw";
+ cstr = "compression=zlib-noraw";
break;
}
config_single(cstr, 0);
}
-
- switch (g.c_compression_flag) {
- case COMPRESS_BZIP:
- case COMPRESS_BZIP_RAW:
- if (access(BZIP_PATH, R_OK) != 0)
- die(0, "bzip library not found or not readable");
- break;
- case COMPRESS_LZO:
- if (access(LZO_PATH, R_OK) != 0)
- die(0, "LZO library not found or not readable");
- break;
- case COMPRESS_SNAPPY:
- if (access(SNAPPY_PATH, R_OK) != 0)
- die(0, "snappy library not found or not readable");
- break;
- case COMPRESS_ZLIB:
- case COMPRESS_ZLIB_NO_RAW:
- if (access(ZLIB_PATH, R_OK) != 0)
- die(0, "zlib library not found or not readable");
- break;
- }
}
/*
diff --git a/test/format/config.h b/test/format/config.h
index 7871127ff26..d5d797f4b50 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -40,14 +40,11 @@ typedef struct {
/* Not a simple randomization, handle outside the main loop. */
#define C_IGNORE 0x002
- /* Operation, only set if doing operations. */
-#define C_OPS 0x004
-
/* Value was set from command-line or file, ignore for all runs. */
-#define C_PERM 0x008
+#define C_PERM 0x004
/* Value isn't random for this run, ignore just for this run. */
-#define C_TEMP 0x010
+#define C_TEMP 0x008
/* Value is a string. */
#define C_STRING 0x020
@@ -134,7 +131,7 @@ static CONFIG c[] = {
{ "delete_pct",
"percent operations that are deletes",
- C_OPS, 0, 45, 90, &g.c_delete_pct, NULL },
+ 0x0, 0, 45, 90, &g.c_delete_pct, NULL },
{ "dictionary",
"if values are dictionary compressed", /* 20% */
@@ -162,7 +159,7 @@ static CONFIG c[] = {
{ "insert_pct",
"percent operations that are inserts",
- C_OPS, 0, 45, 90, &g.c_insert_pct, NULL },
+ 0x0, 0, 45, 90, &g.c_insert_pct, NULL },
{ "internal_key_truncation",
"if internal keys are truncated", /* 95% */
@@ -270,7 +267,7 @@ static CONFIG c[] = {
C_IGNORE, 1, 32, 128, &g.c_threads, NULL },
{ "timer",
- "time to run in minutes",
+ "maximum time to run in minutes (default 20 minutes)",
C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_timer, NULL },
{ "value_max",
@@ -287,7 +284,7 @@ static CONFIG c[] = {
{ "write_pct",
"percent operations that are writes",
- C_OPS, 0, 90, 90, &g.c_write_pct, NULL },
+ 0x0, 0, 90, 90, &g.c_write_pct, NULL },
{ NULL, NULL, 0x0, 0, 0, 0, NULL, NULL }
};
diff --git a/test/format/format.h b/test/format/format.h
index e2cd4f19c7e..58940f0c4b8 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -307,7 +307,7 @@ void wts_create(void);
void wts_dump(const char *, int);
void wts_load(void);
void wts_open(const char *, int, WT_CONNECTION **);
-void wts_ops(void);
+void wts_ops(int);
void wts_read_scan(void);
void wts_salvage(void);
void wts_stats(void);
diff --git a/test/format/ops.c b/test/format/ops.c
index 3a0a9110b9c..5fd992e9952 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -46,14 +46,14 @@ static void table_append_init(void);
* Perform a number of operations in a set of threads.
*/
void
-wts_ops(void)
+wts_ops(int lastrun)
{
TINFO *tinfo, total;
WT_CONNECTION *conn;
WT_SESSION *session;
pthread_t backup_tid, compact_tid;
- uint64_t thread_ops;
- uint32_t i, fourths;
+ int64_t fourths, thread_ops;
+ uint32_t i;
int ret, running;
conn = g.wts_conn;
@@ -71,20 +71,23 @@ wts_ops(void)
/*
* There are two mechanisms to specify the length of the run, a number
- * of operations or a timer. If the former, each thread does an equal
- * share of the total operations (and make sure that it's not 0). If
- * the latter, calculate how many fourth-of-a-second sleeps until this
- * part of the run finishes.
+ * of operations and a timer, when either expire the run terminates.
+ * Each thread does an equal share of the total operations (and make
+ * sure that it's not 0).
+ *
+ * Calculate how many fourth-of-a-second sleeps until any timer expires.
*/
- if (g.c_timer == 0) {
- fourths = 0;
+ if (g.c_ops == 0)
+ thread_ops = -1;
+ else {
if (g.c_ops < g.c_threads)
g.c_ops = g.c_threads;
thread_ops = g.c_ops / g.c_threads;
- } else {
- fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS;
- thread_ops = 0;
}
+ if (g.c_timer == 0)
+ fourths = -1;
+ else
+ fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS;
/* Initialize the table extension code. */
table_append_init();
@@ -117,8 +120,9 @@ wts_ops(void)
die(ret, "pthread_create: compaction");
/* Spin on the threads, calculating the totals. */
- memset(&total, 0, sizeof(total));
for (;;) {
+ /* Clear out the totals each pass. */
+ memset(&total, 0, sizeof(total));
for (i = 0, running = 0; i < g.c_threads; ++i) {
total.commit += tinfo[i].commit;
total.deadlock += tinfo[i].deadlock;
@@ -140,27 +144,29 @@ wts_ops(void)
break;
}
- if (thread_ops == 0) {
+ /*
+ * If the timer has expired or this thread has completed
+ * its operations, notify the thread it should quit.
+ */
+ if (fourths == 0 ||
+ (thread_ops != -1 &&
+ tinfo[i].ops >= (uint64_t)thread_ops)) {
/*
- * Optionally drop core (for testing recovery),
- * otherwise tell the thread it's done.
+ * On the last execution, optionally drop core
+ * for recovery testing.
*/
- if (fourths == 0) {
- if (g.c_abort) {
- static char *core = NULL;
- *core = 0;
- }
- tinfo[i].quit = 1;
+ if (lastrun && g.c_abort) {
+ static char *core = NULL;
+ *core = 0;
}
- } else
- if (tinfo[i].ops >= thread_ops)
- tinfo[i].quit = 1;
+ tinfo[i].quit = 1;
+ }
}
track("ops", 0ULL, &total);
if (!running)
break;
(void)usleep(250000); /* 1/4th of a second */
- if (fourths != 0)
+ if (fourths != -1)
--fourths;
}
free(tinfo);
diff --git a/test/format/recover.sh b/test/format/recover.sh
index de908c71e5d..4177e26a278 100644
--- a/test/format/recover.sh
+++ b/test/format/recover.sh
@@ -37,12 +37,16 @@ while true; do
# Save a copy of the database directory exactly as it was at the crash.
cp -rp RUNDIR $rundir2
- # We aborted, so recovery is required
- if `$wtcmd -R -h RUNDIR list | egrep table > /dev/null`; then
- uri='table:wt'
+ #
+ # Everything is a table unless explicitly a file.
+ #
+ isfile=`grep data_source RUNDIR/CONFIG | grep -c file || exit 0`
+ if test "$isfile" -ne 0; then
+ uri="file:wt"
else
- uri='file:wt'
+ uri="table:wt"
fi
- # Force recovery to run.
+
+ # We know we aborted, so force recovery to run.
$wtcmd -R -h RUNDIR verify $uri || exit 1
done
diff --git a/test/format/smoke.sh b/test/format/smoke.sh
index 62577692d0c..fe53f64229f 100755
--- a/test/format/smoke.sh
+++ b/test/format/smoke.sh
@@ -1,7 +1,7 @@
#! /bin/sh
# Smoke-test format as part of running "make check".
-args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4"
+args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none"
./t $args file_type=fix || exit 1
./t $args file_type=row || exit 1
diff --git a/test/format/t.c b/test/format/t.c
index b53913b4623..03b3605a5e4 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -40,6 +40,7 @@ extern char *__wt_optarg;
int
main(int argc, char *argv[])
{
+ time_t start;
int ch, reps, ret;
const char *config, *home;
@@ -174,7 +175,9 @@ main(int argc, char *argv[])
config_print(0); /* Dump run configuration */
key_len_setup(); /* Setup keys */
+ start = time(NULL);
track("starting up", 0ULL, NULL);
+
if (SINGLETHREADED)
bdb_open(); /* Initial file config */
wts_open(g.home, 1, &g.wts_conn);
@@ -183,35 +186,35 @@ main(int argc, char *argv[])
wts_load(); /* Load initial records */
wts_verify("post-bulk verify"); /* Verify */
- /* Loop reading & operations */
- for (reps = 0; reps < FORMAT_OPERATION_REPS; ++reps) {
- wts_read_scan(); /* Read scan */
-
- /* Operations */
- if (g.c_timer != 0 || g.c_ops != 0)
- wts_ops();
-
- /*
- * Statistics.
- *
- * XXX
- * Verify closes the underlying handle and discards the
- * statistics, read them first.
- */
- if (g.c_ops == 0 || reps == 2)
- wts_stats();
-
- /* Verify */
- wts_verify("post-ops verify");
-
- /*
- * If no operation count, quit after a single read pass.
- * (A timer configuration ran out the timer on the first
- * set of operations.)
- */
- if (g.c_ops == 0)
- break;
- }
+ /*
+ * If we're not doing any operations, scan the bulk-load, copy
+ * the statistics and we're done. Otherwise, loop reading and
+ * operations, with a verify after each set.
+ */
+ if (g.c_timer == 0 && g.c_ops == 0) {
+ wts_read_scan(); /* Read scan */
+ wts_stats(); /* Statistics */
+ } else
+ for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps) {
+ wts_read_scan(); /* Read scan */
+
+ /* Operations */
+ wts_ops(reps == FORMAT_OPERATION_REPS);
+
+ /*
+ * Copy out the run's statistics after the last
+ * set of operations.
+ *
+ * XXX
+ * Verify closes the underlying handle and
+ * discards the statistics, read them first.
+ */
+ if (reps == FORMAT_OPERATION_REPS)
+ wts_stats();
+
+ /* Verify */
+ wts_verify("post-ops verify");
+ }
track("shutting down", 0ULL, NULL);
if (SINGLETHREADED)
@@ -233,8 +236,9 @@ main(int argc, char *argv[])
/* Overwrite the progress line with a completion line. */
if (g.track)
printf("\r%78s\r", " ");
- printf("%4d: %s, %s\n",
- g.run_cnt, g.c_data_source, g.c_file_type);
+ printf("%4d: %s, %s (%.0f seconds)\n",
+ g.run_cnt, g.c_data_source,
+ g.c_file_type, difftime(time(NULL), start));
}
/* Flush/close any logging information. */
diff --git a/test/packing/Makefile.am b/test/packing/Makefile.am
new file mode 100644
index 00000000000..a8c6c2dc69f
--- /dev/null
+++ b/test/packing/Makefile.am
@@ -0,0 +1,5 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_PROGRAMS = intpack-test intpack-test2 packing-test
+LDADD = $(top_builddir)/libwiredtiger.la
+LDFLAGS = -static
diff --git a/test/packing/intpack-test.c b/test/packing/intpack-test.c
index 109f37e229a..51acea15506 100644
--- a/test/packing/intpack-test.c
+++ b/test/packing/intpack-test.c
@@ -27,29 +27,29 @@
*/
#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wt_internal.h>
-#include "intpack.i"
+#include "wt_internal.h"
-int main() {
+int
+main()
+{
+ const uint8_t *cp;
uint8_t buf[10], *p;
- uint64_t r, r2, ncalls;
- int i, s;
+ uint64_t ncalls, r, r2, s;
+ int i;
ncalls = 0;
for (i = 0; i < 10000000; i++) {
for (s = 0; s < 50; s += 5) {
++ncalls;
- r = 1 << s;
+ r = 1ULL << s;
#if 1
p = buf;
- __wt_vpack_uint(NULL, &p, sizeof buf, r);
- p = buf;
- __wt_vunpack_uint(NULL, &p, sizeof buf, &r2);
+ assert(__wt_vpack_uint(&p, sizeof(buf), r) == 0);
+ cp = buf;
+ assert(__wt_vunpack_uint(&cp, sizeof(buf), &r2) == 0);
#else
/*
* Note: use memmove for comparison because GCC does
@@ -57,9 +57,9 @@ int main() {
* to measure anything.
*/
p = buf;
- memmove(p, &r, sizeof r);
- p = buf;
- memmove(&r2, p, sizeof r2);
+ memmove(p, &r, sizeof(r));
+ cp = buf;
+ memmove(&r2, cp, sizeof(r2));
#endif
if (r != r2) {
fprintf(stderr, "mismatch!\n");
diff --git a/test/packing/intpack-test2.c b/test/packing/intpack-test2.c
index 6b54504f367..d9ac9373cea 100644
--- a/test/packing/intpack-test2.c
+++ b/test/packing/intpack-test2.c
@@ -27,27 +27,26 @@
*/
#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wt_internal.h>
-#include "intpack.i"
+#include "wt_internal.h"
-int main() {
+int
+main()
+{
uint8_t buf[10], *p, *end;
int64_t i;
for (i = 1; i < 1LL << 60; i <<= 1) {
end = buf;
- __wt_vpack_uint(NULL, &end, sizeof buf, i);
- printf("%lld ", i);
+ assert(__wt_vpack_uint(&end, sizeof(buf), (uint64_t)i) == 0);
+ printf("%" PRId64 " ", i);
for (p = buf; p < end; p++)
printf("%02x", *p);
printf("\n");
end = buf;
- __wt_vpack_int(NULL, &end, sizeof buf, -i);
- printf("%lld ", -i);
+ assert(__wt_vpack_int(&end, sizeof(buf), -i) == 0);
+ printf("%" PRId64 " ", -i);
for (p = buf; p < end; p++)
printf("%02x", *p);
printf("\n");
diff --git a/test/packing/packing-test.c b/test/packing/packing-test.c
index 2696e8a008d..32b7d3d17ec 100644
--- a/test/packing/packing-test.c
+++ b/test/packing/packing-test.c
@@ -27,26 +27,26 @@
*/
#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wiredtiger.h>
-#include <stdarg.h>
+#include "wt_internal.h"
-void check(const char *fmt, ...)
+static void
+check(const char *fmt, ...)
{
char buf[200], *end, *p;
va_list ap;
size_t len;
+ len = 0; /* -Werror=maybe-uninitialized */
+
va_start(ap, fmt);
- len = wiredtiger_struct_sizev(fmt, ap);
+ assert(__wt_struct_sizev(NULL, &len, fmt, ap) == 0);
va_end(ap);
- assert(len < sizeof buf);
+ assert(len < sizeof(buf));
va_start(ap, fmt);
- assert(wiredtiger_struct_packv(buf, sizeof buf, fmt, ap) == 0);
+ assert(__wt_struct_packv(NULL, buf, sizeof(buf), fmt, ap) == 0);
va_end(ap);
printf("%s ", fmt);
@@ -55,7 +55,9 @@ void check(const char *fmt, ...)
printf("\n");
}
-int main() {
+int
+main()
+{
check("iii", 0, 101, -99);
check("3i", 0, 101, -99);
check("iS", 42, "forty two");
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index f1e4f26c255..1c4d54df9e9 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -447,6 +447,18 @@ run(int r)
}
/*
+ * file_exists --
+ * Return if the file exists.
+ */
+static int
+file_exists(const char *path)
+{
+ struct stat sb;
+
+ return (stat(path, &sb) == 0);
+}
+
+/*
* build --
* Build a row- or column-store page in a file.
*/
@@ -529,21 +541,16 @@ build(int ikey, int ivalue, int cnt)
}
/*
- * The first time through this routine we put a matching configuration
- * in for the salvage file.
+ * The first time through this routine we create the salvage file and
+ * then remove it (all we want is the appropriate schema entry, we're
+ * creating the salvage file itself by hand).
*/
- new_slvg = (access(SLVG, F_OK) != 0);
+ new_slvg = !file_exists(SLVG);
if (new_slvg) {
assert(session->drop(session, "file:" SLVG, "force") == 0);
assert(session->create(session, "file:" SLVG, config) == 0);
}
-
assert(conn->close(conn, 0) == 0);
-
- /*
- * We created the salvage file above, but all we want is the schema,
- * we're creating the salvage file by hand.
- */
if (new_slvg)
(void)remove(SLVG);
}
@@ -567,12 +574,13 @@ copy(u_int gen, u_int recno)
* copy the first sector (the file description).
* Otherwise, we are appending to an existing file.
*/
- if (access(SLVG, F_OK)) {
+ if (file_exists(SLVG))
+ assert((ofp = fopen(SLVG, "a")) != NULL);
+ else {
assert((ofp = fopen(SLVG, "w")) != NULL);
assert(fread(buf, 1, PSIZE, ifp) == PSIZE);
assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE);
- } else
- assert((ofp = fopen(SLVG, "a")) != NULL);
+ }
/*
* If there's data, copy/update the first formatted page.
diff --git a/test/suite/test_bug009.py b/test/suite/test_bug009.py
new file mode 100644
index 00000000000..9074d45bafd
--- /dev/null
+++ b/test/suite/test_bug009.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_bug009.py
+# check that reconciliation takes into account prefix compression
+# when figuring out how to split pages
+#
+
+import wiredtiger, wttest
+from wiredtiger import stat
+from helper import confirm_empty,\
+ key_populate, value_populate, simple_populate,\
+ complex_populate, complex_value_populate
+from wtscenario import multiply_scenarios, number_scenarios
+
+class test_bug009(wttest.WiredTigerTestCase):
+ name = 'test_bug009'
+ uri = 'file:' + name
+
+ def test_reconciliation_prefix_compression(self):
+ # Configure 4KB pages with prefix compression enabled and support for
+ # large data items.
+ self.session.create(self.uri,
+ 'prefix_compression=1,' +
+ 'key_format=S,value_format=S,' +
+ 'internal_page_max=4KB,leaf_page_max=4KB,' +
+ 'leaf_value_max=3096')
+
+ cursor = self.session.open_cursor(self.uri, None)
+ # Insert two items with keys that will be prefix compressed and data
+ # items sized so that the compression size difference tips the
+ # size over a page boundary.
+ cursor.set_key('fill_2__b_27')
+ cursor.set_value(2294 * '0')
+ cursor.insert()
+
+ cursor.set_key('fill_2__b_28')
+ cursor.set_value(3022 * '0')
+ cursor.insert()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_bug010.py b/test/suite/test_bug010.py
new file mode 100644
index 00000000000..31e9777aa8e
--- /dev/null
+++ b/test/suite/test_bug010.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_bug010.py
+# check that checkpoints don't leave files marked clean when they
+# did not write all updates out.
+#
+
+import wiredtiger, wttest, wtthread
+import threading, time
+
+class test_bug010(wttest.WiredTigerTestCase):
+ name = 'test_bug010'
+ uri = 'table:' + name
+ num_tables = 1000
+
+ # Overrides WiredTigerTestCase
+ def setUpConnectionOpen(self, dir):
+ self.home = dir
+ # Disable checkpoint sync, to make checkpoints faster and
+ # increase the likelyhood of triggering the symptom
+ conn_params = ',create,checkpoint_sync=false'
+ conn = wiredtiger.wiredtiger_open(dir, conn_params)
+ return conn
+
+ def test_checkpoint_dirty(self):
+ # Create a lot of tables
+ # insert the same item in each
+ # Start a checkpoint with some of the updates
+ # Create another checkpoint that should contain all data consistently
+ # Read from the checkpoint and make sure the data is consistent
+ for i in range(0, self.num_tables):
+ self.printVerbose(3, 'Creating table ' + str(i))
+ self.session.create(self.uri + str(i),
+ 'key_format=S,value_format=i')
+ c = self.session.open_cursor(self.uri + str(i), None)
+ c.set_key('a')
+ c.set_value(0)
+ c.insert()
+ c.close()
+
+ self.session.checkpoint()
+
+ iterations = 1
+ expected_val = 0
+ for its in range(1, 10):
+ self.printVerbose(3, 'Doing iteration ' + str(its))
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = wtthread.checkpoint_thread(self.conn, done)
+ ckpt.start()
+ try:
+ expected_val += 1
+ for i in range(0, self.num_tables):
+ c = self.session.open_cursor(self.uri + str(i), None)
+ c.set_key('a')
+ c.set_value(expected_val)
+ c.insert()
+ c.close()
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Execute another checkpoint, to make sure we have a consistent
+ # view of the data.
+ self.session.checkpoint()
+ for i in range(0, self.num_tables):
+ c = self.session.open_cursor(
+ self.uri + str(i), None, 'checkpoint=WiredTigerCheckpoint')
+ c.next()
+ self.assertEquals(c.get_value(), expected_val,
+ msg='Mismatch on iteration ' + str(its) +\
+ ' for table ' + str(i))
+ c.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py
index 4c7e6f667e4..6d81c102028 100644
--- a/test/suite/test_dump.py
+++ b/test/suite/test_dump.py
@@ -67,6 +67,31 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
scenarios = number_scenarios(
multiply_scenarios('.', types, keyfmt, dumpfmt))
+ # Extract the values lines from the dump output.
+ def value_lines(self, fname):
+ # mode:
+ # 0 == we are in the header
+ # 1 == next line is key
+ # 2 == next line is value
+ mode = 0
+ lines = []
+ for line in open(fname).readlines():
+ if mode == 0:
+ if line == 'Data\n':
+ mode = 1
+ elif mode == 1:
+ mode = 2
+ else:
+ # This is a value line, keep it.
+ lines.append(line)
+ mode = 1
+ return sorted(lines)
+
+ def compare_dump_values(self, f1, f2):
+ l1 = self.value_lines(f1)
+ l2 = self.value_lines(f2)
+ self.assertEqual(l1, l2)
+
# Dump, re-load and do a content comparison.
def test_dump(self):
# Create the object.
@@ -105,5 +130,14 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess):
'load', '-n', '-f', 'dump.out'], errfilename='errfile.out')
self.check_non_empty_file('errfile.out')
+ # If there is are indices, dump one of them and check the output.
+ if self.populate == complex_populate:
+ indexuri = 'index:' + self.name + ':indx1'
+ hexopt = ['-x'] if self.hex == 1 else []
+ self.runWt(['-h', self.dir, 'dump'] + hexopt + [indexuri],
+ outfilename='dumpidx.out')
+ self.check_non_empty_file('dumpidx.out')
+ self.compare_dump_values('dump.out', 'dumpidx.out')
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_shared_cache.py b/test/suite/test_shared_cache01.py
index ff40d31e6df..e6d712e61bc 100644
--- a/test/suite/test_shared_cache.py
+++ b/test/suite/test_shared_cache01.py
@@ -33,12 +33,12 @@ import wiredtiger, wttest
from wttest import unittest
from helper import key_populate, simple_populate
-# test_shared_cache.py
+# test_shared_cache01.py
# Checkpoint tests
# Test shared cache shared amongst multiple connections.
-class test_shared_cache(wttest.WiredTigerTestCase):
+class test_shared_cache01(wttest.WiredTigerTestCase):
- uri = 'table:test_shared_cache'
+ uri = 'table:test_shared_cache01'
# Setup fairly large items to use up cache
data_str = 'abcdefghijklmnopqrstuvwxyz' * 20
@@ -89,7 +89,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.sessions = [] # Implicitly closed when closing sessions.
# Basic test of shared cache
- def test_shared_cache01(self):
+ def test_shared_cache_basic(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2'])
@@ -99,7 +99,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Test of shared cache with more connections
- def test_shared_cache02(self):
+ def test_shared_cache_more_connections(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3', 'WT_TEST4'])
@@ -109,7 +109,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Do enough work for the shared cache to be fully allocated.
- def test_shared_cache03(self):
+ def test_shared_cache_full(self):
nops = 10000
self.openConnections(['WT_TEST1', 'WT_TEST2'])
for sess in self.sessions:
@@ -121,7 +121,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Switch the work between connections, to test rebalancing.
- def test_shared_cache04(self):
+ def test_shared_cache_rebalance(self):
# About 100 MB of data with ~250 byte values.
nops = 200000
self.openConnections(['WT_TEST1', 'WT_TEST2'])
@@ -132,7 +132,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Add a new connection once the shared cache is already established.
- def test_shared_cache05(self):
+ def test_shared_cache_late_join(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2'])
@@ -147,7 +147,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Close a connection and keep using other connections.
- def test_shared_cache06(self):
+ def test_shared_cache_leaving(self):
nops = 10000
self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3'])
@@ -163,7 +163,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
# Test verbose output
@unittest.skip("Verbose output handling")
- def test_shared_cache07(self):
+ def test_shared_cache_verbose(self):
nops = 1000
self.openConnections(
['WT_TEST1', 'WT_TEST2'], extra_opts="verbose=[shared_cache]")
@@ -174,7 +174,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Test opening a connection outside of the shared cache
- def test_shared_cache08(self):
+ def test_shared_cache_mixed(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2'])
@@ -185,7 +185,7 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.closeConnections()
# Test default config values
- def test_shared_cache09(self):
+ def test_shared_cache_defaults(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool,size=200M)')
@@ -194,21 +194,8 @@ class test_shared_cache(wttest.WiredTigerTestCase):
self.add_records(sess, 0, nops)
self.closeConnections()
- # Test reconfigure API
- def test_shared_cache10(self):
- nops = 1000
- self.openConnections(['WT_TEST1', 'WT_TEST2'])
-
- for sess in self.sessions:
- sess.create(self.uri, "key_format=S,value_format=S")
- self.add_records(sess, 0, nops)
-
- connection = self.conns[0]
- connection.reconfigure("shared_cache=(name=pool,size=300M)")
- self.closeConnections()
-
# Test default config values
- def test_shared_cache11(self):
+ def test_shared_cache_defaults2(self):
nops = 1000
self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool)')
diff --git a/test/suite/test_shared_cache02.py b/test/suite/test_shared_cache02.py
new file mode 100644
index 00000000000..3806e9d0cda
--- /dev/null
+++ b/test/suite/test_shared_cache02.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+# If unittest2 is available, use it in preference to (the old) unittest
+
+import os
+import shutil
+import wiredtiger, wttest
+from wttest import unittest
+from helper import key_populate, simple_populate
+
+# test_shared_cache02.py
+# Shared cache tests
+# Test shared cache shared amongst multiple connections.
+class test_shared_cache02(wttest.WiredTigerTestCase):
+
+ uri = 'table:test_shared_cache02'
+ # Setup fairly large items to use up cache
+ data_str = 'abcdefghijklmnopqrstuvwxyz' * 20
+
+ # Add a set of records
+ def add_records(self, session, start, stop):
+ cursor = session.open_cursor(self.uri, None, "overwrite")
+ for i in range(start, stop+1):
+ cursor.set_key("%010d KEY------" % i)
+ cursor.set_value("%010d VALUE "% i + self.data_str)
+ self.assertEqual(cursor.insert(), 0)
+ cursor.close()
+
+ # Disable default setup/shutdown steps - connections are managed manually.
+ def setUpSessionOpen(self, conn):
+ return None
+
+ def close_conn(self):
+ return None
+
+ def setUpConnectionOpen(self, dir):
+ return None
+
+ def openConnections(
+ self,
+ connections,
+ pool_opts = ',shared_cache=(name=pool,size=200M,chunk=10M,reserve=30M),',
+ extra_opts = '',
+ add=0):
+ if add == 0:
+ self.conns = []
+ self.sessions = []
+ # Open the set of connections.
+ for name in connections:
+ shutil.rmtree(name, True)
+ os.mkdir(name)
+ next_conn = wiredtiger.wiredtiger_open(
+ name,
+ 'create,error_prefix="' + self.shortid() + ': "' +
+ pool_opts + extra_opts)
+ self.conns.append(next_conn)
+ self.sessions.append(next_conn.open_session(None))
+ return None
+
+ def closeConnections(self):
+ for tmp_conn in self.conns:
+ tmp_conn.close()
+ self.conns = []
+ self.sessions = [] # Implicitly closed when closing sessions.
+
+ # Test reconfigure API
+ def test_shared_cache_reconfig01(self):
+ nops = 1000
+ self.openConnections(['WT_TEST1', 'WT_TEST2'])
+
+ for sess in self.sessions:
+ sess.create(self.uri, "key_format=S,value_format=S")
+ self.add_records(sess, 0, nops)
+
+ connection = self.conns[0]
+ connection.reconfigure("shared_cache=(name=pool,size=300M)")
+ self.closeConnections()
+
+ # Test reconfigure that grows the usage over quota fails
+ def test_shared_cache_reconfig02(self):
+ nops = 1000
+ self.openConnections(['WT_TEST1', 'WT_TEST2'],
+ pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),')
+
+ for sess in self.sessions:
+ sess.create(self.uri, "key_format=S,value_format=S")
+ self.add_records(sess, 0, nops)
+
+ connection = self.conns[0]
+ # Reconfigure to over-subscribe, call should fail with an error
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: connection.reconfigure("shared_cache=(name=pool,reserve=40M)"),
+ '/Shared cache unable to accommodate this configuration/')
+ # TODO: Ensure that the reserve size wasn't updated.
+ # cursor = self.sessions[0].open_cursor('config:', None, None)
+ # value = cursor['connection']
+ # self.assertTrue(value.find('reserve') != -1)
+
+ self.closeConnections()
+
+ # Test reconfigure that would grow the usage over quota if the
+ # previous reserve size isn't taken into account
+ def test_shared_cache_reconfig03(self):
+ nops = 1000
+ self.openConnections(['WT_TEST1', 'WT_TEST2'],
+ pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),')
+
+ for sess in self.sessions:
+ sess.create(self.uri, "key_format=S,value_format=S")
+ self.add_records(sess, 0, nops)
+
+ connection = self.conns[0]
+
+ connection.reconfigure("shared_cache=(name=pool,reserve=30M)"),
+
+ # TODO: Ensure that the reserve size was updated.
+ # cursor = self.sessions[0].open_cursor('config:', None, None)
+ # value = cursor['connection']
+ # self.assertTrue(value.find('reserve') != -1)
+
+ self.closeConnections()
+
+ # Test reconfigure that switches to using a shared cache
+ # previous reserve size isn't taken into account
+ def test_shared_cache_reconfig03(self):
+ nops = 1000
+ self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts = ',')
+
+ for sess in self.sessions:
+ sess.create(self.uri, "key_format=S,value_format=S")
+ self.add_records(sess, 0, nops)
+
+ self.conns[0].reconfigure("shared_cache=(name=pool,reserve=20M)"),
+ self.conns[1].reconfigure("shared_cache=(name=pool,reserve=20M)"),
+
+ # TODO: Ensure that the reserve size was updated.
+ # cursor = self.sessions[0].open_cursor('config:', None, None)
+ # value = cursor['connection']
+ # self.assertTrue(value.find('reserve') != -1)
+
+ self.closeConnections()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_stat01.py b/test/suite/test_stat01.py
index 0f072a7c473..0b778d63b9d 100644
--- a/test/suite/test_stat01.py
+++ b/test/suite/test_stat01.py
@@ -28,6 +28,8 @@
import helper, wiredtiger, wttest
from wiredtiger import stat
+from helper import key_populate, simple_populate
+from wtscenario import multiply_scenarios, number_scenarios
# test_stat01.py
# Statistics operations
@@ -36,17 +38,23 @@ class test_stat01(wttest.WiredTigerTestCase):
Test statistics
"""
- tablename = 'test_stat01.wt'
- uri = 'file:' + tablename
- config = 'key_format=S,' +\
- 'allocation_size=512,internal_page_max=16K,leaf_page_max=128K'
+ config = 'internal_page_max=4K,leaf_page_max=8K'
nentries = 25
+ types = [
+ ('file', dict(uri='file:test_stat01.wt')),
+ ('table', dict(uri='table:test_stat01.wt'))
+ ]
+ keyfmt = [
+ ('recno', dict(keyfmt='r')),
+ ('string', dict(keyfmt='S')),
+ ]
+ scenarios = number_scenarios(multiply_scenarios('.', types, keyfmt))
+
# Override WiredTigerTestCase, we have extensions.
def setUpConnectionOpen(self, dir):
conn = wiredtiger.wiredtiger_open(dir,
- 'create,statistics=(fast),' +
- 'error_prefix="%s: "' % self.shortid())
+ 'create,statistics=(all),' + 'error_prefix="%s: "' % self.shortid())
return conn
def statstr_to_int(self, str):
@@ -57,17 +65,17 @@ class test_stat01(wttest.WiredTigerTestCase):
parts = str.rpartition('(')
return int(parts[2].rstrip(')'))
- def check_stats(self, statcursor, mincount, lookfor):
- """
- Do a quick check of the entries in the the stats cursor,
- There should be at least 'mincount' entries,
- and the 'lookfor' string should appear
- """
+ # Do a quick check of the entries in the the stats cursor, the "lookfor"
+ # string should appear with a minimum value of least "min".
+ def check_stats(self, statcursor, min, lookfor):
stringclass = ''.__class__
intclass = (0).__class__
- # make sure statistics basically look right
- count = 0
+
+ # Reset the cursor, we're called multiple times.
+ statcursor.reset()
+
found = False
+ foundval = 0
for id, desc, valstr, val in statcursor:
self.assertEqual(type(desc), stringclass)
self.assertEqual(type(valstr), stringclass)
@@ -75,68 +83,76 @@ class test_stat01(wttest.WiredTigerTestCase):
self.assertEqual(val, self.statstr_to_int(valstr))
self.printVerbose(2, ' stat: \'' + desc + '\', \'' +
valstr + '\', ' + str(val))
- count += 1
if desc == lookfor:
found = True
- self.assertTrue(count > mincount)
+ foundval = val
+
self.assertTrue(found, 'in stats, did not see: ' + lookfor)
+ self.assertTrue(foundval >= min)
+ # Test simple connection statistics.
def test_basic_conn_stats(self):
- self.printVerbose(2, 'overall database stats:')
+ # Build an object and force some writes.
+ config = self.config + ',key_format=' + self.keyfmt
+ simple_populate(self, self.uri, config, 1000)
+ self.session.checkpoint(None)
+
+ # See that we can get a specific stat value by its key and verify its
+ # entry is self-consistent.
allstat_cursor = self.session.open_cursor('statistics:', None, None)
self.check_stats(allstat_cursor, 10, 'block-manager: blocks written')
- # See that we can get a specific stat value by its key,
- # and verify that its entry is self-consistent
values = allstat_cursor[stat.conn.block_write]
self.assertEqual(values[0], 'block-manager: blocks written')
val = self.statstr_to_int(values[1])
self.assertEqual(val, values[2])
allstat_cursor.close()
+ # Test simple object statistics.
def test_basic_data_source_stats(self):
- self.session.create(self.uri, self.config)
+ # Build an object.
+ config = self.config + ',key_format=' + self.keyfmt
+ self.session.create(self.uri, config)
cursor = self.session.open_cursor(self.uri, None, None)
value = ""
- for i in range(0, self.nentries):
- key = str(i)
- value = value + key + value # size grows exponentially
- cursor.set_key(key)
+ for i in range(1, self.nentries):
+ value = value + 1000 * "a"
+ cursor.set_key(key_populate(cursor, i))
cursor.set_value(value)
cursor.insert()
cursor.close()
- self.printVerbose(2, 'data source specific stats:')
- cursor = self.session.open_cursor(
- 'statistics:' + self.uri, None, None)
+ # Force the object to disk, otherwise we can't check the overflow count.
+ self.reopen_conn()
+
+ # See that we can get a specific stat value by its key and verify its
+ # entry is self-consistent.
+ cursor = self.session.open_cursor('statistics:' + self.uri, None, None)
+ self.check_stats(cursor, 8192, 'btree: maximum leaf page size')
+ self.check_stats(cursor, 4096, 'btree: maximum internal page size')
self.check_stats(cursor, 10, 'btree: overflow pages')
- # See that we can get a specific stat value by its key,
- # and verify that its entry is self-consistent
values = cursor[stat.dsrc.btree_overflow]
self.assertEqual(values[0], 'btree: overflow pages')
val = self.statstr_to_int(values[1])
self.assertEqual(val, values[2])
cursor.close()
- def test_missing_file_stats(self):
- self.assertRaises(wiredtiger.WiredTigerError, lambda:
- self.session.open_cursor('statistics:file:DoesNotExist'))
-
+ # Test simple per-checkpoint statistics.
def test_checkpoint_stats(self):
- nentries = 0
- last_size = 0
for name in ('first', 'second', 'third'):
- helper.simple_populate(self, self.uri, self.config, nentries)
- nentries += self.nentries
+ config = self.config + ',key_format=' + self.keyfmt
+ helper.simple_populate(self, self.uri, config, self.nentries)
self.session.checkpoint('name=' + name)
cursor = self.session.open_cursor(
'statistics:' + self.uri, None, 'checkpoint=' + name)
- size = cursor[stat.dsrc.btree_overflow][1]
- self.assertTrue(size >= last_size)
- last_size = size
+ self.assertEqual(
+ cursor[stat.dsrc.btree_entries][2], self.nentries + 1)
cursor.close()
- self.session.truncate(self.uri, None, None)
+
+ def test_missing_file_stats(self):
+ self.assertRaises(wiredtiger.WiredTigerError, lambda:
+ self.session.open_cursor('statistics:file:DoesNotExist'))
if __name__ == '__main__':
wttest.run()