diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2015-03-09 17:47:27 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2015-03-09 17:47:27 +1100 |
commit | 3a3bda539cdd34428b7489fa0fa102ff0605e8d8 (patch) | |
tree | fc901ad7b45300181356b8305ecb02fff13f4bfc | |
parent | 89f45aafdff48bf7c8e191b788a144cab0b86122 (diff) | |
parent | 0afa07b0cd666adf7576901540a699b0bec396e3 (diff) | |
download | mongo-3a3bda539cdd34428b7489fa0fa102ff0605e8d8.tar.gz |
Merge branch 'develop' into mongodb-3.0
Conflicts:
NEWS.MONGODB
72 files changed, 1612 insertions, 720 deletions
@@ -1,6 +1,6 @@ -WiredTiger 2.5.1: (March 9, 2015) +WiredTiger 2.5.2: (March 9, 2015) -This is version 2.5.1 of WiredTiger. +This is version 2.5.2 of WiredTiger. WiredTiger release packages and documentation can be found at: @@ -9,7 +9,7 @@ WiredTiger release packages and documentation can be found at: Information on configuring, building and installing WiredTiger can be found at: - http://source.wiredtiger.com/2.5.1/install.html + http://source.wiredtiger.com/2.5.2/install.html WiredTiger licensing information can be found at: diff --git a/RELEASE_INFO b/RELEASE_INFO index 6c7da8cb961..ac5ff8ac028 100644 --- a/RELEASE_INFO +++ b/RELEASE_INFO @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=2 WIREDTIGER_VERSION_MINOR=5 -WIREDTIGER_VERSION_PATCH=1 +WIREDTIGER_VERSION_PATCH=2 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index d37acef50e1..82feee58aa1 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -24,5 +24,6 @@ test/checkpoint test/fops test/format HAVE_BERKELEY_DB test/huge +test/packing test/salvage test/thread diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index 7f4d68e8b39..cbd389ea40d 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -2,8 +2,8 @@ dnl build by dist/s_version VERSION_MAJOR=2 VERSION_MINOR=5 -VERSION_PATCH=1 -VERSION_STRING='"WiredTiger 2.5.1: (March 9, 2015)"' +VERSION_PATCH=2 +VERSION_STRING='"WiredTiger 2.5.2: (March 9, 2015)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4 index 71598b276eb..340f77e5474 100644 --- a/build_posix/aclocal/version.m4 +++ b/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -2.5.1 +2.5.2 diff --git a/dist/flags.py b/dist/flags.py index a0e307debf6..f1eb6b24968 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -36,12 +36,11 @@ flags = { 'page_read' : [ 'READ_CACHE', 'READ_COMPACT', - 'READ_NO_GEN', 'READ_NO_EVICT', + 'READ_NO_GEN', 'READ_NO_WAIT', 'READ_PREV', 'READ_SKIP_INTL', - 'READ_SKIP_LEAF', 'READ_TRUNCATE', 'READ_WONT_NEED', ], @@ -88,15 +87,16 @@ flags = { 'conn' : [ 'CONN_CACHE_POOL', 'CONN_CKPT_SYNC', + 'CONN_CLOSING', 'CONN_EVICTION_RUN', 'CONN_LEAK_MEMORY', 'CONN_LOG_SERVER_RUN', 'CONN_LSM_MERGE', 'CONN_PANIC', - 'CONN_SERVER_RUN', 'CONN_SERVER_ASYNC', 'CONN_SERVER_CHECKPOINT', 'CONN_SERVER_LSM', + 'CONN_SERVER_RUN', 'CONN_SERVER_STATISTICS', 'CONN_SERVER_SWEEP', 'CONN_WAS_BACKUP', diff --git a/dist/package/wiredtiger.spec b/dist/package/wiredtiger.spec index ab762ef17fd..11eca316ffd 100644 --- a/dist/package/wiredtiger.spec +++ b/dist/package/wiredtiger.spec @@ -1,5 +1,5 @@ Name: wiredtiger -Version: 2.5.1 +Version: 2.5.2 Release: 1%{?dist} Summary: WiredTiger data storage engine diff --git a/dist/s_define.list b/dist/s_define.list index 91fbc971afa..4924a1935ae 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -49,6 +49,7 @@ WT_STAT_ATOMIC_DECR WT_STAT_ATOMIC_DECRV WT_STAT_ATOMIC_INCR WT_STAT_ATOMIC_INCRV +WT_STAT_DECR WT_STAT_DECRV WT_STAT_FAST_ATOMIC_DECR WT_STAT_FAST_ATOMIC_DECRV diff --git a/dist/s_string.ok b/dist/s_string.ok index 66439faf161..8b0335a6480 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -551,6 +551,7 @@ dest dev dhandle dhandles +dir dirlist dl dlclose @@ -1161,6 +1162,7 @@ wrapup writelock writeunlock wrlock +wrlsn ws wti wtperf diff --git a/dist/stat_data.py b/dist/stat_data.py index 5a42f2ff318..dd4d292c8b6 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -221,11 +221,14 @@ connection_stats = [ LogStat('log_prealloc_max', 'number of pre-allocated log files to create'), LogStat('log_prealloc_used', 'pre-allocated log files used'), LogStat('log_reads', 'log read operations'), + LogStat('log_release_write_lsn', 'log release advances write LSN'), LogStat('log_scan_records', 'records processed by log scan'), LogStat('log_scan_rereads', 'log scan records requiring two reads'), LogStat('log_scans', 'log scan operations'), LogStat('log_sync', 'log sync operations'), + LogStat('log_sync_dir', 'log sync_dir operations'), LogStat('log_writes', 'log write operations'), + LogStat('log_write_lsn', 'log server thread advances write LSN'), LogStat('log_slot_consolidated', 'logging bytes consolidated'), LogStat('log_slot_closes', 'consolidated slot closures'), diff --git a/examples/c/ex_pack.c b/examples/c/ex_pack.c index 19be35119af..c24805ade29 100644 --- a/examples/c/ex_pack.c +++ b/examples/c/ex_pack.c @@ -42,8 +42,6 @@ main(void) { WT_CONNECTION *conn; WT_SESSION *session; - char buf[50]; - size_t size; int i, j, k, ret; /* @@ -66,7 +64,11 @@ main(void) fprintf(stderr, "Error opening a session on %s: %s\n", home, wiredtiger_strerror(ret)); + { /*! [packing] */ + size_t size; + char buf[50]; + ret = wiredtiger_struct_size(session, &size, "iii", 42, 1000, -9); if (size > sizeof(buf)) { /* Allocate a bigger buffer. */ @@ -76,6 +78,7 @@ main(void) ret = wiredtiger_struct_unpack(session, buf, size, "iii", &i, &j, &k); /*! [packing] */ + } /* Note: closing the connection implicitly closes open session(s). */ if ((ret = conn->close(conn, NULL)) != 0) diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 9c4ab05ce40..479f6547e42 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -221,9 +221,6 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { int skip; - if (ref->state != WT_REF_DELETED) - return (0); - /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 299849ad365..5b3624a4a2d 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -453,8 +453,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) ref->page = NULL; ref->addr = NULL; ref->state = WT_REF_DELETED; - WT_ERR(__wt_row_ikey_incr( - session, root, 0, "", 1, &ref->key.ikey)); + WT_ERR(__wt_row_ikey_incr(session, root, 0, "", 1, ref)); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -634,7 +633,7 @@ __btree_page_sizes(WT_SESSION_IMPL *session) WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage); cache_size = S2C(session)->cache_size; if (cache_size > 0) - btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2); + btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4); /* * Get the split percentage (reconciliation splits pages into smaller diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index b5140beb792..e177b05cd24 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -165,6 +165,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && + page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); @@ -611,7 +612,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) WT_ERR(__wt_row_ikey_incr(session, page, WT_PAGE_DISK_OFFSET(page, cell), - current->data, current->size, &ref->key.ikey)); + current->data, current->size, ref)); *sizep += sizeof(WT_IKEY) + current->size; break; diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 1cf616a2f6b..d6c20556a9a 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1858,8 +1858,7 @@ __slvg_row_build_internal( WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss)); } else { WT_ERR(__wt_row_ikey_incr(session, page, 0, - trk->row_start.data, trk->row_start.size, - &ref->key.ikey)); + trk->row_start.data, trk->row_start.size, ref)); WT_ERR(__slvg_ovfl_ref_all(session, trk)); } @@ -1981,8 +1980,8 @@ __slvg_row_build_leaf( */ rip = page->pg_row_d + skip_start; WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0)); - WT_ERR(__wt_row_ikey_incr(session, - ref->home, 0, key->data, key->size, &ref->key.ikey)); + WT_ERR(__wt_row_ikey_incr( + session, ref->home, 0, key->data, key->size, ref)); /* Set the referenced flag on overflow pages we're using. */ if (trk->trk_ovfl_cnt != 0) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 6ebd4609efa..95fb9c68a86 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -281,8 +281,8 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, if (parent->type == WT_PAGE_ROW_INT) { if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { __wt_ref_key(parent, ref, &key, &size); - WT_RET(__wt_row_ikey(session, 0, key, size, &ikey)); - ref->key.ikey = ikey; + WT_RET(__wt_row_ikey(session, 0, key, size, ref)); + ikey = ref->key.ikey; } else { WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); *parent_decrp += sizeof(WT_IKEY) + ikey->size; @@ -454,8 +454,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) ref->addr = NULL; if (parent->type == WT_PAGE_ROW_INT) { __wt_ref_key(parent, *parent_refp, &p, &size); - WT_ERR( - __wt_row_ikey(session, 0, p, size, &ref->key.ikey)); + WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); parent_incr += sizeof(WT_IKEY) + size; } else ref->key.recno = (*parent_refp)->key.recno; @@ -468,7 +467,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); - __wt_page_only_modify_set(session, child); + __wt_page_modify_set(session, child); /* * Once the split goes live, the newly created internal pages @@ -761,8 +760,8 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: ikey = multi->key.ikey; - WT_RET(__wt_row_ikey(session, 0, - WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey)); + WT_RET(__wt_row_ikey( + session, 0, WT_IKEY_DATA(ikey), ikey->size, ref)); incr += sizeof(WT_IKEY) + ikey->size; break; default: @@ -855,7 +854,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, for (i = 0, deleted_entries = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - if (__wt_delete_page_skip(session, next_ref) && + if (next_ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, next_ref) && WT_ATOMIC_CAS4(next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) deleted_entries++; @@ -1139,15 +1139,23 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* - * The first page in the split is the current page, but we still need to - * create a replacement WT_REF and make a copy of the key (the original - * WT_REF is set to split-status and eventually freed). - * - * The new reference is visible to readers once the split completes. + * The first page in the split is the current page, but we still have + * to create a replacement WT_REF, the original WT_REF will be set to + * split status and eventually freed. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); child = split_ref[0]; *child = *ref; + + /* + * The new WT_REF is not quite identical: we have to instantiate a key, + * and the new reference is visible to readers once the split completes. + * + * The key-instantiation code checks for races, clear the key fields so + * we don't trigger them. + */ + child->key.recno = 0; + child->key.ikey = NULL; child->state = WT_REF_MEM; /* @@ -1167,8 +1175,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) } else WT_ERR(__wt_row_leaf_key( session, page, &page->pg_row_d[0], key, 1)); - WT_ERR(__wt_row_ikey( - session, 0, key->data, key->size, &child->key.ikey)); + WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); @@ -1187,7 +1194,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) child->state = WT_REF_MEM; WT_ERR(__wt_row_ikey(session, 0, WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), - &child->key.ikey)); + child)); parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); @@ -1203,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) /* The new page is dirty by definition. */ WT_ERR(__wt_page_modify_init(session, right)); - __wt_page_only_modify_set(session, right); + __wt_page_modify_set(session, right); /* * We modified the page above, which will have set the first dirty diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 2e34a925f84..b550158a5a9 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -9,8 +9,9 @@ #include "wt_internal.h" static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); -static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *); -static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *); +static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *); +static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); +static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); /* * __wt_btree_stat_init -- @@ -89,18 +90,13 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) WT_STAT_INCRV(stats, btree_entries, pindex->entries); break; case WT_PAGE_COL_VAR: - WT_RET(__stat_page_col_var(page, stats)); - break; - case WT_PAGE_OVFL: - WT_STAT_INCR(stats, btree_overflow); + __stat_page_col_var(page, stats); break; case WT_PAGE_ROW_INT: - WT_STAT_INCR(stats, btree_row_internal); - pindex = WT_INTL_INDEX_COPY(page); - WT_STAT_INCRV(stats, btree_entries, pindex->entries); + __stat_page_row_int(session, page, stats); break; case WT_PAGE_ROW_LEAF: - WT_RET(__stat_page_row_leaf(page, stats)); + __stat_page_row_leaf(session, page, stats); break; WT_ILLEGAL_VALUE(session); } @@ -111,7 +107,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ -static int +static void __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) { WT_CELL *cell; @@ -119,29 +115,33 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; + uint64_t deleted_cnt, entry_cnt, ovfl_cnt; uint32_t i; int orig_deleted; unpack = &_unpack; + deleted_cnt = entry_cnt = ovfl_cnt = 0; WT_STAT_INCR(stats, btree_column_variable); /* - * Walk the page, counting regular and overflow data items, and checking - * to be sure any updates weren't deletions. If the item was updated, - * assume it was updated by an item of the same size (it's expensive to - * figure out if it will require the same space or not, especially if - * there's Huffman encoding). + * Walk the page counting regular items, adjusting if the item has been + * subsequently deleted or not. This is a mess because 10-item RLE might + * have 3 of the items subsequently deleted. Overflow items are harder, + * we can't know if an updated item will be an overflow item or not; do + * our best, and simply count every overflow item (or RLE set of items) + * we see. */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = 1; - WT_STAT_INCR(stats, btree_column_deleted); + ++deleted_cnt; } else { orig_deleted = 0; __wt_cell_unpack(cell, unpack); - WT_STAT_INCRV( - stats, btree_entries, __wt_cell_rle(unpack)); + entry_cnt += __wt_cell_rle(unpack); + if (unpack->ovfl) + ++ovfl_cnt; } /* @@ -151,57 +151,128 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { - if (orig_deleted) - continue; - WT_STAT_INCR(stats, btree_column_deleted); - WT_STAT_DECR(stats, btree_entries); - } else { - if (!orig_deleted) - continue; - WT_STAT_DECR(stats, btree_column_deleted); - WT_STAT_INCR(stats, btree_entries); - } + if (!orig_deleted) { + ++deleted_cnt; + --entry_cnt; + } + } else + if (orig_deleted) { + --deleted_cnt; + ++entry_cnt; + } } } - return (0); + + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) + ++entry_cnt; + + WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt); + WT_STAT_INCRV(stats, btree_entries, entry_cnt); + WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); +} + +/* + * __stat_page_row_int -- + * Stat a WT_PAGE_ROW_INT page. + */ +static void +__stat_page_row_int( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_PAGE_INDEX *pindex; + uint32_t i, ovfl_cnt; + + btree = S2BT(session); + ovfl_cnt = 0; + + WT_STAT_INCR(stats, btree_row_internal); + + /* + * The number of entries tells us the number of items on row-store + * internal page. + */ + pindex = WT_INTL_INDEX_COPY(page); + WT_STAT_INCRV(stats, btree_entries, pindex->entries); + + /* + * Overflow keys are hard: we have to walk the disk image to count them, + * the in-memory representation of the page doesn't necessarily contain + * a reference to the original cell. + */ + if (page->dsk != NULL) + WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) { + __wt_cell_unpack(cell, &unpack); + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) + ++ovfl_cnt; + } + + WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); } /* * __stat_page_row_leaf -- * Stat a WT_PAGE_ROW_LEAF page. */ -static int -__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats) +static void +__stat_page_row_leaf( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) { + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; WT_INSERT *ins; WT_ROW *rip; WT_UPDATE *upd; - uint32_t cnt, i; + uint32_t entry_cnt, i, ovfl_cnt; + + btree = S2BT(session); + entry_cnt = ovfl_cnt = 0; WT_STAT_INCR(stats, btree_row_leaf); /* - * Stat any K/V pairs inserted into the page before the first from-disk + * Walk any K/V pairs inserted into the page before the first from-disk * key on the page. */ - cnt = 0; WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page)) if (!WT_UPDATE_DELETED_ISSET(ins->upd)) - ++cnt; + ++entry_cnt; - /* Stat the page's K/V pairs. */ + /* + * Walk the page's K/V pairs. Count overflow values, where an overflow + * item is any on-disk overflow item that hasn't been updated. + */ WT_ROW_FOREACH(page, rip, i) { upd = WT_ROW_UPDATE(page, rip); if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd)) - ++cnt; + ++entry_cnt; + if (upd == NULL && (cell = + __wt_row_leaf_value_cell(page, rip, NULL)) != NULL && + __wt_cell_type(cell) == WT_CELL_VALUE_OVFL) + ++ovfl_cnt; - /* Stat inserted K/V pairs. */ + /* Walk K/V pairs inserted after the on-page K/V pair. */ WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip)) if (!WT_UPDATE_DELETED_ISSET(ins->upd)) - ++cnt; + ++entry_cnt; } - WT_STAT_INCRV(stats, btree_entries, cnt); + /* + * Overflow keys are hard: we have to walk the disk image to count them, + * the in-memory representation of the page doesn't necessarily contain + * a reference to the original cell. + */ + if (page->dsk != NULL) + WT_CELL_FOREACH(btree, page->dsk, cell, &unpack, i) { + __wt_cell_unpack(cell, &unpack); + if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL) + ++ovfl_cnt; + } - return (0); + WT_STAT_INCRV(stats, btree_entries, entry_cnt); + WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index d925eefc2fe..bc5d1051b1e 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -113,6 +113,13 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) if (walk == NULL) break; + page = walk->page; + mod = page->modify; + + /* Skip clean pages. */ + if (!__wt_page_is_modified(page)) + continue; + /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. @@ -125,23 +132,27 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. + * + * Mark the tree dirty: the checkpoint marked it clean + * and we can't skip future checkpoints until this page + * is written. */ - page = walk->page; - mod = page->modify; - if (__wt_page_is_modified(page) && - (WT_PAGE_IS_INTERNAL(page) || - !F_ISSET(txn, TXN_HAS_SNAPSHOT) || - TXNID_LE(mod->first_dirty_txn, txn->snap_max))) { - if (WT_PAGE_IS_INTERNAL(page)) { - internal_bytes += - page->memory_footprint; - ++internal_pages; - } else { - leaf_bytes += page->memory_footprint; - ++leaf_pages; - } - WT_ERR(__wt_reconcile(session, walk, NULL, 0)); + if (!WT_PAGE_IS_INTERNAL(page) && + F_ISSET(txn, TXN_HAS_SNAPSHOT) && + TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { + __wt_page_modify_set(session, page); + continue; + } + + if (WT_PAGE_IS_INTERNAL(page)) { + internal_bytes += + page->memory_footprint; + ++internal_pages; + } else { + leaf_bytes += page->memory_footprint; + ++leaf_pages; } + WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 10dd5b12936..917e0c54a30 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -20,12 +20,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; - WT_REF *couple, *ref; - int descending, prev, skip; + WT_REF *couple, *couple_orig, *ref; + int prev, skip; uint32_t slot; btree = S2BT(session); - descending = 0; /* * Tree walks are special: they look inside page structures that splits @@ -79,7 +78,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ - couple = ref = *refp; + couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ @@ -102,29 +101,6 @@ ascend: /* /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); - if (0) { -restart: /* - * The page we're moving to might have split, in which case find - * the last position we held. - * - * If we were starting a tree walk, begin again. - * - * If we were in the process of descending, repeat the descent. - * If we were moving within a single level of the tree, repeat - * the last move. - */ - ref = couple; - if (ref == &btree->root) { - ref = &btree->root; - if (ref->page == NULL) - goto done; - goto descend; - } - __wt_page_refp(session, ref, &pindex, &slot); - if (descending) - goto descend; - } - for (;;) { /* * If we're at the last/first slot on the page, return this page @@ -152,14 +128,11 @@ restart: /* /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. - * We don't handle a restart return because it - * would require additional complexity in the - * restart code (ascent code somewhat like the - * descent code already there), and it's not a - * possible return: we're moving to the parent - * of the current child, not another child of - * the same parent, there's no way our parent - * split. + * We don't handle restart or not-found returns. + * It would require additional complexity and is + * not a possible return: we're moving to the + * parent of the current child page, our parent + * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( @@ -182,7 +155,7 @@ restart: /* if (walkcntp != NULL) ++*walkcntp; - for (descending = 0;;) { + for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { @@ -198,7 +171,8 @@ restart: /* * Avoid pulling a deleted page back in to try * to delete it again. */ - if (__wt_delete_page_skip(session, ref)) + if (ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page @@ -232,26 +206,67 @@ restart: /* } } else { /* - * If iterating a cursor, try to skip deleted - * pages that are visible to us. + * Try to skip deleted pages visible to us. */ - if (__wt_delete_page_skip(session, ref)) + if (ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); + + /* + * Not-found is an expected return when only walking + * in-cache pages. + */ if (ret == WT_NOTFOUND) { ret = 0; break; } - if (ret == WT_RESTART) - goto restart; + + /* + * The page we're moving to might have split, in which + * case move to the last position we held. + */ + if (ret == WT_RESTART) { + ret = 0; + + /* + * If a new walk that never coupled from the + * root to a new saved position in the tree, + * restart the walk. + */ + if (couple == &btree->root) { + ref = &btree->root; + if (ref->page == NULL) + goto done; + goto descend; + } + + /* + * If restarting from some original position, + * repeat the increment or decrement we made at + * that time. Otherwise, couple is an internal + * page we've acquired after moving from that + * starting position and we can treat it as a + * new page. This works because we never acquire + * a hazard pointer on a leaf page we're not + * going to return to our caller, this will quit + * work if that ever changes. + */ + WT_ASSERT(session, + couple == couple_orig || + WT_PAGE_IS_INTERNAL(couple->page)); + ref = couple; + __wt_page_refp(session, ref, &pindex, &slot); + if (couple == couple_orig) + break; + } WT_ERR(ret); /* - * Entering a new page: configure for traversal of any - * internal page's children, else return (or optionally - * skip), the leaf page. + * A new page: configure for traversal of any internal + * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; @@ -259,10 +274,7 @@ descend: couple = ref; page->type == WT_PAGE_COL_INT) { pindex = WT_INTL_INDEX_COPY(page); slot = prev ? pindex->entries - 1 : 0; - descending = 1; - } else if (LF_ISSET(WT_READ_SKIP_LEAF)) - goto ascend; - else { + } else { *refp = ref; goto done; } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 92cfd1e4273..f2868afe13a 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -439,7 +439,7 @@ next: switch (direction) { (void)__wt_row_leaf_key_info( page, copy, &ikey, &cell, NULL, NULL); if (ikey == NULL) { - WT_ERR(__wt_row_ikey(session, + WT_ERR(__wt_row_ikey_alloc(session, WT_PAGE_DISK_OFFSET(page, cell), keyb->data, keyb->size, &ikey)); @@ -462,15 +462,37 @@ err: __wt_scr_free(session, &tmp); } /* + * __wt_row_ikey_alloc -- + * Instantiate a key in a WT_IKEY structure. + */ +int +__wt_row_ikey_alloc(WT_SESSION_IMPL *session, + uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp) +{ + WT_IKEY *ikey; + + /* + * Allocate memory for the WT_IKEY structure and the key, then copy + * the key into place. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey)); + ikey->size = WT_STORE_SIZE(size); + ikey->cell_offset = cell_offset; + memcpy(WT_IKEY_DATA(ikey), key, size); + *ikeyp = ikey; + return (0); +} + +/* * __wt_row_ikey_incr -- * Instantiate a key in a WT_IKEY structure and increment the page's * memory footprint. */ int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, - uint32_t cell_offset, const void *key, size_t size, void *ikeyp) + uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) { - WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp)); + WT_RET(__wt_row_ikey(session, cell_offset, key, size, ref)); __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size); @@ -483,19 +505,30 @@ __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, */ int __wt_row_ikey(WT_SESSION_IMPL *session, - uint32_t cell_offset, const void *key, size_t size, void *ikeyp) + uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) { WT_IKEY *ikey; + WT_RET(__wt_row_ikey_alloc(session, cell_offset, key, size, &ikey)); + +#ifdef HAVE_DIAGNOSTIC + { + uintptr_t oldv; + + oldv = (uintptr_t)ref->key.ikey; + WT_DIAGNOSTIC_YIELD; + /* - * Allocate memory for the WT_IKEY structure and the key, then copy - * the key into place. + * We should never overwrite an instantiated key, and we should + * never instantiate a key after a split. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey)); - ikey->size = WT_STORE_SIZE(size); - ikey->cell_offset = cell_offset; - memcpy(WT_IKEY_DATA(ikey), key, size); - - *(WT_IKEY **)ikeyp = ikey; + WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0); + WT_ASSERT(session, ref->state != WT_REF_SPLIT); + WT_ASSERT(session, + WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey)); + } +#else + ref->key.ikey = ikey; +#endif return (0); } diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 0562f9cfc34..6b9824fc415 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -762,8 +762,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) WT_ERR(__conn_statistics_config(session, config_cfg)); WT_ERR(__wt_async_reconfig(session, config_cfg)); - WT_ERR(__wt_cache_config(session, config_cfg)); - WT_ERR(__wt_cache_pool_config(session, config_cfg)); + WT_ERR(__wt_cache_config(session, 1, config_cfg)); WT_ERR(__wt_checkpoint_server_create(session, config_cfg)); WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg)); WT_ERR(__wt_statlog_create(session, config_cfg)); diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index c513d46137c..4a7e15044de 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -9,33 +9,28 @@ #include "wt_internal.h" /* - * __wt_cache_config -- + * __cache_config_local -- * Configure the underlying cache. */ -int -__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]) +static int +__cache_config_local(WT_SESSION_IMPL *session, int shared, const char *cfg[]) { WT_CACHE *cache; WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; + uint32_t evict_workers_max, evict_workers_min; conn = S2C(session); cache = conn->cache; /* * If not using a shared cache configure the cache size, otherwise - * check for a reserved size. + * check for a reserved size. All other settings are independent of + * whether we are using a shared cache or not. */ - if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) { + if (!shared) { WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval)); conn->cache_size = (uint64_t)cval.val; - } else { - WT_RET(__wt_config_gets( - session, cfg, "shared_cache.reserve", &cval)); - if (cval.val == 0) - WT_RET(__wt_config_gets( - session, cfg, "shared_cache.chunk", &cval)); - cache->cp_reserved = (uint64_t)cval.val; } WT_RET(__wt_config_gets(session, cfg, "cache_overhead", &cval)); @@ -57,16 +52,64 @@ __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval)); WT_ASSERT(session, cval.val > 0); - conn->evict_workers_max = (u_int)cval.val - 1; + evict_workers_max = (uint32_t)cval.val - 1; WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval)); WT_ASSERT(session, cval.val > 0); - conn->evict_workers_min = (u_int)cval.val - 1; + evict_workers_min = (uint32_t)cval.val - 1; - if (conn->evict_workers_min > conn->evict_workers_max) + if (evict_workers_min > evict_workers_max) WT_RET_MSG(session, EINVAL, "eviction=(threads_min) cannot be greater than " "eviction=(threads_max)"); + conn->evict_workers_max = evict_workers_max; + conn->evict_workers_min = evict_workers_min; + + return (0); +} + +/* + * __wt_cache_config -- + * Configure or reconfigure the current cache and shared cache. + */ +int +__wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + int now_shared, was_shared; + + conn = S2C(session); + + WT_ASSERT(session, conn->cache != NULL); + + WT_RET(__wt_config_gets_none(session, cfg, "shared_cache.name", &cval)); + now_shared = cval.len != 0; + was_shared = F_ISSET(conn, WT_CONN_CACHE_POOL); + + /* Cleanup if reconfiguring */ + if (reconfigure && was_shared && !now_shared) + /* Remove ourselves from the pool if necessary */ + WT_RET(__wt_conn_cache_pool_destroy(session)); + else if (reconfigure && !was_shared && now_shared) + /* + * Cache size will now be managed by the cache pool - the + * start size always needs to be zero to allow the pool to + * manage how much memory is in-use. + */ + conn->cache_size = 0; + + /* + * Always setup the local cache - it's used even if we are + * participating in a shared cache. + */ + WT_RET(__cache_config_local(session, now_shared, cfg)); + if (now_shared) { + WT_RET(__wt_cache_pool_config(session, cfg)); + WT_ASSERT(session, F_ISSET(conn, WT_CONN_CACHE_POOL)); + if (!was_shared) + WT_RET(__wt_conn_cache_pool_open(session)); + } return (0); } @@ -84,19 +127,14 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); - WT_ASSERT(session, conn->cache == NULL || - (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL)); + WT_ASSERT(session, conn->cache == NULL); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /* Use a common routine for run-time configuration options. */ - WT_RET(__wt_cache_config(session, cfg)); - - /* Add the configured cache to the cache pool. */ - if (F_ISSET(conn, WT_CONN_CACHE_POOL)) - WT_RET(__wt_conn_cache_pool_open(session)); + WT_RET(__wt_cache_config(session, 0, cfg)); /* * The target size must be lower than the trigger size or we will never diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index f5b78e33b04..7bf090496a8 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -36,17 +36,17 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) WT_CONNECTION_IMPL *conn, *entry; WT_DECL_RET; char *pool_name; - int created, reconfiguring; + int created, updating; uint64_t chunk, reserve, size, used_cache; conn = S2C(session); - created = reconfiguring = 0; + created = updating = 0; pool_name = NULL; cp = NULL; size = 0; if (F_ISSET(conn, WT_CONN_CACHE_POOL)) - reconfiguring = 1; + updating = 1; else { WT_RET(__wt_config_gets_none( session, cfg, "shared_cache.name", &cval)); @@ -81,7 +81,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) __wt_spin_lock(session, &__wt_process.spinlock); if (__wt_process.cache_pool == NULL) { - WT_ASSERT(session, !reconfiguring); + WT_ASSERT(session, !updating); /* Create a cache pool. */ WT_ERR(__wt_calloc_one(session, &cp)); created = 1; @@ -96,7 +96,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) __wt_process.cache_pool = cp; WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name)); - } else if (!reconfiguring && !WT_STRING_MATCH( + } else if (!updating && !WT_STRING_MATCH( __wt_process.cache_pool->name, pool_name, strlen(pool_name))) /* Only a single cache pool is supported. */ WT_ERR_MSG(session, WT_ERROR, @@ -109,7 +109,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) * The cache pool requires a reference count to avoid a race between * configuration/open and destroy. */ - if (!reconfiguring) + if (!updating) ++cp->refs; /* @@ -157,7 +157,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) if (__wt_config_gets(session, &cfg[1], "shared_cache.reserve", &cval) == 0 && cval.val != 0) reserve = (uint64_t)cval.val; - else if (reconfiguring) + else if (updating) reserve = conn->cache->cp_reserved; else reserve = chunk; @@ -171,18 +171,23 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) used_cache += entry->cache->cp_reserved; } + /* Ignore our old allocation if reconfiguring */ + if (updating) + used_cache -= conn->cache->cp_reserved; if (used_cache + reserve > size) WT_ERR_MSG(session, EINVAL, "Shared cache unable to accommodate this configuration. " - "Shared cache size: %" PRIu64 ", reserved: %" PRIu64, + "Shared cache size: %" PRIu64 ", requested min: %" PRIu64, size, used_cache + reserve); /* The configuration is verified - it's safe to update the pool. */ cp->size = size; cp->chunk = chunk; + conn->cache->cp_reserved = reserve; + /* Wake up the cache pool server so any changes are noticed. */ - if (reconfiguring) + if (updating) WT_ERR(__wt_cond_signal( session, __wt_process.cache_pool->cache_pool_cond)); @@ -192,7 +197,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) F_SET(conn, WT_CONN_CACHE_POOL); err: __wt_spin_unlock(session, &__wt_process.spinlock); - if (!reconfiguring) + if (!updating) __wt_free(session, pool_name); if (ret != 0 && created) { __wt_free(session, cp->name); diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index a5512352f2c..7756158594c 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -375,6 +375,8 @@ __conn_btree_open( F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING)); + /* * If the handle is already open, it has to be closed so it can be * reopened with a new configuration. We don't need to check again: @@ -539,6 +541,48 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, } /* + * __wt_conn_btree_apply_single_ckpt -- + * Decode any checkpoint information from the configuration string then + * call btree apply single. + */ +int +__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, + const char *uri, + int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + const char *checkpoint; + + checkpoint = NULL; + + /* + * This function exists to handle checkpoint configuration. Callers + * that never open a checkpoint call the underlying function directly. + */ + WT_RET_NOTFOUND_OK( + __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); + if (cval.len != 0) { + /* + * The internal checkpoint name is special, find the last + * unnamed checkpoint of the object. + */ + if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { + WT_RET(__wt_meta_checkpoint_last_name( + session, uri, &checkpoint)); + } else + WT_RET(__wt_strndup( + session, cval.str, cval.len, &checkpoint)); + } + + ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg); + + __wt_free(session, checkpoint); + + return (ret); +} + +/* * __wt_conn_btree_apply_single -- * Apply a function to a single btree handle that couldn't be locked * (attempting to get the handle returned EBUSY). @@ -580,10 +624,10 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, ret = func(session, cfg)); } __wt_spin_unlock(session, &dhandle->close_lock); - WT_ERR(ret); + WT_RET(ret); } -err: return (ret); + return (0); } /* @@ -683,20 +727,25 @@ __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final) { WT_DATA_HANDLE *dhandle; WT_DECL_RET; + int tret; dhandle = session->dhandle; if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { - ret = __wt_conn_btree_sync_and_close(session, 0); - if (!final) - WT_RET(ret); + tret = __wt_conn_btree_sync_and_close(session, 0); + if (final && tret != 0) { + __wt_err(session, tret, + "Final close of %s failed", dhandle->name); + WT_TRET(tret); + } else if (!final) + WT_RET(tret); } /* * Kludge: interrupt the eviction server in case it is holding the * handle list lock. */ - F_SET(S2C(session)->cache, WT_EVICT_CLEAR_WALKS); + F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS); /* Try to remove the handle, protected by the data handle lock. */ WT_WITH_DHANDLE_LOCK(session, diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 36d4d539d92..315e93c1875 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -347,6 +347,124 @@ err: __wt_err(session, ret, "log close server error"); } /* + * Simple structure for sorting written slots. + */ +typedef struct { + WT_LSN lsn; + uint32_t slot_index; +} WT_LOG_WRLSN_ENTRY; + +/* + * __log_wrlsn_cmp -- + * The log wrlsn comparison function for qsort. + */ +static int +__log_wrlsn_cmp(const void *a, const void *b) +{ + WT_LOG_WRLSN_ENTRY *ae, *be; + + ae = (WT_LOG_WRLSN_ENTRY *)a; + be = (WT_LOG_WRLSN_ENTRY *)b; + return (LOG_CMP(&ae->lsn, &be->lsn)); +} + +/* + * __log_wrlsn_server -- + * The log wrlsn server thread. + */ +static void * +__log_wrlsn_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOG_WRLSN_ENTRY written[SLOT_POOL]; + WT_LOGSLOT *slot; + WT_SESSION_IMPL *session; + size_t written_i; + uint32_t i, save_i; + int yield; + + session = arg; + conn = S2C(session); + log = conn->log; + yield = 0; + while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + /* + * No need to use the log_slot_lock because the slot pool + * is statically allocated and any slot in the + * WT_LOG_SLOT_WRITTEN state is exclusively ours for now. + */ + i = 0; + written_i = 0; + /* + * Walk the array once saving any slots that are in the + * WT_LOG_SLOT_WRITTEN state. + */ + while (i < SLOT_POOL) { + save_i = i; + slot = &log->slot_pool[i++]; + if (slot->slot_state != WT_LOG_SLOT_WRITTEN) + continue; + written[written_i].slot_index = save_i; + written[written_i++].lsn = slot->slot_release_lsn; + } + /* + * If we found any written slots process them. We sort them + * based on the release LSN, and then look for them in order. + */ + if (written_i > 0) { + yield = 0; + qsort(written, written_i, sizeof(WT_LOG_WRLSN_ENTRY), + __log_wrlsn_cmp); + /* + * We know the written array is sorted by LSN. Go + * through them either advancing write_lsn or stop + * as soon as one is not in order. + */ + for (i = 0; i < written_i; i++) { + if (LOG_CMP(&log->write_lsn, + &written[i].lsn) != 0) + break; + /* + * If we get here we have a slot to process. + * Advance the LSN and process the slot. + */ + slot = &log->slot_pool[written[i].slot_index]; + WT_ASSERT(session, LOG_CMP(&written[i].lsn, + &slot->slot_release_lsn) == 0); + log->write_lsn = slot->slot_end_lsn; + WT_ERR(__wt_cond_signal(session, + log->log_write_cond)); + WT_STAT_FAST_CONN_INCR(session, log_write_lsn); + + /* + * Signal the close thread if needed. + */ + if (F_ISSET(slot, SLOT_CLOSEFH)) + WT_ERR(__wt_cond_signal(session, + conn->log_close_cond)); + WT_ERR(__wt_log_slot_free(session, slot)); + } + } + /* + * If we saw a later write, we always want to yield because + * we know something is in progress. + */ + if (yield++ < 1000) + __wt_yield(); + else + /* Wait until the next event. */ + WT_ERR(__wt_cond_wait(session, + conn->log_wrlsn_cond, 100000)); + } + + if (0) +err: __wt_err(session, ret, "log wrlsn server error"); + return (NULL); +} + +/* * __log_server -- * The log server thread. */ @@ -479,12 +597,24 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) "log close server", 0, &conn->log_close_cond)); /* - * Start the thread. + * Start the log file close thread. */ WT_RET(__wt_thread_create(conn->log_close_session, &conn->log_close_tid, __log_close_server, conn->log_close_session)); conn->log_close_tid_set = 1; + /* + * Start the log write LSN thread. It is not configurable. + * If logging is enabled, this thread runs. + */ + WT_RET(__wt_open_internal_session( + conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session)); + WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, + "log write lsn server", 0, &conn->log_wrlsn_cond)); + WT_RET(__wt_thread_create(conn->log_wrlsn_session, + &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); + conn->log_wrlsn_tid_set = 1; + /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC))) @@ -557,6 +687,17 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_TRET(wt_session->close(wt_session, NULL)); conn->log_close_session = NULL; } + if (conn->log_wrlsn_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); + WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); + conn->log_wrlsn_tid_set = 0; + } + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + if (conn->log_wrlsn_session != NULL) { + wt_session = &conn->log_wrlsn_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + conn->log_wrlsn_session = NULL; + } WT_TRET(__wt_log_close(session)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index d4f6cf4869c..0a3d35ac0b1 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -55,9 +55,6 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) */ WT_WRITE_BARRIER(); - /* Connect to a cache pool. */ - WT_RET(__wt_cache_pool_config(session, cfg)); - /* Create the cache. */ WT_RET(__wt_cache_create(session, cfg)); @@ -113,6 +110,9 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); + + F_SET(conn, WT_CONN_CLOSING); + WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, 1)); WT_TRET(__wt_sweep_destroy(session)); diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 67814dc330b..c38e0ef125f 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -205,7 +205,7 @@ __statlog_apply(WT_SESSION_IMPL *session, const char *cfg[]) if (WT_PREFIX_MATCH(dhandle->name, *p)) { WT_WITHOUT_DHANDLE(session, ret = __statlog_dump(session, dhandle->name, 0)); - WT_RET(ret); + return (ret); } return (0); } diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index abc6a106cc9..bf086bcc813 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -26,7 +26,7 @@ __curindex_get_value(WT_CURSOR *cursor, ...) WT_CURSOR_NEEDVALUE(cursor); va_start(ap, cursor); - if (F_ISSET(cursor, WT_CURSTD_RAW)) { + if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { ret = __wt_schema_project_merge(session, cindex->cg_cursors, cindex->value_plan, cursor->value_format, &cursor->value); diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index bebce217a6a..74b998876c2 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -354,7 +354,6 @@ __curstat_file_init(WT_SESSION_IMPL *session, /* Release the handle, we're done with it. */ WT_TRET(__wt_session_release_btree(session)); - WT_RET(ret); return (ret); } diff --git a/src/docs/spell.ok b/src/docs/spell.ok index f333a8fff58..df31a272361 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -87,6 +87,7 @@ ack'ed ajn alloc allocator +allocators allocsize ao api diff --git a/src/docs/tune-memory-allocator.dox b/src/docs/tune-memory-allocator.dox index ad052bc4ec3..a619708f816 100644 --- a/src/docs/tune-memory-allocator.dox +++ b/src/docs/tune-memory-allocator.dox @@ -10,4 +10,9 @@ Google's tcmalloc</a>, or <a href="http://www.canonware.com/jemalloc">FreeBSD's jemalloc</a>), can dramatically improve throughput. +As different memory allocators have different overhead and different +workloads will have different heap allocation sizes and patterns, +applications may need to set their allocator overhead using the +\c cache_overhead configuration to the wiredtiger_open:: call. + */ diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 9e39fcc7a2c..1030c0aa818 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -72,6 +72,17 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { + case WT_SYNC_DISCARD: + /* + * Check that the page is clean: if we see a dirty page + * (including a dirty parent page after evicting a + * child), give up. The higher level can try to + * checkpoint, but during discard we aren't set up to + * manage checkpoints. + */ + if (__wt_page_is_modified(page)) + WT_ERR(EBUSY); + /* FALLTHROUGH */ case WT_SYNC_CLOSE: /* * Evict the page. @@ -84,29 +95,6 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) !F_ISSET(page->modify, WT_PM_REC_EMPTY)) WT_ERR(__wt_evict(session, ref, 1)); break; - case WT_SYNC_DISCARD: - /* - * Ordinary discard of the page, whether clean or dirty. - * If we see a dirty page in an ordinary discard (e.g., - * from sweep), give up: an update must have happened - * since the file was selected for sweeping. - */ - if (__wt_page_is_modified(page)) - WT_ERR(EBUSY); - - /* - * If the page contains an update that is too recent to - * evict, stop. This should never happen during - * connection close, but in other paths our caller - * should be prepared to deal with this case. - */ - if (page->modify != NULL && - !__wt_txn_visible_all(session, - page->modify->rec_max_txn)) - WT_ERR(EBUSY); - - __wt_evict_page_clean_update(session, ref); - break; case WT_SYNC_DISCARD_FORCE: /* * Forced discard of the page, whether clean or dirty. diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index e3d8ea6a4e0..640c9b0541d 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -194,6 +194,17 @@ __evict_server(void *arg) ret = 0; } } + /* + * Clear the walks so we don't pin pages while asleep, + * otherwise we can block applications evicting large pages. + */ + if (!F_ISSET(cache, WT_CACHE_STUCK)) { + WT_ERR(__evict_clear_walks(session)); + + /* Next time we wake up, reverse the sweep direction. */ + cache->flags ^= WT_CACHE_WALK_REVERSE; + } + WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); /* Don't rely on signals: check periodically. */ WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); @@ -237,7 +248,7 @@ __evict_workers_resize(WT_SESSION_IMPL *session) WT_DECL_RET; WT_EVICT_WORKER *workers; size_t alloc; - u_int i; + uint32_t i; conn = S2C(session); @@ -321,7 +332,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) WT_DECL_RET; WT_EVICT_WORKER *workers; WT_SESSION *wt_session; - u_int i; + uint32_t i; conn = S2C(session); cache = conn->cache; @@ -432,17 +443,17 @@ __evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp) (cache->eviction_dirty_target * bytes_max) / 100) /* Ignore clean pages unless the cache is too large */ LF_SET(WT_EVICT_PASS_DIRTY); - else if (F_ISSET(cache, WT_EVICT_WOULD_BLOCK)) { + else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { /* * Evict pages with oldest generation (which would otherwise * block application threads) set regardless of whether we have * reached the eviction trigger. */ LF_SET(WT_EVICT_PASS_WOULD_BLOCK); - F_CLR(cache, WT_EVICT_WOULD_BLOCK); + F_CLR(cache, WT_CACHE_WOULD_BLOCK); } - if (F_ISSET(cache, WT_EVICT_STUCK)) + if (F_ISSET(cache, WT_CACHE_STUCK)) LF_SET(WT_EVICT_PASS_AGGRESSIVE); *flagsp = flags; @@ -475,8 +486,8 @@ __evict_pass(WT_SESSION_IMPL *session) * If there is a request to clear eviction walks, do that now, * before checking if the cache is full. */ - if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) { - F_CLR(cache, WT_EVICT_CLEAR_WALKS); + if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS)) { + F_CLR(cache, WT_CACHE_CLEAR_WALKS); WT_RET(__evict_clear_walks(session)); WT_RET(__wt_cond_signal( session, cache->evict_waiter_cond)); @@ -493,7 +504,8 @@ __evict_pass(WT_SESSION_IMPL *session) * Start a worker if we have capacity and we haven't reached * the eviction targets. */ - if (LF_ISSET(WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY) && + if (LF_ISSET(WT_EVICT_PASS_ALL | + WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "Starting evict worker: %"PRIu32"\n", @@ -527,10 +539,8 @@ __evict_pass(WT_SESSION_IMPL *session) * handles. */ __wt_sleep(0, 1000 * (long)loop); - if (F_ISSET(cache, WT_EVICT_STUCK)) - break; if (loop == 100) { - F_SET(cache, WT_EVICT_STUCK); + F_SET(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR( session, cache_eviction_slow); WT_RET(__wt_verbose( @@ -605,7 +615,7 @@ __evict_tree_walk_clear(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_CLEAR_EVICT_WALK); while (btree->evict_ref != NULL && ret == 0) { - F_SET(cache, WT_EVICT_CLEAR_WALKS); + F_SET(cache, WT_CACHE_CLEAR_WALKS); ret = __wt_cond_wait( session, cache->evict_waiter_cond, 100000); } @@ -792,21 +802,29 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) WT_ASSERT(session, cache->evict[0].ref != NULL); - /* Find the bottom 25% of read generations. */ - cutoff = (3 * __evict_read_gen(&cache->evict[0]) + - __evict_read_gen(&cache->evict[entries - 1])) / 4; - - /* - * Don't take less than 10% or more than 50% of entries, regardless. - * That said, if there is only one entry, which is normal when - * populating an empty file, don't exclude it. - */ - for (candidates = 1 + entries / 10; - candidates < entries / 2; - candidates++) - if (__evict_read_gen(&cache->evict[candidates]) > cutoff) - break; - cache->evict_candidates = candidates; + if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + /* + * Take all candidates if we only gathered pages with an oldest + * read generation set. + */ + cache->evict_candidates = entries; + else { + /* Find the bottom 25% of read generations. */ + cutoff = (3 * __evict_read_gen(&cache->evict[0]) + + __evict_read_gen(&cache->evict[entries - 1])) / 4; + /* + * Don't take less than 10% or more than 50% of entries, + * regardless. That said, if there is only one entry, which is + * normal when populating an empty file, don't exclude it. + */ + for (candidates = 1 + entries / 10; + candidates < entries / 2; + candidates++) + if (__evict_read_gen( + &cache->evict[candidates]) > cutoff) + break; + cache->evict_candidates = candidates; + } /* If we have more than the minimum number of entries, clear them. */ if (cache->evict_entries > WT_EVICT_WALK_BASE) { @@ -907,7 +925,7 @@ retry: while (slot < max_entries && ret == 0) { * If another thread is waiting on the eviction server to clear * the walk point in a tree, give up. */ - if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) + if (F_ISSET(cache, WT_CACHE_CLEAR_WALKS)) break; /* @@ -917,7 +935,7 @@ retry: while (slot < max_entries && ret == 0) { if (!dhandle_locked) { for (spins = 0; (ret = __wt_spin_trylock( session, &conn->dhandle_lock, &id)) == EBUSY && - !F_ISSET(cache, WT_EVICT_CLEAR_WALKS); + !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); spins++) { if (spins < 1000) __wt_yield(); @@ -1029,7 +1047,7 @@ retry: while (slot < max_entries && ret == 0) { * candidates and we aren't finding more. Take care not to skip files * on subsequent passes. */ - if (!F_ISSET(cache, WT_EVICT_CLEAR_WALKS) && ret == 0 && + if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 && slot < max_entries && (retries < 2 || (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 && (slot == cache->evict_entries || slot > start_slot)))) { @@ -1096,8 +1114,11 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) cache->evict + cache->evict_slots); enough = internal_pages = restarts = 0; - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | + WT_READ_NO_GEN | WT_READ_NO_WAIT; + + if (F_ISSET(cache, WT_CACHE_WALK_REVERSE)) + walk_flags |= WT_READ_PREV; /* * Get some more eviction candidate pages. @@ -1181,7 +1202,7 @@ fast: /* If the page can't be evicted, give up. */ */ mod = page->modify; if (!modified && mod != NULL && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_WOULD_BLOCK) && + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && !__wt_txn_visible_all(session, mod->rec_max_txn)) continue; @@ -1355,8 +1376,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server) WT_RET(ret); cache = S2C(session)->cache; - if (F_ISSET(cache, WT_EVICT_STUCK)) - F_CLR(cache, WT_EVICT_STUCK); + if (F_ISSET(cache, WT_CACHE_STUCK)) + F_CLR(cache, WT_CACHE_STUCK); return (ret); } @@ -1400,9 +1421,9 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full) * abort the transaction to give up all hazard pointers before * trying again. */ - if (F_ISSET(cache, WT_EVICT_STUCK) && + if (F_ISSET(cache, WT_CACHE_STUCK) && __wt_txn_am_oldest(session)) { - F_CLR(cache, WT_EVICT_STUCK); + F_CLR(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR(session, txn_fail_cache); return (WT_ROLLBACK); } diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 9ba1af897a4..892d5b4ac60 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -130,8 +130,8 @@ done: session->excl_next = 0; txn_state->snap_min = WT_TXN_NONE; if ((inmem_split || (forced_eviction && ret == EBUSY)) && - !F_ISSET(conn->cache, WT_EVICT_WOULD_BLOCK)) { - F_SET(conn->cache, WT_EVICT_WOULD_BLOCK); + !F_ISSET(conn->cache, WT_CACHE_WOULD_BLOCK)) { + F_SET(conn->cache, WT_CACHE_WOULD_BLOCK); WT_TRET(__wt_evict_server_wake(session)); } diff --git a/src/include/btmem.h b/src/include/btmem.h index 91d0d1eb654..101fd450fc7 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -442,8 +442,6 @@ struct __wt_page { /* Row-store leaf page. */ struct { - WT_ROW *d; /* Key/value pairs */ - /* * The column-store leaf page modification structures * live in the WT_PAGE_MODIFY structure to keep the @@ -457,6 +455,7 @@ struct __wt_page { WT_INSERT_HEAD **ins; /* Inserts */ WT_UPDATE **upd; /* Updates */ + WT_ROW *d; /* Key/value pairs */ uint32_t entries; /* Entries */ } row; #undef pg_row_d @@ -510,11 +509,31 @@ struct __wt_page { #define pg_var_entries u.col_var.entries } u; - /* Page's on-disk representation: NULL for pages created in memory. */ - const WT_PAGE_HEADER *dsk; + /* + * The page's type and flags are positioned at the end of the WT_PAGE + * union, it reduces cache misses in the row-store search function. + */ +#define WT_PAGE_IS_INTERNAL(page) \ + ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) +#define WT_PAGE_INVALID 0 /* Invalid page */ +#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */ +#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */ +#define WT_PAGE_COL_INT 3 /* Col-store internal page */ +#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */ +#define WT_PAGE_OVFL 5 /* Overflow page */ +#define WT_PAGE_ROW_INT 6 /* Row-store internal page */ +#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */ + uint8_t type; /* Page type */ - /* If/when the page is modified, we need lots more information. */ - WT_PAGE_MODIFY *modify; +#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */ +#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ +#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ +#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ +#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ +#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ +#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ +#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ + uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* * The page's read generation acts as an LRU value for each page in the @@ -539,27 +558,11 @@ struct __wt_page { size_t memory_footprint; /* Memory attached to the page */ -#define WT_PAGE_IS_INTERNAL(page) \ - ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) -#define WT_PAGE_INVALID 0 /* Invalid page */ -#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */ -#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */ -#define WT_PAGE_COL_INT 3 /* Col-store internal page */ -#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */ -#define WT_PAGE_OVFL 5 /* Overflow page */ -#define WT_PAGE_ROW_INT 6 /* Row-store internal page */ -#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */ - uint8_t type; /* Page type */ + /* Page's on-disk representation: NULL for pages created in memory. */ + const WT_PAGE_HEADER *dsk; -#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */ -#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ -#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ -#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ -#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ - uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ + /* If/when the page is modified, we need lots more information. */ + WT_PAGE_MODIFY *modify; }; /* diff --git a/src/include/btree.i b/src/include/btree.i index 032178b4755..56fb66abaef 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -404,7 +404,7 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_page_parent_modify_set -- - * Mark the parent page and tree dirty. + * Mark the parent page, and optionally the tree, dirty. */ static inline int __wt_page_parent_modify_set( @@ -957,6 +957,10 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits) if (mod == NULL) return (1); + /* Skip pages that are already being evicted. */ + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + return (0); + /* * If the tree was deepened, there's a requirement that newly created * internal pages not be evicted until all threads are known to have diff --git a/src/include/cache.h b/src/include/cache.h index 84b18082a25..8ed3176492f 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -118,9 +118,10 @@ struct __wt_cache { */ #define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */ #define WT_CACHE_POOL_RUN 0x02 /* Cache pool thread running */ -#define WT_EVICT_CLEAR_WALKS 0x04 /* Clear eviction walks */ -#define WT_EVICT_STUCK 0x08 /* Eviction server is stuck */ -#define WT_EVICT_WOULD_BLOCK 0x10 /* Pages that would block apps */ +#define WT_CACHE_CLEAR_WALKS 0x04 /* Clear eviction walks */ +#define WT_CACHE_STUCK 0x08 /* Eviction server is stuck */ +#define WT_CACHE_WALK_REVERSE 0x10 /* Scan backwards for candidates */ +#define WT_CACHE_WOULD_BLOCK 0x20 /* Pages that would block apps */ uint32_t flags; }; diff --git a/src/include/connection.h b/src/include/connection.h index 9cb42ae7c80..78b2949ab98 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -320,6 +320,10 @@ struct __wt_connection_impl { WT_SESSION_IMPL *log_close_session;/* Log close thread session */ wt_thread_t log_close_tid; /* Log close thread thread */ int log_close_tid_set;/* Log close thread set */ + WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ + WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ + wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ + int log_wrlsn_tid_set;/* Log write lsn thread set */ WT_LOG *log; /* Logging structure */ WT_COMPRESSOR *log_compressor;/* Logging compressor */ wt_off_t log_file_max; /* Log file max size */ diff --git a/src/include/error.h b/src/include/error.h index b732776badf..efc1617fcd3 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -11,11 +11,11 @@ /* In DIAGNOSTIC mode, yield in places where we want to encourage races. */ #ifdef HAVE_DIAGNOSTIC -#define WT_HAVE_DIAGNOSTIC_YIELD do { \ +#define WT_DIAGNOSTIC_YIELD do { \ __wt_yield(); \ } while (0) #else -#define WT_HAVE_DIAGNOSTIC_YIELD +#define WT_DIAGNOSTIC_YIELD #endif /* Set "ret" and branch-to-err-label tests. */ diff --git a/src/include/extern.h b/src/include/extern.h index 5d3ee5bc8f8..bddbb5e01eb 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -163,8 +163,9 @@ extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *lea extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate); -extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp); -extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp); +extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp); +extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); +extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep); @@ -206,7 +207,7 @@ extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session); extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *config, WT_EXTRACTOR **extractorp, int *ownp); extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session); extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]); -extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]); extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); extern int __wt_cache_destroy(WT_SESSION_IMPL *session); @@ -221,6 +222,7 @@ extern int __wt_conn_dhandle_find(WT_SESSION_IMPL *session, const char *name, co extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force); extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags); extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force); extern int __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final); @@ -349,7 +351,7 @@ extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); -extern int __wt_log_slot_free(WT_LOGSLOT *slot); +extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); diff --git a/src/include/flags.h b/src/include/flags.h index 9664fce3f9f..30b2ab1c0e3 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -4,18 +4,19 @@ */ #define WT_CONN_CACHE_POOL 0x00000001 #define WT_CONN_CKPT_SYNC 0x00000002 -#define WT_CONN_EVICTION_RUN 0x00000004 -#define WT_CONN_LEAK_MEMORY 0x00000008 -#define WT_CONN_LOG_SERVER_RUN 0x00000010 -#define WT_CONN_LSM_MERGE 0x00000020 -#define WT_CONN_PANIC 0x00000040 -#define WT_CONN_SERVER_ASYNC 0x00000080 -#define WT_CONN_SERVER_CHECKPOINT 0x00000100 -#define WT_CONN_SERVER_LSM 0x00000200 -#define WT_CONN_SERVER_RUN 0x00000400 -#define WT_CONN_SERVER_STATISTICS 0x00000800 -#define WT_CONN_SERVER_SWEEP 0x00001000 -#define WT_CONN_WAS_BACKUP 0x00002000 +#define WT_CONN_CLOSING 0x00000004 +#define WT_CONN_EVICTION_RUN 0x00000008 +#define WT_CONN_LEAK_MEMORY 0x00000010 +#define WT_CONN_LOG_SERVER_RUN 0x00000020 +#define WT_CONN_LSM_MERGE 0x00000040 +#define WT_CONN_PANIC 0x00000080 +#define WT_CONN_SERVER_ASYNC 0x00000100 +#define WT_CONN_SERVER_CHECKPOINT 0x00000200 +#define WT_CONN_SERVER_LSM 0x00000400 +#define WT_CONN_SERVER_RUN 0x00000800 +#define WT_CONN_SERVER_STATISTICS 0x00001000 +#define WT_CONN_SERVER_SWEEP 0x00002000 +#define WT_CONN_WAS_BACKUP 0x00004000 #define WT_EVICTING 0x00000001 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 @@ -36,9 +37,8 @@ #define WT_READ_NO_WAIT 0x00000010 #define WT_READ_PREV 0x00000020 #define WT_READ_SKIP_INTL 0x00000040 -#define WT_READ_SKIP_LEAF 0x00000080 -#define WT_READ_TRUNCATE 0x00000100 -#define WT_READ_WONT_NEED 0x00000200 +#define WT_READ_TRUNCATE 0x00000080 +#define WT_READ_WONT_NEED 0x00000100 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_DISCARD_FORCE 0x00000004 diff --git a/src/include/log.h b/src/include/log.h index 82d90070609..760321d9abb 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -59,17 +59,21 @@ /* * Possible values for the consolidation array slot states: + * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.) + * * < WT_LOG_SLOT_DONE - threads are actively writing to the log. * WT_LOG_SLOT_DONE - all activity on this slot is complete. * WT_LOG_SLOT_FREE - slot is available for allocation. * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active. + * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker. * WT_LOG_SLOT_READY - slot is ready for threads to join. * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot. */ #define WT_LOG_SLOT_DONE 0 #define WT_LOG_SLOT_FREE 1 #define WT_LOG_SLOT_PENDING 2 -#define WT_LOG_SLOT_READY 3 +#define WT_LOG_SLOT_WRITTEN 3 +#define WT_LOG_SLOT_READY 4 typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { int64_t slot_state; /* Slot state */ uint64_t slot_group_size; /* Group size */ @@ -92,9 +96,11 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { uint32_t flags; /* Flags */ } WT_LOGSLOT; +#define SLOT_INIT_FLAGS (SLOT_BUFFERED) + typedef struct { WT_LOGSLOT *slot; - wt_off_t offset; + wt_off_t offset; } WT_MYSLOT; /* Offset of first record */ diff --git a/src/include/stat.h b/src/include/stat.h index 3f684478358..21eaff0677f 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -215,6 +215,7 @@ struct __wt_connection_stats { WT_STATS log_prealloc_max; WT_STATS log_prealloc_used; WT_STATS log_reads; + WT_STATS log_release_write_lsn; WT_STATS log_scan_records; WT_STATS log_scan_rereads; WT_STATS log_scans; @@ -227,6 +228,8 @@ struct __wt_connection_stats { WT_STATS log_slot_toosmall; WT_STATS log_slot_transitions; WT_STATS log_sync; + WT_STATS log_sync_dir; + WT_STATS log_write_lsn; WT_STATS log_writes; WT_STATS lsm_checkpoint_throttle; WT_STATS lsm_merge_throttle; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index d0d0f9eec77..fed6042c67a 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -561,7 +561,6 @@ struct __wt_cursor { * user on open. */ const char *internal_uri; - /* Saved modification methods. */ #define WT_CURSTD_APPEND 0x0001 #define WT_CURSTD_BULK 0x0002 @@ -3336,110 +3335,116 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_PREALLOC_USED 1081 /*! log: log read operations */ #define WT_STAT_CONN_LOG_READS 1082 +/*! log: log release advances write LSN */ +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1083 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1083 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1084 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1084 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1085 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1085 +#define WT_STAT_CONN_LOG_SCANS 1086 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1086 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1087 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1087 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1088 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1088 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1089 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1089 +#define WT_STAT_CONN_LOG_SLOT_RACES 1090 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1090 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1091 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1091 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1092 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1092 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1093 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1093 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1094 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1094 +#define WT_STAT_CONN_LOG_SYNC 1095 +/*! log: log sync_dir operations */ +#define WT_STAT_CONN_LOG_SYNC_DIR 1096 +/*! log: log server thread advances write LSN */ +#define WT_STAT_CONN_LOG_WRITE_LSN 1097 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1095 +#define WT_STAT_CONN_LOG_WRITES 1098 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1096 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1099 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1097 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1100 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1098 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1101 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1099 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1102 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1100 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1103 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1101 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1104 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1102 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1105 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1103 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1106 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1104 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1107 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1105 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1108 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1106 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1109 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1107 +#define WT_STAT_CONN_MEMORY_FREE 1110 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1108 +#define WT_STAT_CONN_MEMORY_GROW 1111 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1109 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1112 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1110 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1113 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1111 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1114 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1112 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1115 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1113 +#define WT_STAT_CONN_PAGE_SLEEP 1116 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1114 +#define WT_STAT_CONN_READ_IO 1117 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1115 +#define WT_STAT_CONN_REC_PAGES 1118 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1116 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1119 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1117 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1120 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1118 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1121 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1119 +#define WT_STAT_CONN_RWLOCK_READ 1122 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1120 +#define WT_STAT_CONN_RWLOCK_WRITE 1123 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1121 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1124 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1122 +#define WT_STAT_CONN_SESSION_OPEN 1125 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1123 +#define WT_STAT_CONN_TXN_BEGIN 1126 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1124 +#define WT_STAT_CONN_TXN_CHECKPOINT 1127 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1125 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1128 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1126 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1129 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1127 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1130 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1128 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1131 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1129 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1132 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1130 +#define WT_STAT_CONN_TXN_COMMIT 1133 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1131 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1134 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1132 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1135 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1133 +#define WT_STAT_CONN_TXN_ROLLBACK 1136 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1134 +#define WT_STAT_CONN_WRITE_IO 1137 /*! * @} diff --git a/src/log/log.c b/src/log/log.c index f76ec402b0d..f485f0a09e5 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -61,16 +61,23 @@ __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec) WT_RET(__wt_curlog_open(session, "log:", NULL, &c)); c->set_key(c, ckp_lsn->file, ckp_lsn->offset, 0); - WT_ERR(c->search(c)); - - /* - * If the checkpoint LSN we're given is the last record, then recovery - * is not needed. - */ - if ((ret = c->next(c)) == WT_NOTFOUND) { - *rec = 0; + if ((ret = c->search(c)) == 0) { + /* + * If the checkpoint LSN we're given is the last record, + * then recovery is not needed. + */ + if ((ret = c->next(c)) == WT_NOTFOUND) { + *rec = 0; + ret = 0; + } + } else if (ret == WT_NOTFOUND) + /* + * If we didn't find that LSN, we need to run recovery, + * but not return any error. + */ ret = 0; - } + else + WT_ERR(ret); err: WT_TRET(c->close(c)); return (ret); @@ -455,6 +462,10 @@ __log_file_header( WT_ERR(__log_acquire(session, logrec->len, &tmp)); } WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); + /* + * Make sure the header gets to disk. + */ + WT_ERR(__wt_fsync(session, tmp.slot_fh)); if (end_lsn != NULL) *end_lsn = tmp.slot_end_lsn; @@ -573,6 +584,7 @@ __log_truncate(WT_SESSION_IMPL *session, WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset)); tmp_fh = log_fh; log_fh = NULL; + WT_ERR(__wt_fsync(session, tmp_fh)); WT_ERR(__wt_close(session, tmp_fh)); /* @@ -596,6 +608,7 @@ __log_truncate(WT_SESSION_IMPL *session, log_fh, LOG_FIRST_RECORD)); tmp_fh = log_fh; log_fh = NULL; + WT_ERR(__wt_fsync(session, tmp_fh)); WT_ERR(__wt_close(session, tmp_fh)); } } @@ -646,6 +659,7 @@ __wt_log_allocfile( WT_ERR(__log_prealloc(session, log_fh)); tmp_fh = log_fh; log_fh = NULL; + WT_ERR(__wt_fsync(session, tmp_fh)); WT_ERR(__wt_close(session, tmp_fh)); WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_prealloc: rename %s to %s", @@ -790,17 +804,20 @@ __wt_log_close(WT_SESSION_IMPL *session) if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name)); + WT_RET(__wt_fsync(session, log->log_close_fh)); WT_RET(__wt_close(session, log->log_close_fh)); } if (log->log_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name)); + WT_RET(__wt_fsync(session, log->log_fh)); WT_RET(__wt_close(session, log->log_fh)); log->log_fh = NULL; } if (log->log_dir_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name)); + WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh)); WT_RET(__wt_close(session, log->log_dir_fh)); log->log_dir_fh = NULL; } @@ -900,7 +917,7 @@ err: * Release a log slot. */ static int -__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -913,6 +930,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) conn = S2C(session); log = conn->log; locked = yield_count = 0; + *freep = 1; /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { @@ -923,9 +941,29 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) } /* - * Wait for earlier groups to finish, otherwise there could be holes - * in the log file. + * If this is not a buffered write, meaning the slot we have is a + * dummy constructed slot, not from the slot pool, or we have to wait + * for a synchronous operation, we do not pass handling of this slot + * off to the worker thread. The caller is responsible for freeing + * the slot in that case. Otherwise the worker thread will free it. + */ + if (F_ISSET(slot, SLOT_BUFFERED) && + !F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { + *freep = 0; + slot->slot_state = WT_LOG_SLOT_WRITTEN; + /* + * After this point the worker thread owns the slot. There + * is nothing more to do but return. + */ + WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); + goto done; + } + + /* + * Wait for earlier groups to finish, otherwise there could + * be holes in the log file. */ + WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn); while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { if (++yield_count < 1000) __wt_yield(); @@ -936,6 +974,9 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_write_cond)); + /* + * Signal the close thread if needed. + */ if (F_ISSET(slot, SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_close_cond)); @@ -978,7 +1019,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_ERR(__wt_directory_sync_fh( session, log->log_dir_fh)); log->sync_dir_lsn = sync_lsn; - F_CLR(slot, SLOT_SYNC_DIR); + WT_STAT_FAST_CONN_INCR(session, log_sync_dir); } /* @@ -990,26 +1031,22 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) "log_release: sync log %s", log->log_fh->name)); WT_STAT_FAST_CONN_INCR(session, log_sync); WT_ERR(__wt_fsync(session, log->log_fh)); - F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); } + /* + * Clear the flags before leaving the loop. + */ + F_CLR(slot, SLOT_SYNC | SLOT_SYNC_DIR); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); break; } - if (F_ISSET(slot, SLOT_BUF_GROW)) { - WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); - F_CLR(slot, SLOT_BUF_GROW); - WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, slot->slot_buf.memsize); - WT_ERR(__wt_buf_grow(session, - &slot->slot_buf, slot->slot_buf.memsize * 2)); - } err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; +done: return (ret); } @@ -1460,12 +1497,13 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LOG *log; WT_LOGSLOT tmp; WT_MYSLOT myslot; - int locked; + int dummy, locked; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ log = S2C(session)->log; myslot.slot = &tmp; myslot.offset = 0; + dummy = 0; WT_CLEAR(tmp); /* Fast path the contended case. */ @@ -1481,7 +1519,7 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; WT_ERR(__log_fill(session, &myslot, 1, record, lsnp)); - WT_ERR(__log_release(session, &tmp)); + WT_ERR(__log_release(session, &tmp, &dummy)); err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); @@ -1609,11 +1647,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN lsn; WT_MYSLOT myslot; uint32_t rdup_len; - int locked; + int free_slot, locked; conn = S2C(session); log = conn->log; - locked = 0; + free_slot = locked = 0; WT_INIT_LSN(&lsn); myslot.slot = NULL; /* @@ -1695,8 +1733,9 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_ERR(__wt_log_slot_wait(session, myslot.slot)); WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { - WT_ERR(__log_release(session, myslot.slot)); - WT_ERR(__wt_log_slot_free(myslot.slot)); + WT_ERR(__log_release(session, myslot.slot, &free_slot)); + if (free_slot) + WT_ERR(__wt_log_slot_free(session, myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 8dcb2f9f165..02b3056be6f 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -57,7 +57,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) for (i = 0; i < SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); - F_SET(&log->slot_pool[i], SLOT_BUFFERED); + F_SET(&log->slot_pool[i], SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL); @@ -295,10 +295,34 @@ __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) * Free a slot back into the pool. */ int -__wt_log_slot_free(WT_LOGSLOT *slot) +__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { + WT_DECL_RET; + + ret = 0; + /* + * Grow the buffer if needed before returning it to the pool. + */ + if (F_ISSET(slot, SLOT_BUF_GROW)) { + WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); + WT_STAT_FAST_CONN_INCRV(session, + log_buffer_size, slot->slot_buf.memsize); + WT_ERR(__wt_buf_grow(session, + &slot->slot_buf, slot->slot_buf.memsize * 2)); + } +err: + /* + * No matter if there is an error, we always want to free + * the slot back to the pool. + */ + /* + * Make sure flags don't get retained between uses. + * We have to reset them them here because multiple threads may + * change the flags when joining the slot. + */ + slot->flags = SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; - return (0); + return (ret); } /* diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 3b4dc639945..8474b6e8b37 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -77,6 +77,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) } else { primary = clsm->cursors[clsm->nchunks - 1]; primary_chunk = clsm->primary_chunk; + WT_ASSERT(session, F_ISSET(&session->txn, TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && (primary_chunk->switch_txn == WT_TXN_NONE || TXNID_LT(session->txn.id, primary_chunk->switch_txn))); @@ -177,14 +178,15 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) /* Update the maximum transaction ID in the primary chunk. */ if (update) { - WT_RET(__clsm_enter_update(clsm)); - if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) - goto open; - /* * Ensure that there is a transaction snapshot active. */ WT_RET(__wt_txn_autocommit_check(session)); + WT_RET(__wt_txn_id_check(session)); + + WT_RET(__clsm_enter_update(clsm)); + if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) + goto open; if (session->txn.isolation == TXN_ISO_SNAPSHOT) __wt_txn_cursor_op(session); @@ -1237,11 +1239,12 @@ __clsm_put(WT_SESSION_IMPL *session, { WT_CURSOR *c, *primary; WT_LSM_TREE *lsm_tree; - u_int i; + u_int i, slot; lsm_tree = clsm->lsm_tree; WT_ASSERT(session, + F_ISSET(&session->txn, TXN_HAS_ID) && clsm->primary_chunk != NULL && (clsm->primary_chunk->switch_txn == WT_TXN_NONE || TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn))); @@ -1257,8 +1260,15 @@ __clsm_put(WT_SESSION_IMPL *session, if (position) clsm->current = primary; - for (i = 0; i < clsm->nupdates; i++) { - c = clsm->cursors[(clsm->nchunks - i) - 1]; + for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) { + /* Check if we need to keep updating old chunks. */ + if (i > 0 && + __wt_txn_visible(session, clsm->switch_txn[slot])) { + clsm->nupdates = i; + break; + } + + c = clsm->cursors[slot]; c->set_key(c, key); c->set_value(c, value); WT_RET((position && i == 0) ? c->update(c) : c->insert(c)); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index f4ddd4f7e2f..dea012ccb9e 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -401,7 +401,13 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); - F_SET(session, WT_SESSION_NO_CACHE); + /* + * Setup so that we don't hold pages we read into cache, and so + * that we don't get stuck if the cache is full. If we allow + * ourselves to get stuck creating bloom filters, the entire tree + * can stall since there may be no worker threads available to flush. + */ + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); @@ -414,15 +420,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, F_CLR(session, WT_SESSION_NO_CACHE); - /* - * Load the new Bloom filter into cache. - * - * We're doing advisory reads to fault the new trees into cache. - * Don't block if the cache is full: our next unit of work may be to - * discard some trees to free space. - */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK); - + /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 4ca1a6bc623..33d79e6d4ce 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -522,6 +522,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) */ mod->mod_root_split = next; + /* + * Mark the page dirty. + * Don't mark the tree dirty: if this reconciliation is in service of a + * checkpoint, it's cleared the tree's dirty flag, and we don't want to + * set it again as part of that walk. + */ WT_ERR(__wt_page_modify_init(session, next)); __wt_page_only_modify_set(session, next); @@ -1113,12 +1119,14 @@ __rec_child_modify(WT_SESSION_IMPL *session, * process will have completed before we walk any pages * for checkpoint. */ - if ((ret = __wt_page_in(session, ref, + ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | - WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) { + WT_READ_NO_GEN | WT_READ_NO_WAIT); + if (ret == WT_NOTFOUND) { ret = 0; break; } + WT_RET(ret); *hazardp = 1; goto in_memory; @@ -1173,7 +1181,7 @@ in_memory: CHILD_RELEASE(session, *hazardp, ref); } -done: WT_HAVE_DIAGNOSTIC_YIELD; +done: WT_DIAGNOSTIC_YIELD; return (ret); } @@ -1982,16 +1990,20 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) next->start = r->first_free; next->entries = 0; - /* - * Set the space available to another split-size chunk, if we - * have one. If we don't have room for another split chunk, - * add whatever space remains in this page. - */ + /* Set the space available to another split-size chunk. */ r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + + /* + * Adjust the space available to handle two cases: + * - We don't have enough room for another full split-size + * chunk on the page. + * - We chose to fill past a page boundary because of a + * large item. + */ if (inuse + r->space_avail > r->page_size) { - WT_ASSERT(session, r->page_size >= inuse); - r->space_avail = r->page_size - inuse; + r->space_avail = + r->page_size > inuse ? (r->page_size - inuse) : 0; /* There are no further boundary points. */ r->bnd_state = SPLIT_MAX; @@ -2649,7 +2661,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * WT_PAGE_HEADER header onto the scratch buffer, most of the header * information remains unchanged between the pages. */ - WT_RET(__wt_scr_alloc(session, r->page_size, &tmp)); + WT_RET(__wt_scr_alloc(session, r->dsk.memsize, &tmp)); dsk = tmp->mem; memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); @@ -2977,7 +2989,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_split_finish(session, r)); WT_RET(__rec_write_wrapup(session, r, r->page)); - /* Mark the page's parent dirty. */ + /* Mark the page's parent and the tree dirty. */ parent = r->ref->home; WT_RET(__wt_page_modify_init(session, parent)); __wt_page_modify_set(session, parent); @@ -3017,8 +3029,6 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET( __rec_split_raw(session, r, key->len + val->len)); else { - WT_RET(__rec_split(session, r, key->len + val->len)); - /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3030,6 +3040,8 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } + + WT_RET(__rec_split(session, r, key->len + val->len)); } } @@ -3225,15 +3237,18 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); addr = NULL; child = ref->page; - if (state != 0) { - /* - * Currently the only non-zero returned stated possible - * for a column-store page is child-modified (all other - * states are part of the fast-truncate support, which - * is row-store only). - */ - WT_ASSERT(session, state == WT_CHILD_MODIFIED); + /* Deleted child we don't have to write. */ + if (state == WT_CHILD_IGNORE) { + CHILD_RELEASE_ERR(session, hazard, ref); + continue; + } + + /* + * Modified child. Empty pages are merged into the parent and + * discarded. + */ + if (state == WT_CHILD_MODIFIED) { switch (F_ISSET(child->modify, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* @@ -3253,7 +3268,9 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) break; WT_ILLEGAL_VALUE_ERR(session); } - } + } else + /* No other states are expected for column stores. */ + WT_ASSERT(session, state == 0); /* * Build the value cell. The child page address is in one of 3 @@ -4550,8 +4567,6 @@ build: WT_PAGE_ROW_LEAF, kpack, r->cur)); key_onpage_ovfl = 0; } - WT_ERR(__rec_split( - session, r, key->len + val->len)); /* * Turn off prefix compression until a full key @@ -4567,6 +4582,9 @@ build: session, r, NULL, 0, &ovfl_key)); } + + WT_ERR(__rec_split( + session, r, key->len + val->len)); } } @@ -4636,9 +4654,6 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_RET(__rec_split_raw( session, r, key->len + val->len)); else { - WT_RET(__rec_split( - session, r, key->len + val->len)); - /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -4653,6 +4668,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) session, r, NULL, 0, &ovfl_key)); } + + WT_RET(__rec_split( + session, r, key->len + val->len)); } } @@ -5085,7 +5103,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey(session, 0, + WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); if (bnd->skip == NULL) { diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index 3dfd068cf9c..e913fcfe69d 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -55,11 +55,17 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_ERR(ret); } - WT_ERR(__wt_session_get_btree_ckpt( - session, uri, cfg, open_flags)); - WT_SAVE_DHANDLE(session, - ret = file_func(session, cfg)); - WT_TRET(__wt_session_release_btree(session)); + if ((ret = __wt_session_get_btree_ckpt( + session, uri, cfg, open_flags)) == 0) { + WT_SAVE_DHANDLE(session, + ret = file_func(session, cfg)); + WT_TRET(__wt_session_release_btree(session)); + } else if (ret == EBUSY) + /* TODO: Decode checkpoint from cfg. */ + WT_WITH_DHANDLE_LOCK(session, + ret = __wt_conn_btree_apply_single_ckpt( + session, uri, file_func, cfg)); + WT_ERR(ret); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { WT_ERR(__wt_schema_get_colgroup( diff --git a/src/support/stat.c b/src/support/stat.c index 0926636a532..9d10c4d5ca6 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -447,10 +447,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "log: log records not compressed"; stats->log_compress_small.desc = "log: log records too small to compress"; + stats->log_release_write_lsn.desc = + "log: log release advances write LSN"; stats->log_scans.desc = "log: log scan operations"; stats->log_scan_rereads.desc = "log: log scan records requiring two reads"; + stats->log_write_lsn.desc = + "log: log server thread advances write LSN"; stats->log_sync.desc = "log: log sync operations"; + stats->log_sync_dir.desc = "log: log sync_dir operations"; stats->log_writes.desc = "log: log write operations"; stats->log_slot_consolidated.desc = "log: logging bytes consolidated"; stats->log_max_filesize.desc = "log: maximum log file size"; @@ -613,9 +618,12 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_compress_writes.v = 0; stats->log_compress_write_fails.v = 0; stats->log_compress_small.v = 0; + stats->log_release_write_lsn.v = 0; stats->log_scans.v = 0; stats->log_scan_rereads.v = 0; + stats->log_write_lsn.v = 0; stats->log_sync.v = 0; + stats->log_sync_dir.v = 0; stats->log_writes.v = 0; stats->log_slot_consolidated.v = 0; stats->log_prealloc_max.v = 0; diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index fb590e1a297..87b85eb2d8d 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -259,10 +259,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) session->ckpt_handle[session->ckpt_handle_next++].dhandle = session->dhandle; else if (ret == EBUSY) - WT_ERR(__wt_strdup(session, name, - &session->ckpt_handle[session->ckpt_handle_next++].name)); + ret = __wt_strdup(session, name, + &session->ckpt_handle[session->ckpt_handle_next++].name); -err: return (ret); + return (ret); } /* @@ -988,14 +988,23 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) int __wt_checkpoint_close(WT_SESSION_IMPL *session, int force) { - /* If closing an unmodified file, simply discard its blocks. */ - if (!S2BT(session)->modified || force) - return (__wt_cache_op(session, NULL, - force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD)); + WT_DECL_RET; + + /* Handle forced discard (when dropping a file). */ + if (force) + return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE)); + + /* If closing an unmodified file, try to evict its pages. */ + if (!S2BT(session)->modified) { + ret = __wt_cache_op(session, NULL, WT_SYNC_DISCARD); + if (ret != EBUSY) + return (ret); + } /* - * Else, checkpoint the file and optionally flush the writes (the - * checkpoint call will discard the blocks, there's no additional + * If closing a modified file, or closing an unmodified file was blocked + * for any reason, checkpoint the file and optionally flush the writes + * (the checkpoint call will discard the blocks, there's no additional * step needed). */ WT_RET(__checkpoint_worker(session, NULL, 0)); diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index 72f53fed9f8..086faef1a30 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -28,8 +28,6 @@ #include "wt_internal.h" -#include <signal.h> - static struct { char *progname; /* Program name */ @@ -38,8 +36,7 @@ static struct { char *config_open; /* Command-line configuration */ - uint32_t c_bitcnt; /* Config values */ - uint32_t c_cache; + uint32_t c_cache; /* Config values */ uint32_t c_key_max; uint32_t c_ops; uint32_t c_k; /* Number of hash iterations */ @@ -49,12 +46,12 @@ static struct { uint8_t **entries; } g; -static int cleanup(void); +void cleanup(void); void die(int e, const char *fmt, ...); -static int populate_entries(void); -static int run(void); -static int setup(void); -static void usage(void); +void populate_entries(void); +void run(void); +void setup(void); +void usage(void); extern char *__wt_optarg; extern int __wt_optind; @@ -109,7 +106,7 @@ main(int argc, char *argv[]) return (EXIT_SUCCESS); } -int +void setup(void) { WT_CONNECTION *conn; @@ -141,13 +138,10 @@ setup(void) g.wt_conn = conn; g.wt_session = session; - if ((ret = populate_entries()) != 0) - die(ret, "populate_entries"); - - return (0); + populate_entries(); } -int +void run(void) { WT_BLOOM *bloomp; @@ -184,7 +178,8 @@ run(void) if ((ret = __wt_bloom_close(bloomp)) != 0) die(ret, "__wt_bloom_close"); - g.wt_session->checkpoint(g.wt_session, NULL); + if ((ret = g.wt_session->checkpoint(g.wt_session, NULL)) != 0) + die(ret, "WT_SESSION.checkpoint"); if ((ret = __wt_bloom_open( sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0) die(ret, "__wt_bloom_open"); @@ -212,28 +207,28 @@ run(void) g.c_ops, fp, 100.0 * fp/g.c_ops); if ((ret = __wt_bloom_drop(bloomp, NULL)) != 0) die(ret, "__wt_bloom_drop"); - - return (0); } -int +void cleanup(void) { uint32_t i; + int ret; for (i = 0; i < g.c_ops; i++) free(g.entries[i]); free(g.entries); - g.wt_session->close(g.wt_session, NULL); - g.wt_conn->close(g.wt_conn, NULL); - return (0); + if ((ret = g.wt_session->close(g.wt_session, NULL)) != 0) + die(ret, "WT_SESSION.close"); + if ((g.wt_conn->close(g.wt_conn, NULL)) != 0) + die(ret, "WT_CONNECTION.close"); } /* * Create and keep all the strings used to populate the bloom filter, so that * we can do validation with the same set of entries. */ -static int +void populate_entries(void) { uint32_t i, j; @@ -254,7 +249,6 @@ populate_entries(void) } g.entries = entries; - return (0); } /* @@ -283,7 +277,7 @@ die(int e, const char *fmt, ...) * usage -- * Display usage statement and exit failure. */ -static void +void usage(void) { fprintf(stderr, "usage: %s [-cfkos]\n", g.progname); diff --git a/test/format/bdb.c b/test/format/bdb.c index 563b69b9e27..254dd95e1d3 100644 --- a/test/format/bdb.c +++ b/test/format/bdb.c @@ -66,10 +66,7 @@ bdb_open(void) assert(dbenv->mutex_set_max(dbenv, 10000) == 0); assert(dbenv->set_cachesize(dbenv, 0, 50 * 1024 * 1024, 1) == 0); assert(dbenv->open(dbenv, NULL, - DB_CREATE | - (g.c_delete_pct == 0 && g.c_insert_pct == 0 && g.c_write_pct == 0 ? - 0 : DB_INIT_LOCK) | - DB_INIT_MPOOL | DB_PRIVATE, 0) == 0); + DB_CREATE | DB_INIT_LOCK | DB_INIT_MPOOL | DB_PRIVATE, 0) == 0); assert(db_create(&db, dbenv, 0) == 0); if (g.type == ROW && g.c_reverse) diff --git a/test/format/config.c b/test/format/config.c index 1fbbe90a57e..e801827935c 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -71,14 +71,14 @@ config_setup(void) } if (!config_find_is_perm("file_type", strlen("file_type"))) - switch (DATASOURCE("lsm") ? 3 : MMRAND(1, 3)) { + switch (DATASOURCE("lsm") ? 5 : MMRAND(1, 10)) { case 1: config_single("file_type=fix", 0); break; - case 2: + case 2: case 3: case 4: config_single("file_type=var", 0); break; - case 3: + case 5: case 6: case 7: case 8: case 9: case 10: config_single("file_type=row", 0); break; } @@ -142,12 +142,6 @@ config_setup(void) config_compression(); config_isolation(); - /* Clear operations values if the whole run is read-only. */ - if (g.c_ops == 0) - for (cp = c; cp->name != NULL; ++cp) - if (cp->flags & C_OPS) - *cp->v = 0; - /* * Periodically, set the delete percentage to 0 so salvage gets run, * as long as the delete percentage isn't nailed down. @@ -174,6 +168,11 @@ config_setup(void) g.c_insert_pct = MMRAND(50, 85); } + /* Make the default maximum-run length 20 minutes. */ + cp = config_find("timer", strlen("timer")); + if (!(cp->flags & C_PERM)) + g.c_timer = 20; + /* * Key/value minimum/maximum are related, correct unless specified by * the configuration. @@ -238,8 +237,9 @@ config_compression(void) /* * Compression: choose something if compression wasn't specified, * otherwise confirm the appropriate shared library is available. - * We don't include LZO in the test compression choices, we don't - * yet have an LZO module of our own. + * We used to verify that the libraries existed but that's no longer + * robust, since it's possible to build compression libraries into + * the WiredTiger library. */ cp = config_find("compression", strlen("compression")); if (!(cp->flags & C_PERM)) { @@ -249,50 +249,24 @@ config_compression(void) case 4: case 5: case 6: break; case 7: case 8: case 9: case 10: /* 20% bzip */ - if (access(BZIP_PATH, R_OK) == 0) - cstr = "compression=bzip"; + cstr = "compression=bzip"; break; case 11: /* 5% bzip-raw */ - if (access(BZIP_PATH, R_OK) == 0) - cstr = "compression=bzip-raw"; + cstr = "compression=bzip-raw"; break; case 12: case 13: case 14: case 15: /* 20% snappy */ - if (access(SNAPPY_PATH, R_OK) == 0) - cstr = "compression=snappy"; + cstr = "compression=snappy"; break; case 16: case 17: case 18: case 19: /* 20% zlib */ - if (access(ZLIB_PATH, R_OK) == 0) - cstr = "compression=zlib"; + cstr = "compression=zlib"; break; case 20: /* 5% zlib-no-raw */ - if (access(ZLIB_PATH, R_OK) == 0) - cstr = "compression=zlib-noraw"; + cstr = "compression=zlib-noraw"; break; } config_single(cstr, 0); } - - switch (g.c_compression_flag) { - case COMPRESS_BZIP: - case COMPRESS_BZIP_RAW: - if (access(BZIP_PATH, R_OK) != 0) - die(0, "bzip library not found or not readable"); - break; - case COMPRESS_LZO: - if (access(LZO_PATH, R_OK) != 0) - die(0, "LZO library not found or not readable"); - break; - case COMPRESS_SNAPPY: - if (access(SNAPPY_PATH, R_OK) != 0) - die(0, "snappy library not found or not readable"); - break; - case COMPRESS_ZLIB: - case COMPRESS_ZLIB_NO_RAW: - if (access(ZLIB_PATH, R_OK) != 0) - die(0, "zlib library not found or not readable"); - break; - } } /* diff --git a/test/format/config.h b/test/format/config.h index 7871127ff26..d5d797f4b50 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -40,14 +40,11 @@ typedef struct { /* Not a simple randomization, handle outside the main loop. */ #define C_IGNORE 0x002 - /* Operation, only set if doing operations. */ -#define C_OPS 0x004 - /* Value was set from command-line or file, ignore for all runs. */ -#define C_PERM 0x008 +#define C_PERM 0x004 /* Value isn't random for this run, ignore just for this run. */ -#define C_TEMP 0x010 +#define C_TEMP 0x008 /* Value is a string. */ #define C_STRING 0x020 @@ -134,7 +131,7 @@ static CONFIG c[] = { { "delete_pct", "percent operations that are deletes", - C_OPS, 0, 45, 90, &g.c_delete_pct, NULL }, + 0x0, 0, 45, 90, &g.c_delete_pct, NULL }, { "dictionary", "if values are dictionary compressed", /* 20% */ @@ -162,7 +159,7 @@ static CONFIG c[] = { { "insert_pct", "percent operations that are inserts", - C_OPS, 0, 45, 90, &g.c_insert_pct, NULL }, + 0x0, 0, 45, 90, &g.c_insert_pct, NULL }, { "internal_key_truncation", "if internal keys are truncated", /* 95% */ @@ -270,7 +267,7 @@ static CONFIG c[] = { C_IGNORE, 1, 32, 128, &g.c_threads, NULL }, { "timer", - "time to run in minutes", + "maximum time to run in minutes (default 20 minutes)", C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_timer, NULL }, { "value_max", @@ -287,7 +284,7 @@ static CONFIG c[] = { { "write_pct", "percent operations that are writes", - C_OPS, 0, 90, 90, &g.c_write_pct, NULL }, + 0x0, 0, 90, 90, &g.c_write_pct, NULL }, { NULL, NULL, 0x0, 0, 0, 0, NULL, NULL } }; diff --git a/test/format/format.h b/test/format/format.h index e2cd4f19c7e..58940f0c4b8 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -307,7 +307,7 @@ void wts_create(void); void wts_dump(const char *, int); void wts_load(void); void wts_open(const char *, int, WT_CONNECTION **); -void wts_ops(void); +void wts_ops(int); void wts_read_scan(void); void wts_salvage(void); void wts_stats(void); diff --git a/test/format/ops.c b/test/format/ops.c index 3a0a9110b9c..5fd992e9952 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -46,14 +46,14 @@ static void table_append_init(void); * Perform a number of operations in a set of threads. */ void -wts_ops(void) +wts_ops(int lastrun) { TINFO *tinfo, total; WT_CONNECTION *conn; WT_SESSION *session; pthread_t backup_tid, compact_tid; - uint64_t thread_ops; - uint32_t i, fourths; + int64_t fourths, thread_ops; + uint32_t i; int ret, running; conn = g.wts_conn; @@ -71,20 +71,23 @@ wts_ops(void) /* * There are two mechanisms to specify the length of the run, a number - * of operations or a timer. If the former, each thread does an equal - * share of the total operations (and make sure that it's not 0). If - * the latter, calculate how many fourth-of-a-second sleeps until this - * part of the run finishes. + * of operations and a timer, when either expire the run terminates. + * Each thread does an equal share of the total operations (and make + * sure that it's not 0). + * + * Calculate how many fourth-of-a-second sleeps until any timer expires. */ - if (g.c_timer == 0) { - fourths = 0; + if (g.c_ops == 0) + thread_ops = -1; + else { if (g.c_ops < g.c_threads) g.c_ops = g.c_threads; thread_ops = g.c_ops / g.c_threads; - } else { - fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS; - thread_ops = 0; } + if (g.c_timer == 0) + fourths = -1; + else + fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS; /* Initialize the table extension code. */ table_append_init(); @@ -117,8 +120,9 @@ wts_ops(void) die(ret, "pthread_create: compaction"); /* Spin on the threads, calculating the totals. */ - memset(&total, 0, sizeof(total)); for (;;) { + /* Clear out the totals each pass. */ + memset(&total, 0, sizeof(total)); for (i = 0, running = 0; i < g.c_threads; ++i) { total.commit += tinfo[i].commit; total.deadlock += tinfo[i].deadlock; @@ -140,27 +144,29 @@ wts_ops(void) break; } - if (thread_ops == 0) { + /* + * If the timer has expired or this thread has completed + * its operations, notify the thread it should quit. + */ + if (fourths == 0 || + (thread_ops != -1 && + tinfo[i].ops >= (uint64_t)thread_ops)) { /* - * Optionally drop core (for testing recovery), - * otherwise tell the thread it's done. + * On the last execution, optionally drop core + * for recovery testing. */ - if (fourths == 0) { - if (g.c_abort) { - static char *core = NULL; - *core = 0; - } - tinfo[i].quit = 1; + if (lastrun && g.c_abort) { + static char *core = NULL; + *core = 0; } - } else - if (tinfo[i].ops >= thread_ops) - tinfo[i].quit = 1; + tinfo[i].quit = 1; + } } track("ops", 0ULL, &total); if (!running) break; (void)usleep(250000); /* 1/4th of a second */ - if (fourths != 0) + if (fourths != -1) --fourths; } free(tinfo); diff --git a/test/format/recover.sh b/test/format/recover.sh index de908c71e5d..4177e26a278 100644 --- a/test/format/recover.sh +++ b/test/format/recover.sh @@ -37,12 +37,16 @@ while true; do # Save a copy of the database directory exactly as it was at the crash. cp -rp RUNDIR $rundir2 - # We aborted, so recovery is required - if `$wtcmd -R -h RUNDIR list | egrep table > /dev/null`; then - uri='table:wt' + # + # Everything is a table unless explicitly a file. + # + isfile=`grep data_source RUNDIR/CONFIG | grep -c file || exit 0` + if test "$isfile" -ne 0; then + uri="file:wt" else - uri='file:wt' + uri="table:wt" fi - # Force recovery to run. + + # We know we aborted, so force recovery to run. $wtcmd -R -h RUNDIR verify $uri || exit 1 done diff --git a/test/format/smoke.sh b/test/format/smoke.sh index 62577692d0c..fe53f64229f 100755 --- a/test/format/smoke.sh +++ b/test/format/smoke.sh @@ -1,7 +1,7 @@ #! /bin/sh # Smoke-test format as part of running "make check". -args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4" +args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none" ./t $args file_type=fix || exit 1 ./t $args file_type=row || exit 1 diff --git a/test/format/t.c b/test/format/t.c index b53913b4623..03b3605a5e4 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -40,6 +40,7 @@ extern char *__wt_optarg; int main(int argc, char *argv[]) { + time_t start; int ch, reps, ret; const char *config, *home; @@ -174,7 +175,9 @@ main(int argc, char *argv[]) config_print(0); /* Dump run configuration */ key_len_setup(); /* Setup keys */ + start = time(NULL); track("starting up", 0ULL, NULL); + if (SINGLETHREADED) bdb_open(); /* Initial file config */ wts_open(g.home, 1, &g.wts_conn); @@ -183,35 +186,35 @@ main(int argc, char *argv[]) wts_load(); /* Load initial records */ wts_verify("post-bulk verify"); /* Verify */ - /* Loop reading & operations */ - for (reps = 0; reps < FORMAT_OPERATION_REPS; ++reps) { - wts_read_scan(); /* Read scan */ - - /* Operations */ - if (g.c_timer != 0 || g.c_ops != 0) - wts_ops(); - - /* - * Statistics. - * - * XXX - * Verify closes the underlying handle and discards the - * statistics, read them first. - */ - if (g.c_ops == 0 || reps == 2) - wts_stats(); - - /* Verify */ - wts_verify("post-ops verify"); - - /* - * If no operation count, quit after a single read pass. - * (A timer configuration ran out the timer on the first - * set of operations.) - */ - if (g.c_ops == 0) - break; - } + /* + * If we're not doing any operations, scan the bulk-load, copy + * the statistics and we're done. Otherwise, loop reading and + * operations, with a verify after each set. + */ + if (g.c_timer == 0 && g.c_ops == 0) { + wts_read_scan(); /* Read scan */ + wts_stats(); /* Statistics */ + } else + for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps) { + wts_read_scan(); /* Read scan */ + + /* Operations */ + wts_ops(reps == FORMAT_OPERATION_REPS); + + /* + * Copy out the run's statistics after the last + * set of operations. + * + * XXX + * Verify closes the underlying handle and + * discards the statistics, read them first. + */ + if (reps == FORMAT_OPERATION_REPS) + wts_stats(); + + /* Verify */ + wts_verify("post-ops verify"); + } track("shutting down", 0ULL, NULL); if (SINGLETHREADED) @@ -233,8 +236,9 @@ main(int argc, char *argv[]) /* Overwrite the progress line with a completion line. */ if (g.track) printf("\r%78s\r", " "); - printf("%4d: %s, %s\n", - g.run_cnt, g.c_data_source, g.c_file_type); + printf("%4d: %s, %s (%.0f seconds)\n", + g.run_cnt, g.c_data_source, + g.c_file_type, difftime(time(NULL), start)); } /* Flush/close any logging information. */ diff --git a/test/packing/Makefile.am b/test/packing/Makefile.am new file mode 100644 index 00000000000..a8c6c2dc69f --- /dev/null +++ b/test/packing/Makefile.am @@ -0,0 +1,5 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_PROGRAMS = intpack-test intpack-test2 packing-test +LDADD = $(top_builddir)/libwiredtiger.la +LDFLAGS = -static diff --git a/test/packing/intpack-test.c b/test/packing/intpack-test.c index 109f37e229a..51acea15506 100644 --- a/test/packing/intpack-test.c +++ b/test/packing/intpack-test.c @@ -27,29 +27,29 @@ */ #include <assert.h> -#include <stdlib.h> -#include <time.h> -#include <wt_internal.h> -#include "intpack.i" +#include "wt_internal.h" -int main() { +int +main() +{ + const uint8_t *cp; uint8_t buf[10], *p; - uint64_t r, r2, ncalls; - int i, s; + uint64_t ncalls, r, r2, s; + int i; ncalls = 0; for (i = 0; i < 10000000; i++) { for (s = 0; s < 50; s += 5) { ++ncalls; - r = 1 << s; + r = 1ULL << s; #if 1 p = buf; - __wt_vpack_uint(NULL, &p, sizeof buf, r); - p = buf; - __wt_vunpack_uint(NULL, &p, sizeof buf, &r2); + assert(__wt_vpack_uint(&p, sizeof(buf), r) == 0); + cp = buf; + assert(__wt_vunpack_uint(&cp, sizeof(buf), &r2) == 0); #else /* * Note: use memmove for comparison because GCC does @@ -57,9 +57,9 @@ int main() { * to measure anything. */ p = buf; - memmove(p, &r, sizeof r); - p = buf; - memmove(&r2, p, sizeof r2); + memmove(p, &r, sizeof(r)); + cp = buf; + memmove(&r2, cp, sizeof(r2)); #endif if (r != r2) { fprintf(stderr, "mismatch!\n"); diff --git a/test/packing/intpack-test2.c b/test/packing/intpack-test2.c index 6b54504f367..d9ac9373cea 100644 --- a/test/packing/intpack-test2.c +++ b/test/packing/intpack-test2.c @@ -27,27 +27,26 @@ */ #include <assert.h> -#include <stdlib.h> -#include <time.h> -#include <wt_internal.h> -#include "intpack.i" +#include "wt_internal.h" -int main() { +int +main() +{ uint8_t buf[10], *p, *end; int64_t i; for (i = 1; i < 1LL << 60; i <<= 1) { end = buf; - __wt_vpack_uint(NULL, &end, sizeof buf, i); - printf("%lld ", i); + assert(__wt_vpack_uint(&end, sizeof(buf), (uint64_t)i) == 0); + printf("%" PRId64 " ", i); for (p = buf; p < end; p++) printf("%02x", *p); printf("\n"); end = buf; - __wt_vpack_int(NULL, &end, sizeof buf, -i); - printf("%lld ", -i); + assert(__wt_vpack_int(&end, sizeof(buf), -i) == 0); + printf("%" PRId64 " ", -i); for (p = buf; p < end; p++) printf("%02x", *p); printf("\n"); diff --git a/test/packing/packing-test.c b/test/packing/packing-test.c index 2696e8a008d..32b7d3d17ec 100644 --- a/test/packing/packing-test.c +++ b/test/packing/packing-test.c @@ -27,26 +27,26 @@ */ #include <assert.h> -#include <stdlib.h> -#include <time.h> -#include <wiredtiger.h> -#include <stdarg.h> +#include "wt_internal.h" -void check(const char *fmt, ...) +static void +check(const char *fmt, ...) { char buf[200], *end, *p; va_list ap; size_t len; + len = 0; /* -Werror=maybe-uninitialized */ + va_start(ap, fmt); - len = wiredtiger_struct_sizev(fmt, ap); + assert(__wt_struct_sizev(NULL, &len, fmt, ap) == 0); va_end(ap); - assert(len < sizeof buf); + assert(len < sizeof(buf)); va_start(ap, fmt); - assert(wiredtiger_struct_packv(buf, sizeof buf, fmt, ap) == 0); + assert(__wt_struct_packv(NULL, buf, sizeof(buf), fmt, ap) == 0); va_end(ap); printf("%s ", fmt); @@ -55,7 +55,9 @@ void check(const char *fmt, ...) printf("\n"); } -int main() { +int +main() +{ check("iii", 0, 101, -99); check("3i", 0, 101, -99); check("iS", 42, "forty two"); diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index f1e4f26c255..1c4d54df9e9 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -447,6 +447,18 @@ run(int r) } /* + * file_exists -- + * Return if the file exists. + */ +static int +file_exists(const char *path) +{ + struct stat sb; + + return (stat(path, &sb) == 0); +} + +/* * build -- * Build a row- or column-store page in a file. */ @@ -529,21 +541,16 @@ build(int ikey, int ivalue, int cnt) } /* - * The first time through this routine we put a matching configuration - * in for the salvage file. + * The first time through this routine we create the salvage file and + * then remove it (all we want is the appropriate schema entry, we're + * creating the salvage file itself by hand). */ - new_slvg = (access(SLVG, F_OK) != 0); + new_slvg = !file_exists(SLVG); if (new_slvg) { assert(session->drop(session, "file:" SLVG, "force") == 0); assert(session->create(session, "file:" SLVG, config) == 0); } - assert(conn->close(conn, 0) == 0); - - /* - * We created the salvage file above, but all we want is the schema, - * we're creating the salvage file by hand. - */ if (new_slvg) (void)remove(SLVG); } @@ -567,12 +574,13 @@ copy(u_int gen, u_int recno) * copy the first sector (the file description). * Otherwise, we are appending to an existing file. */ - if (access(SLVG, F_OK)) { + if (file_exists(SLVG)) + assert((ofp = fopen(SLVG, "a")) != NULL); + else { assert((ofp = fopen(SLVG, "w")) != NULL); assert(fread(buf, 1, PSIZE, ifp) == PSIZE); assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE); - } else - assert((ofp = fopen(SLVG, "a")) != NULL); + } /* * If there's data, copy/update the first formatted page. diff --git a/test/suite/test_bug009.py b/test/suite/test_bug009.py new file mode 100644 index 00000000000..9074d45bafd --- /dev/null +++ b/test/suite/test_bug009.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_bug009.py +# check that reconciliation takes into account prefix compression +# when figuring out how to split pages +# + +import wiredtiger, wttest +from wiredtiger import stat +from helper import confirm_empty,\ + key_populate, value_populate, simple_populate,\ + complex_populate, complex_value_populate +from wtscenario import multiply_scenarios, number_scenarios + +class test_bug009(wttest.WiredTigerTestCase): + name = 'test_bug009' + uri = 'file:' + name + + def test_reconciliation_prefix_compression(self): + # Configure 4KB pages with prefix compression enabled and support for + # large data items. + self.session.create(self.uri, + 'prefix_compression=1,' + + 'key_format=S,value_format=S,' + + 'internal_page_max=4KB,leaf_page_max=4KB,' + + 'leaf_value_max=3096') + + cursor = self.session.open_cursor(self.uri, None) + # Insert two items with keys that will be prefix compressed and data + # items sized so that the compression size difference tips the + # size over a page boundary. + cursor.set_key('fill_2__b_27') + cursor.set_value(2294 * '0') + cursor.insert() + + cursor.set_key('fill_2__b_28') + cursor.set_value(3022 * '0') + cursor.insert() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_bug010.py b/test/suite/test_bug010.py new file mode 100644 index 00000000000..31e9777aa8e --- /dev/null +++ b/test/suite/test_bug010.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_bug010.py +# check that checkpoints don't leave files marked clean when they +# did not write all updates out. +# + +import wiredtiger, wttest, wtthread +import threading, time + +class test_bug010(wttest.WiredTigerTestCase): + name = 'test_bug010' + uri = 'table:' + name + num_tables = 1000 + + # Overrides WiredTigerTestCase + def setUpConnectionOpen(self, dir): + self.home = dir + # Disable checkpoint sync, to make checkpoints faster and + # increase the likelyhood of triggering the symptom + conn_params = ',create,checkpoint_sync=false' + conn = wiredtiger.wiredtiger_open(dir, conn_params) + return conn + + def test_checkpoint_dirty(self): + # Create a lot of tables + # insert the same item in each + # Start a checkpoint with some of the updates + # Create another checkpoint that should contain all data consistently + # Read from the checkpoint and make sure the data is consistent + for i in range(0, self.num_tables): + self.printVerbose(3, 'Creating table ' + str(i)) + self.session.create(self.uri + str(i), + 'key_format=S,value_format=i') + c = self.session.open_cursor(self.uri + str(i), None) + c.set_key('a') + c.set_value(0) + c.insert() + c.close() + + self.session.checkpoint() + + iterations = 1 + expected_val = 0 + for its in range(1, 10): + self.printVerbose(3, 'Doing iteration ' + str(its)) + + # Create a checkpoint thread + done = threading.Event() + ckpt = wtthread.checkpoint_thread(self.conn, done) + ckpt.start() + try: + expected_val += 1 + for i in range(0, self.num_tables): + c = self.session.open_cursor(self.uri + str(i), None) + c.set_key('a') + c.set_value(expected_val) + c.insert() + c.close() + finally: + done.set() + ckpt.join() + + # Execute another checkpoint, to make sure we have a consistent + # view of the data. + self.session.checkpoint() + for i in range(0, self.num_tables): + c = self.session.open_cursor( + self.uri + str(i), None, 'checkpoint=WiredTigerCheckpoint') + c.next() + self.assertEquals(c.get_value(), expected_val, + msg='Mismatch on iteration ' + str(its) +\ + ' for table ' + str(i)) + c.close() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py index 4c7e6f667e4..6d81c102028 100644 --- a/test/suite/test_dump.py +++ b/test/suite/test_dump.py @@ -67,6 +67,31 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): scenarios = number_scenarios( multiply_scenarios('.', types, keyfmt, dumpfmt)) + # Extract the values lines from the dump output. + def value_lines(self, fname): + # mode: + # 0 == we are in the header + # 1 == next line is key + # 2 == next line is value + mode = 0 + lines = [] + for line in open(fname).readlines(): + if mode == 0: + if line == 'Data\n': + mode = 1 + elif mode == 1: + mode = 2 + else: + # This is a value line, keep it. + lines.append(line) + mode = 1 + return sorted(lines) + + def compare_dump_values(self, f1, f2): + l1 = self.value_lines(f1) + l2 = self.value_lines(f2) + self.assertEqual(l1, l2) + # Dump, re-load and do a content comparison. def test_dump(self): # Create the object. @@ -105,5 +130,14 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): 'load', '-n', '-f', 'dump.out'], errfilename='errfile.out') self.check_non_empty_file('errfile.out') + # If there is are indices, dump one of them and check the output. + if self.populate == complex_populate: + indexuri = 'index:' + self.name + ':indx1' + hexopt = ['-x'] if self.hex == 1 else [] + self.runWt(['-h', self.dir, 'dump'] + hexopt + [indexuri], + outfilename='dumpidx.out') + self.check_non_empty_file('dumpidx.out') + self.compare_dump_values('dump.out', 'dumpidx.out') + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_shared_cache.py b/test/suite/test_shared_cache01.py index ff40d31e6df..e6d712e61bc 100644 --- a/test/suite/test_shared_cache.py +++ b/test/suite/test_shared_cache01.py @@ -33,12 +33,12 @@ import wiredtiger, wttest from wttest import unittest from helper import key_populate, simple_populate -# test_shared_cache.py +# test_shared_cache01.py # Checkpoint tests # Test shared cache shared amongst multiple connections. -class test_shared_cache(wttest.WiredTigerTestCase): +class test_shared_cache01(wttest.WiredTigerTestCase): - uri = 'table:test_shared_cache' + uri = 'table:test_shared_cache01' # Setup fairly large items to use up cache data_str = 'abcdefghijklmnopqrstuvwxyz' * 20 @@ -89,7 +89,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.sessions = [] # Implicitly closed when closing sessions. # Basic test of shared cache - def test_shared_cache01(self): + def test_shared_cache_basic(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2']) @@ -99,7 +99,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Test of shared cache with more connections - def test_shared_cache02(self): + def test_shared_cache_more_connections(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3', 'WT_TEST4']) @@ -109,7 +109,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Do enough work for the shared cache to be fully allocated. - def test_shared_cache03(self): + def test_shared_cache_full(self): nops = 10000 self.openConnections(['WT_TEST1', 'WT_TEST2']) for sess in self.sessions: @@ -121,7 +121,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Switch the work between connections, to test rebalancing. - def test_shared_cache04(self): + def test_shared_cache_rebalance(self): # About 100 MB of data with ~250 byte values. nops = 200000 self.openConnections(['WT_TEST1', 'WT_TEST2']) @@ -132,7 +132,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Add a new connection once the shared cache is already established. - def test_shared_cache05(self): + def test_shared_cache_late_join(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2']) @@ -147,7 +147,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Close a connection and keep using other connections. - def test_shared_cache06(self): + def test_shared_cache_leaving(self): nops = 10000 self.openConnections(['WT_TEST1', 'WT_TEST2', 'WT_TEST3']) @@ -163,7 +163,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): # Test verbose output @unittest.skip("Verbose output handling") - def test_shared_cache07(self): + def test_shared_cache_verbose(self): nops = 1000 self.openConnections( ['WT_TEST1', 'WT_TEST2'], extra_opts="verbose=[shared_cache]") @@ -174,7 +174,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Test opening a connection outside of the shared cache - def test_shared_cache08(self): + def test_shared_cache_mixed(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2']) @@ -185,7 +185,7 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.closeConnections() # Test default config values - def test_shared_cache09(self): + def test_shared_cache_defaults(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool,size=200M)') @@ -194,21 +194,8 @@ class test_shared_cache(wttest.WiredTigerTestCase): self.add_records(sess, 0, nops) self.closeConnections() - # Test reconfigure API - def test_shared_cache10(self): - nops = 1000 - self.openConnections(['WT_TEST1', 'WT_TEST2']) - - for sess in self.sessions: - sess.create(self.uri, "key_format=S,value_format=S") - self.add_records(sess, 0, nops) - - connection = self.conns[0] - connection.reconfigure("shared_cache=(name=pool,size=300M)") - self.closeConnections() - # Test default config values - def test_shared_cache11(self): + def test_shared_cache_defaults2(self): nops = 1000 self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=(name=pool)') diff --git a/test/suite/test_shared_cache02.py b/test/suite/test_shared_cache02.py new file mode 100644 index 00000000000..3806e9d0cda --- /dev/null +++ b/test/suite/test_shared_cache02.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# If unittest2 is available, use it in preference to (the old) unittest + +import os +import shutil +import wiredtiger, wttest +from wttest import unittest +from helper import key_populate, simple_populate + +# test_shared_cache02.py +# Shared cache tests +# Test shared cache shared amongst multiple connections. +class test_shared_cache02(wttest.WiredTigerTestCase): + + uri = 'table:test_shared_cache02' + # Setup fairly large items to use up cache + data_str = 'abcdefghijklmnopqrstuvwxyz' * 20 + + # Add a set of records + def add_records(self, session, start, stop): + cursor = session.open_cursor(self.uri, None, "overwrite") + for i in range(start, stop+1): + cursor.set_key("%010d KEY------" % i) + cursor.set_value("%010d VALUE "% i + self.data_str) + self.assertEqual(cursor.insert(), 0) + cursor.close() + + # Disable default setup/shutdown steps - connections are managed manually. + def setUpSessionOpen(self, conn): + return None + + def close_conn(self): + return None + + def setUpConnectionOpen(self, dir): + return None + + def openConnections( + self, + connections, + pool_opts = ',shared_cache=(name=pool,size=200M,chunk=10M,reserve=30M),', + extra_opts = '', + add=0): + if add == 0: + self.conns = [] + self.sessions = [] + # Open the set of connections. + for name in connections: + shutil.rmtree(name, True) + os.mkdir(name) + next_conn = wiredtiger.wiredtiger_open( + name, + 'create,error_prefix="' + self.shortid() + ': "' + + pool_opts + extra_opts) + self.conns.append(next_conn) + self.sessions.append(next_conn.open_session(None)) + return None + + def closeConnections(self): + for tmp_conn in self.conns: + tmp_conn.close() + self.conns = [] + self.sessions = [] # Implicitly closed when closing sessions. + + # Test reconfigure API + def test_shared_cache_reconfig01(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2']) + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + + connection = self.conns[0] + connection.reconfigure("shared_cache=(name=pool,size=300M)") + self.closeConnections() + + # Test reconfigure that grows the usage over quota fails + def test_shared_cache_reconfig02(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),') + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + + connection = self.conns[0] + # Reconfigure to over-subscribe, call should fail with an error + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: connection.reconfigure("shared_cache=(name=pool,reserve=40M)"), + '/Shared cache unable to accommodate this configuration/') + # TODO: Ensure that the reserve size wasn't updated. + # cursor = self.sessions[0].open_cursor('config:', None, None) + # value = cursor['connection'] + # self.assertTrue(value.find('reserve') != -1) + + self.closeConnections() + + # Test reconfigure that would grow the usage over quota if the + # previous reserve size isn't taken into account + def test_shared_cache_reconfig03(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2'], + pool_opts = ',shared_cache=(name=pool,size=50M,reserve=20M),') + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + + connection = self.conns[0] + + connection.reconfigure("shared_cache=(name=pool,reserve=30M)"), + + # TODO: Ensure that the reserve size was updated. + # cursor = self.sessions[0].open_cursor('config:', None, None) + # value = cursor['connection'] + # self.assertTrue(value.find('reserve') != -1) + + self.closeConnections() + + # Test reconfigure that switches to using a shared cache + # previous reserve size isn't taken into account + def test_shared_cache_reconfig03(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts = ',') + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + + self.conns[0].reconfigure("shared_cache=(name=pool,reserve=20M)"), + self.conns[1].reconfigure("shared_cache=(name=pool,reserve=20M)"), + + # TODO: Ensure that the reserve size was updated. + # cursor = self.sessions[0].open_cursor('config:', None, None) + # value = cursor['connection'] + # self.assertTrue(value.find('reserve') != -1) + + self.closeConnections() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_stat01.py b/test/suite/test_stat01.py index 0f072a7c473..0b778d63b9d 100644 --- a/test/suite/test_stat01.py +++ b/test/suite/test_stat01.py @@ -28,6 +28,8 @@ import helper, wiredtiger, wttest from wiredtiger import stat +from helper import key_populate, simple_populate +from wtscenario import multiply_scenarios, number_scenarios # test_stat01.py # Statistics operations @@ -36,17 +38,23 @@ class test_stat01(wttest.WiredTigerTestCase): Test statistics """ - tablename = 'test_stat01.wt' - uri = 'file:' + tablename - config = 'key_format=S,' +\ - 'allocation_size=512,internal_page_max=16K,leaf_page_max=128K' + config = 'internal_page_max=4K,leaf_page_max=8K' nentries = 25 + types = [ + ('file', dict(uri='file:test_stat01.wt')), + ('table', dict(uri='table:test_stat01.wt')) + ] + keyfmt = [ + ('recno', dict(keyfmt='r')), + ('string', dict(keyfmt='S')), + ] + scenarios = number_scenarios(multiply_scenarios('.', types, keyfmt)) + # Override WiredTigerTestCase, we have extensions. def setUpConnectionOpen(self, dir): conn = wiredtiger.wiredtiger_open(dir, - 'create,statistics=(fast),' + - 'error_prefix="%s: "' % self.shortid()) + 'create,statistics=(all),' + 'error_prefix="%s: "' % self.shortid()) return conn def statstr_to_int(self, str): @@ -57,17 +65,17 @@ class test_stat01(wttest.WiredTigerTestCase): parts = str.rpartition('(') return int(parts[2].rstrip(')')) - def check_stats(self, statcursor, mincount, lookfor): - """ - Do a quick check of the entries in the the stats cursor, - There should be at least 'mincount' entries, - and the 'lookfor' string should appear - """ + # Do a quick check of the entries in the the stats cursor, the "lookfor" + # string should appear with a minimum value of least "min". + def check_stats(self, statcursor, min, lookfor): stringclass = ''.__class__ intclass = (0).__class__ - # make sure statistics basically look right - count = 0 + + # Reset the cursor, we're called multiple times. + statcursor.reset() + found = False + foundval = 0 for id, desc, valstr, val in statcursor: self.assertEqual(type(desc), stringclass) self.assertEqual(type(valstr), stringclass) @@ -75,68 +83,76 @@ class test_stat01(wttest.WiredTigerTestCase): self.assertEqual(val, self.statstr_to_int(valstr)) self.printVerbose(2, ' stat: \'' + desc + '\', \'' + valstr + '\', ' + str(val)) - count += 1 if desc == lookfor: found = True - self.assertTrue(count > mincount) + foundval = val + self.assertTrue(found, 'in stats, did not see: ' + lookfor) + self.assertTrue(foundval >= min) + # Test simple connection statistics. def test_basic_conn_stats(self): - self.printVerbose(2, 'overall database stats:') + # Build an object and force some writes. + config = self.config + ',key_format=' + self.keyfmt + simple_populate(self, self.uri, config, 1000) + self.session.checkpoint(None) + + # See that we can get a specific stat value by its key and verify its + # entry is self-consistent. allstat_cursor = self.session.open_cursor('statistics:', None, None) self.check_stats(allstat_cursor, 10, 'block-manager: blocks written') - # See that we can get a specific stat value by its key, - # and verify that its entry is self-consistent values = allstat_cursor[stat.conn.block_write] self.assertEqual(values[0], 'block-manager: blocks written') val = self.statstr_to_int(values[1]) self.assertEqual(val, values[2]) allstat_cursor.close() + # Test simple object statistics. def test_basic_data_source_stats(self): - self.session.create(self.uri, self.config) + # Build an object. + config = self.config + ',key_format=' + self.keyfmt + self.session.create(self.uri, config) cursor = self.session.open_cursor(self.uri, None, None) value = "" - for i in range(0, self.nentries): - key = str(i) - value = value + key + value # size grows exponentially - cursor.set_key(key) + for i in range(1, self.nentries): + value = value + 1000 * "a" + cursor.set_key(key_populate(cursor, i)) cursor.set_value(value) cursor.insert() cursor.close() - self.printVerbose(2, 'data source specific stats:') - cursor = self.session.open_cursor( - 'statistics:' + self.uri, None, None) + # Force the object to disk, otherwise we can't check the overflow count. + self.reopen_conn() + + # See that we can get a specific stat value by its key and verify its + # entry is self-consistent. + cursor = self.session.open_cursor('statistics:' + self.uri, None, None) + self.check_stats(cursor, 8192, 'btree: maximum leaf page size') + self.check_stats(cursor, 4096, 'btree: maximum internal page size') self.check_stats(cursor, 10, 'btree: overflow pages') - # See that we can get a specific stat value by its key, - # and verify that its entry is self-consistent values = cursor[stat.dsrc.btree_overflow] self.assertEqual(values[0], 'btree: overflow pages') val = self.statstr_to_int(values[1]) self.assertEqual(val, values[2]) cursor.close() - def test_missing_file_stats(self): - self.assertRaises(wiredtiger.WiredTigerError, lambda: - self.session.open_cursor('statistics:file:DoesNotExist')) - + # Test simple per-checkpoint statistics. def test_checkpoint_stats(self): - nentries = 0 - last_size = 0 for name in ('first', 'second', 'third'): - helper.simple_populate(self, self.uri, self.config, nentries) - nentries += self.nentries + config = self.config + ',key_format=' + self.keyfmt + helper.simple_populate(self, self.uri, config, self.nentries) self.session.checkpoint('name=' + name) cursor = self.session.open_cursor( 'statistics:' + self.uri, None, 'checkpoint=' + name) - size = cursor[stat.dsrc.btree_overflow][1] - self.assertTrue(size >= last_size) - last_size = size + self.assertEqual( + cursor[stat.dsrc.btree_entries][2], self.nentries + 1) cursor.close() - self.session.truncate(self.uri, None, None) + + def test_missing_file_stats(self): + self.assertRaises(wiredtiger.WiredTigerError, lambda: + self.session.open_cursor('statistics:file:DoesNotExist')) if __name__ == '__main__': wttest.run() |