summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2014-11-01 12:59:29 -0400
committerKeith Bostic <keith@wiredtiger.com>2014-11-01 12:59:29 -0400
commit54b3d551f2f89c75c451520879b91701eeee7c06 (patch)
tree4ed6fc2b23506a866fca1fbad506361f085a7e73
parent5d9ddea0691679d7fad291c865b534fd55230c9d (diff)
downloadmongo-54b3d551f2f89c75c451520879b91701eeee7c06.tar.gz
In the case of a workload where we're forcibly evicting a large page,
but most of the page is discarded during reconciliation and the page doesn't split into multiple chunks, we were quitting, leaving the page in place. Instead, instantiate the page and swap it into place to replace the previous version. Reference #1317.
-rw-r--r--dist/stat_data.py3
-rw-r--r--src/btree/rec_split.c86
-rw-r--r--src/btree/rec_write.c109
-rw-r--r--src/include/stat.h1
-rw-r--r--src/include/wiredtiger.in31
-rw-r--r--src/support/stat.c3
6 files changed, 126 insertions, 107 deletions
diff --git a/dist/stat_data.py b/dist/stat_data.py
index a07dd9c81cb..d64d2c165e4 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -162,9 +162,6 @@ connection_stats = [
Stat('rec_pages', 'reconciliation: page reconciliation calls'),
Stat('rec_pages_eviction',
'reconciliation: page reconciliation calls for eviction'),
- Stat('rec_skipped_update',
- 'reconciliation: reconciliation failed because an update' +
- ' could not be included'),
Stat('rec_split_stashed_bytes',
'reconciliation: split bytes currently awaiting free',
'no_clear,no_scale'),
diff --git a/src/btree/rec_split.c b/src/btree/rec_split.c
index ae2f662a570..babec2cc295 100644
--- a/src/btree/rec_split.c
+++ b/src/btree/rec_split.c
@@ -654,23 +654,16 @@ __split_inmem_build(
/*
* We can find unresolved updates when attempting to evict a page, which
- * cannot be written. We could fail those evictions, but if the page is
- * never quiescent and is growing too large for the cache, we can only
- * avoid the problem for so long. The solution is to split those pages
- * into many on-disk chunks we write, plus some on-disk chunks we don't
- * write. This code deals with the latter: any chunk we didn't write is
- * re-created as an in-memory page, then we apply the unresolved updates
- * to that page.
- */
- WT_RET(__wt_page_inmem(
- session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page));
-
- /*
+ * can't be written. This code re-creates the in-memory page and applies
+ * the unresolved updates to that page.
+ *
* Clear the disk image and link the page into the passed-in WT_REF to
* simplify error handling: our caller will not discard the disk image
* when discarding the original page, and our caller will discard the
* allocated page on error, when discarding the allocated WT_REF.
*/
+ WT_RET(__wt_page_inmem(
+ session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page));
multi->skip_dsk = NULL;
if (orig->type == WT_PAGE_ROW_LEAF)
@@ -811,11 +804,11 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
}
/*
- * __wt_split_evict --
- * Resolve a page split, inserting new information into the parent.
+ * __split_evict_multi --
+ * Resolve a multi-page split, inserting new information into the parent.
*/
-int
-__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+static int
+__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
{
WT_DECL_RET;
WT_IKEY *ikey;
@@ -1065,3 +1058,64 @@ err: if (locked)
*/
return (ret == WT_PANIC || !complete ? ret : 0);
}
+
+/*
+ * __split_evict_single --
+ * Resolve a single page split, replacing a page with a new version.
+ */
+static int
+__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ /* Build the new page. */
+ memset(&new, 0, sizeof(new));
+ WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * Discard the original page. Pages with unresolved changes are not
+ * marked clean during reconciliation, do it now.
+ */
+ mod->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ __wt_page_out(session, &page);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+}
+
+/*
+ * __wt_split_evict --
+ * Resolve a page split.
+ */
+int
+__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ uint32_t split_entries;
+
+ /*
+ * There are two cases entering this code. First, an in-memory page that
+ * got too large, we forcibly evicted it, and there wasn't anything to
+ * write. (Imagine two threads updating a small set keys on a leaf page.
+ * The page is too large so we try to evict it, but after reconciliation
+ * there's only a small amount of data (so it's a single page we can't
+ * split), and because there are two threads, there's some data we can't
+ * write (so we can't evict it). In that case, we take advantage of the
+ * fact we have exclusive access to the page and rewrite it in memory.)
+ *
+ * Second, a real split where we reconciled a page and it turned into a
+ * lot of pages.
+ */
+ split_entries = ref->page->modify->mod_multi_entries;
+ return (split_entries == 1 ?
+ __split_evict_single(session, ref) :
+ __split_evict_multi(session, ref, exclusive));
+}
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 1dbe6ffc81f..bd691dffce2 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -2011,30 +2011,6 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_skipped_update_chk --
- * Return if a skipped update makes this a waste of time.
- */
-static inline int
-__rec_skipped_update_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r)
-{
- /*
- * If we're doing an eviction, and we skipped an update, it only pays
- * off to continue if we're writing multiple blocks, that is, we'll be
- * able to evict something. This should be unlikely (why did eviction
- * choose a recently written, small block), but it's possible. Our
- * caller is responsible for calling us at the right moment, when all
- * of the rows have been reviewed and we're about to finalize a write.
- */
- if (F_ISSET(r, WT_SKIP_UPDATE_RESTORE) &&
- r->bnd_next == 0 && r->leave_dirty) {
- WT_STAT_FAST_CONN_INCR(session, rec_skipped_update);
- WT_STAT_FAST_DATA_INCR(session, rec_skipped_update);
- return (EBUSY);
- }
- return (0);
-}
-
-/*
* __rec_split_raw_worker --
* Handle the raw compression page reconciliation bookkeeping.
*/
@@ -2393,10 +2369,6 @@ no_slots:
return (0);
}
- /* Check if a skipped update makes this a waste of time. */
- if (last_block)
- WT_RET(__rec_skipped_update_chk(session, r));
-
/* We have a block, update the boundary counter. */
++r->bnd_next;
@@ -2501,20 +2473,21 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
WT_RET(__rec_split_bnd_grow(session, r));
break;
case SPLIT_TRACKING_RAW:
+ /*
+ * We were configured for raw compression, but never actually
+ * wrote anything.
+ */
+ break;
WT_ILLEGAL_VALUE(session);
}
- /* Check if a skipped update makes this a waste of time. */
- WT_RET(__rec_skipped_update_chk(session, r));
-
/*
* We only arrive here with no entries to write if the page was entirely
- * empty; if the page was empty, we merge it into its parent during the
- * parent's reconciliation. This check is done after checking skipped
- * updates, we could have a page that's empty only because we skipped
- * all of the updates.
+ * empty, and if the page is empty, we merge it into its parent during
+ * the parent's reconciliation. A page with skipped updates isn't truly
+ * empty, continue on.
*/
- if (r->entries == 0)
+ if (r->entries == 0 && r->skip == NULL)
return (0);
/* Set the boundary reference and increment the count. */
@@ -2534,34 +2507,21 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_split_finish_raw --
- * Finish processing page, raw compression version.
- */
-static inline int
-__rec_split_finish_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
-{
- /* Check if a skipped update makes this a waste of time. */
- if (r->entries == 0)
- WT_RET(__rec_skipped_update_chk(session, r));
-
- while (r->entries != 0)
- WT_RET(__rec_split_raw_worker(session, r, 1));
- return (0);
-}
-
-/*
* __rec_split_finish --
* Finish processing a page.
*/
-static inline int
+static int
__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
/*
* We're done reconciling a page.
*/
- return (r->raw_compression ?
- __rec_split_finish_raw(session, r) :
- __rec_split_finish_std(session, r));
+ if (r->entries == 0 || !r->raw_compression)
+ return (__rec_split_finish_std(session, r));
+
+ while (r->entries != 0)
+ WT_RET(__rec_split_raw_worker(session, r, 1));
+ return (0);
}
/*
@@ -2674,9 +2634,10 @@ __rec_split_write(WT_SESSION_IMPL *session,
/* Set the zero-length value flag in the page header. */
if (dsk->type == WT_PAGE_ROW_LEAF) {
F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
- if (r->all_empty_value)
+
+ if (r->entries != 0 && r->all_empty_value)
F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
- if (!r->any_empty_value)
+ if (r->entries != 0 && !r->any_empty_value)
F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
}
@@ -4677,6 +4638,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_BM *bm;
WT_BOUNDARY *bnd;
WT_BTREE *btree;
+ WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
size_t addr_size;
@@ -4783,13 +4745,28 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* Because WiredTiger's pages grow without splitting, we're
* replacing a single page with another single page most of
* the time.
- *
- * We should not be saving/restoring changes for this page in
- * this case, we should have returned failure before writing
- * any blocks.
*/
bnd = &r->bnd[0];
- WT_ASSERT(session, bnd->skip == NULL);
+
+ /*
+ * If we're saving/restoring changes for this page, there's
+ * nothing to write. Allocate, then initialize the array of
+ * replacement blocks.
+ */
+ if (bnd->skip != NULL) {
+ WT_RET(__wt_calloc_def(
+ session, r->bnd_next, &mod->mod_multi));
+ multi = mod->mod_multi;
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ mod->mod_multi_entries = 1;
+
+ F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ break;
+ }
/*
* If this is a root page, then we don't have an address and we
@@ -4992,8 +4969,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
}
/* Allocate, then initialize the array of replacement blocks. */
- WT_RET(__wt_calloc(
- session, r->bnd_next, sizeof(WT_MULTI), &mod->mod_multi));
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
for (multi = mod->mod_multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
@@ -5034,8 +5010,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
mod = page->modify;
/* Allocate, then initialize the array of replacement blocks. */
- WT_RET(__wt_calloc(
- session, r->bnd_next, sizeof(WT_MULTI), &mod->mod_multi));
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
for (multi = mod->mod_multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
diff --git a/src/include/stat.h b/src/include/stat.h
index ac0d03e8a88..d34009516a2 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -221,7 +221,6 @@ struct __wt_connection_stats {
WT_STATS read_io;
WT_STATS rec_pages;
WT_STATS rec_pages_eviction;
- WT_STATS rec_skipped_update;
WT_STATS rec_split_stashed_bytes;
WT_STATS rec_split_stashed_objects;
WT_STATS rwlock_read;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index a5cd9467b29..244a034d615 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -3205,37 +3205,34 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_REC_PAGES 1092
/*! reconciliation: page reconciliation calls for eviction */
#define WT_STAT_CONN_REC_PAGES_EVICTION 1093
-/*! reconciliation: reconciliation failed because an update could not be
- * included */
-#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1094
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1095
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1096
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095
/*! conn: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1097
+#define WT_STAT_CONN_RWLOCK_READ 1096
/*! conn: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1098
+#define WT_STAT_CONN_RWLOCK_WRITE 1097
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1099
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1100
+#define WT_STAT_CONN_SESSION_OPEN 1099
/*! txn: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1101
+#define WT_STAT_CONN_TXN_BEGIN 1100
/*! txn: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1102
+#define WT_STAT_CONN_TXN_CHECKPOINT 1101
/*! txn: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1103
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1102
/*! txn: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1104
+#define WT_STAT_CONN_TXN_COMMIT 1103
/*! txn: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1105
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1104
/*! txn: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1106
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1105
/*! txn: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1107
+#define WT_STAT_CONN_TXN_ROLLBACK 1106
/*! conn: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1108
+#define WT_STAT_CONN_WRITE_IO 1107
/*!
* @}
diff --git a/src/support/stat.c b/src/support/stat.c
index 411515e6b03..030c4c563a9 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -446,8 +446,6 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
stats->rec_pages.desc = "reconciliation: page reconciliation calls";
stats->rec_pages_eviction.desc =
"reconciliation: page reconciliation calls for eviction";
- stats->rec_skipped_update.desc =
- "reconciliation: reconciliation failed because an update could not be included";
stats->rec_split_stashed_bytes.desc =
"reconciliation: split bytes currently awaiting free";
stats->rec_split_stashed_objects.desc =
@@ -562,7 +560,6 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->read_io.v = 0;
stats->rec_pages.v = 0;
stats->rec_pages_eviction.v = 0;
- stats->rec_skipped_update.v = 0;
stats->rwlock_read.v = 0;
stats->rwlock_write.v = 0;
stats->txn_begin.v = 0;