summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2015-05-07 13:40:19 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2015-05-25 16:39:39 +1000
commita89948f9239d3c45869214e1280f7835d3fccb88 (patch)
treeb0ef3fe2d5a55a070ce2be462c608a5a734b3a02
parentbf0408ec3310f5a81746169cb12781c931866bd1 (diff)
downloadmongo-a89948f9239d3c45869214e1280f7835d3fccb88.tar.gz
Merge pull request #1946 from wiredtiger/checkpoint-rewrite
Fix checkpoint visiting a page that was rewritten in memory (cherry picked from commit f3e73908a9e19a64648f509c5e0417b05853bca2)
-rw-r--r--src/btree/bt_debug.c3
-rw-r--r--src/btree/bt_discard.c1
-rw-r--r--src/btree/bt_split.c1
-rw-r--r--src/btree/bt_sync.c3
-rw-r--r--src/evict/evict_page.c35
-rw-r--r--src/include/btmem.h4
-rw-r--r--src/reconcile/rec_write.c12
7 files changed, 36 insertions, 23 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index fa7cff35e5f..dba2da223bd 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -648,6 +648,9 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
case WT_PM_REC_REPLACE:
__dmsg(ds, ", replaced");
break;
+ case WT_PM_REC_REWRITE:
+ __dmsg(ds, ", rewrite");
+ break;
case 0:
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 05a54ad643e..2a0a5e37f98 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -150,6 +150,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case WT_PM_REC_MULTIBLOCK:
+ case WT_PM_REC_REWRITE:
/* Free list of replacement blocks. */
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 9fc567f02c1..f5c3d5fa331 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -646,7 +646,6 @@ __split_multi_inmem(
* when discarding the original page, and our caller will discard the
* allocated page on error, when discarding the allocated WT_REF.
*/
-
WT_RET(__wt_page_inmem(session, ref,
multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size,
WT_PAGE_DISK_ALLOC, &page));
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index dae2dd8d480..71b0d0abdb3 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -150,7 +150,8 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
F_ISSET(txn, TXN_HAS_SNAPSHOT) &&
- TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
+ TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
+ !F_ISSET(mod, WT_PM_REC_REWRITE)) {
__wt_page_modify_set(session, page);
continue;
}
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 37612bda7e6..e9d1616149e 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -195,25 +195,10 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
/*
- * There are two cases in this code.
- *
- * First, an in-memory page that got too large, we forcibly
- * evicted it, and there wasn't anything to write. (Imagine two
- * threads updating a small set keys on a leaf page. The page is
- * too large so we try to evict it, but after reconciliation
- * there's only a small amount of data (so it's a single page we
- * can't split), and because there are two threads, there's some
- * data we can't write (so we can't evict it). In that case, we
- * take advantage of the fact we have exclusive access to the
- * page and rewrite it in memory.)
- *
- * Second, a real split where we reconciled a page and it turned
- * into a lot of pages.
+ * A real split where we reconciled a page and it turned into a
+ * lot of pages.
*/
- if (mod->mod_multi_entries == 1)
- WT_RET(__wt_split_rewrite(session, ref));
- else
- WT_RET(__wt_split_multi(session, ref, exclusive));
+ WT_RET(__wt_split_multi(session, ref, exclusive));
break;
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
@@ -236,6 +221,20 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
ref->addr = addr;
WT_PUBLISH(ref->state, WT_REF_DISK);
break;
+ case WT_PM_REC_REWRITE:
+ /*
+ * An in-memory page that got too large, we forcibly evicted
+ * it, and there wasn't anything to write. (Imagine two threads
+ * updating a small set keys on a leaf page. The page is too
+ * large so we try to evict it, but after reconciliation
+ * there's only a small amount of data (so it's a single page
+ * we can't split), and because there are two threads, there's
+ * some data we can't write (so we can't evict it). In that
+ * case, we take advantage of the fact we have exclusive access
+ * to the page and rewrite it in memory.)
+ */
+ WT_RET(__wt_split_rewrite(session, ref));
+ break;
WT_ILLEGAL_VALUE(session);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index eecbbed176e..303162fcc93 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -356,8 +356,10 @@ struct __wt_page_modify {
#define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */
#define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */
+#define WT_PM_REC_REWRITE 0x08 /* Reconciliation: rewrite in place */
#define WT_PM_REC_MASK \
- (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE)
+ (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | \
+ WT_PM_REC_REPLACE | WT_PM_REC_REWRITE)
uint8_t flags; /* Page flags */
};
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index ba0479bf974..573ea8811f8 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -483,6 +483,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case WT_PM_REC_EMPTY: /* Page is empty */
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ case WT_PM_REC_REWRITE: /* Rewrite */
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
@@ -3269,6 +3270,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_REPLACE:
addr = &child->modify->mod_replace;
break;
+ case WT_PM_REC_REWRITE:
+ break;
WT_ILLEGAL_VALUE_ERR(session);
}
} else
@@ -4819,6 +4822,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
case WT_PM_REC_EMPTY: /* Page deleted */
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ case WT_PM_REC_REWRITE: /* Rewrite */
/*
* Discard the multiple replacement blocks.
*/
@@ -4897,7 +4901,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd->dsk = NULL;
mod->mod_multi_entries = 1;
- F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ F_SET(mod, WT_PM_REC_REWRITE);
break;
}
@@ -5047,10 +5051,14 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* information (otherwise we might think the backing block is being
* reused on a subsequent reconciliation where we want to free it).
*/
- if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_MULTIBLOCK:
+ case WT_PM_REC_REWRITE:
for (multi = mod->mod_multi,
i = 0; i < mod->mod_multi_entries; ++multi, ++i)
multi->addr.reuse = 0;
+ break;
+ }
/*
* On error, discard blocks we've written, they're unreferenced by the