diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2015-05-07 13:40:19 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-05-25 16:39:39 +1000 |
commit | a89948f9239d3c45869214e1280f7835d3fccb88 (patch) | |
tree | b0ef3fe2d5a55a070ce2be462c608a5a734b3a02 | |
parent | bf0408ec3310f5a81746169cb12781c931866bd1 (diff) | |
download | mongo-a89948f9239d3c45869214e1280f7835d3fccb88.tar.gz |
Merge pull request #1946 from wiredtiger/checkpoint-rewrite
Fix checkpoint visiting a page that was rewritten in memory
(cherry picked from commit f3e73908a9e19a64648f509c5e0417b05853bca2)
-rw-r--r-- | src/btree/bt_debug.c | 3 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 1 | ||||
-rw-r--r-- | src/btree/bt_split.c | 1 | ||||
-rw-r--r-- | src/btree/bt_sync.c | 3 | ||||
-rw-r--r-- | src/evict/evict_page.c | 35 | ||||
-rw-r--r-- | src/include/btmem.h | 4 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 12 |
7 files changed, 36 insertions, 23 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index fa7cff35e5f..dba2da223bd 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -648,6 +648,9 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) case WT_PM_REC_REPLACE: __dmsg(ds, ", replaced"); break; + case WT_PM_REC_REWRITE: + __dmsg(ds, ", rewrite"); + break; case 0: break; WT_ILLEGAL_VALUE(session); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 05a54ad643e..2a0a5e37f98 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -150,6 +150,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_MULTIBLOCK: + case WT_PM_REC_REWRITE: /* Free list of replacement blocks. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 9fc567f02c1..f5c3d5fa331 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -646,7 +646,6 @@ __split_multi_inmem( * when discarding the original page, and our caller will discard the * allocated page on error, when discarding the allocated WT_REF. */ - WT_RET(__wt_page_inmem(session, ref, multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size, WT_PAGE_DISK_ALLOC, &page)); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index dae2dd8d480..71b0d0abdb3 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -150,7 +150,8 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, TXN_HAS_SNAPSHOT) && - TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { + TXNID_LT(txn->snap_max, mod->first_dirty_txn) && + !F_ISSET(mod, WT_PM_REC_REWRITE)) { __wt_page_modify_set(session, page); continue; } diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 37612bda7e6..e9d1616149e 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -195,25 +195,10 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ /* - * There are two cases in this code. - * - * First, an in-memory page that got too large, we forcibly - * evicted it, and there wasn't anything to write. (Imagine two - * threads updating a small set keys on a leaf page. The page is - * too large so we try to evict it, but after reconciliation - * there's only a small amount of data (so it's a single page we - * can't split), and because there are two threads, there's some - * data we can't write (so we can't evict it). In that case, we - * take advantage of the fact we have exclusive access to the - * page and rewrite it in memory.) - * - * Second, a real split where we reconciled a page and it turned - * into a lot of pages. + * A real split where we reconciled a page and it turned into a + * lot of pages. */ - if (mod->mod_multi_entries == 1) - WT_RET(__wt_split_rewrite(session, ref)); - else - WT_RET(__wt_split_multi(session, ref, exclusive)); + WT_RET(__wt_split_multi(session, ref, exclusive)); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -236,6 +221,20 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) ref->addr = addr; WT_PUBLISH(ref->state, WT_REF_DISK); break; + case WT_PM_REC_REWRITE: + /* + * An in-memory page that got too large, we forcibly evicted + * it, and there wasn't anything to write. (Imagine two threads + * updating a small set keys on a leaf page. The page is too + * large so we try to evict it, but after reconciliation + * there's only a small amount of data (so it's a single page + * we can't split), and because there are two threads, there's + * some data we can't write (so we can't evict it). In that + * case, we take advantage of the fact we have exclusive access + * to the page and rewrite it in memory.) + */ + WT_RET(__wt_split_rewrite(session, ref)); + break; WT_ILLEGAL_VALUE(session); } diff --git a/src/include/btmem.h b/src/include/btmem.h index eecbbed176e..303162fcc93 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -356,8 +356,10 @@ struct __wt_page_modify { #define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */ +#define WT_PM_REC_REWRITE 0x08 /* Reconciliation: rewrite in place */ #define WT_PM_REC_MASK \ - (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE) + (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | \ + WT_PM_REC_REPLACE | WT_PM_REC_REWRITE) uint8_t flags; /* Page flags */ }; diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index ba0479bf974..573ea8811f8 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -483,6 +483,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + case WT_PM_REC_REWRITE: /* Rewrite */ return (0); case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ break; @@ -3269,6 +3270,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_REPLACE: addr = &child->modify->mod_replace; break; + case WT_PM_REC_REWRITE: + break; WT_ILLEGAL_VALUE_ERR(session); } } else @@ -4819,6 +4822,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_EMPTY: /* Page deleted */ break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + case WT_PM_REC_REWRITE: /* Rewrite */ /* * Discard the multiple replacement blocks. */ @@ -4897,7 +4901,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd->dsk = NULL; mod->mod_multi_entries = 1; - F_SET(mod, WT_PM_REC_MULTIBLOCK); + F_SET(mod, WT_PM_REC_REWRITE); break; } @@ -5047,10 +5051,14 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * information (otherwise we might think the backing block is being * reused on a subsequent reconciliation where we want to free it). */ - if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK) + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_MULTIBLOCK: + case WT_PM_REC_REWRITE: for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) multi->addr.reuse = 0; + break; + } /* * On error, discard blocks we've written, they're unreferenced by the |