summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2013-03-16 10:21:08 -0400
committerKeith Bostic <keith@wiredtiger.com>2013-03-16 10:21:08 -0400
commit4c0821092c721a0444e23c1c6ba048c2956c540c (patch)
tree99156383066b4b1533fcf9b3c1acc1d388e2d6a5 /src
parent463fb65908ab8cd097d1ab9839884131b2ca9e4e (diff)
downloadmongo-4c0821092c721a0444e23c1c6ba048c2956c540c.tar.gz
If a page is reconciled (causing it's on-disk blocks to be free'd and
potentially recycled), and then a subsequent collapse of a stack of split-merge pages replaces that page with a page that has not yet been reconciled, we can potentially free the same blocks twice. The fix is to clear the page's WT_REF.addr field at the time we free the blocks, so future reconciliations will ignore the original disk blocks. Explicitly free the disk blocks instead of copying them into the reconciliation tracking array and then freeing them, that gets rid of the __wt_rec_track_onpage_ref() function, will be faster for a common operation, and isn't logically different, the next step is to free the blocks anyway.
Diffstat (limited to 'src')
-rw-r--r--src/btree/rec_track.c28
-rw-r--r--src/btree/rec_write.c48
-rw-r--r--src/include/extern.h4
3 files changed, 40 insertions, 40 deletions
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index 92e98212b65..b28ff3d6321 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -237,17 +237,8 @@ __wt_rec_track_onpage_srch(
/*
* __wt_rec_track_onpage_addr --
- * Search for a permanently tracked object (based on an addr/size pair),
- * and add it if it isn't already tracked.
- *
- * __wt_rec_track_onpage_ref --
- * Search for a permanently tracked object (based on a page and ref),
- * and add it if it isn't already tracked.
- *
- * These functions are short-hand for "search the on-page records, and if the
- * address is not already listed as an object, add it". Note there is no
- * possibility of object re-use, the object is discarded when reconciliation
- * completes.
+ * Search the on-page records for a permanently tracked object (based on
+ * an addr/size pair), and add it if it isn't already tracked.
*/
int
__wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
@@ -256,21 +247,14 @@ __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
if (__wt_rec_track_onpage_srch(page, addr, addr_size))
return (0);
+ /*
+ * Note there is no possibility of object re-use, the object is
+ * discarded when reconciliation completes.
+ */
return (__wt_rec_track(
session, page, addr, addr_size, NULL, 0, WT_TRK_ONPAGE));
}
-int
-__wt_rec_track_onpage_ref(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE *refpage, WT_REF *ref)
-{
- uint32_t size;
- const uint8_t *addr;
-
- __wt_get_addr(refpage, ref, &addr, &size);
- return (__wt_rec_track_onpage_addr(session, page, addr, size));
-}
-
/*
* __wt_rec_track_ovfl_reuse --
* Search for a matching overflow record and reactivate it.
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 12e3e0d706e..49b7503190b 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -814,10 +814,11 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* If no such transactions exist, we can discard the leaf page to the
* block manager and no cell needs to be written at all. We do this
* outside of the underlying tracking routines because this action is
- * permanent and irrevocable. (Setting the WT_REF.addr value to NULL
- * means we've lost track of the disk address in a permanent way. If
- * we ever read into this chunk of the name space again, the cache read
- * function instantiates a new page.)
+ * permanent and irrevocable. (Clearing the address means we've lost
+ * track of the disk address in a permanent way. This is safe because
+ * there's no path to reading the leaf page again: if reconciliation
+ * fails, and we ever read into this part of the name space again, the
+ * cache read function instantiates a new page.)
*
* One final note: if the WT_REF transaction ID is set to WT_TXN_NONE,
* it means this WT_REF is the re-creation of a deleted node (we wrote
@@ -3728,8 +3729,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_BTREE *btree;
WT_BOUNDARY *bnd;
WT_PAGE_MODIFY *mod;
- uint32_t page_size;
+ WT_REF *ref;
+ uint32_t size;
int was_modified;
+ const uint8_t *addr;
btree = session->btree;
bm = btree->bm;
@@ -3743,17 +3746,34 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case 0: /*
- * The page has never been reconciled before, track the original
- * address blocks (if any). The "if any" is for empty trees we
- * create when a new tree is opened, and for previously deleted
- * pages that are instantiated in memory.
+ * The page has never been reconciled before, free the original
+ * address blocks (if any). The "if any" is for empty trees
+ * created when a new tree is opened, previously deleted pages
+ * instantiated in memory, or pages reconciled into split-merge
+ * pages and then replaced by other pages because the tree grew
+ * too deep.
*
* The exception is root pages are never tracked or free'd, they
* are checkpoints, and must be explicitly dropped.
*/
- if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL)
- WT_RET(__wt_rec_track_onpage_ref(
- session, page, page->parent, page->ref));
+ if (WT_PAGE_IS_ROOT(page))
+ break;
+
+ ref = page->ref;
+ if (ref->addr != NULL) {
+ /*
+ * Free the page and clear the address (so we don't free
+ * it twice). Logically, this is the same as adding the
+ * address to the reconciliation tracking information
+ * and freeing it when reconciliation ends as part of
+ * cleaning up the track information, but that is going
+ * to happen right at the end of this switch statement,
+ * might as well save the work.
+ */
+ __wt_get_addr(page->parent, ref, &addr, &size);
+ WT_RET(bm->free(bm, session, addr, size));
+ ref->addr = NULL;
+ }
break;
case WT_PM_REC_EMPTY: /* Page deleted */
break;
@@ -3935,10 +3955,10 @@ err: __wt_scr_free(&tkey);
*/
if (!r->upd_skipped) {
was_modified = __wt_page_is_modified(page);
- WT_ORDERED_READ(page_size, page->memory_footprint);
+ WT_ORDERED_READ(size, page->memory_footprint);
mod->disk_gen = r->orig_write_gen;
if (was_modified && !__wt_page_is_modified(page))
- __wt_cache_dirty_decr(session, page_size);
+ __wt_cache_dirty_decr(session, size);
}
/* Record the most recent transaction ID we could have written. */
diff --git a/src/include/extern.h b/src/include/extern.h
index c0142791879..6da82c23fa4 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -380,10 +380,6 @@ extern int __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
WT_PAGE *page,
const uint8_t *addr,
uint32_t addr_size);
-extern int __wt_rec_track_onpage_ref( WT_SESSION_IMPL *session,
- WT_PAGE *page,
- WT_PAGE *refpage,
- WT_REF *ref);
extern int __wt_rec_track_ovfl_reuse( WT_SESSION_IMPL *session,
WT_PAGE *page,
const void *data,