summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2012-03-25 17:49:32 +0000
committerKeith Bostic <keith@wiredtiger.com>2012-03-25 17:52:09 +0000
commitb97813245af2ef6da2b761ee619a692f4d46f42b (patch)
tree692b12cd00b817bd81e7ed56103912a847eaddcb
parentc3cecf7f3aa67978ba0ded16a4f97896093f7e0e (diff)
downloadmongo-b97813245af2ef6da2b761ee619a692f4d46f42b.tar.gz
Second set of snapshot changes, ref #167:
Start using the alloc and discard lists instead of running everything through the single avail list. This primarily affects what happens when a snapshot is deleted. A snapshot's avail list is discarded; alloc and discard lists are rolled-forward into the live system's corresponding lists, and then the alloc and discard lists are checked for overlaps, the overlaps are the blocks that are newly available for re-use. Change extent writes to always append to the file (the problem is that we can't allocate the avail list blocks from the avail list, and it's simpler to always extend the file for extent lists rather than try to figure out which extent lists writes are extending the file and which aren't.) This implies that we can tell the allocation code to always extend the file, and the free code to simply put the free'd blocks on the avail list, because extent blocks don't appear in the allocation list. Change page reconciliation to release all pages in the page-tracking information before writing a snapshot, the underlying system has to be fully consistent. We can no longer retain currently unused overflow pages to re-use, across reconciliation calls: if a reconciliation call results in a snapshot, any overflow page not used as part of the page reconciliation must be discarded, otherwise the underlying snapshot won't be completely consistent. This adds some (very!) few cases where we don't re-use overflow items we would have re-used in earlier versions of the system, but it shouldn't be common. This required an extensive re-work of the block/overflow page tracking code. Changes for salvage: load and unload snapshots during salvage, blocks free'd during salvage do not appear on the allocation list, so must be free'd explicitly to the system's avail list. Rename block-manager page type WT_PAGE_FREELIST to WT_PAGE_BLOCK_MANAGER. We still handle block-manager pages in the btree layer during salvage, but we could also handle them in the salvage code if we're willing to look at the page type. Set the snapshot field names when cracking the buffer cookie. Don't display the by-size extent lists by default, it's not very useful.
-rw-r--r--src/block/block_addr.c8
-rw-r--r--src/block/block_ext.c353
-rw-r--r--src/block/block_mgr.c4
-rw-r--r--src/block/block_open.c2
-rw-r--r--src/block/block_slvg.c20
-rw-r--r--src/block/block_snap.c131
-rw-r--r--src/block/block_write.c25
-rw-r--r--src/btree/bt_debug.c14
-rw-r--r--src/btree/bt_misc.c4
-rw-r--r--src/btree/bt_slvg.c4
-rw-r--r--src/btree/bt_vrfy_dsk.c6
-rw-r--r--src/btree/rec_evict.c19
-rw-r--r--src/btree/rec_track.c238
-rw-r--r--src/btree/rec_write.c70
-rw-r--r--src/include/block.h1
-rw-r--r--src/include/btmem.h22
-rw-r--r--src/include/extern.h37
-rw-r--r--src/include/misc.h4
18 files changed, 686 insertions, 276 deletions
diff --git a/src/block/block_addr.c b/src/block/block_addr.c
index f065aa7c3d2..47d1eff3c78 100644
--- a/src/block/block_addr.c
+++ b/src/block/block_addr.c
@@ -124,14 +124,22 @@ __wt_block_buffer_to_snapshot(WT_SESSION_IMPL *session,
pp = &p;
WT_RET(__block_buffer_to_addr(block, pp,
&si->root_offset, &si->root_size, &si->root_cksum));
+
+ si->alloc.name = "snapshot.alloc";
WT_RET(__block_buffer_to_addr(block, pp,
&si->alloc_offset, &si->alloc_size, &si->alloc_cksum));
+
+ si->avail.name = "snapshot.avail";
WT_RET(__block_buffer_to_addr(block, pp,
&si->avail_offset, &si->avail_size, &si->avail_cksum));
+
+ si->discard.name = "snapshot.discard";
WT_RET(__block_buffer_to_addr(block, pp,
&si->discard_offset, &si->discard_size, &si->discard_cksum));
+
WT_RET(__wt_vunpack_uint(pp, 0, &a));
si->file_size = (off_t)a;
+
WT_RET(__wt_vunpack_uint(pp, 0, &a));
si->write_gen = a;
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index 38abbeae999..9cae1ed7e8d 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -7,11 +7,13 @@
#include "wt_internal.h"
-static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t);
static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t);
+static int __block_overlap(WT_SESSION_IMPL *,
+ WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
#ifdef HAVE_VERBOSE
-static void __block_extlist_dump(WT_SESSION_IMPL *, WT_EXTLIST *);
+static void __block_extlist_dump(
+ WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int);
#endif
/*
@@ -262,7 +264,7 @@ __wt_block_alloc(
__block_size_srch(block->live.avail.size, size, sstack);
szp = *sstack[0];
if (szp == NULL) {
- WT_ERR(__block_extend(session, block, offp, size));
+ WT_ERR(__wt_block_extend(session, block, offp, size));
goto done;
}
@@ -292,17 +294,20 @@ __wt_block_alloc(
__wt_free(session, ext);
}
-done: err:
+ /* Add the newly allocated extent to the list of allocations. */
+done: WT_RET(__block_merge(
+ session, &block->live.alloc, *offp, (off_t)size));
+err:
__wt_spin_unlock(session, &block->live_lock);
return (ret);
}
/*
- * __block_extend --
+ * __wt_block_extend --
* Extend the file to allocate space.
*/
-static int
-__block_extend(
+int
+__wt_block_extend(
WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size)
{
WT_FH *fh;
@@ -351,7 +356,7 @@ __wt_block_free_buf(WT_SESSION_IMPL *session,
/* Crack the cookie. */
WT_RET(__wt_block_buffer_to_addr(block, addr, &off, &size, NULL));
- WT_RET(__wt_block_free(session, block, off, size));
+ WT_RET(__wt_block_free(session, block, off, size, 0));
return (0);
}
@@ -361,9 +366,10 @@ __wt_block_free_buf(WT_SESSION_IMPL *session,
* Free a chunk of space to the underlying file.
*/
int
-__wt_block_free(
- WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, off_t size)
+__wt_block_free(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, off_t off, off_t size, int free_extent)
{
+ WT_EXTLIST *el;
int ret;
WT_BSTAT_INCR(session, free);
@@ -371,13 +377,304 @@ __wt_block_free(
"free %" PRIdMAX "/%" PRIdMAX, (intmax_t)off, (intmax_t)size);
__wt_spin_lock(session, &block->live_lock);
- ret = __block_merge(session, &block->live.avail, off, (off_t)size);
+
+ /*
+ * If performing salvage, snapshots no longer apply, and so blocks are
+ * immediately re-used where possible. Free to the live system's avail
+ * list.
+ *
+ * If freeing extent blocks, they were never entered onto an alloc list.
+ * They are being freed as part of snapshot delete, so they are freed to
+ * the live system's avail list as well.
+ */
+ el = free_extent || block->slvg ?
+ &block->live.avail : &block->live.discard;
+
+ ret = __block_merge(session, el, off, (off_t)size);
+
__wt_spin_unlock(session, &block->live_lock);
return (ret);
}
/*
+ * __wt_block_extlist_match --
+ * Review a snapshot's alloc/discard extent lists, move overlaps into the
+ * live system's avail list.
+ */
+int
+__wt_block_extlist_match(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si)
+{
+ WT_EXT *alloc, *discard;
+
+ alloc = si->alloc.off[0];
+ discard = si->discard.off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (alloc != NULL && discard != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the
+ * next entry in its list.
+ */
+ if (alloc->off + alloc->size < discard->off ||
+ discard->off + alloc->size < alloc->off) {
+ if (alloc->off < discard->off)
+ alloc = alloc->next[0];
+ else
+ discard = discard->next[0];
+ continue;
+ }
+
+ WT_VERBOSE(session, block,
+ "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
+
+ WT_VERBOSE_CALL(session, block,
+ __block_extlist_dump(
+ session, "initial live alloc", &block->live.avail, 0));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(
+ session, "initial snapshot alloc", &si->alloc, 0));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(
+ session, "initial snapshot discard", &si->discard, 0));
+
+ WT_VERBOSE(session, block,
+ "overlap %" PRIuMAX "/%" PRIuMAX " vs. %" PRIuMAX "/%"
+ PRIuMAX,
+ (uintmax_t)alloc->off, (uintmax_t)alloc->size,
+ (uintmax_t)discard->off, (uintmax_t)discard->size);
+
+ /* Reconcile the overlap. */
+ WT_RET(__block_overlap(session, block,
+ &si->alloc, &alloc, &si->discard, &discard));
+
+ WT_VERBOSE(session, block,
+ "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
+ WT_VERBOSE_CALL(session, block,
+ __block_extlist_dump(
+ session, "result live alloc", &block->live.avail, 0));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(
+ session, "result snapshot alloc", &si->alloc, 0));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(
+ session, "result snapshot discard", &si->discard, 0));
+
+ WT_VERBOSE(session, block,
+ "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
+ }
+
+ return (0);
+}
+
+/*
+ * __block_overlap --
+ * Reconcile two overlapping ranges.
+ */
+static int
+__block_overlap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+{
+ WT_EXT *a, *b, **ext;
+ WT_EXTLIST *el, *live;
+ off_t off, size;
+
+ live = &block->live.avail;
+
+ /*
+ * The ranges overlap, choose the range we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAAAAA same as #2
+ * #8 AAAAAAAAAAAAA same as #3
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ *
+ * By swapping the arguments so "A" is always the lower range, we can
+ * eliminate cases #2, #7, #10 and #11, and only handle 7 cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are are the same
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #8 AAAAAAAAAAAAA same as #3
+ * #9 AAAAA A is a prefix of B
+ */
+ a = *ap;
+ b = *bp;
+ if (a->off > b->off) { /* Swap */
+ b = *ap;
+ a = *bp;
+ ext = ap; ap = bp; bp = ext;
+ el = ael; ael = bel; bel = el;
+ }
+
+ if (a->off == b->off) { /* Case #1, #4, #9 */
+ if (a->size == b->size) { /* Case #1 */
+ /*
+ * Move caller's A and B to the next element
+ * Add that A and B range to the avail list
+ * Delete A and B
+ */
+ *ap = (*ap)->next[0];
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, live, b->off, b->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+ else if (a->size > b->size) { /* Case #4 */
+ /*
+ * Remove A from its list
+ * Increment/Decrement A's offset/size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->off += b->size;
+ a->size -= b->size;
+ WT_RET(__block_off_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, live, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else { /* Case #9 */
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the size of A
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += a->size;
+ b->size -= a->size;
+ WT_RET(__block_off_insert(session, bel, b));
+
+ /*
+ * Move caller's A to the next element
+ * Add A's range to the avail list
+ * Delete A
+ */
+ *ap = (*ap)->next[0];
+ WT_RET(__block_merge(session, live, a->off, a->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ } /* Case #6 */
+ } else if (a->off + a->size == b->off + b->size) {
+ /*
+ * Remove A from its list
+ * Decrement A's size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= b->size;
+ WT_RET(__block_off_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, live, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else if /* Case #3, #8 */
+ (a->off + a->size < b->off + b->size) {
+ /*
+ * Add overlap to the avail list
+ */
+ off = b->off;
+ size = (a->off + a->size) - b->off;
+ WT_RET(__block_merge(session, live, off, size));
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by the overlap
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= size;
+ WT_RET(__block_off_insert(session, ael, a));
+
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the overlap
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += size;
+ b->size -= size;
+ WT_RET(__block_off_insert(session, bel, b));
+ } else { /* Case #5 */
+ /* Calculate the offset/size of the trailing part of A. */
+ off = b->off + b->size;
+ size = (a->off + a->size) - off;
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by trailing part of A plus B's size
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size = b->off - a->off;
+ WT_RET(__block_off_insert(session, ael, a));
+
+ /* Add trailing part of A to A's list as a new element. */
+ WT_RET(__block_merge(session, ael, off, size));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, live, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_merge --
+ * Merge one extent list into another.
+ */
+int
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+{
+ WT_EXT *ext;
+
+ WT_VERBOSE(session, block, "merging %s into %s", a->name, b->name);
+
+ WT_EXT_FOREACH(ext, a->off)
+ WT_RET(__block_merge(session, b, ext->off, ext->size));
+
+ return (0);
+}
+
+/*
* __block_merge --
* Insert an extent into an extent list, merging if possible.
*/
@@ -418,6 +715,11 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
after = NULL;
}
if (before == NULL && after == NULL) {
+ WT_VERBOSE(session, block,
+ "%s: insert range %" PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size));
+
/* Allocate a new WT_EXT structure. */
skipdepth = __wt_skip_choose_depth();
WT_RET(__wt_calloc(session, 1,
@@ -527,7 +829,8 @@ corrupted: WT_ERR_MSG(session, WT_ERROR,
WT_ERR(__block_merge(session, el, loff, lsize));
}
- WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(session, "read extlist", el, 0));
err: __wt_scr_free(&tmp);
return (ret);
@@ -552,7 +855,8 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
tmp = NULL;
ret = 0;
- WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
+ WT_VERBOSE_CALL(session,
+ block, __block_extlist_dump(session, "write extlist", el, 0));
/* If there aren't any entries, we're done. */
if (el->entries == 0) {
@@ -577,7 +881,7 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
dsk = tmp->mem;
memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
dsk->u.datalen = WT_STORE_SIZE(datasize);
- dsk->type = WT_PAGE_FREELIST;
+ dsk->type = WT_PAGE_BLOCK_MANAGER;
tmp->size = WT_STORE_SIZE(WT_BLOCK_HEADER_BYTE_SIZE + datasize);
#define WT_EXTLIST_WRITE(p, v) do { \
@@ -612,7 +916,7 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
el = NULL;
/* Write the extent list to disk. */
- WT_ERR(__wt_block_write(session, block, tmp, offp, sizep, cksump));
+ WT_ERR(__wt_block_write(session, block, tmp, offp, sizep, cksump, 1));
WT_VERBOSE(session, block,
"%s written %" PRIdMAX "/%" PRIu32, name, (intmax_t)*offp, *sizep);
@@ -681,23 +985,30 @@ __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
#ifdef HAVE_VERBOSE
static void
-__block_extlist_dump(WT_SESSION_IMPL *session, WT_EXTLIST *el)
+__block_extlist_dump(
+ WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
{
WT_EXT *ext;
WT_SIZE *szp;
- if (el->entries == 0) {
- WT_VERBOSE(session, block, "%s: [Empty]", el->name);
+ WT_VERBOSE(session, block, "%s: %s: by offset:%s",
+ tag, el->name, el->entries == 0 ? " [Empty]" : "");
+ if (el->entries == 0)
return;
- }
- WT_VERBOSE(session, block, "%s: list by offset:", el->name);
WT_EXT_FOREACH(ext, el->off)
WT_VERBOSE(session, block,
"\t{%" PRIuMAX "/%" PRIuMAX "}",
(uintmax_t)ext->off, (uintmax_t)ext->size);
- WT_VERBOSE(session, block, "%s: list by size:", el->name);
+ if (!show_size)
+ return;
+
+ WT_VERBOSE(session, block, "%s: %s: by size:%s",
+ tag, el->name, el->entries == 0 ? " [Empty]" : "");
+ if (el->entries == 0)
+ return;
+
WT_EXT_FOREACH(szp, el->size) {
WT_VERBOSE(session, block,
"\t{%" PRIuMAX "}",
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index f7e9f64a31c..2a731a7aae9 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -301,14 +301,14 @@ __wt_bm_salvage_next(WT_SESSION_IMPL *session, WT_ITEM *buf,
* End a block manager salvage.
*/
int
-__wt_bm_salvage_end(WT_SESSION_IMPL *session, int success)
+__wt_bm_salvage_end(WT_SESSION_IMPL *session)
{
WT_BLOCK *block;
if ((block = session->btree->block) == NULL)
return (__bm_invalid(session));
- return (__wt_block_salvage_end(session, block, success));
+ return (__wt_block_salvage_end(session, block));
}
/*
diff --git a/src/block/block_open.c b/src/block/block_open.c
index c8dc7871e08..aa637b417a7 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -157,6 +157,8 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
ret = 0;
+ WT_VERBOSE(session, block, "close");
+
if (block->live_load) {
__wt_errx(session, "snapshot never unloaded");
ret = EINVAL;
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 8b23547e7b6..baca1bb06a2 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -17,6 +17,9 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
off_t len;
uint32_t allocsize;
+ /* Reset the live snapshot information. */
+ WT_RET(__wt_block_snap_init(session, block, &block->live, 1));
+
/*
* Truncate the file to an initial sector plus N allocation size
* units (bytes trailing the last multiple of an allocation size
@@ -38,10 +41,11 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
block->slvg_off = WT_BLOCK_DESC_SECTOR;
/*
- * We don't currently need to do anything about the freelist because
- * we don't read it for salvage operations.
+ * We don't currently need to do anything about the snapshot extents
+ * because we don't read them for salvage operations.
*/
+ block->live_load = block->slvg = 1;
return (0);
}
@@ -50,11 +54,13 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
* End a file salvage.
*/
int
-__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block, int success)
+__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
- /* If not successful, discard the live snapshot we've created. */
- if (!success)
- (void)__wt_block_snap_unload(session, block);
+ /* Discard the live snapshot. */
+ WT_RET(__wt_block_snap_unload(session, block));
+
+ block->slvg = 0;
+
return (0);
}
@@ -122,7 +128,7 @@ skip: WT_VERBOSE(session, salvage,
* than once.
*/
WT_RET(__wt_block_free(
- session, block, offset, (off_t)allocsize));
+ session, block, offset, (off_t)allocsize, 0));
block->slvg_off = offset += allocsize;
continue;
}
diff --git a/src/block/block_snap.c b/src/block/block_snap.c
index cd9eea5e5e4..17238a595c2 100644
--- a/src/block/block_snap.c
+++ b/src/block/block_snap.c
@@ -12,6 +12,35 @@ static int __block_snap_extlists_write(
WT_SESSION_IMPL *, WT_BLOCK *, WT_BLOCK_SNAPSHOT *);
/*
+ * __wt_block_snap_init --
+ * Initialize a snapshot structure.
+ */
+int
+__wt_block_snap_init(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si, int is_live)
+{
+ /*
+ * If we're loading a new live snapshot, there shouldn't be one
+ * already loaded.
+ */
+ if (is_live && block->live_load)
+ WT_RET_MSG(session, EINVAL, "snapshot already loaded");
+
+ memset(si, 0, sizeof(*si));
+
+ si->alloc.name = "alloc";
+ si->alloc_offset = WT_BLOCK_INVALID_OFFSET;
+
+ si->avail.name = "avail";
+ si->avail_offset = WT_BLOCK_INVALID_OFFSET;
+
+ si->discard.name = "discard";
+ si->discard_offset = WT_BLOCK_INVALID_OFFSET;
+
+ return (0);
+}
+
+/*
* __wt_block_snap_load --
* Load a snapshot.
*/
@@ -26,29 +55,20 @@ __wt_block_snap_load(WT_SESSION_IMPL *session,
tmp = NULL;
ret = 0;
- WT_VERBOSE(session, block, "%s: load snapshot", block->name);
-
- /* Work on the "live" snapshot. */
- if (block->live_load)
- WT_RET_MSG(session, EINVAL, "snapshot already loaded");
+ if (addr == NULL)
+ WT_VERBOSE(session, block, "load-snapshot: [Empty]");
+ else
+ WT_VERBOSE_CALL_RET(session, block,
+ __wt_block_snapshot_string(
+ session, block, addr, "load-snapshot", NULL));
si = &block->live;
- memset(si, 0, sizeof(*si));
- block->live.alloc.name = "live: alloc";
- block->live.alloc_offset = WT_BLOCK_INVALID_OFFSET;
- block->live.avail.name = "live: avail";
- block->live.avail_offset = WT_BLOCK_INVALID_OFFSET;
- block->live.discard.name = "live: discard";
- block->live.discard_offset = WT_BLOCK_INVALID_OFFSET;
+ WT_RET(__wt_block_snap_init(session, block, si, 1));
/* If not loading a snapshot from disk, we're done. */
if (addr == NULL || addr_size == 0)
goto done;
- WT_VERBOSE_CALL_ERR(session, block,
- __wt_block_snapshot_string(
- session, block, addr, "load-snapshot", NULL));
-
/* Crack the snapshot cookie. */
WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));
@@ -139,7 +159,7 @@ __wt_block_snap_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_block_extlist_free(session, &si->avail);
__wt_block_extlist_free(session, &si->discard);
- memset(si, 0, sizeof(*si));
+ WT_RET(__wt_block_snap_init(session, block, si, 0));
return (0);
}
@@ -174,7 +194,7 @@ __wt_block_write_buf_snapshot(
si->root_size = si->root_cksum = 0;
} else
WT_RET(__wt_block_write(session, block, buf,
- &si->root_offset, &si->root_size, &si->root_cksum));
+ &si->root_offset, &si->root_size, &si->root_cksum, 0));
#if 0
/*
@@ -278,32 +298,79 @@ static int
__block_snap_delete(
WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr)
{
- WT_BLOCK_SNAPSHOT *si, __si;
-
- si = &__si;
+ WT_BLOCK_SNAPSHOT *live, *si, __si;
WT_VERBOSE_CALL_RET(session, block,
__wt_block_snapshot_string(
session, block, addr, "delete-snapshot", NULL));
- /*
- * If there's a snapshot, crack the cookie and free the snapshot's
- * extent lists and the snapshot's root page.
- */
+ live = &block->live;
+ si = &__si;
+ WT_RET(__wt_block_snap_init(session, block, si, 0));
+
+ /* If there's a snapshot, crack the cookie. */
WT_RET(__wt_block_buffer_to_snapshot(session, block, addr, si));
- if (si->alloc_offset != WT_BLOCK_INVALID_OFFSET)
+ /*
+ * Free the root page: there's nothing special about this free, the root
+ * page is allocated using normal rules, that is, it may have been taken
+ * from the avail list, and was entered on the alloc list at that time.
+ */
+ if (si->root_offset != WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_block_free(
- session, block, si->alloc_offset, si->alloc_size));
+ session, block, si->root_offset, si->root_size, 0));
+
+ /*
+ * Discard the avail list: snapshot avail lists are only useful if we
+ * are rolling forward from the particular snapshot and they represent
+ * our best understanding of what blocks can be allocated. If we're
+ * not operating on the live snapshot, subsequent snapshots may have
+ * allocated those blocks, and the avail list is useless.
+ *
+ * The avail list is an extent: extent blocks must be freed directly to
+ * the live system's avail list, they were never on any alloc list.
+ */
if (si->avail_offset != WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_block_free(
- session, block, si->avail_offset, si->avail_size));
- if (si->discard_offset != WT_BLOCK_INVALID_OFFSET)
+ session, block, si->avail_offset, si->avail_size, 1));
+
+ /*
+ * Migrate the allocation and deletion lists forward, in this case,
+ * into the live system. This is done by (1) first reading the
+ * extent list, (2) merging into the corresponding live system's
+ * extent list, (3) deleting the free list and (4) discarding the
+ * blocks that made up the list.
+ *
+ * The alloc and discard lists are extents: extent blocks must be freed
+ * directly to the live system's avail list, they were never on any
+ * alloc list.
+ */
+ if (si->alloc_offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(session, block, &si->alloc,
+ si->alloc_offset, si->alloc_size, si->alloc_cksum));
+ WT_RET(__wt_block_extlist_merge(
+ session, &si->alloc, &live->alloc));
+ __wt_block_extlist_free(session, &si->alloc);
WT_RET(__wt_block_free(
- session, block, si->discard_offset, si->discard_size));
- if (si->root_offset != WT_BLOCK_INVALID_OFFSET)
+ session, block, si->alloc_offset, si->alloc_size, 1));
+ }
+ if (si->discard_offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(session, block, &si->discard,
+ si->discard_offset, si->discard_size, si->discard_cksum));
+ WT_RET(__wt_block_extlist_merge(
+ session, &si->discard, &live->discard));
+ __wt_block_extlist_free(session, &si->discard);
WT_RET(__wt_block_free(
- session, block, si->root_offset, si->root_size));
+ session, block, si->discard_offset, si->discard_size, 1));
+ }
+
+ /*
+ * Figure out which blocks we can re-use. This is done by checking
+ * the live system's allocate and discard lists for overlaps: if an
+ * extent appears on both lists, move it to the avail list, it can be
+ * re-used immediately.
+ */
+ WT_RET(__wt_block_extlist_match(session, block, live));
return (0);
}
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 1daafd68f62..3ec2545c88b 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -47,7 +47,8 @@ __wt_block_write_buf(WT_SESSION_IMPL *session,
WT_UNUSED(addr_size);
- WT_RET(__wt_block_write(session, block, buf, &offset, &size, &cksum));
+ WT_RET(
+ __wt_block_write(session, block, buf, &offset, &size, &cksum, 0));
endp = addr;
WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
@@ -62,8 +63,8 @@ __wt_block_write_buf(WT_SESSION_IMPL *session,
* checksum.
*/
int
-__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf,
+ off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int force_extend)
{
WT_BLOCK_HEADER *blk;
WT_PAGE_HEADER *dsk;
@@ -224,8 +225,22 @@ not_compressed: /*
} else
blk->cksum = WT_BLOCK_CHECKSUM_NOT_SET;
- /* Allocate space from the underlying file and write the block. */
- WT_ERR(__wt_block_alloc(session, block, &offset, (off_t)align_size));
+ /*
+ * Allocate space from the underlying file and write the block. When
+ * writing snapshot extents, we always extend the file, that's simpler
+ * than distinguishing between extents allocated from the live avail
+ * list, and those which can't be allocated from the live avail list
+ * such as blocks for writing the live avail list itself.
+ */
+ if (force_extend) {
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_extend(
+ session, block, &offset, (off_t)align_size);
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_ERR(ret);
+ } else
+ WT_ERR(__wt_block_alloc(
+ session, block, &offset, (off_t)align_size));
WT_ERR(__wt_write(session, block->fh, offset, align_size, dsk));
WT_BSTAT_INCR(session, page_write);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 430c6b6b504..e7a3441e7e4 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -509,17 +509,17 @@ __debug_page_modify(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, "\t" "tracking list:\n");
for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
switch (track->type) {
- case WT_PT_BLOCK:
- __dmsg(ds, "\t\t" "block");
+ case WT_PT_DISCARD:
+ __dmsg(ds, "\t\t" "discard");
break;
- case WT_PT_BLOCK_EVICT:
- __dmsg(ds, "\t\t" "block-evict");
+ case WT_PT_DISCARD_COMPLETE:
+ __dmsg(ds, "\t\t" "discard-complete");
break;
case WT_PT_OVFL:
- __dmsg(ds, "\t\t" "overflow (on)");
+ __dmsg(ds, "\t\t" "overflow");
break;
- case WT_PT_OVFL_DISCARD:
- __dmsg(ds, "\t\t" "overflow (off)");
+ case WT_PT_OVFL_ACTIVE:
+ __dmsg(ds, "\t\t" "overflow-active");
break;
case WT_PT_EMPTY:
continue;
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index bc096bd768c..6952cbc4fb5 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -17,6 +17,8 @@ __wt_page_type_string(u_int type)
switch (type) {
case WT_PAGE_INVALID:
return ("invalid");
+ case WT_PAGE_BLOCK_MANAGER:
+ return ("block manager");
case WT_PAGE_COL_FIX:
return ("column-store fixed-length leaf");
case WT_PAGE_COL_INT:
@@ -29,8 +31,6 @@ __wt_page_type_string(u_int type)
return ("row-store internal");
case WT_PAGE_ROW_LEAF:
return ("row-store leaf");
- case WT_PAGE_FREELIST:
- return ("freelist");
default:
return ("unknown");
}
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 770e2584990..9e4cfd9e15b 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -297,7 +297,7 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
* Inform the underlying block manager that we're done.
*/
err: if (started)
- WT_TRET(__wt_bm_salvage_end(session, ret == 0 ? 1 : 0));
+ WT_TRET(__wt_bm_salvage_end(session));
/* Discard any root page we created. */
if (ss->root_page != NULL)
@@ -364,8 +364,8 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
* calls don't need them either.
*/
switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_COL_INT:
- case WT_PAGE_FREELIST:
case WT_PAGE_ROW_INT:
WT_VERBOSE(session, salvage,
"%s page ignored %s",
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index de36224624e..1164cf6bf4b 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -51,10 +51,10 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
/* Check the page type. */
switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
- case WT_PAGE_FREELIST:
case WT_PAGE_OVFL:
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
@@ -76,7 +76,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
WT_RET_VRFY(session,
"%s page at %s has a record number of zero",
__wt_page_type_string(dsk->type), addr);
- case WT_PAGE_FREELIST:
+ case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
@@ -112,7 +112,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
return (__verify_dsk_row(session, addr, dsk));
- case WT_PAGE_FREELIST:
+ case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
WT_ILLEGAL_VALUE(session);
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index ee295dff0a8..26e0089ada1 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -267,7 +267,7 @@ __rec_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* __rec_discard_page --
- * Process the page's list of tracked objects, and discard it.
+ * Discard the page.
*/
static int
__rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -276,30 +276,25 @@ __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page)
mod = page->modify;
- /*
- * or if the page was split and later merged, discard it.
- */
+ /* We should never evict the file's current eviction point. */
+ WT_ASSERT(session, session->btree->evict_page != page);
+
if (mod != NULL) {
/*
* If the page has been modified and was tracking objects,
- * resolve them.
+ * discard them.
*/
- WT_RET(__wt_rec_track_wrapup(session, page, 1));
+ __wt_rec_track_discard(session, page);
/*
* If the page was split and eventually merged into the parent,
- * discard the split page; if the split page was promoted into
- * a split-merge page, then the reference must be cleared before
- * the page is discarded.
+ * discard the split page.
*/
if (F_ISSET(page, WT_PAGE_REC_MASK) == WT_PAGE_REC_SPLIT &&
mod->u.split != NULL)
__wt_page_out(session, mod->u.split, 0);
}
- /* We should never evict the file's current eviction point. */
- WT_ASSERT(session, session->btree->evict_page != page);
-
/* Discard the page itself. */
__wt_page_out(session, page, 0);
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index 6336f3ffa0d..908e30e396a 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -12,8 +12,45 @@
* example, when an overflow item is modified, the original overflow blocks
* must be freed at some point. Or, when a page is split, then written again,
* the first split must be freed. This code tracks those objects: they are
- * generally called from the routines in rec_write.c, which update the objects
- * each time a page is reconciled.
+ * called from the routines in rec_write.c, which update the objects each time
+ * a page is reconciled.
+ *
+ * An object has one of 4 types, plus there's a "slot not in use" type.
+ *
+ * WT_PT_EMPTY:
+ * Empty slot.
+ *
+ * WT_PT_DISCARD:
+ * WT_PT_DISCARD_COMPLETE:
+ * The key fact about a discarded block or overflow record is it may be
+ * discarded multiple times. For example, an internal page with an overflow
+ * key referencing a page that's empty or split: each time a page is written,
+ * we'll figure out the key's overflow blocks are no longer useful, but we have
+ * no way to know we've figured that same thing out several times already.
+ * The type is initially set to WT_PT_DISCARD. After the page is written,
+ * WT_PT_DISCARD blocks are freed to the underlying block manager and the type
+ * is set to WT_PT_DISCARD_COMPLETE. That allows us to find the block on the
+ * page's list again, but only physically free it once.
+ *
+ * WT_PT_OVFL:
+ * WT_PT_OVFL_ACTIVE:
+ * The key facts about a created overflow record are first that it may be
+ * re-used during subsequent reconciliations, and second the blocks must be
+ * physically discarded if a reconciliation of the page does not re-use the
+ * previously created overflow records. (Note this is different from overflow
+ * records that appeared on the on-disk version of the page: they can only be
+ * deleted, not re-used, and so they are handled by the WT_PT_DISCARD type.)
+ * An example of re-use is an inserted key/value pair where the value is
+ * an overflow item. The overflow record will be re-created as part of each
+ * reconciliation. We don't want to physically write the overflow record every
+ * time, instead we track overflow records written on behalf of the page across
+ * reconciliations.
+ * However, if a created overflow record is not re-used in reconciliation,
+ * the physical blocks must be discarded to the block manager since they are no
+ * longer in use.
+ * The type is first set to WT_PT_OVFL_ACTIVE; after page reconciliation
+ * completes, any records with a type of WT_PT_OVFL are discarded, and records
+ * with a type of WT_PT_OVFL_ACTIVE are reset to WT_PT_OVFL.
*/
#ifdef HAVE_VERBOSE
@@ -69,36 +106,32 @@ __rec_track_extend(WT_SESSION_IMPL *session, WT_PAGE *page)
* Add an addr/size pair to the page's list of tracked objects.
*/
int
-__wt_rec_track_block(WT_SESSION_IMPL *session,
- __wt_pt_type_t type, WT_PAGE *page, const uint8_t *addr, uint32_t size)
+__wt_rec_track_block(
+ WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, uint32_t size)
{
WT_PAGE_MODIFY *mod;
WT_PAGE_TRACK *empty, *track;
uint32_t i;
- WT_ASSERT(session, addr != NULL);
-
mod = page->modify;
- /*
- * There may be multiple requests to track a single block. For example,
- * an internal page with an overflow key that references a page that's
- * split: every time the page is written, we'll figure out the key's
- * overflow pages are no longer useful because the underlying page has
- * split, but we have no way to know that we've figured that same thing
- * out several times already. Check for duplicates.
- */
empty = NULL;
- for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
- if (track->type == WT_PT_EMPTY) {
+ for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
+ switch (track->type) {
+ case WT_PT_EMPTY:
empty = track;
- continue;
+ break;
+ case WT_PT_DISCARD:
+ case WT_PT_DISCARD_COMPLETE:
+ /* We've discarded this block already, ignore. */
+ if (track->addr.size == size &&
+ memcmp(addr, track->addr.addr, size) == 0)
+ return (0);
+ break;
+ case WT_PT_OVFL:
+ case WT_PT_OVFL_ACTIVE:
+ break;
}
- if (track->type == type &&
- track->addr.size == size &&
- memcmp(addr, track->addr.addr, size) == 0)
- return (0);
- }
/* Reallocate space as necessary. */
if (empty == NULL) {
@@ -107,7 +140,7 @@ __wt_rec_track_block(WT_SESSION_IMPL *session,
}
track = empty;
- track->type = type;
+ track->type = WT_PT_DISCARD;
track->data = NULL;
track->size = 0;
WT_RET(__wt_strndup(session, (char *)addr, size, &track->addr.addr));
@@ -155,7 +188,7 @@ __wt_rec_track_ovfl(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_RET(__wt_calloc_def(session, addr_size + data_size, &p));
track = empty;
- track->type = WT_PT_OVFL;
+ track->type = WT_PT_OVFL_ACTIVE;
track->addr.addr = p;
track->addr.size = addr_size;
memcpy(track->addr.addr, addr, addr_size);
@@ -186,12 +219,12 @@ __wt_rec_track_ovfl_reuse(WT_SESSION_IMPL *session, WT_PAGE *page,
mod = page->modify;
for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
/* Check for a match. */
- if (track->type != WT_PT_OVFL_DISCARD ||
+ if (track->type != WT_PT_OVFL ||
size != track->size || memcmp(data, track->data, size) != 0)
continue;
/* Found a match, return the record to use. */
- track->type = WT_PT_OVFL;
+ track->type = WT_PT_OVFL_ACTIVE;
/* Return the block addr/size pair to our caller. */
*addrp = track->addr.addr;
@@ -205,134 +238,83 @@ __wt_rec_track_ovfl_reuse(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/*
- * __wt_rec_track_init --
- * Initialize/Reset the tracking information when writing a page.
- */
-int
-__wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_PAGE_MODIFY *mod;
- WT_PAGE_TRACK *track;
- uint32_t i;
-
- WT_VERBOSE_CALL(
- session, reconcile, __track_dump(session, page, "reconcile init"));
-
- mod = page->modify;
-
- for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
- switch (track->type) {
- case WT_PT_BLOCK_EVICT:
- /*
- * We had a block we would have discarded, had the last
- * reconciliation been the final one used to evict the
- * page -- it wasn't, and we didn't. Clear the slot.
- */
- __rec_track_clear(track);
- break;
- case WT_PT_OVFL:
- /*
- * An overflow item associated with this page: mark it
- * "not in use", we'll reactivate any being re-used as
- * we process the page.
- */
- WT_VERBOSE_CALL(session, reconcile, __track_msg(
- session, page, "set overflow OFF", &track->addr));
- track->type = WT_PT_OVFL_DISCARD;
- break;
- case WT_PT_EMPTY:
- break;
- case WT_PT_BLOCK:
- case WT_PT_OVFL_DISCARD:
- /*
- * We shouldn't see WT_PT_BLOCK or WT_PT_OVFL_DISCARD,
- * those blocks were discarded at the end of the last
- * reconciliation of this page.
- */
- /* FALLTHROUGH */
- WT_ILLEGAL_VALUE(session);
- }
- return (0);
-}
-
-/*
* __wt_rec_track_wrapup --
- * Temporarily/Permanently resolve the page's list of tracked objects.
+ * Resolve the page's list of tracked objects after the page is written.
*/
int
-__wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page, int final)
+__wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_PAGE_TRACK *track;
uint32_t i;
WT_VERBOSE_CALL(session, reconcile,
- __track_dump(session,
- page, final ? "eviction wrapup" : "reconcile wrapup"));
+ __track_dump(session, page, "reconcile wrapup"));
/*
* After a sync of a page, some of the objects we're tracking are no
* longer needed -- free what we can free.
- *
- * WT_PT_EMPTY:
- * Empty slot.
- * WT_PT_BLOCK:
- * A discarded block, free when this reconciliation completes.
- * WT_PT_BLOCK_EVICT:
- * A discarded block based on this reconciliation; if the page is
- * evicted based on this reconciliation, discard the block. (For
- * example, an overflow key that references a deleted item will be
- * discarded, but a subsequent reconciliation might find the key
- * is once more in use because the item is no longer deleted.)
- * WT_PT_OVFL:
- * An overflow record that's in-use. Ignored after any particular
- * reconciliation, because we need to track it for re-use in future
- * reconciliations. When the page is evicted, discard its memory,
- * leaving the underlying blocks alone.
- * WT_PT_OVFL_DISCARD:
- * An overflow record that's no longer in-use. Discard the memory
- * and free the underlying blocks after reconciliation completes.
*/
for (track = page->modify->track,
- i = 0; i < page->modify->track_entries; ++track, ++i) {
+ i = 0; i < page->modify->track_entries; ++track, ++i)
switch (track->type) {
case WT_PT_EMPTY:
+ case WT_PT_DISCARD_COMPLETE:
continue;
- case WT_PT_BLOCK_EVICT:
- if (!final)
- continue;
- /* FALLTHROUGH */
- case WT_PT_BLOCK:
+ case WT_PT_DISCARD:
WT_VERBOSE_CALL(session, reconcile, __track_msg(
session, page, "discard block", &track->addr));
+
+ /* We no longer need these blocks. */
WT_RET(__wt_bm_free(
session, track->addr.addr, track->addr.size));
- __wt_free(session, track->addr.addr);
- break;
- case WT_PT_OVFL:
- WT_VERBOSE_CALL(session, reconcile, __track_msg(
- session, page, "retain overflow", &track->addr));
- if (!final)
- continue;
- /* Freeing WT_PAGE_TRACK->addr frees ->data, too. */
- __wt_free(session, track->addr.addr);
+ /*
+ * We still need to know about the blocks, reset the
+ * type, the blocks are freed.
+ */
+ track->type = WT_PT_DISCARD_COMPLETE;
break;
- case WT_PT_OVFL_DISCARD:
+ case WT_PT_OVFL:
WT_VERBOSE_CALL(session, reconcile, __track_msg(
session, page, "discard overflow", &track->addr));
+
+ /* We no longer need these blocks. */
WT_RET(__wt_bm_free(
session, track->addr.addr, track->addr.size));
- /* Freeing WT_PAGE_TRACK->addr frees ->data, too. */
+ /*
+ * We no longer need to know about the blocks, clear the
+ * slot.
+ *
+ * Freeing WT_PAGE_TRACK->addr frees ->data, too.
+ */
__wt_free(session, track->addr.addr);
+ __rec_track_clear(track);
+
+ break;
+ case WT_PT_OVFL_ACTIVE:
+ /* Reset the type to prepare for the next reconcile. */
+ track->type = WT_PT_OVFL;
break;
}
-
- __rec_track_clear(track);
- }
return (0);
}
+/*
+ * __wt_rec_track_discard --
+ * Discard the page's list of tracked objects.
+ */
+void
+__wt_rec_track_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_TRACK *track;
+ uint32_t i;
+
+ for (track = page->modify->track,
+ i = 0; i < page->modify->track_entries; ++track, ++i)
+ __wt_free(session, track->addr.addr);
+}
+
#ifdef HAVE_VERBOSE
/*
* __track_dump --
@@ -364,17 +346,17 @@ static void
__track_print(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_TRACK *track)
{
switch (track->type) {
- case WT_PT_BLOCK:
- __track_msg(session, page, "block", &track->addr);
+ case WT_PT_DISCARD:
+ __track_msg(session, page, "discard", &track->addr);
+ break;
+ case WT_PT_DISCARD_COMPLETE:
+ __track_msg(session, page, "discard-complete", &track->addr);
break;
- case WT_PT_BLOCK_EVICT:
- __track_msg(session, page, "block-evict", &track->addr);
- return;
case WT_PT_OVFL:
- __track_msg(session, page, "overflow (on)", &track->addr);
+ __track_msg(session, page, "overflow", &track->addr);
break;
- case WT_PT_OVFL_DISCARD:
- __track_msg(session, page, "overflow (off)", &track->addr);
+ case WT_PT_OVFL_ACTIVE:
+ __track_msg(session, page, "overflow-active", &track->addr);
break;
case WT_PT_EMPTY:
default: /* Not possible. */
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index d92f66a06d0..9ba13171554 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -175,14 +175,15 @@ static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_PAGE *);
/*
* __rec_track_cell --
- * If a cell references an overflow chunk, add it to the page's list.
+ * If a cell we're re-writing references an overflow chunk, add it to the
+ * page's tracking list to be discarded after the write completes.
*/
static inline int
__rec_track_cell(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
{
- return (unpack->ovfl ? __wt_rec_track_block(session,
- WT_PT_BLOCK_EVICT, page, unpack->data, unpack->size) : 0);
+ return (unpack->ovfl ? __wt_rec_track_block(
+ session, page, unpack->data, unpack->size) : 0);
}
/*
@@ -211,9 +212,6 @@ __wt_rec_write(
/* Initialize the reconciliation structures for each new run. */
WT_RET(__rec_write_init(session, page));
- /* Initialize the overflow tracking information for each new run. */
- WT_RET(__wt_rec_track_init(session, page));
-
/* Reconcile the page. */
switch (page->type) {
case WT_PAGE_COL_FIX:
@@ -240,9 +238,6 @@ __wt_rec_write(
/* Wrap up the page's reconciliation. */
WT_RET(__rec_write_wrapup(session, page));
- /* Wrap up overflow tracking, discarding what we can. */
- WT_RET(__wt_rec_track_wrapup(session, page, 0));
-
/*
* If this page has a parent, mark the parent dirty.
*
@@ -666,7 +661,7 @@ __rec_split(WT_SESSION_IMPL *session)
* boundaries, or the split size was the same as the page size,
* so we never bothered with saving split-point information.
*
- * Write the current disk image.
+ * Finalize the header information and write the page.
*/
dsk->recno = bnd->recno;
dsk->u.entries = r->entries;
@@ -753,7 +748,7 @@ __rec_split_finish(WT_SESSION_IMPL *session)
*/
snapshot = r->bnd_next == 1 && WT_PAGE_IS_ROOT(r->page);
- /* Write the remaining information. */
+ /* Finalize the header information and write the page. */
dsk = r->dsk.mem;
dsk->recno = bnd->recno;
dsk->u.entries = r->entries;
@@ -809,7 +804,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session)
len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
memcpy(dsk_start, bnd->start, len);
- /* Write the page. */
+ /* Finalize the header information and write the page. */
dsk->recno = bnd->recno;
dsk->u.entries = bnd->entries;
tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
@@ -850,15 +845,13 @@ static int
__rec_split_write(
WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf, int snapshot)
{
- WT_BTREE *btree;
WT_CELL *cell;
WT_PAGE_HEADER *dsk;
- WT_RECONCILE *r;
uint32_t size;
uint8_t addr[WT_BM_MAX_ADDR_COOKIE];
- r = session->reconcile;
- btree = session->btree;
+ dsk = buf->mem;
+ WT_VERBOSE(session, write, "%s", __wt_page_type_string(dsk->type));
/*
* We always write an additional byte on row-store leaf pages after the
@@ -873,7 +866,6 @@ __rec_split_write(
* see it.
*/
#define WT_TRAILING_KEY_CELL (sizeof(uint8_t))
- dsk = buf->mem;
if (dsk->type == WT_PAGE_ROW_LEAF) {
WT_ASSERT_RET(session, buf->size < buf->memsize);
@@ -882,10 +874,18 @@ __rec_split_write(
++buf->size;
}
- /* Write the chunk and save the location information. */
- WT_VERBOSE(session, write, "%s", __wt_page_type_string(dsk->type));
+ /*
+ * Write the chunk and save the location information. There is one big
+ * question: if this is a snapshot, then we're going to have to wrap up
+ * our tracking information (freeing blocks we no longer need) before we
+ * can create the snapshot, because snapshots write extent lists, that
+ * is, the whole system has to be consistent. We have to handle empty
+ * tree snapshots elsewhere (because we don't write anything for empty
+ * tree snapshots, they don't come through this path). Given that fact,
+ * clear the boundary information as a reminder, and do the snapshot at
+ * a later time, during wrapup.
+ */
if (snapshot) {
- WT_RET(__wt_bm_snap_write(session, buf, btree->snap));
bnd->addr.addr = NULL;
bnd->addr.size = 0;
} else {
@@ -2509,8 +2509,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL) {
__wt_get_addr(page->parent, page->ref, &addr, &size);
- WT_RET(__wt_rec_track_block(
- session, WT_PT_BLOCK, page, addr, size));
+ WT_RET(__wt_rec_track_block(session, page, addr, size));
}
break;
case WT_PAGE_REC_EMPTY: /* Page deleted */
@@ -2523,7 +2522,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
* are snapshots, and must be explicitly dropped.
*/
if (!WT_PAGE_IS_ROOT(page))
- WT_RET(__wt_rec_track_block(session, WT_PT_BLOCK,
+ WT_RET(__wt_rec_track_block(session,
page, mod->u.replace.addr, mod->u.replace.size));
/* Discard the replacement page's address. */
@@ -2534,8 +2533,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_REC_SPLIT: /* Page split */
/* Discard the split page's leaf-page blocks. */
WT_REF_FOREACH(mod->u.split, ref, i)
- WT_RET(__wt_rec_track_block(
- session, WT_PT_BLOCK, page,
+ WT_RET(__wt_rec_track_block(session, page,
((WT_ADDR *)ref->addr)->addr,
((WT_ADDR *)ref->addr)->size));
@@ -2554,6 +2552,15 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
}
F_CLR(page, WT_PAGE_REC_MASK);
+ /*
+ * Wrap up discarded block and overflow tracking. If we are about to
+ * create a snapshot, the system must be entirely consistent at that
+ * point, the underlying block manager is presumably going to do some
+ * action to resolve the list of allocated/free/whatever blocks that
+ * are associated with the snapshot.
+ */
+ WT_RET(__wt_rec_track_wrapup(session, page));
+
switch (r->bnd_next) {
case 0: /* Page delete */
WT_VERBOSE(session, reconcile, "page %p empty", page);
@@ -2577,10 +2584,19 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
* Because WiredTiger's pages grow without splitting, we're
* replacing a single page with another single page most of
* the time.
+ *
+ * If this is a root page, then we don't have an address and we
+ * have to create a sync point. The address was cleared when
+ * we were about to write the buffer so we know what to do here.
*/
bnd = &r->bnd[0];
- mod->u.replace = bnd->addr;
- bnd->addr.addr = NULL;
+ if (bnd->addr.addr == NULL)
+ WT_RET(
+ __wt_bm_snap_write(session, &r->dsk, btree->snap));
+ else {
+ mod->u.replace = bnd->addr;
+ bnd->addr.addr = NULL;
+ }
F_SET(page, WT_PAGE_REC_REPLACE);
break;
diff --git a/src/include/block.h b/src/include/block.h
index 27548f9f34a..784cdd46e8c 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -149,6 +149,7 @@ struct __wt_block {
WT_COMPRESSOR *compressor; /* Page compressor */
/* Salvage support */
+ int slvg; /* If performing salvage. */
off_t slvg_off; /* Salvage file offset */
/* Verification support */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index e8ec2c7e9f7..0481cc8de0e 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -74,10 +74,10 @@ struct __wt_addr {
*/
typedef enum {
WT_PT_EMPTY=0, /* Unused slot */
- WT_PT_BLOCK, /* Block: inactive */
- WT_PT_BLOCK_EVICT, /* Block: inactive on eviction */
- WT_PT_OVFL, /* Overflow: active */
- WT_PT_OVFL_DISCARD /* Overflow: inactive */
+ WT_PT_DISCARD, /* Block/overflow to discard */
+ WT_PT_DISCARD_COMPLETE, /* Block/overflow freed */
+ WT_PT_OVFL, /* Overflow record not yet in use */
+ WT_PT_OVFL_ACTIVE /* Overflow record in use */
} __wt_pt_type_t;
struct __wt_page_modify {
@@ -274,13 +274,13 @@ struct __wt_page {
uint32_t memory_footprint;
#define WT_PAGE_INVALID 0 /* Invalid page */
-#define WT_PAGE_COL_FIX 1 /* Col-store fixed-len leaf */
-#define WT_PAGE_COL_INT 2 /* Col-store internal page */
-#define WT_PAGE_COL_VAR 3 /* Col-store var-length leaf page */
-#define WT_PAGE_OVFL 4 /* Overflow page */
-#define WT_PAGE_ROW_INT 5 /* Row-store internal page */
-#define WT_PAGE_ROW_LEAF 6 /* Row-store leaf page */
-#define WT_PAGE_FREELIST 7 /* Free-list page */
+#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
+#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
+#define WT_PAGE_COL_INT 3 /* Col-store internal page */
+#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
+#define WT_PAGE_OVFL 5 /* Overflow page */
+#define WT_PAGE_ROW_INT 6 /* Row-store internal page */
+#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
uint8_t type; /* Page type */
/*
diff --git a/src/include/extern.h b/src/include/extern.h
index c3457da5db2..fd5f15ee6ed 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -32,14 +32,25 @@ extern int __wt_block_alloc( WT_SESSION_IMPL *session,
WT_BLOCK *block,
off_t *offp,
off_t size);
+extern int __wt_block_extend( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ off_t *offp,
+ off_t size);
extern int __wt_block_free_buf(WT_SESSION_IMPL *session,
WT_BLOCK *block,
const uint8_t *addr,
uint32_t addr_size);
-extern int __wt_block_free( WT_SESSION_IMPL *session,
+extern int __wt_block_free(WT_SESSION_IMPL *session,
WT_BLOCK *block,
off_t off,
- off_t size);
+ off_t size,
+ int free_extent);
+extern int __wt_block_extlist_match( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ WT_BLOCK_SNAPSHOT *si);
+extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session,
+ WT_EXTLIST *a,
+ WT_EXTLIST *b);
extern int __wt_block_extlist_read(WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_EXTLIST *el,
@@ -101,7 +112,7 @@ extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session,
uint32_t *addr_sizep,
uint64_t *write_genp,
int *eofp);
-extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session, int success);
+extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session);
extern int __wt_bm_verify_start(WT_SESSION_IMPL *session);
extern int __wt_bm_verify_end(WT_SESSION_IMPL *session);
extern int __wt_bm_verify_addr(WT_SESSION_IMPL *session,
@@ -129,9 +140,7 @@ extern int __wt_block_read(WT_SESSION_IMPL *session,
uint32_t size,
uint32_t cksum);
extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_salvage_end(WT_SESSION_IMPL *session,
- WT_BLOCK *block,
- int success);
+extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_ITEM *buf,
@@ -139,6 +148,10 @@ extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
uint32_t *addr_sizep,
uint64_t *write_genp,
int *eofp);
+extern int __wt_block_snap_init(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ WT_BLOCK_SNAPSHOT *si,
+ int is_live);
extern int __wt_block_snap_load(WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_ITEM *dsk,
@@ -181,7 +194,8 @@ extern int __wt_block_write(WT_SESSION_IMPL *session,
WT_ITEM *buf,
off_t *offsetp,
uint32_t *sizep,
- uint32_t *cksump);
+ uint32_t *cksump,
+ int force_extend);
extern int __wt_bulk_init(WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_insert(WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_end(WT_CURSOR_BULK *cbulk);
@@ -298,8 +312,7 @@ extern int __wt_col_search(WT_SESSION_IMPL *session,
extern int __wt_rec_evict(WT_SESSION_IMPL *session,
WT_PAGE *page,
uint32_t flags);
-extern int __wt_rec_track_block(WT_SESSION_IMPL *session,
- __wt_pt_type_t type,
+extern int __wt_rec_track_block( WT_SESSION_IMPL *session,
WT_PAGE *page,
const uint8_t *addr,
uint32_t size);
@@ -315,10 +328,8 @@ extern int __wt_rec_track_ovfl_reuse(WT_SESSION_IMPL *session,
uint32_t size,
uint8_t **addrp,
uint32_t *sizep);
-extern int __wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_rec_track_wrapup(WT_SESSION_IMPL *session,
- WT_PAGE *page,
- int final);
+extern int __wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_rec_track_discard(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_rec_write( WT_SESSION_IMPL *session,
WT_PAGE *page,
WT_SALVAGE_COOKIE *salvage);
diff --git a/src/include/misc.h b/src/include/misc.h
index d419912654f..45e8b808010 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -117,10 +117,6 @@
if (WT_VERBOSE_ISSET(session, f)) \
func; \
} while (0)
-#define WT_VERBOSE_CALL_ERR(session, f, func) do { \
- if (WT_VERBOSE_ISSET(session, f)) \
- WT_ERR(func); \
-} while (0)
#define WT_VERBOSE_CALL_RET(session, f, func) do { \
if (WT_VERBOSE_ISSET(session, f)) \
WT_RET(func); \