summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDon Anderson <dda@ddanderson.com>2012-03-16 14:24:38 -0400
committerDon Anderson <dda@ddanderson.com>2012-03-16 14:24:38 -0400
commit6f048f9116aec392898816e6c50669f64c1cc93e (patch)
tree1f328bdee2b7aef09f1c6be5a69040731b6beae3
parentfe21c095f8aa6f82347937739e42fb2b0b4eca3d (diff)
parentbd3a2cc352e427658140604ff96ef89945456351 (diff)
downloadmongo-6f048f9116aec392898816e6c50669f64c1cc93e.tar.gz
Merge branch 'master' of https://github.com/wiredtiger/wiredtiger
-rw-r--r--dist/filelist2
-rw-r--r--dist/s_string.ok2
-rw-r--r--src/block/block_ext.c (renamed from src/block/block_alloc.c)530
-rw-r--r--src/block/block_open.c14
-rw-r--r--src/block/block_vrfy.c12
-rw-r--r--src/include/block.h69
-rw-r--r--src/include/extern.h19
-rw-r--r--src/include/wt_internal.in6
8 files changed, 348 insertions, 306 deletions
diff --git a/dist/filelist b/dist/filelist
index 802d7a03723..264af59c511 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -5,8 +5,8 @@ src/api/api_event.c
src/api/api_strerror.c
src/api/api_version.c
src/block/block_addr.c
-src/block/block_alloc.c
src/block/block_cksum.c
+src/block/block_ext.c
src/block/block_mgr.c
src/block/block_open.c
src/block/block_read.c
diff --git a/dist/s_string.ok b/dist/s_string.ok
index df49454f9ad..d02605f04db 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -45,6 +45,7 @@ ENOMEM
ENOTSUP
ETIME
ETIMEDOUT
+EXTLIST
Enqueue
Env
Eron
@@ -249,6 +250,7 @@ eventv
evictserver
exactp
extern
+extlist
fcntl
ffc
ffs
diff --git a/src/block/block_alloc.c b/src/block/block_ext.c
index 28b604bc7ef..a03101c74a5 100644
--- a/src/block/block_alloc.c
+++ b/src/block/block_ext.c
@@ -7,17 +7,23 @@
#include "wt_internal.h"
-static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t);
-static int __block_truncate(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t);
+static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t);
+static int __block_truncate(WT_SESSION_IMPL *, WT_BLOCK *);
+
+#ifdef HAVE_VERBOSE
+static void __block_extlist_dump(WT_SESSION_IMPL *, WT_EXTLIST *);
+#endif
/*
* __block_off_srch --
- * Search a by-offset skiplist.
+ * Search a by-offset skiplist (either the primary by-offset list, or the
+ * by-offset list referenced by a size entry), for the specified offset.
*/
static void
-__block_off_srch(WT_FREE **head, off_t off, WT_FREE ***stack, int skip_off)
+__block_off_srch(WT_EXT **head, off_t off, WT_EXT ***stack, int skip_off)
{
- WT_FREE **fep;
+ WT_EXT **extp;
int i;
/*
@@ -26,20 +32,21 @@ __block_off_srch(WT_FREE **head, off_t off, WT_FREE ***stack, int skip_off)
*
* Return a stack for an exact match or the next-largest item.
*
- * The WT_FREE structure contains two skiplists, the primary one and the
+ * The WT_EXT structure contains two skiplists, the primary one and the
* per-size bucket one: if the skip_off flag is set, offset the skiplist
* array by the depth specified in this particular structure.
*/
- for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;)
- if (*fep != NULL && (*fep)->off < off)
- fep = &(*fep)->next[i + (skip_off ? (*fep)->depth : 0)];
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL && (*extp)->off < off)
+ extp =
+ &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
else
- stack[i--] = fep--;
+ stack[i--] = extp--;
}
/*
* __block_size_srch --
- * Search the by-size skiplist.
+ * Search the by-size skiplist for the specified size.
*/
static void
__block_size_srch(WT_SIZE **head, off_t size, WT_SIZE ***stack)
@@ -62,165 +69,165 @@ __block_size_srch(WT_SIZE **head, off_t size, WT_SIZE ***stack)
/*
* __block_off_pair_srch --
- * Search the primary by-offset skiplist for before/after records of the
- * specified offset.
+ * Search a by-offset skiplist for before/after records of the specified
+ * offset.
*/
static void
__block_off_pair_srch(
- WT_FREE **head, off_t off, WT_FREE **beforep, WT_FREE **afterp)
+ WT_EXTLIST *el, off_t off, WT_EXT **beforep, WT_EXT **afterp)
{
- WT_FREE **fep;
+ WT_EXT **head, **extp;
int i;
*beforep = *afterp = NULL;
+ head = el->off;
+
/*
* Start at the highest skip level, then go as far as possible at each
* level before stepping down to the next.
*/
- for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;) {
- if (*fep == NULL) {
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+ if (*extp == NULL) {
--i;
- --fep;
+ --extp;
continue;
}
- if ((*fep)->off < off) { /* Keep going at this level */
- *beforep = *fep;
- fep = &(*fep)->next[i];
+ if ((*extp)->off < off) { /* Keep going at this level */
+ *beforep = *extp;
+ extp = &(*extp)->next[i];
} else { /* Drop down a level */
- *afterp = *fep;
+ *afterp = *extp;
--i;
- --fep;
+ --extp;
}
}
}
/*
- * __block_off_last --
- * Return the last free range in the offset skiplist.
+ * __block_extlist_last --
+ * Return the last extent range in the skiplist.
*/
-static WT_FREE *
-__block_off_last(WT_FREE **head)
+static WT_EXT *
+__block_extlist_last(WT_EXT **head)
{
- WT_FREE *fe, **fep;
+ WT_EXT *ext, **extp;
int i;
- fe = NULL;
+ ext = NULL;
- for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;) {
- if (*fep == NULL) {
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+ if (*extp == NULL) {
--i;
- --fep;
+ --extp;
continue;
}
- fe = *fep;
- fep = &(*fep)->next[i];
+ ext = *extp;
+ extp = &(*extp)->next[i];
}
- return (fe);
+ return (ext);
}
/*
* __block_off_insert --
- * Insert a record into the system.
+ * Insert a record into an extent list.
*/
static int
-__block_off_insert(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FREE *fe)
+__block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
{
- WT_FREE **astack[WT_SKIP_MAXDEPTH];
+ WT_EXT **astack[WT_SKIP_MAXDEPTH];
WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
u_int i;
/*
* If we are inserting a new size onto the size skiplist, we'll need
- * a new WT_FREE structure for that skiplist.
+ * a new WT_EXT structure for that skiplist.
*/
- __block_size_srch(block->fsize, fe->size, sstack);
+ __block_size_srch(el->size, ext->size, sstack);
szp = *sstack[0];
- if (szp == NULL || szp->size != fe->size) {
+ if (szp == NULL || szp->size != ext->size) {
WT_RET(__wt_calloc(session, 1,
- sizeof(WT_SIZE) + fe->depth * sizeof(WT_SIZE *), &szp));
- szp->size = fe->size;
- szp->depth = fe->depth;
- for (i = 0; i < fe->depth; ++i) {
+ sizeof(WT_SIZE) + ext->depth * sizeof(WT_SIZE *), &szp));
+ szp->size = ext->size;
+ szp->depth = ext->depth;
+ for (i = 0; i < ext->depth; ++i) {
szp->next[i] = *sstack[i];
*sstack[i] = szp;
}
}
- /* Insert the new WT_FREE structure into the offset skiplist. */
- __block_off_srch(block->foff, fe->off, astack, 0);
- for (i = 0; i < fe->depth; ++i) {
- fe->next[i] = *astack[i];
- *astack[i] = fe;
+ /* Insert the new WT_EXT structure into the offset skiplist. */
+ __block_off_srch(el->off, ext->off, astack, 0);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i] = *astack[i];
+ *astack[i] = ext;
}
/*
- * Insert the new WT_FREE structure into the size element's offset
+ * Insert the new WT_EXT structure into the size element's offset
* skiplist.
*/
- __block_off_srch(szp->foff, fe->off, astack, 1);
- for (i = 0; i < fe->depth; ++i) {
- fe->next[i + fe->depth] = *astack[i];
- *astack[i] = fe;
+ __block_off_srch(szp->off, ext->off, astack, 1);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i + ext->depth] = *astack[i];
+ *astack[i] = ext;
}
- ++block->freelist_entries;
- block->freelist_bytes += (uint64_t)fe->size;
- block->freelist_dirty = 1;
+ ++el->entries;
+ el->bytes += (uint64_t)ext->size;
return (0);
}
/*
* __block_off_remove --
- * Remove a record from the system.
+ * Remove a record from an extent list.
*/
static int
__block_off_remove(
- WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, WT_FREE **fep)
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, WT_EXT **extp)
{
- WT_FREE *fe, **astack[WT_SKIP_MAXDEPTH];
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
u_int i;
/* Find and remove the record from the by-offset skiplist. */
- __block_off_srch(block->foff, off, astack, 0);
- fe = *astack[0];
- if (fe == NULL || fe->off != off)
+ __block_off_srch(el->off, off, astack, 0);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
return (EINVAL);
- for (i = 0; i < fe->depth; ++i)
- *astack[i] = fe->next[i];
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i];
/*
* Find and remove the record from the size's offset skiplist; if that
* empties the by-size skiplist entry, remove it as well.
*/
- __block_size_srch(block->fsize, fe->size, sstack);
+ __block_size_srch(el->size, ext->size, sstack);
szp = *sstack[0];
- if (szp == NULL || szp->size != fe->size)
+ if (szp == NULL || szp->size != ext->size)
return (EINVAL);
- __block_off_srch(szp->foff, off, astack, 1);
- fe = *astack[0];
- if (fe == NULL || fe->off != off)
+ __block_off_srch(szp->off, off, astack, 1);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
return (EINVAL);
- for (i = 0; i < fe->depth; ++i)
- *astack[i] = fe->next[i + fe->depth];
- if (szp->foff[0] == NULL) {
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i + ext->depth];
+ if (szp->off[0] == NULL) {
for (i = 0; i < szp->depth; ++i)
*sstack[i] = szp->next[i];
__wt_free(session, szp);
}
- --block->freelist_entries;
- block->freelist_bytes -= (uint64_t)fe->size;
- block->freelist_dirty = 1;
+ --el->entries;
+ el->bytes -= (uint64_t)ext->size;
/* Return the record if our caller wants it, otherwise free it. */
- if (fep == NULL)
- __wt_free(session, fe);
+ if (extp == NULL)
+ __wt_free(session, ext);
else
- *fep = fe;
+ *extp = ext;
return (0);
}
@@ -231,9 +238,9 @@ __block_off_remove(
*/
int
__wt_block_alloc(
- WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offsetp, off_t size)
+ WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size)
{
- WT_FREE *fe;
+ WT_EXT *ext;
WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
int ret;
@@ -253,37 +260,37 @@ __wt_block_alloc(
* requested size and take the first entry on the by-size offset list.
* If we don't have anything large enough, extend the file.
*/
- __block_size_srch(block->fsize, size, sstack);
+ __block_size_srch(block->free.size, size, sstack);
szp = *sstack[0];
if (szp == NULL) {
- WT_ERR(__block_extend(session, block, offsetp, size));
+ WT_ERR(__block_extend(session, block, offp, size));
goto done;
}
/* Remove the first record, and set the returned offset. */
- fe = szp->foff[0];
- WT_ERR(__block_off_remove(session, block, fe->off, &fe));
- *offsetp = fe->off;
+ ext = szp->off[0];
+ WT_ERR(__block_off_remove(session, &block->free, ext->off, &ext));
+ *offp = ext->off;
/* If doing a partial allocation, adjust the record and put it back. */
- if (fe->size > size) {
+ if (ext->size > size) {
WT_VERBOSE(session, block,
"allocate %" PRIdMAX " from range %" PRIdMAX "-%"
PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
(intmax_t)size,
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size),
- (intmax_t)(fe->off + size),
- (intmax_t)(fe->off + size + fe->size - size));
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)(ext->off + size),
+ (intmax_t)(ext->off + size + ext->size - size));
- fe->off += size;
- fe->size -= size;
- WT_ERR(__block_off_insert(session, block,fe));
+ ext->off += size;
+ ext->size -= size;
+ WT_ERR(__block_off_insert(session, &block->free, ext));
} else {
WT_VERBOSE(session, block,
"allocate range %" PRIdMAX "-%" PRIdMAX,
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size));
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
- __wt_free(session, fe);
+ __wt_free(session, ext);
}
done: err:
@@ -297,7 +304,7 @@ done: err:
*/
static int
__block_extend(
- WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offsetp, off_t size)
+ WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size)
{
WT_FH *fh;
@@ -328,13 +335,13 @@ __block_extend(
WT_RET_MSG(session, WT_ERROR,
"block allocation failed, file cannot grow further");
- *offsetp = fh->file_size;
+ *offp = fh->file_size;
fh->file_size += size;
WT_BSTAT_INCR(session, extend);
WT_VERBOSE(session, block,
"file extend %" PRIdMAX "B @ %" PRIdMAX,
- (intmax_t)size, (intmax_t)*offsetp);
+ (intmax_t)size, (intmax_t)*offp);
return (0);
}
@@ -347,15 +354,14 @@ int
__wt_block_free_buf(WT_SESSION_IMPL *session,
WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size)
{
- off_t offset;
+ off_t off;
uint32_t size;
WT_UNUSED(addr_size);
/* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, NULL));
-
- WT_RET(__wt_block_free(session, block, offset, (off_t)size));
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &off, &size, NULL));
+ WT_RET(__wt_block_free(session, block, off, size));
return (0);
}
@@ -368,56 +374,68 @@ int
__wt_block_free(
WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, off_t size)
{
- WT_FREE *fe, *after, *before;
- u_int skipdepth;
int ret;
- ret = 0;
-
WT_BSTAT_INCR(session, free);
WT_VERBOSE(session, block,
"free %" PRIdMAX "/%" PRIdMAX, (intmax_t)off, (intmax_t)size);
__wt_spin_lock(session, &block->freelist_lock);
+ ret = __block_merge(session, &block->free, off, (off_t)size);
+ __wt_spin_unlock(session, &block->freelist_lock);
+
+ return (ret);
+}
+
+/*
+ * __block_merge --
+ * Insert an extent into an extent list, merging if possible.
+ */
+static int
+__block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+{
+ WT_EXT *ext, *after, *before;
+ u_int skipdepth;
/*
* Retrieve the records preceding/following the offset. If the records
* are contiguous with the free'd offset, combine records.
*/
- __block_off_pair_srch(block->foff, off, &before, &after);
+ __block_off_pair_srch(el, off, &before, &after);
if (before != NULL) {
if (before->off + before->size > off)
- WT_ERR_MSG(session, EINVAL,
- "existing range %" PRIdMAX "-%" PRIdMAX " overlaps "
- "with free'd range %" PRIdMAX "-%" PRIdMAX,
+ WT_RET_MSG(session, EINVAL,
+ "%s: existing range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with merge range %" PRIdMAX "-%" PRIdMAX,
+ el->name,
(intmax_t)before->off,
(intmax_t)(before->off + before->size),
- (intmax_t)off,
- (intmax_t)(off + size));
+ (intmax_t)off, (intmax_t)(off + size));
if (before->off + before->size != off)
before = NULL;
}
if (after != NULL) {
if (off + size > after->off)
- WT_ERR_MSG(session, EINVAL,
- "free'd range %" PRIdMAX "-%" PRIdMAX " overlaps "
- "with existing range %" PRIdMAX "-%" PRIdMAX,
- (intmax_t)off,
- (intmax_t)(off + size),
+ WT_RET_MSG(session, EINVAL,
+ "%s: merge range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with existing range %" PRIdMAX
+ "-%" PRIdMAX,
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size),
(intmax_t)after->off,
(intmax_t)(after->off + after->size));
if (off + size != after->off)
after = NULL;
}
if (before == NULL && after == NULL) {
- /* Allocate a new WT_FREE structure. */
+ /* Allocate a new WT_EXT structure. */
skipdepth = __wt_skip_choose_depth();
- WT_ERR(__wt_calloc(session, 1,
- sizeof(WT_FREE) + skipdepth * 2 * sizeof(WT_FREE *), &fe));
- fe->off = off;
- fe->size = size;
- fe->depth = (uint8_t)skipdepth;
- goto done;
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+ ext->off = off;
+ ext->size = size;
+ ext->depth = (uint8_t)skipdepth;
+ return (__block_off_insert(session, el, ext));
}
/*
@@ -428,126 +446,120 @@ __wt_block_free(
* the record we're going to use, adjust it and re-insert it.
*/
if (before == NULL) {
- WT_ERR(__block_off_remove(session, block, after->off, &fe));
+ WT_RET(__block_off_remove(session, el, after->off, &ext));
WT_VERBOSE(session, block,
- "free range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
PRIdMAX "-%" PRIdMAX,
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size),
- (intmax_t)off, (intmax_t)(off + fe->size + size));
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)off, (intmax_t)(off + ext->size + size));
- fe->off = off;
- fe->size += size;
+ ext->off = off;
+ ext->size += size;
} else {
if (after != NULL) {
size += after->size;
- WT_ERR(__block_off_remove(
- session, block, after->off, NULL));
+ WT_RET(
+ __block_off_remove(session, el, after->off, NULL));
}
- WT_ERR(__block_off_remove(session, block, before->off, &fe));
+ WT_RET(__block_off_remove(session, el, before->off, &ext));
WT_VERBOSE(session, block,
- "free range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
PRIdMAX "-%" PRIdMAX,
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size),
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size + size));
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)ext->off,
+ (intmax_t)(ext->off + ext->size + size));
- fe->size += size;
+ ext->size += size;
}
-
-done: WT_ERR(__block_off_insert(session, block,fe));
-
-err: __wt_spin_unlock(session, &block->freelist_lock);
- return (ret);
+ return (__block_off_insert(session, el, ext));
}
/*
- * __wt_block_freelist_open --
- * Initialize the free-list structures.
- */
-void
-__wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block)
-{
- __wt_spin_init(session, &block->freelist_lock);
-
- block->freelist_bytes = 0;
- block->freelist_entries = 0;
- block->freelist_dirty = 0;
-
- block->free_offset = WT_BLOCK_INVALID_OFFSET;
- block->free_size = block->free_cksum = 0;
-}
-
-/*
- * __wt_block_freelist_read --
- * Read the free-list.
+ * __wt_block_extlist_read --
+ * Read an extent list.
*/
int
-__wt_block_freelist_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__wt_block_extlist_read(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, uint32_t size, uint32_t cksum)
{
WT_ITEM *tmp;
- off_t offset, size;
+ off_t loff, lsize;
uint8_t *p;
int ret;
tmp = NULL;
ret = 0;
- /* See if there's a free-list to read. */
- if (block->free_offset == WT_BLOCK_INVALID_OFFSET)
- return (0);
-
- WT_RET(__wt_scr_alloc(session, block->free_size, &tmp));
- WT_ERR(__wt_block_read(session, block,
- tmp, block->free_offset, block->free_size, block->free_cksum));
-
- /*
- * The free-list is read before the file is verified, which means we
- * need to be a little paranoid. We know the free-list chunk itself
- * is entirely in the file because we checked when we first read the
- * file's description structure. Confirm the offset/size pairs are
- * valid, then insert them into the linked list.
- */
- p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ WT_ERR(__wt_block_read(session, block, tmp, off, size, cksum));
-#define WT_FREELIST_READ(p, v) do { \
+#define WT_EXTLIST_READ(p, v) do { \
(v) = *(off_t *)(p); \
(p) += sizeof(off_t); \
} while (0)
- WT_FREELIST_READ(p, offset);
- WT_FREELIST_READ(p, size);
- if (offset != WT_BLOCK_FREELIST_MAGIC || size != 0)
+ p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_EXTLIST_READ(p, loff);
+ WT_EXTLIST_READ(p, lsize);
+ if (loff != WT_BLOCK_EXTLIST_MAGIC || lsize != 0)
goto corrupted;
for (;;) {
- WT_FREELIST_READ(p, offset);
- WT_FREELIST_READ(p, size);
- if (offset == WT_BLOCK_INVALID_OFFSET)
+ WT_EXTLIST_READ(p, loff);
+ WT_EXTLIST_READ(p, lsize);
+ if (loff == WT_BLOCK_INVALID_OFFSET)
break;
- if ((offset - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 ||
- size % block->allocsize != 0 ||
- offset + size > block->fh->file_size)
+ /*
+ * We check the offset/size pairs represent valid file ranges,
+ * then insert them into the list. We don't necessarily have
+ * to check for offsets past the end-of-file, but it's a cheap
+ * and easy test to do here and we'd have to do the check as
+ * part of file verification, regardless.
+ */
+ if ((loff - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 ||
+ lsize % block->allocsize != 0 ||
+ loff + lsize > block->fh->file_size)
corrupted: WT_ERR_MSG(session, WT_ERROR,
- "file contains a corrupted free-list, range %"
+ "file contains a corrupted %s extent list, range %"
PRIdMAX "-%" PRIdMAX " past end-of-file",
- (intmax_t)offset, (intmax_t)(offset + size));
-
- WT_ERR(__wt_block_free(session, block, offset, size));
+ el->name,
+ (intmax_t)loff, (intmax_t)(loff + lsize));
+
+ /*
+ * We could insert instead of merge, because ranges shouldn't
+ * overlap, but merge knows how to allocate WT_EXT structures,
+ * and a little paranoia is a good thing.
+ */
+ WT_ERR(__block_merge(session, el, loff, lsize));
}
- /*
- * Insert the free-list itself into the linked list, but don't clear
- * the values, if the free-list is never modified, we don't write it.
- */
- WT_ERR(__wt_block_free(
- session, block, block->free_offset, (off_t)block->free_size));
+ WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
err: __wt_scr_free(&tmp);
return (ret);
}
/*
+ * __wt_block_freelist_open --
+ * Initialize the free-list structures.
+ */
+void
+__wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ __wt_spin_init(session, &block->freelist_lock);
+
+ memset(&block->free, 0, sizeof(block->free));
+ block->free.name = "freelist";
+
+ block->free_offset = WT_BLOCK_INVALID_OFFSET;
+ block->free_size = block->free_cksum = 0;
+}
+
+/*
* __wt_block_freelist_close --
* Write the free-list at the tail of the file.
*/
@@ -558,38 +570,36 @@ __wt_block_freelist_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
/*
- * __wt_block_freelist_write --
- * Write the free-list at the tail of the file.
+ * __wt_block_extlist_write --
+ * Write an extent list at the tail of the file.
*/
int
-__wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_EXTLIST *el, off_t *offp, uint32_t *sizep, uint32_t *cksump)
{
+ WT_EXT *ext;
WT_ITEM *tmp;
- WT_FREE *fe;
WT_PAGE_HEADER *dsk;
uint32_t datasize, size;
uint8_t *p;
int ret;
+ const char *name;
tmp = NULL;
ret = 0;
- /* If the free-list hasn't changed, there's nothing to write. */
- if (block->freelist_dirty == 0)
- return (0);
+ WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
/* If there aren't any free-list entries, we're done. */
- if (block->freelist_entries == 0) {
- block->free_offset = WT_BLOCK_INVALID_OFFSET;
- block->free_size = block->free_cksum = 0;
+ if (el->entries == 0) {
+ *offp = WT_BLOCK_INVALID_OFFSET;
+ *sizep = *cksump = 0;
return (0);
}
/* Truncate the file if possible. */
WT_RET(__block_truncate(session, block));
- WT_VERBOSE_CALL(session, block, __wt_block_dump(session, block));
-
/*
* Get a scratch buffer, clear the page's header and data, initialize
* the header. Allocate an allocation-sized aligned buffer so the
@@ -597,11 +607,10 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block)
* copying to something larger.
*
* Allocate room for the free-list entries, plus 2 additional entries:
- * the initial WT_BLOCK_FREELIST_MAGIC/0 pair and the list-terminating
+ * the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-terminating
* WT_BLOCK_INVALID_OFFSET/0 pair.
*/
- datasize = size =
- (block->freelist_entries + 2) * WT_STORE_SIZE(sizeof(off_t) * 2);
+ datasize = size = (el->entries + 2) * WT_STORE_SIZE(sizeof(off_t) * 2);
WT_RET(__wt_block_write_size(session, block, &size));
WT_RET(__wt_scr_alloc(session, size, &tmp));
dsk = tmp->mem;
@@ -610,24 +619,24 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block)
dsk->type = WT_PAGE_FREELIST;
tmp->size = WT_STORE_SIZE(WT_BLOCK_HEADER_BYTE_SIZE + datasize);
- /* Fill the page's data. */
- p = WT_BLOCK_HEADER_BYTE(dsk);
-
-#define WT_FREELIST_WRITE(p, v) do { \
+#define WT_EXTLIST_WRITE(p, v) do { \
*(off_t *)(p) = (v); \
(p) += sizeof(off_t); \
} while (0)
- WT_FREELIST_WRITE(p, WT_BLOCK_FREELIST_MAGIC); /* Initial value */
- WT_FREELIST_WRITE(p, 0);
- WT_FREE_FOREACH(fe, block->foff) { /* Free ranges */
- WT_FREELIST_WRITE(p, fe->off);
- WT_FREELIST_WRITE(p, fe->size);
+ /* Fill the page's data. */
+ p = WT_BLOCK_HEADER_BYTE(dsk);
+ WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC); /* Initial value */
+ WT_EXTLIST_WRITE(p, 0);
+ WT_EXT_FOREACH(ext, el->off) { /* Free ranges */
+ WT_EXTLIST_WRITE(p, ext->off);
+ WT_EXTLIST_WRITE(p, ext->size);
}
- WT_FREELIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */
- WT_FREELIST_WRITE(p, 0);
+ WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */
+ WT_EXTLIST_WRITE(p, 0);
/*
+ * XXX
* Discard the in-memory free-list: this has to happen before writing
* the free-list because the underlying block write function is going
* to allocate file space for the free-list block(s), and allocating
@@ -637,15 +646,15 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block)
* complex than ordering these operations so the eventual allocation
* in the write code always extends the file.
*/
+ name = el->name;
+ el = NULL;
__wt_block_discard(session, block);
- /* Write the free list to disk and update the file's meta-data. */
- WT_ERR(__wt_block_write(session, block, tmp,
- &block->free_offset, &block->free_size, &block->free_cksum));
+ /* Write the extent list to disk. */
+ WT_ERR(__wt_block_write(session, block, tmp, offp, sizep, cksump));
WT_VERBOSE(session, block,
- "free-list written %" PRIdMAX "/%" PRIu32,
- (intmax_t)block->free_offset, block->free_size);
+ "%s written %" PRIdMAX "/%" PRIu32, name, (intmax_t)*offp, *sizep);
err: __wt_scr_free(&tmp);
return (ret);
@@ -658,8 +667,8 @@ err: __wt_scr_free(&tmp);
static int
__block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
+ WT_EXT *ext;
WT_FH *fh;
- WT_FREE *fe;
fh = block->fh;
@@ -667,47 +676,43 @@ __block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block)
* Check if the last free range is at the end of the file, and if so,
* truncate the file and discard the range.
*/
- if ((fe = __block_off_last(block->foff)) == NULL)
+ if ((ext = __block_extlist_last(block->free.off)) == NULL)
return (0);
- if (fe->off + fe->size != fh->file_size)
+ if (ext->off + ext->size != fh->file_size)
return (0);
WT_VERBOSE(session, block,
"truncate file from %" PRIdMAX " to %" PRIdMAX,
- (intmax_t)fh->file_size, (intmax_t)fe->off);
+ (intmax_t)fh->file_size, (intmax_t)ext->off);
- fh->file_size = fe->off;
+ fh->file_size = ext->off;
WT_RET(__wt_ftruncate(session, fh, fh->file_size));
- WT_RET(__block_off_remove(session, block, fe->off, NULL));
+ WT_RET(__block_off_remove(session, &block->free, ext->off, NULL));
return (0);
}
/*
* __wt_block_discard --
- * Discard any free-list entries.
+ * Discard any extent-list entries.
*/
void
__wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
- WT_FREE *fe, *nfe;
+ WT_EXT *ext, *next;
WT_SIZE *szp, *nszp;
- for (fe = block->foff[0]; fe != NULL; fe = nfe) {
- nfe = fe->next[0];
- __wt_free(session, fe);
+ for (ext = block->free.off[0]; ext != NULL; ext = next) {
+ next = ext->next[0];
+ __wt_free(session, ext);
}
- memset(block->foff, 0, sizeof(block->foff));
- for (szp = block->fsize[0]; szp != NULL; szp = nszp) {
+ for (szp = block->free.size[0]; szp != NULL; szp = nszp) {
nszp = szp->next[0];
__wt_free(session, szp);
}
- memset(block->fsize, 0, sizeof(block->fsize));
- block->freelist_bytes = 0;
- block->freelist_entries = 0;
- block->freelist_dirty = 0;
+ memset(&block->free, 0, sizeof(block->free));
}
/*
@@ -717,8 +722,8 @@ __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block)
void
__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
- WT_BSTAT_SET(session, file_freelist_bytes, block->freelist_bytes);
- WT_BSTAT_SET(session, file_freelist_entries, block->freelist_entries);
+ WT_BSTAT_SET(session, file_freelist_bytes, block->free.bytes);
+ WT_BSTAT_SET(session, file_freelist_entries, block->free.entries);
WT_BSTAT_SET(session, file_size, block->fh->file_size);
WT_BSTAT_SET(session, file_magic, WT_BLOCK_MAGIC);
WT_BSTAT_SET(session, file_major, WT_BLOCK_MAJOR_VERSION);
@@ -726,27 +731,32 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
#ifdef HAVE_VERBOSE
-void
-__wt_block_dump(WT_SESSION_IMPL *session, WT_BLOCK *block)
+static void
+__block_extlist_dump(WT_SESSION_IMPL *session, WT_EXTLIST *el)
{
- WT_FREE *fe;
+ WT_EXT *ext;
WT_SIZE *szp;
- WT_VERBOSE(session, block, "freelist by offset:");
- WT_FREE_FOREACH(fe, block->foff)
+ if (el->entries == 0) {
+ WT_VERBOSE(session, block, "%s: [Empty]", el->name);
+ return;
+ }
+
+ WT_VERBOSE(session, block, "%s: list by offset:", el->name);
+ WT_EXT_FOREACH(ext, el->off)
WT_VERBOSE(session, block,
"\t{%" PRIuMAX "/%" PRIuMAX "}",
- (uintmax_t)fe->off, (uintmax_t)fe->size);
+ (uintmax_t)ext->off, (uintmax_t)ext->size);
- WT_VERBOSE(session, block, "freelist by size:");
- WT_FREE_FOREACH(szp, block->fsize) {
+ WT_VERBOSE(session, block, "%s: list by size:", el->name);
+ WT_EXT_FOREACH(szp, el->size) {
WT_VERBOSE(session, block,
"\t{%" PRIuMAX "}",
(uintmax_t)szp->size);
- WT_FREE_FOREACH_OFF(fe, szp->foff)
+ WT_EXT_FOREACH_OFF(ext, szp->off)
WT_VERBOSE(session, block,
"\t\t{%" PRIuMAX "/%" PRIuMAX "}",
- (uintmax_t)fe->off, (uintmax_t)fe->size);
+ (uintmax_t)ext->off, (uintmax_t)ext->size);
}
}
#endif
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 23dd2e475c8..6bac8c72986 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -136,8 +136,14 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
WT_ERR(__desc_read(session, block, salvage));
/* If not an open for a salvage operation, read the freelist. */
- if (!salvage)
- WT_ERR(__wt_block_freelist_read(session, block));
+ if (!salvage && block->free_offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_ERR(__wt_block_extlist_read(session, block, &block->free,
+ block->free_offset, block->free_size, block->free_cksum));
+
+ /* Insert the free-list itself into the linked list. */
+ WT_ERR(__wt_block_free(session,
+ block, block->free_offset, (off_t)block->free_size));
+ }
F_SET(block, WT_BLOCK_OK);
*(void **)retp = block;
@@ -163,7 +169,9 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
* file's description.
*/
if (F_ISSET(block, WT_BLOCK_OK)) {
- WT_TRET(__wt_block_freelist_write(session, block));
+ WT_TRET(__wt_block_extlist_write(session, block,
+ &block->free, &block->free_offset,
+ &block->free_size, &block->free_cksum));
WT_TRET(__desc_update(session, block));
}
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index e6a4cf52410..62186fcf2fc 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -117,23 +117,23 @@ __wt_block_verify_addr(WT_SESSION_IMPL *session,
static int
__verify_freelist(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
- WT_FREE *fe;
+ WT_EXT *ext;
int ret;
ret = 0;
- WT_FREE_FOREACH(fe, block->foff) {
- if (fe->off + (off_t)fe->size > block->fh->file_size)
+ WT_EXT_FOREACH(ext, block->free.off) {
+ if (ext->off + (off_t)ext->size > block->fh->file_size)
WT_RET_MSG(session, WT_ERROR,
"free-list entry offset %" PRIuMAX "references "
"non-existent file pages",
- (uintmax_t)fe->off);
+ (uintmax_t)ext->off);
WT_VERBOSE(session, verify,
"free-list range %" PRIdMAX "-%" PRIdMAX,
- (intmax_t)fe->off, (intmax_t)(fe->off + fe->size));
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
- WT_TRET(__verify_addfrag(session, block, fe->off, fe->size));
+ WT_TRET(__verify_addfrag(session, block, ext->off, ext->size));
}
return (ret);
diff --git a/src/include/block.h b/src/include/block.h
index d217a5ff496..6b4b8c88830 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -19,36 +19,53 @@
/*
* The block allocator maintains two primary skiplists: first, the by-offset
- * list linking WT_FREE elements and sorted by file offset (low-to-high):
+ * list linking WT_EXT elements and sorted by file offset (low-to-high):
* this list has an entry for every free chunk in the file. The second primary
* skiplist is the by-size list linking WT_SIZE elements and sorted by chunk
* size (low-to-high). This list has an entry for every free chunk size seen
* since the list was created.
* Additionally, each WT_SIZE element has a skiplist of its own, linking
- * WT_FREE elements and sorted by file offset (low-to-high). This list has an
+ * WT_EXT elements and sorted by file offset (low-to-high). This list has an
* entry for every free chunk in the file of a particular size.
- * The trickiness is that each individual WT_FREE element appears on two
+ * The trickiness is that each individual WT_EXT element appears on two
* skiplists. In order to minimize allocation calls, we allocate a single
- * array of WT_FREE pointers at the end of the WT_FREE structure, for both
- * skiplists, and store the depth of the skiplist in the WT_FREE structure.
- * The skiplist entries for the offset skiplist start at WT_FREE.next[0] and
- * the entries for the size skiplist start at WT_FREE.next[WT_FREE.depth].
- *
- * WT_FREE --
- * Encapsulation of a free chunk of space.
+ * array of WT_EXT pointers at the end of the WT_EXT structure, for both
+ * skiplists, and store the depth of the skiplist in the WT_EXT structure.
+ * The skiplist entries for the offset skiplist start at WT_EXT.next[0] and
+ * the entries for the size skiplist start at WT_EXT.next[WT_EXT.depth].
*/
-struct __wt_free {
- off_t off; /* File offset */
- off_t size; /* Size */
+
+/*
+ * WT_EXTLIST --
+ * An extent list.
+ */
+struct __wt_extlist {
+ const char *name; /* Name */
+
+ uint64_t bytes; /* Byte count */
+ uint32_t entries; /* Entry count */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
+ WT_SIZE *size[WT_SKIP_MAXDEPTH];
+};
+
+/*
+ * WT_EXT --
+ * Encapsulation of an extent, either allocated or freed within the
+ * snapshot.
+ */
+struct __wt_ext {
+ off_t off; /* Extent's file offset */
+ off_t size; /* Extent's Size */
uint8_t depth; /* Skip list depth */
/*
* Variable-length array, sized by the number of skiplist elements.
- * The first depth array entries are the offset skiplist elements,
+ * The first depth array entries are the address skiplist elements,
* the second depth array entries are the size skiplist.
*/
- WT_FREE *next[0]; /* Offset, size skiplists */
+ WT_EXT *next[0]; /* Offset, size skiplists */
};
/*
@@ -60,23 +77,23 @@ struct __wt_size {
uint8_t depth; /* Skip list depth */
- WT_FREE *foff[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
/* Variable-length array, sized by the number of skiplist elements. */
WT_SIZE *next[0]; /* Size skiplist */
};
/*
- * WT_FREE_FOREACH --
+ * WT_EXT_FOREACH --
* Walk a block manager skiplist.
- * WT_FREE_FOREACH_OFF --
- * Walk a block manager skiplist where the WT_FREE.next entries are offset
+ * WT_EXT_FOREACH_OFF --
+ * Walk a block manager skiplist where the WT_EXT.next entries are offset
* by the depth.
*/
-#define WT_FREE_FOREACH(skip, head) \
+#define WT_EXT_FOREACH(skip, head) \
for ((skip) = (head)[0]; \
(skip) != NULL; (skip) = (skip)->next[0])
-#define WT_FREE_FOREACH_OFF(skip, head) \
+#define WT_EXT_FOREACH_OFF(skip, head) \
for ((skip) = (head)[0]; \
(skip) != NULL; (skip) = (skip)->next[(skip)->depth])
@@ -99,13 +116,7 @@ struct __wt_block {
/* Freelist support */
WT_SPINLOCK freelist_lock; /* Lock to protect the freelist. */
- uint64_t freelist_bytes; /* Freelist byte count */
- uint32_t freelist_entries; /* Freelist entry count */
- int freelist_dirty; /* Freelist has been modified */
-
- /* Freelist offset/size skiplists */
- WT_FREE *foff[WT_SKIP_MAXDEPTH];
- WT_SIZE *fsize[WT_SKIP_MAXDEPTH];
+ WT_EXTLIST free; /* Freelist offset/size skiplists */
off_t free_offset; /* Freelist file location */
uint32_t free_size;
@@ -137,7 +148,7 @@ struct __wt_block_desc {
uint32_t cksum; /* 08-11: Description block checksum */
uint32_t unused; /* 12-15: Padding */
-#define WT_BLOCK_FREELIST_MAGIC 071002
+#define WT_BLOCK_EXTLIST_MAGIC 071002
uint64_t free_offset; /* 16-23: Free list page offset */
uint32_t free_size; /* 24-27: Free list page length */
uint32_t free_cksum; /* 28-31: Free list page checksum */
diff --git a/src/include/extern.h b/src/include/extern.h
index 13b39c43d85..e262dbb5a66 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -19,9 +19,10 @@ extern int __wt_block_addr_string(WT_SESSION_IMPL *session,
WT_ITEM *buf,
const uint8_t *addr,
uint32_t addr_size);
+extern uint32_t __wt_cksum(const void *chunk, size_t len);
extern int __wt_block_alloc( WT_SESSION_IMPL *session,
WT_BLOCK *block,
- off_t *offsetp,
+ off_t *offp,
off_t size);
extern int __wt_block_free_buf(WT_SESSION_IMPL *session,
WT_BLOCK *block,
@@ -31,15 +32,23 @@ extern int __wt_block_free( WT_SESSION_IMPL *session,
WT_BLOCK *block,
off_t off,
off_t size);
+extern int __wt_block_extlist_read(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ WT_EXTLIST *el,
+ off_t off,
+ uint32_t size,
+ uint32_t cksum);
extern void __wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_freelist_read(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern void __wt_block_freelist_close(WT_SESSION_IMPL *session,
WT_BLOCK *block);
-extern int __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_extlist_write(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ WT_EXTLIST *el,
+ off_t *offp,
+ uint32_t *sizep,
+ uint32_t *cksump);
extern void __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern void __wt_block_dump(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern uint32_t __wt_cksum(const void *chunk, size_t len);
extern int __wt_bm_addr_valid( WT_SESSION_IMPL *session,
const uint8_t *addr,
uint32_t addr_size);
diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in
index d2b6aa7f96f..04d9080fb5e 100644
--- a/src/include/wt_internal.in
+++ b/src/include/wt_internal.in
@@ -97,10 +97,12 @@ struct __wt_evict_list;
typedef struct __wt_evict_list WT_EVICT_LIST;
struct __wt_evict_req;
typedef struct __wt_evict_req WT_EVICT_REQ;
+struct __wt_ext;
+ typedef struct __wt_ext WT_EXT;
+struct __wt_extlist;
+ typedef struct __wt_extlist WT_EXTLIST;
struct __wt_fh;
typedef struct __wt_fh WT_FH;
-struct __wt_free;
- typedef struct __wt_free WT_FREE;
struct __wt_hazard;
typedef struct __wt_hazard WT_HAZARD;
struct __wt_ikey;