diff options
author | Don Anderson <dda@ddanderson.com> | 2012-03-16 14:24:38 -0400 |
---|---|---|
committer | Don Anderson <dda@ddanderson.com> | 2012-03-16 14:24:38 -0400 |
commit | 6f048f9116aec392898816e6c50669f64c1cc93e (patch) | |
tree | 1f328bdee2b7aef09f1c6be5a69040731b6beae3 | |
parent | fe21c095f8aa6f82347937739e42fb2b0b4eca3d (diff) | |
parent | bd3a2cc352e427658140604ff96ef89945456351 (diff) | |
download | mongo-6f048f9116aec392898816e6c50669f64c1cc93e.tar.gz |
Merge branch 'master' of https://github.com/wiredtiger/wiredtiger
-rw-r--r-- | dist/filelist | 2 | ||||
-rw-r--r-- | dist/s_string.ok | 2 | ||||
-rw-r--r-- | src/block/block_ext.c (renamed from src/block/block_alloc.c) | 530 | ||||
-rw-r--r-- | src/block/block_open.c | 14 | ||||
-rw-r--r-- | src/block/block_vrfy.c | 12 | ||||
-rw-r--r-- | src/include/block.h | 69 | ||||
-rw-r--r-- | src/include/extern.h | 19 | ||||
-rw-r--r-- | src/include/wt_internal.in | 6 |
8 files changed, 348 insertions, 306 deletions
diff --git a/dist/filelist b/dist/filelist index 802d7a03723..264af59c511 100644 --- a/dist/filelist +++ b/dist/filelist @@ -5,8 +5,8 @@ src/api/api_event.c src/api/api_strerror.c src/api/api_version.c src/block/block_addr.c -src/block/block_alloc.c src/block/block_cksum.c +src/block/block_ext.c src/block/block_mgr.c src/block/block_open.c src/block/block_read.c diff --git a/dist/s_string.ok b/dist/s_string.ok index df49454f9ad..d02605f04db 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -45,6 +45,7 @@ ENOMEM ENOTSUP ETIME ETIMEDOUT +EXTLIST Enqueue Env Eron @@ -249,6 +250,7 @@ eventv evictserver exactp extern +extlist fcntl ffc ffs diff --git a/src/block/block_alloc.c b/src/block/block_ext.c index 28b604bc7ef..a03101c74a5 100644 --- a/src/block/block_alloc.c +++ b/src/block/block_ext.c @@ -7,17 +7,23 @@ #include "wt_internal.h" -static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t); -static int __block_truncate(WT_SESSION_IMPL *, WT_BLOCK *); +static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t); +static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t); +static int __block_truncate(WT_SESSION_IMPL *, WT_BLOCK *); + +#ifdef HAVE_VERBOSE +static void __block_extlist_dump(WT_SESSION_IMPL *, WT_EXTLIST *); +#endif /* * __block_off_srch -- - * Search a by-offset skiplist. + * Search a by-offset skiplist (either the primary by-offset list, or the + * by-offset list referenced by a size entry), for the specified offset. */ static void -__block_off_srch(WT_FREE **head, off_t off, WT_FREE ***stack, int skip_off) +__block_off_srch(WT_EXT **head, off_t off, WT_EXT ***stack, int skip_off) { - WT_FREE **fep; + WT_EXT **extp; int i; /* @@ -26,20 +32,21 @@ __block_off_srch(WT_FREE **head, off_t off, WT_FREE ***stack, int skip_off) * * Return a stack for an exact match or the next-largest item. * - * The WT_FREE structure contains two skiplists, the primary one and the + * The WT_EXT structure contains two skiplists, the primary one and the * per-size bucket one: if the skip_off flag is set, offset the skiplist * array by the depth specified in this particular structure. */ - for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;) - if (*fep != NULL && (*fep)->off < off) - fep = &(*fep)->next[i + (skip_off ? (*fep)->depth : 0)]; + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) + if (*extp != NULL && (*extp)->off < off) + extp = + &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)]; else - stack[i--] = fep--; + stack[i--] = extp--; } /* * __block_size_srch -- - * Search the by-size skiplist. + * Search the by-size skiplist for the specified size. */ static void __block_size_srch(WT_SIZE **head, off_t size, WT_SIZE ***stack) @@ -62,165 +69,165 @@ __block_size_srch(WT_SIZE **head, off_t size, WT_SIZE ***stack) /* * __block_off_pair_srch -- - * Search the primary by-offset skiplist for before/after records of the - * specified offset. + * Search a by-offset skiplist for before/after records of the specified + * offset. */ static void __block_off_pair_srch( - WT_FREE **head, off_t off, WT_FREE **beforep, WT_FREE **afterp) + WT_EXTLIST *el, off_t off, WT_EXT **beforep, WT_EXT **afterp) { - WT_FREE **fep; + WT_EXT **head, **extp; int i; *beforep = *afterp = NULL; + head = el->off; + /* * Start at the highest skip level, then go as far as possible at each * level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;) { - if (*fep == NULL) { + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) { + if (*extp == NULL) { --i; - --fep; + --extp; continue; } - if ((*fep)->off < off) { /* Keep going at this level */ - *beforep = *fep; - fep = &(*fep)->next[i]; + if ((*extp)->off < off) { /* Keep going at this level */ + *beforep = *extp; + extp = &(*extp)->next[i]; } else { /* Drop down a level */ - *afterp = *fep; + *afterp = *extp; --i; - --fep; + --extp; } } } /* - * __block_off_last -- - * Return the last free range in the offset skiplist. + * __block_extlist_last -- + * Return the last extent range in the skiplist. */ -static WT_FREE * -__block_off_last(WT_FREE **head) +static WT_EXT * +__block_extlist_last(WT_EXT **head) { - WT_FREE *fe, **fep; + WT_EXT *ext, **extp; int i; - fe = NULL; + ext = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, fep = &head[i]; i >= 0;) { - if (*fep == NULL) { + for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) { + if (*extp == NULL) { --i; - --fep; + --extp; continue; } - fe = *fep; - fep = &(*fep)->next[i]; + ext = *extp; + extp = &(*extp)->next[i]; } - return (fe); + return (ext); } /* * __block_off_insert -- - * Insert a record into the system. + * Insert a record into an extent list. */ static int -__block_off_insert(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FREE *fe) +__block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext) { - WT_FREE **astack[WT_SKIP_MAXDEPTH]; + WT_EXT **astack[WT_SKIP_MAXDEPTH]; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; u_int i; /* * If we are inserting a new size onto the size skiplist, we'll need - * a new WT_FREE structure for that skiplist. + * a new WT_EXT structure for that skiplist. */ - __block_size_srch(block->fsize, fe->size, sstack); + __block_size_srch(el->size, ext->size, sstack); szp = *sstack[0]; - if (szp == NULL || szp->size != fe->size) { + if (szp == NULL || szp->size != ext->size) { WT_RET(__wt_calloc(session, 1, - sizeof(WT_SIZE) + fe->depth * sizeof(WT_SIZE *), &szp)); - szp->size = fe->size; - szp->depth = fe->depth; - for (i = 0; i < fe->depth; ++i) { + sizeof(WT_SIZE) + ext->depth * sizeof(WT_SIZE *), &szp)); + szp->size = ext->size; + szp->depth = ext->depth; + for (i = 0; i < ext->depth; ++i) { szp->next[i] = *sstack[i]; *sstack[i] = szp; } } - /* Insert the new WT_FREE structure into the offset skiplist. */ - __block_off_srch(block->foff, fe->off, astack, 0); - for (i = 0; i < fe->depth; ++i) { - fe->next[i] = *astack[i]; - *astack[i] = fe; + /* Insert the new WT_EXT structure into the offset skiplist. */ + __block_off_srch(el->off, ext->off, astack, 0); + for (i = 0; i < ext->depth; ++i) { + ext->next[i] = *astack[i]; + *astack[i] = ext; } /* - * Insert the new WT_FREE structure into the size element's offset + * Insert the new WT_EXT structure into the size element's offset * skiplist. */ - __block_off_srch(szp->foff, fe->off, astack, 1); - for (i = 0; i < fe->depth; ++i) { - fe->next[i + fe->depth] = *astack[i]; - *astack[i] = fe; + __block_off_srch(szp->off, ext->off, astack, 1); + for (i = 0; i < ext->depth; ++i) { + ext->next[i + ext->depth] = *astack[i]; + *astack[i] = ext; } - ++block->freelist_entries; - block->freelist_bytes += (uint64_t)fe->size; - block->freelist_dirty = 1; + ++el->entries; + el->bytes += (uint64_t)ext->size; return (0); } /* * __block_off_remove -- - * Remove a record from the system. + * Remove a record from an extent list. */ static int __block_off_remove( - WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, WT_FREE **fep) + WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, WT_EXT **extp) { - WT_FREE *fe, **astack[WT_SKIP_MAXDEPTH]; + WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; u_int i; /* Find and remove the record from the by-offset skiplist. */ - __block_off_srch(block->foff, off, astack, 0); - fe = *astack[0]; - if (fe == NULL || fe->off != off) + __block_off_srch(el->off, off, astack, 0); + ext = *astack[0]; + if (ext == NULL || ext->off != off) return (EINVAL); - for (i = 0; i < fe->depth; ++i) - *astack[i] = fe->next[i]; + for (i = 0; i < ext->depth; ++i) + *astack[i] = ext->next[i]; /* * Find and remove the record from the size's offset skiplist; if that * empties the by-size skiplist entry, remove it as well. */ - __block_size_srch(block->fsize, fe->size, sstack); + __block_size_srch(el->size, ext->size, sstack); szp = *sstack[0]; - if (szp == NULL || szp->size != fe->size) + if (szp == NULL || szp->size != ext->size) return (EINVAL); - __block_off_srch(szp->foff, off, astack, 1); - fe = *astack[0]; - if (fe == NULL || fe->off != off) + __block_off_srch(szp->off, off, astack, 1); + ext = *astack[0]; + if (ext == NULL || ext->off != off) return (EINVAL); - for (i = 0; i < fe->depth; ++i) - *astack[i] = fe->next[i + fe->depth]; - if (szp->foff[0] == NULL) { + for (i = 0; i < ext->depth; ++i) + *astack[i] = ext->next[i + ext->depth]; + if (szp->off[0] == NULL) { for (i = 0; i < szp->depth; ++i) *sstack[i] = szp->next[i]; __wt_free(session, szp); } - --block->freelist_entries; - block->freelist_bytes -= (uint64_t)fe->size; - block->freelist_dirty = 1; + --el->entries; + el->bytes -= (uint64_t)ext->size; /* Return the record if our caller wants it, otherwise free it. */ - if (fep == NULL) - __wt_free(session, fe); + if (extp == NULL) + __wt_free(session, ext); else - *fep = fe; + *extp = ext; return (0); } @@ -231,9 +238,9 @@ __block_off_remove( */ int __wt_block_alloc( - WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offsetp, off_t size) + WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size) { - WT_FREE *fe; + WT_EXT *ext; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; int ret; @@ -253,37 +260,37 @@ __wt_block_alloc( * requested size and take the first entry on the by-size offset list. * If we don't have anything large enough, extend the file. */ - __block_size_srch(block->fsize, size, sstack); + __block_size_srch(block->free.size, size, sstack); szp = *sstack[0]; if (szp == NULL) { - WT_ERR(__block_extend(session, block, offsetp, size)); + WT_ERR(__block_extend(session, block, offp, size)); goto done; } /* Remove the first record, and set the returned offset. */ - fe = szp->foff[0]; - WT_ERR(__block_off_remove(session, block, fe->off, &fe)); - *offsetp = fe->off; + ext = szp->off[0]; + WT_ERR(__block_off_remove(session, &block->free, ext->off, &ext)); + *offp = ext->off; /* If doing a partial allocation, adjust the record and put it back. */ - if (fe->size > size) { + if (ext->size > size) { WT_VERBOSE(session, block, "allocate %" PRIdMAX " from range %" PRIdMAX "-%" PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX, (intmax_t)size, - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size), - (intmax_t)(fe->off + size), - (intmax_t)(fe->off + size + fe->size - size)); + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)(ext->off + size), + (intmax_t)(ext->off + size + ext->size - size)); - fe->off += size; - fe->size -= size; - WT_ERR(__block_off_insert(session, block,fe)); + ext->off += size; + ext->size -= size; + WT_ERR(__block_off_insert(session, &block->free, ext)); } else { WT_VERBOSE(session, block, "allocate range %" PRIdMAX "-%" PRIdMAX, - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size)); + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)); - __wt_free(session, fe); + __wt_free(session, ext); } done: err: @@ -297,7 +304,7 @@ done: err: */ static int __block_extend( - WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offsetp, off_t size) + WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size) { WT_FH *fh; @@ -328,13 +335,13 @@ __block_extend( WT_RET_MSG(session, WT_ERROR, "block allocation failed, file cannot grow further"); - *offsetp = fh->file_size; + *offp = fh->file_size; fh->file_size += size; WT_BSTAT_INCR(session, extend); WT_VERBOSE(session, block, "file extend %" PRIdMAX "B @ %" PRIdMAX, - (intmax_t)size, (intmax_t)*offsetp); + (intmax_t)size, (intmax_t)*offp); return (0); } @@ -347,15 +354,14 @@ int __wt_block_free_buf(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size) { - off_t offset; + off_t off; uint32_t size; WT_UNUSED(addr_size); /* Crack the cookie. */ - WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, NULL)); - - WT_RET(__wt_block_free(session, block, offset, (off_t)size)); + WT_RET(__wt_block_buffer_to_addr(block, addr, &off, &size, NULL)); + WT_RET(__wt_block_free(session, block, off, size)); return (0); } @@ -368,56 +374,68 @@ int __wt_block_free( WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, off_t size) { - WT_FREE *fe, *after, *before; - u_int skipdepth; int ret; - ret = 0; - WT_BSTAT_INCR(session, free); WT_VERBOSE(session, block, "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)off, (intmax_t)size); __wt_spin_lock(session, &block->freelist_lock); + ret = __block_merge(session, &block->free, off, (off_t)size); + __wt_spin_unlock(session, &block->freelist_lock); + + return (ret); +} + +/* + * __block_merge -- + * Insert an extent into an extent list, merging if possible. + */ +static int +__block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) +{ + WT_EXT *ext, *after, *before; + u_int skipdepth; /* * Retrieve the records preceding/following the offset. If the records * are contiguous with the free'd offset, combine records. */ - __block_off_pair_srch(block->foff, off, &before, &after); + __block_off_pair_srch(el, off, &before, &after); if (before != NULL) { if (before->off + before->size > off) - WT_ERR_MSG(session, EINVAL, - "existing range %" PRIdMAX "-%" PRIdMAX " overlaps " - "with free'd range %" PRIdMAX "-%" PRIdMAX, + WT_RET_MSG(session, EINVAL, + "%s: existing range %" PRIdMAX "-%" PRIdMAX + " overlaps with merge range %" PRIdMAX "-%" PRIdMAX, + el->name, (intmax_t)before->off, (intmax_t)(before->off + before->size), - (intmax_t)off, - (intmax_t)(off + size)); + (intmax_t)off, (intmax_t)(off + size)); if (before->off + before->size != off) before = NULL; } if (after != NULL) { if (off + size > after->off) - WT_ERR_MSG(session, EINVAL, - "free'd range %" PRIdMAX "-%" PRIdMAX " overlaps " - "with existing range %" PRIdMAX "-%" PRIdMAX, - (intmax_t)off, - (intmax_t)(off + size), + WT_RET_MSG(session, EINVAL, + "%s: merge range %" PRIdMAX "-%" PRIdMAX + " overlaps with existing range %" PRIdMAX + "-%" PRIdMAX, + el->name, + (intmax_t)off, (intmax_t)(off + size), (intmax_t)after->off, (intmax_t)(after->off + after->size)); if (off + size != after->off) after = NULL; } if (before == NULL && after == NULL) { - /* Allocate a new WT_FREE structure. */ + /* Allocate a new WT_EXT structure. */ skipdepth = __wt_skip_choose_depth(); - WT_ERR(__wt_calloc(session, 1, - sizeof(WT_FREE) + skipdepth * 2 * sizeof(WT_FREE *), &fe)); - fe->off = off; - fe->size = size; - fe->depth = (uint8_t)skipdepth; - goto done; + WT_RET(__wt_calloc(session, 1, + sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext)); + ext->off = off; + ext->size = size; + ext->depth = (uint8_t)skipdepth; + return (__block_off_insert(session, el, ext)); } /* @@ -428,126 +446,120 @@ __wt_block_free( * the record we're going to use, adjust it and re-insert it. */ if (before == NULL) { - WT_ERR(__block_off_remove(session, block, after->off, &fe)); + WT_RET(__block_off_remove(session, el, after->off, &ext)); WT_VERBOSE(session, block, - "free range grows from %" PRIdMAX "-%" PRIdMAX ", to %" + "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" PRIdMAX "-%" PRIdMAX, - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size), - (intmax_t)off, (intmax_t)(off + fe->size + size)); + el->name, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)off, (intmax_t)(off + ext->size + size)); - fe->off = off; - fe->size += size; + ext->off = off; + ext->size += size; } else { if (after != NULL) { size += after->size; - WT_ERR(__block_off_remove( - session, block, after->off, NULL)); + WT_RET( + __block_off_remove(session, el, after->off, NULL)); } - WT_ERR(__block_off_remove(session, block, before->off, &fe)); + WT_RET(__block_off_remove(session, el, before->off, &ext)); WT_VERBOSE(session, block, - "free range grows from %" PRIdMAX "-%" PRIdMAX ", to %" + "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" PRIdMAX "-%" PRIdMAX, - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size), - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size + size)); + el->name, + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), + (intmax_t)ext->off, + (intmax_t)(ext->off + ext->size + size)); - fe->size += size; + ext->size += size; } - -done: WT_ERR(__block_off_insert(session, block,fe)); - -err: __wt_spin_unlock(session, &block->freelist_lock); - return (ret); + return (__block_off_insert(session, el, ext)); } /* - * __wt_block_freelist_open -- - * Initialize the free-list structures. - */ -void -__wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block) -{ - __wt_spin_init(session, &block->freelist_lock); - - block->freelist_bytes = 0; - block->freelist_entries = 0; - block->freelist_dirty = 0; - - block->free_offset = WT_BLOCK_INVALID_OFFSET; - block->free_size = block->free_cksum = 0; -} - -/* - * __wt_block_freelist_read -- - * Read the free-list. + * __wt_block_extlist_read -- + * Read an extent list. */ int -__wt_block_freelist_read(WT_SESSION_IMPL *session, WT_BLOCK *block) +__wt_block_extlist_read(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, uint32_t size, uint32_t cksum) { WT_ITEM *tmp; - off_t offset, size; + off_t loff, lsize; uint8_t *p; int ret; tmp = NULL; ret = 0; - /* See if there's a free-list to read. */ - if (block->free_offset == WT_BLOCK_INVALID_OFFSET) - return (0); - - WT_RET(__wt_scr_alloc(session, block->free_size, &tmp)); - WT_ERR(__wt_block_read(session, block, - tmp, block->free_offset, block->free_size, block->free_cksum)); - - /* - * The free-list is read before the file is verified, which means we - * need to be a little paranoid. We know the free-list chunk itself - * is entirely in the file because we checked when we first read the - * file's description structure. Confirm the offset/size pairs are - * valid, then insert them into the linked list. - */ - p = WT_BLOCK_HEADER_BYTE(tmp->mem); + WT_RET(__wt_scr_alloc(session, size, &tmp)); + WT_ERR(__wt_block_read(session, block, tmp, off, size, cksum)); -#define WT_FREELIST_READ(p, v) do { \ +#define WT_EXTLIST_READ(p, v) do { \ (v) = *(off_t *)(p); \ (p) += sizeof(off_t); \ } while (0) - WT_FREELIST_READ(p, offset); - WT_FREELIST_READ(p, size); - if (offset != WT_BLOCK_FREELIST_MAGIC || size != 0) + p = WT_BLOCK_HEADER_BYTE(tmp->mem); + WT_EXTLIST_READ(p, loff); + WT_EXTLIST_READ(p, lsize); + if (loff != WT_BLOCK_EXTLIST_MAGIC || lsize != 0) goto corrupted; for (;;) { - WT_FREELIST_READ(p, offset); - WT_FREELIST_READ(p, size); - if (offset == WT_BLOCK_INVALID_OFFSET) + WT_EXTLIST_READ(p, loff); + WT_EXTLIST_READ(p, lsize); + if (loff == WT_BLOCK_INVALID_OFFSET) break; - if ((offset - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 || - size % block->allocsize != 0 || - offset + size > block->fh->file_size) + /* + * We check the offset/size pairs represent valid file ranges, + * then insert them into the list. We don't necessarily have + * to check for offsets past the end-of-file, but it's a cheap + * and easy test to do here and we'd have to do the check as + * part of file verification, regardless. + */ + if ((loff - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 || + lsize % block->allocsize != 0 || + loff + lsize > block->fh->file_size) corrupted: WT_ERR_MSG(session, WT_ERROR, - "file contains a corrupted free-list, range %" + "file contains a corrupted %s extent list, range %" PRIdMAX "-%" PRIdMAX " past end-of-file", - (intmax_t)offset, (intmax_t)(offset + size)); - - WT_ERR(__wt_block_free(session, block, offset, size)); + el->name, + (intmax_t)loff, (intmax_t)(loff + lsize)); + + /* + * We could insert instead of merge, because ranges shouldn't + * overlap, but merge knows how to allocate WT_EXT structures, + * and a little paranoia is a good thing. + */ + WT_ERR(__block_merge(session, el, loff, lsize)); } - /* - * Insert the free-list itself into the linked list, but don't clear - * the values, if the free-list is never modified, we don't write it. - */ - WT_ERR(__wt_block_free( - session, block, block->free_offset, (off_t)block->free_size)); + WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el)); err: __wt_scr_free(&tmp); return (ret); } /* + * __wt_block_freelist_open -- + * Initialize the free-list structures. + */ +void +__wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + __wt_spin_init(session, &block->freelist_lock); + + memset(&block->free, 0, sizeof(block->free)); + block->free.name = "freelist"; + + block->free_offset = WT_BLOCK_INVALID_OFFSET; + block->free_size = block->free_cksum = 0; +} + +/* * __wt_block_freelist_close -- * Write the free-list at the tail of the file. */ @@ -558,38 +570,36 @@ __wt_block_freelist_close(WT_SESSION_IMPL *session, WT_BLOCK *block) } /* - * __wt_block_freelist_write -- - * Write the free-list at the tail of the file. + * __wt_block_extlist_write -- + * Write an extent list at the tail of the file. */ int -__wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block) +__wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, off_t *offp, uint32_t *sizep, uint32_t *cksump) { + WT_EXT *ext; WT_ITEM *tmp; - WT_FREE *fe; WT_PAGE_HEADER *dsk; uint32_t datasize, size; uint8_t *p; int ret; + const char *name; tmp = NULL; ret = 0; - /* If the free-list hasn't changed, there's nothing to write. */ - if (block->freelist_dirty == 0) - return (0); + WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el)); /* If there aren't any free-list entries, we're done. */ - if (block->freelist_entries == 0) { - block->free_offset = WT_BLOCK_INVALID_OFFSET; - block->free_size = block->free_cksum = 0; + if (el->entries == 0) { + *offp = WT_BLOCK_INVALID_OFFSET; + *sizep = *cksump = 0; return (0); } /* Truncate the file if possible. */ WT_RET(__block_truncate(session, block)); - WT_VERBOSE_CALL(session, block, __wt_block_dump(session, block)); - /* * Get a scratch buffer, clear the page's header and data, initialize * the header. Allocate an allocation-sized aligned buffer so the @@ -597,11 +607,10 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block) * copying to something larger. * * Allocate room for the free-list entries, plus 2 additional entries: - * the initial WT_BLOCK_FREELIST_MAGIC/0 pair and the list-terminating + * the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-terminating * WT_BLOCK_INVALID_OFFSET/0 pair. */ - datasize = size = - (block->freelist_entries + 2) * WT_STORE_SIZE(sizeof(off_t) * 2); + datasize = size = (el->entries + 2) * WT_STORE_SIZE(sizeof(off_t) * 2); WT_RET(__wt_block_write_size(session, block, &size)); WT_RET(__wt_scr_alloc(session, size, &tmp)); dsk = tmp->mem; @@ -610,24 +619,24 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block) dsk->type = WT_PAGE_FREELIST; tmp->size = WT_STORE_SIZE(WT_BLOCK_HEADER_BYTE_SIZE + datasize); - /* Fill the page's data. */ - p = WT_BLOCK_HEADER_BYTE(dsk); - -#define WT_FREELIST_WRITE(p, v) do { \ +#define WT_EXTLIST_WRITE(p, v) do { \ *(off_t *)(p) = (v); \ (p) += sizeof(off_t); \ } while (0) - WT_FREELIST_WRITE(p, WT_BLOCK_FREELIST_MAGIC); /* Initial value */ - WT_FREELIST_WRITE(p, 0); - WT_FREE_FOREACH(fe, block->foff) { /* Free ranges */ - WT_FREELIST_WRITE(p, fe->off); - WT_FREELIST_WRITE(p, fe->size); + /* Fill the page's data. */ + p = WT_BLOCK_HEADER_BYTE(dsk); + WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC); /* Initial value */ + WT_EXTLIST_WRITE(p, 0); + WT_EXT_FOREACH(ext, el->off) { /* Free ranges */ + WT_EXTLIST_WRITE(p, ext->off); + WT_EXTLIST_WRITE(p, ext->size); } - WT_FREELIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */ - WT_FREELIST_WRITE(p, 0); + WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */ + WT_EXTLIST_WRITE(p, 0); /* + * XXX * Discard the in-memory free-list: this has to happen before writing * the free-list because the underlying block write function is going * to allocate file space for the free-list block(s), and allocating @@ -637,15 +646,15 @@ __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block) * complex than ordering these operations so the eventual allocation * in the write code always extends the file. */ + name = el->name; + el = NULL; __wt_block_discard(session, block); - /* Write the free list to disk and update the file's meta-data. */ - WT_ERR(__wt_block_write(session, block, tmp, - &block->free_offset, &block->free_size, &block->free_cksum)); + /* Write the extent list to disk. */ + WT_ERR(__wt_block_write(session, block, tmp, offp, sizep, cksump)); WT_VERBOSE(session, block, - "free-list written %" PRIdMAX "/%" PRIu32, - (intmax_t)block->free_offset, block->free_size); + "%s written %" PRIdMAX "/%" PRIu32, name, (intmax_t)*offp, *sizep); err: __wt_scr_free(&tmp); return (ret); @@ -658,8 +667,8 @@ err: __wt_scr_free(&tmp); static int __block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block) { + WT_EXT *ext; WT_FH *fh; - WT_FREE *fe; fh = block->fh; @@ -667,47 +676,43 @@ __block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block) * Check if the last free range is at the end of the file, and if so, * truncate the file and discard the range. */ - if ((fe = __block_off_last(block->foff)) == NULL) + if ((ext = __block_extlist_last(block->free.off)) == NULL) return (0); - if (fe->off + fe->size != fh->file_size) + if (ext->off + ext->size != fh->file_size) return (0); WT_VERBOSE(session, block, "truncate file from %" PRIdMAX " to %" PRIdMAX, - (intmax_t)fh->file_size, (intmax_t)fe->off); + (intmax_t)fh->file_size, (intmax_t)ext->off); - fh->file_size = fe->off; + fh->file_size = ext->off; WT_RET(__wt_ftruncate(session, fh, fh->file_size)); - WT_RET(__block_off_remove(session, block, fe->off, NULL)); + WT_RET(__block_off_remove(session, &block->free, ext->off, NULL)); return (0); } /* * __wt_block_discard -- - * Discard any free-list entries. + * Discard any extent-list entries. */ void __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block) { - WT_FREE *fe, *nfe; + WT_EXT *ext, *next; WT_SIZE *szp, *nszp; - for (fe = block->foff[0]; fe != NULL; fe = nfe) { - nfe = fe->next[0]; - __wt_free(session, fe); + for (ext = block->free.off[0]; ext != NULL; ext = next) { + next = ext->next[0]; + __wt_free(session, ext); } - memset(block->foff, 0, sizeof(block->foff)); - for (szp = block->fsize[0]; szp != NULL; szp = nszp) { + for (szp = block->free.size[0]; szp != NULL; szp = nszp) { nszp = szp->next[0]; __wt_free(session, szp); } - memset(block->fsize, 0, sizeof(block->fsize)); - block->freelist_bytes = 0; - block->freelist_entries = 0; - block->freelist_dirty = 0; + memset(&block->free, 0, sizeof(block->free)); } /* @@ -717,8 +722,8 @@ __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block) void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block) { - WT_BSTAT_SET(session, file_freelist_bytes, block->freelist_bytes); - WT_BSTAT_SET(session, file_freelist_entries, block->freelist_entries); + WT_BSTAT_SET(session, file_freelist_bytes, block->free.bytes); + WT_BSTAT_SET(session, file_freelist_entries, block->free.entries); WT_BSTAT_SET(session, file_size, block->fh->file_size); WT_BSTAT_SET(session, file_magic, WT_BLOCK_MAGIC); WT_BSTAT_SET(session, file_major, WT_BLOCK_MAJOR_VERSION); @@ -726,27 +731,32 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block) } #ifdef HAVE_VERBOSE -void -__wt_block_dump(WT_SESSION_IMPL *session, WT_BLOCK *block) +static void +__block_extlist_dump(WT_SESSION_IMPL *session, WT_EXTLIST *el) { - WT_FREE *fe; + WT_EXT *ext; WT_SIZE *szp; - WT_VERBOSE(session, block, "freelist by offset:"); - WT_FREE_FOREACH(fe, block->foff) + if (el->entries == 0) { + WT_VERBOSE(session, block, "%s: [Empty]", el->name); + return; + } + + WT_VERBOSE(session, block, "%s: list by offset:", el->name); + WT_EXT_FOREACH(ext, el->off) WT_VERBOSE(session, block, "\t{%" PRIuMAX "/%" PRIuMAX "}", - (uintmax_t)fe->off, (uintmax_t)fe->size); + (uintmax_t)ext->off, (uintmax_t)ext->size); - WT_VERBOSE(session, block, "freelist by size:"); - WT_FREE_FOREACH(szp, block->fsize) { + WT_VERBOSE(session, block, "%s: list by size:", el->name); + WT_EXT_FOREACH(szp, el->size) { WT_VERBOSE(session, block, "\t{%" PRIuMAX "}", (uintmax_t)szp->size); - WT_FREE_FOREACH_OFF(fe, szp->foff) + WT_EXT_FOREACH_OFF(ext, szp->off) WT_VERBOSE(session, block, "\t\t{%" PRIuMAX "/%" PRIuMAX "}", - (uintmax_t)fe->off, (uintmax_t)fe->size); + (uintmax_t)ext->off, (uintmax_t)ext->size); } } #endif diff --git a/src/block/block_open.c b/src/block/block_open.c index 23dd2e475c8..6bac8c72986 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -136,8 +136,14 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_ERR(__desc_read(session, block, salvage)); /* If not an open for a salvage operation, read the freelist. */ - if (!salvage) - WT_ERR(__wt_block_freelist_read(session, block)); + if (!salvage && block->free_offset != WT_BLOCK_INVALID_OFFSET) { + WT_ERR(__wt_block_extlist_read(session, block, &block->free, + block->free_offset, block->free_size, block->free_cksum)); + + /* Insert the free-list itself into the linked list. */ + WT_ERR(__wt_block_free(session, + block, block->free_offset, (off_t)block->free_size)); + } F_SET(block, WT_BLOCK_OK); *(void **)retp = block; @@ -163,7 +169,9 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) * file's description. */ if (F_ISSET(block, WT_BLOCK_OK)) { - WT_TRET(__wt_block_freelist_write(session, block)); + WT_TRET(__wt_block_extlist_write(session, block, + &block->free, &block->free_offset, + &block->free_size, &block->free_cksum)); WT_TRET(__desc_update(session, block)); } diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index e6a4cf52410..62186fcf2fc 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -117,23 +117,23 @@ __wt_block_verify_addr(WT_SESSION_IMPL *session, static int __verify_freelist(WT_SESSION_IMPL *session, WT_BLOCK *block) { - WT_FREE *fe; + WT_EXT *ext; int ret; ret = 0; - WT_FREE_FOREACH(fe, block->foff) { - if (fe->off + (off_t)fe->size > block->fh->file_size) + WT_EXT_FOREACH(ext, block->free.off) { + if (ext->off + (off_t)ext->size > block->fh->file_size) WT_RET_MSG(session, WT_ERROR, "free-list entry offset %" PRIuMAX "references " "non-existent file pages", - (uintmax_t)fe->off); + (uintmax_t)ext->off); WT_VERBOSE(session, verify, "free-list range %" PRIdMAX "-%" PRIdMAX, - (intmax_t)fe->off, (intmax_t)(fe->off + fe->size)); + (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)); - WT_TRET(__verify_addfrag(session, block, fe->off, fe->size)); + WT_TRET(__verify_addfrag(session, block, ext->off, ext->size)); } return (ret); diff --git a/src/include/block.h b/src/include/block.h index d217a5ff496..6b4b8c88830 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -19,36 +19,53 @@ /* * The block allocator maintains two primary skiplists: first, the by-offset - * list linking WT_FREE elements and sorted by file offset (low-to-high): + * list linking WT_EXT elements and sorted by file offset (low-to-high): * this list has an entry for every free chunk in the file. The second primary * skiplist is the by-size list linking WT_SIZE elements and sorted by chunk * size (low-to-high). This list has an entry for every free chunk size seen * since the list was created. * Additionally, each WT_SIZE element has a skiplist of its own, linking - * WT_FREE elements and sorted by file offset (low-to-high). This list has an + * WT_EXT elements and sorted by file offset (low-to-high). This list has an * entry for every free chunk in the file of a particular size. - * The trickiness is that each individual WT_FREE element appears on two + * The trickiness is that each individual WT_EXT element appears on two * skiplists. In order to minimize allocation calls, we allocate a single - * array of WT_FREE pointers at the end of the WT_FREE structure, for both - * skiplists, and store the depth of the skiplist in the WT_FREE structure. - * The skiplist entries for the offset skiplist start at WT_FREE.next[0] and - * the entries for the size skiplist start at WT_FREE.next[WT_FREE.depth]. - * - * WT_FREE -- - * Encapsulation of a free chunk of space. + * array of WT_EXT pointers at the end of the WT_EXT structure, for both + * skiplists, and store the depth of the skiplist in the WT_EXT structure. + * The skiplist entries for the offset skiplist start at WT_EXT.next[0] and + * the entries for the size skiplist start at WT_EXT.next[WT_EXT.depth]. */ -struct __wt_free { - off_t off; /* File offset */ - off_t size; /* Size */ + +/* + * WT_EXTLIST -- + * An extent list. + */ +struct __wt_extlist { + const char *name; /* Name */ + + uint64_t bytes; /* Byte count */ + uint32_t entries; /* Entry count */ + + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */ + WT_SIZE *size[WT_SKIP_MAXDEPTH]; +}; + +/* + * WT_EXT -- + * Encapsulation of an extent, either allocated or freed within the + * snapshot. + */ +struct __wt_ext { + off_t off; /* Extent's file offset */ + off_t size; /* Extent's Size */ uint8_t depth; /* Skip list depth */ /* * Variable-length array, sized by the number of skiplist elements. - * The first depth array entries are the offset skiplist elements, + * The first depth array entries are the address skiplist elements, * the second depth array entries are the size skiplist. */ - WT_FREE *next[0]; /* Offset, size skiplists */ + WT_EXT *next[0]; /* Offset, size skiplists */ }; /* @@ -60,23 +77,23 @@ struct __wt_size { uint8_t depth; /* Skip list depth */ - WT_FREE *foff[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */ + WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */ /* Variable-length array, sized by the number of skiplist elements. */ WT_SIZE *next[0]; /* Size skiplist */ }; /* - * WT_FREE_FOREACH -- + * WT_EXT_FOREACH -- * Walk a block manager skiplist. - * WT_FREE_FOREACH_OFF -- - * Walk a block manager skiplist where the WT_FREE.next entries are offset + * WT_EXT_FOREACH_OFF -- + * Walk a block manager skiplist where the WT_EXT.next entries are offset * by the depth. */ -#define WT_FREE_FOREACH(skip, head) \ +#define WT_EXT_FOREACH(skip, head) \ for ((skip) = (head)[0]; \ (skip) != NULL; (skip) = (skip)->next[0]) -#define WT_FREE_FOREACH_OFF(skip, head) \ +#define WT_EXT_FOREACH_OFF(skip, head) \ for ((skip) = (head)[0]; \ (skip) != NULL; (skip) = (skip)->next[(skip)->depth]) @@ -99,13 +116,7 @@ struct __wt_block { /* Freelist support */ WT_SPINLOCK freelist_lock; /* Lock to protect the freelist. */ - uint64_t freelist_bytes; /* Freelist byte count */ - uint32_t freelist_entries; /* Freelist entry count */ - int freelist_dirty; /* Freelist has been modified */ - - /* Freelist offset/size skiplists */ - WT_FREE *foff[WT_SKIP_MAXDEPTH]; - WT_SIZE *fsize[WT_SKIP_MAXDEPTH]; + WT_EXTLIST free; /* Freelist offset/size skiplists */ off_t free_offset; /* Freelist file location */ uint32_t free_size; @@ -137,7 +148,7 @@ struct __wt_block_desc { uint32_t cksum; /* 08-11: Description block checksum */ uint32_t unused; /* 12-15: Padding */ -#define WT_BLOCK_FREELIST_MAGIC 071002 +#define WT_BLOCK_EXTLIST_MAGIC 071002 uint64_t free_offset; /* 16-23: Free list page offset */ uint32_t free_size; /* 24-27: Free list page length */ uint32_t free_cksum; /* 28-31: Free list page checksum */ diff --git a/src/include/extern.h b/src/include/extern.h index 13b39c43d85..e262dbb5a66 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -19,9 +19,10 @@ extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size); +extern uint32_t __wt_cksum(const void *chunk, size_t len); extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, - off_t *offsetp, + off_t *offp, off_t size); extern int __wt_block_free_buf(WT_SESSION_IMPL *session, WT_BLOCK *block, @@ -31,15 +32,23 @@ extern int __wt_block_free( WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, off_t size); +extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_EXTLIST *el, + off_t off, + uint32_t size, + uint32_t cksum); extern void __wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_block_freelist_read(WT_SESSION_IMPL *session, WT_BLOCK *block); extern void __wt_block_freelist_close(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_block_freelist_write(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_EXTLIST *el, + off_t *offp, + uint32_t *sizep, + uint32_t *cksump); extern void __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern void __wt_block_dump(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern uint32_t __wt_cksum(const void *chunk, size_t len); extern int __wt_bm_addr_valid( WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t addr_size); diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in index d2b6aa7f96f..04d9080fb5e 100644 --- a/src/include/wt_internal.in +++ b/src/include/wt_internal.in @@ -97,10 +97,12 @@ struct __wt_evict_list; typedef struct __wt_evict_list WT_EVICT_LIST; struct __wt_evict_req; typedef struct __wt_evict_req WT_EVICT_REQ; +struct __wt_ext; + typedef struct __wt_ext WT_EXT; +struct __wt_extlist; + typedef struct __wt_extlist WT_EXTLIST; struct __wt_fh; typedef struct __wt_fh WT_FH; -struct __wt_free; - typedef struct __wt_free WT_FREE; struct __wt_hazard; typedef struct __wt_hazard WT_HAZARD; struct __wt_ikey; |