summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block/block_ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_ext.c')
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c2301
1 files changed, 1102 insertions, 1199 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index 82e85658e22..ac8ef950868 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -13,1462 +13,1365 @@
* Handle extension list errors that would normally panic the system but
* which should fail gracefully when verifying.
*/
-#define WT_BLOCK_RET(session, block, v, ...) do { \
- int __ret = (v); \
- __wt_err(session, __ret, __VA_ARGS__); \
- return ((block)->verify ? __ret : __wt_panic(session)); \
-} while (0)
-
-static int __block_append(WT_SESSION_IMPL *,
- WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
-static int __block_ext_overlap(WT_SESSION_IMPL *,
- WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
-static int __block_extlist_dump(
- WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, const char *);
-static int __block_merge(WT_SESSION_IMPL *,
- WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
+#define WT_BLOCK_RET(session, block, v, ...) \
+ do { \
+ int __ret = (v); \
+ __wt_err(session, __ret, __VA_ARGS__); \
+ return ((block)->verify ? __ret : __wt_panic(session)); \
+ } while (0)
+
+static int __block_append(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
+static int __block_ext_overlap(
+ WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
+static int __block_extlist_dump(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, const char *);
+static int __block_merge(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
/*
* __block_off_srch_last --
- * Return the last element in the list, along with a stack for appending.
+ * Return the last element in the list, along with a stack for appending.
*/
static inline WT_EXT *
__block_off_srch_last(WT_EXT **head, WT_EXT ***stack)
{
- WT_EXT **extp, *last;
- int i;
-
- last = NULL; /* The list may be empty */
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
- if (*extp != NULL) {
- last = *extp;
- extp = &(*extp)->next[i];
- } else
- stack[i--] = extp--;
- return (last);
+ WT_EXT **extp, *last;
+ int i;
+
+ last = NULL; /* The list may be empty */
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each level before stepping
+ * down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL) {
+ last = *extp;
+ extp = &(*extp)->next[i];
+ } else
+ stack[i--] = extp--;
+ return (last);
}
/*
* __block_off_srch --
- * Search a by-offset skiplist (either the primary by-offset list, or the
- * by-offset list referenced by a size entry), for the specified offset.
+ * Search a by-offset skiplist (either the primary by-offset list, or the by-offset list
+ * referenced by a size entry), for the specified offset.
*/
static inline void
__block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, bool skip_off)
{
- WT_EXT **extp;
- int i;
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- *
- * Return a stack for an exact match or the next-largest item.
- *
- * The WT_EXT structure contains two skiplists, the primary one and the
- * per-size bucket one: if the skip_off flag is set, offset the skiplist
- * array by the depth specified in this particular structure.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
- if (*extp != NULL && (*extp)->off < off)
- extp =
- &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
- else
- stack[i--] = extp--;
+ WT_EXT **extp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ *
+ * The WT_EXT structure contains two skiplists, the primary one and the
+ * per-size bucket one: if the skip_off flag is set, offset the skiplist
+ * array by the depth specified in this particular structure.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL && (*extp)->off < off)
+ extp = &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
+ else
+ stack[i--] = extp--;
}
/*
* __block_first_srch --
- * Search the skiplist for the first available slot.
+ * Search the skiplist for the first available slot.
*/
static inline bool
__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
{
- WT_EXT *ext;
-
- /*
- * Linear walk of the available chunks in offset order; take the first
- * one that's large enough.
- */
- WT_EXT_FOREACH(ext, head)
- if (ext->size >= size)
- break;
- if (ext == NULL)
- return (false);
-
- /* Build a stack for the offset we want. */
- __block_off_srch(head, ext->off, stack, false);
- return (true);
+ WT_EXT *ext;
+
+ /*
+ * Linear walk of the available chunks in offset order; take the first one that's large enough.
+ */
+ WT_EXT_FOREACH (ext, head)
+ if (ext->size >= size)
+ break;
+ if (ext == NULL)
+ return (false);
+
+ /* Build a stack for the offset we want. */
+ __block_off_srch(head, ext->off, stack, false);
+ return (true);
}
/*
* __block_size_srch --
- * Search the by-size skiplist for the specified size.
+ * Search the by-size skiplist for the specified size.
*/
static inline void
__block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack)
{
- WT_SIZE **szp;
- int i;
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- *
- * Return a stack for an exact match or the next-largest item.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;)
- if (*szp != NULL && (*szp)->size < size)
- szp = &(*szp)->next[i];
- else
- stack[i--] = szp--;
+ WT_SIZE **szp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;)
+ if (*szp != NULL && (*szp)->size < size)
+ szp = &(*szp)->next[i];
+ else
+ stack[i--] = szp--;
}
/*
* __block_off_srch_pair --
- * Search a by-offset skiplist for before/after records of the specified
- * offset.
+ * Search a by-offset skiplist for before/after records of the specified offset.
*/
static inline void
-__block_off_srch_pair(
- WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp)
+__block_off_srch_pair(WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp)
{
- WT_EXT **head, **extp;
- int i;
-
- *beforep = *afterp = NULL;
-
- head = el->off;
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
- if (*extp == NULL) {
- --i;
- --extp;
- continue;
- }
-
- if ((*extp)->off < off) { /* Keep going at this level */
- *beforep = *extp;
- extp = &(*extp)->next[i];
- } else { /* Drop down a level */
- *afterp = *extp;
- --i;
- --extp;
- }
- }
+ WT_EXT **head, **extp;
+ int i;
+
+ *beforep = *afterp = NULL;
+
+ head = el->off;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each level before stepping
+ * down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+ if (*extp == NULL) {
+ --i;
+ --extp;
+ continue;
+ }
+
+ if ((*extp)->off < off) { /* Keep going at this level */
+ *beforep = *extp;
+ extp = &(*extp)->next[i];
+ } else { /* Drop down a level */
+ *afterp = *extp;
+ --i;
+ --extp;
+ }
+ }
}
/*
* __block_ext_insert --
- * Insert an extent into an extent list.
+ * Insert an extent into an extent list.
*/
static int
__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
{
- WT_EXT **astack[WT_SKIP_MAXDEPTH];
- WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
- u_int i;
-
- /*
- * If we are inserting a new size onto the size skiplist, we'll need a
- * new WT_SIZE structure for that skiplist.
- */
- if (el->track_size) {
- __block_size_srch(el->sz, ext->size, sstack);
- szp = *sstack[0];
- if (szp == NULL || szp->size != ext->size) {
- WT_RET(__wt_block_size_alloc(session, &szp));
- szp->size = ext->size;
- szp->depth = ext->depth;
- for (i = 0; i < ext->depth; ++i) {
- szp->next[i] = *sstack[i];
- *sstack[i] = szp;
- }
- }
-
- /*
- * Insert the new WT_EXT structure into the size element's
- * offset skiplist.
- */
- __block_off_srch(szp->off, ext->off, astack, true);
- for (i = 0; i < ext->depth; ++i) {
- ext->next[i + ext->depth] = *astack[i];
- *astack[i] = ext;
- }
- }
+ WT_EXT **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /*
+ * If we are inserting a new size onto the size skiplist, we'll need a new WT_SIZE structure for
+ * that skiplist.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size) {
+ WT_RET(__wt_block_size_alloc(session, &szp));
+ szp->size = ext->size;
+ szp->depth = ext->depth;
+ for (i = 0; i < ext->depth; ++i) {
+ szp->next[i] = *sstack[i];
+ *sstack[i] = szp;
+ }
+ }
+
+ /*
+ * Insert the new WT_EXT structure into the size element's offset skiplist.
+ */
+ __block_off_srch(szp->off, ext->off, astack, true);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i + ext->depth] = *astack[i];
+ *astack[i] = ext;
+ }
+ }
#ifdef HAVE_DIAGNOSTIC
- if (!el->track_size)
- for (i = 0; i < ext->depth; ++i)
- ext->next[i + ext->depth] = NULL;
+ if (!el->track_size)
+ for (i = 0; i < ext->depth; ++i)
+ ext->next[i + ext->depth] = NULL;
#endif
- /* Insert the new WT_EXT structure into the offset skiplist. */
- __block_off_srch(el->off, ext->off, astack, false);
- for (i = 0; i < ext->depth; ++i) {
- ext->next[i] = *astack[i];
- *astack[i] = ext;
- }
+ /* Insert the new WT_EXT structure into the offset skiplist. */
+ __block_off_srch(el->off, ext->off, astack, false);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i] = *astack[i];
+ *astack[i] = ext;
+ }
- ++el->entries;
- el->bytes += (uint64_t)ext->size;
+ ++el->entries;
+ el->bytes += (uint64_t)ext->size;
- /* Update the cached end-of-list. */
- if (ext->next[0] == NULL)
- el->last = ext;
+ /* Update the cached end-of-list. */
+ if (ext->next[0] == NULL)
+ el->last = ext;
- return (0);
+ return (0);
}
/*
* __block_off_insert --
- * Insert a file range into an extent list.
+ * Insert a file range into an extent list.
*/
static int
-__block_off_insert(
- WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+__block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- WT_EXT *ext;
+ WT_EXT *ext;
- WT_RET(__wt_block_ext_alloc(session, &ext));
- ext->off = off;
- ext->size = size;
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
- return (__block_ext_insert(session, el, ext));
+ return (__block_ext_insert(session, el, ext));
}
#ifdef HAVE_DIAGNOSTIC
/*
* __block_off_match --
- * Return if any part of a specified range appears on a specified extent
- * list.
+ * Return if any part of a specified range appears on a specified extent list.
*/
static bool
__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- WT_EXT *before, *after;
+ WT_EXT *before, *after;
- /* Search for before and after entries for the offset. */
- __block_off_srch_pair(el, off, &before, &after);
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
- /* If "before" or "after" overlaps, we have a winner. */
- if (before != NULL && before->off + before->size > off)
- return (true);
- if (after != NULL && off + size > after->off)
- return (true);
- return (false);
+ /* If "before" or "after" overlaps, we have a winner. */
+ if (before != NULL && before->off + before->size > off)
+ return (true);
+ if (after != NULL && off + size > after->off)
+ return (true);
+ return (false);
}
/*
* __wt_block_misplaced --
- * Complain if a block appears on the available or discard lists.
+ * Complain if a block appears on the available or discard lists.
*/
int
-__wt_block_misplaced(
- WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list,
- wt_off_t offset, uint32_t size, bool live, const char *func, int line)
+__wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *list, wt_off_t offset,
+ uint32_t size, bool live, const char *func, int line)
{
- const char *name;
-
- name = NULL;
-
- /*
- * Don't check during the salvage read phase, we might be reading an
- * already freed overflow page.
- */
- if (F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
- return (0);
-
- /*
- * Verify a block the btree engine thinks it "owns" doesn't appear on
- * the available or discard lists (it might reasonably be on the alloc
- * list, if it was allocated since the last checkpoint). The engine
- * "owns" a block if it's trying to read or free the block, and those
- * functions make this check.
- *
- * Any block being read or freed should not be "available".
- *
- * Any block being read or freed in the live system should not be on the
- * discard list. (A checkpoint handle might be reading a block which is
- * on the live system's discard list; any attempt to free a block from a
- * checkpoint handle has already failed.)
- */
- __wt_spin_lock(session, &block->live_lock);
- if (__block_off_match(&block->live.avail, offset, size))
- name = "available";
- else if (live && __block_off_match(&block->live.discard, offset, size))
- name = "discard";
- __wt_spin_unlock(session, &block->live_lock);
- if (name != NULL) {
- __wt_errx(session,
- "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list "
- "(%s, %d)",
- list, (uintmax_t)offset, size, name, func, line);
- return (__wt_panic(session));
- }
- return (0);
+ const char *name;
+
+ name = NULL;
+
+ /*
+ * Don't check during the salvage read phase, we might be reading an already freed overflow
+ * page.
+ */
+ if (F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
+ return (0);
+
+ /*
+ * Verify a block the btree engine thinks it "owns" doesn't appear on
+ * the available or discard lists (it might reasonably be on the alloc
+ * list, if it was allocated since the last checkpoint). The engine
+ * "owns" a block if it's trying to read or free the block, and those
+ * functions make this check.
+ *
+ * Any block being read or freed should not be "available".
+ *
+ * Any block being read or freed in the live system should not be on the
+ * discard list. (A checkpoint handle might be reading a block which is
+ * on the live system's discard list; any attempt to free a block from a
+ * checkpoint handle has already failed.)
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ if (__block_off_match(&block->live.avail, offset, size))
+ name = "available";
+ else if (live && __block_off_match(&block->live.discard, offset, size))
+ name = "discard";
+ __wt_spin_unlock(session, &block->live_lock);
+ if (name != NULL) {
+ __wt_errx(session, "%s failed: %" PRIuMAX "/%" PRIu32
+ " is on the %s list "
+ "(%s, %d)",
+ list, (uintmax_t)offset, size, name, func, line);
+ return (__wt_panic(session));
+ }
+ return (0);
}
#endif
/*
* __block_off_remove --
- * Remove a record from an extent list.
+ * Remove a record from an extent list.
*/
static int
-__block_off_remove(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *el, wt_off_t off, WT_EXT **extp)
+__block_off_remove(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp)
{
- WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
- WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
- u_int i;
-
- /* Find and remove the record from the by-offset skiplist. */
- __block_off_srch(el->off, off, astack, false);
- ext = *astack[0];
- if (ext == NULL || ext->off != off)
- goto corrupt;
- for (i = 0; i < ext->depth; ++i)
- *astack[i] = ext->next[i];
-
- /*
- * Find and remove the record from the size's offset skiplist; if that
- * empties the by-size skiplist entry, remove it as well.
- */
- if (el->track_size) {
- __block_size_srch(el->sz, ext->size, sstack);
- szp = *sstack[0];
- if (szp == NULL || szp->size != ext->size)
- WT_PANIC_RET(session, EINVAL,
- "extent not found in by-size list during remove");
- __block_off_srch(szp->off, off, astack, true);
- ext = *astack[0];
- if (ext == NULL || ext->off != off)
- goto corrupt;
- for (i = 0; i < ext->depth; ++i)
- *astack[i] = ext->next[i + ext->depth];
- if (szp->off[0] == NULL) {
- for (i = 0; i < szp->depth; ++i)
- *sstack[i] = szp->next[i];
- __wt_block_size_free(session, szp);
- }
- }
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Find and remove the record from the by-offset skiplist. */
+ __block_off_srch(el->off, off, astack, false);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i];
+
+ /*
+ * Find and remove the record from the size's offset skiplist; if that empties the by-size
+ * skiplist entry, remove it as well.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size)
+ WT_PANIC_RET(session, EINVAL, "extent not found in by-size list during remove");
+ __block_off_srch(szp->off, off, astack, true);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i + ext->depth];
+ if (szp->off[0] == NULL) {
+ for (i = 0; i < szp->depth; ++i)
+ *sstack[i] = szp->next[i];
+ __wt_block_size_free(session, szp);
+ }
+ }
#ifdef HAVE_DIAGNOSTIC
- if (!el->track_size) {
- bool not_null;
- for (i = 0, not_null = false; i < ext->depth; ++i)
- if (ext->next[i + ext->depth] != NULL)
- not_null = true;
- WT_ASSERT(session, not_null == false);
- }
+ if (!el->track_size) {
+ bool not_null;
+ for (i = 0, not_null = false; i < ext->depth; ++i)
+ if (ext->next[i + ext->depth] != NULL)
+ not_null = true;
+ WT_ASSERT(session, not_null == false);
+ }
#endif
- --el->entries;
- el->bytes -= (uint64_t)ext->size;
+ --el->entries;
+ el->bytes -= (uint64_t)ext->size;
- /* Return the record if our caller wants it, otherwise free it. */
- if (extp == NULL)
- __wt_block_ext_free(session, ext);
- else
- *extp = ext;
+ /* Return the record if our caller wants it, otherwise free it. */
+ if (extp == NULL)
+ __wt_block_ext_free(session, ext);
+ else
+ *extp = ext;
- /* Update the cached end-of-list. */
- if (el->last == ext)
- el->last = NULL;
+ /* Update the cached end-of-list. */
+ if (el->last == ext)
+ el->last = NULL;
- return (0);
+ return (0);
corrupt:
- WT_BLOCK_RET(session, block, EINVAL,
- "attempt to remove non-existent offset from an extent list");
+ WT_BLOCK_RET(
+ session, block, EINVAL, "attempt to remove non-existent offset from an extent list");
}
/*
* __wt_block_off_remove_overlap --
- * Remove a range from an extent list, where the range may be part of a
- * overlapping entry.
+ * Remove a range from an extent list, where the range may be part of a overlapping entry.
*/
int
-__wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+__wt_block_off_remove_overlap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- WT_EXT *before, *after, *ext;
- wt_off_t a_off, a_size, b_off, b_size;
-
- WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET);
-
- /* Search for before and after entries for the offset. */
- __block_off_srch_pair(el, off, &before, &after);
-
- /* If "before" or "after" overlaps, retrieve the overlapping entry. */
- if (before != NULL && before->off + before->size > off) {
- WT_RET(__block_off_remove(
- session, block, el, before->off, &ext));
-
- /* Calculate overlapping extents. */
- a_off = ext->off;
- a_size = off - ext->off;
- b_off = off + size;
- b_size = ext->size - (a_size + size);
- } else if (after != NULL && off + size > after->off) {
- WT_RET(__block_off_remove(
- session, block, el, after->off, &ext));
-
- /*
- * Calculate overlapping extents. There's no initial overlap
- * since the after extent presumably cannot begin before "off".
- */
- a_off = WT_BLOCK_INVALID_OFFSET;
- a_size = 0;
- b_off = off + size;
- b_size = ext->size - (b_off - ext->off);
- } else
- return (WT_NOTFOUND);
-
- /*
- * If there are overlaps, insert the item; re-use the extent structure
- * and save the allocation (we know there's no need to merge).
- */
- if (a_size != 0) {
- ext->off = a_off;
- ext->size = a_size;
- WT_RET(__block_ext_insert(session, el, ext));
- ext = NULL;
- }
- if (b_size != 0) {
- if (ext == NULL)
- WT_RET(__block_off_insert(session, el, b_off, b_size));
- else {
- ext->off = b_off;
- ext->size = b_size;
- WT_RET(__block_ext_insert(session, el, ext));
- ext = NULL;
- }
- }
- if (ext != NULL)
- __wt_block_ext_free(session, ext);
- return (0);
+ WT_EXT *before, *after, *ext;
+ wt_off_t a_off, a_size, b_off, b_size;
+
+ WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET);
+
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
+
+ /* If "before" or "after" overlaps, retrieve the overlapping entry. */
+ if (before != NULL && before->off + before->size > off) {
+ WT_RET(__block_off_remove(session, block, el, before->off, &ext));
+
+ /* Calculate overlapping extents. */
+ a_off = ext->off;
+ a_size = off - ext->off;
+ b_off = off + size;
+ b_size = ext->size - (a_size + size);
+ } else if (after != NULL && off + size > after->off) {
+ WT_RET(__block_off_remove(session, block, el, after->off, &ext));
+
+ /*
+ * Calculate overlapping extents. There's no initial overlap since the after extent
+ * presumably cannot begin before "off".
+ */
+ a_off = WT_BLOCK_INVALID_OFFSET;
+ a_size = 0;
+ b_off = off + size;
+ b_size = ext->size - (b_off - ext->off);
+ } else
+ return (WT_NOTFOUND);
+
+ /*
+ * If there are overlaps, insert the item; re-use the extent structure and save the allocation
+ * (we know there's no need to merge).
+ */
+ if (a_size != 0) {
+ ext->off = a_off;
+ ext->size = a_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ if (b_size != 0) {
+ if (ext == NULL)
+ WT_RET(__block_off_insert(session, el, b_off, b_size));
+ else {
+ ext->off = b_off;
+ ext->size = b_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ }
+ if (ext != NULL)
+ __wt_block_ext_free(session, ext);
+ return (0);
}
/*
* __block_extend --
- * Extend the file to allocate space.
+ * Extend the file to allocate space.
*/
static inline int
-__block_extend(
- WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+__block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
{
- /*
- * Callers of this function are expected to have already acquired any
- * locks required to extend the file.
- *
- * We should never be allocating from an empty file.
- */
- if (block->size < block->allocsize)
- WT_RET_MSG(session, EINVAL,
- "file has no description information");
-
- /*
- * Make sure we don't allocate past the maximum file size. There's no
- * easy way to know the maximum wt_off_t on a system, limit growth to
- * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I
- * don't think we're likely to see anything bigger for awhile.
- */
- if (block->size > (wt_off_t)INT64_MAX - size)
- WT_RET_MSG(session, WT_ERROR,
- "block allocation failed, file cannot grow further");
-
- *offp = block->size;
- block->size += size;
-
- WT_STAT_DATA_INCR(session, block_extension);
- __wt_verbose(session, WT_VERB_BLOCK,
- "file extend %" PRIdMAX "B @ %" PRIdMAX,
- (intmax_t)size, (intmax_t)*offp);
-
- return (0);
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to extend the file.
+ *
+ * We should never be allocating from an empty file.
+ */
+ if (block->size < block->allocsize)
+ WT_RET_MSG(session, EINVAL, "file has no description information");
+
+ /*
+ * Make sure we don't allocate past the maximum file size. There's no
+ * easy way to know the maximum wt_off_t on a system, limit growth to
+ * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I
+ * don't think we're likely to see anything bigger for awhile.
+ */
+ if (block->size > (wt_off_t)INT64_MAX - size)
+ WT_RET_MSG(session, WT_ERROR, "block allocation failed, file cannot grow further");
+
+ *offp = block->size;
+ block->size += size;
+
+ WT_STAT_DATA_INCR(session, block_extension);
+ __wt_verbose(session, WT_VERB_BLOCK, "file extend %" PRIdMAX "B @ %" PRIdMAX, (intmax_t)size,
+ (intmax_t)*offp);
+
+ return (0);
}
/*
* __wt_block_alloc --
- * Alloc a chunk of space from the underlying file.
+ * Alloc a chunk of space from the underlying file.
*/
int
-__wt_block_alloc(
- WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+__wt_block_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
{
- WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
- WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
-
- /* If a sync is running, no other sessions can allocate blocks. */
- WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
-
- /* Assert we're maintaining the by-size skiplist. */
- WT_ASSERT(session, block->live.avail.track_size != 0);
-
- WT_STAT_DATA_INCR(session, block_alloc);
- if (size % block->allocsize != 0)
- WT_RET_MSG(session, EINVAL,
- "cannot allocate a block size %" PRIdMAX " that is not "
- "a multiple of the allocation size %" PRIu32,
- (intmax_t)size, block->allocsize);
-
- /*
- * Allocation is either first-fit (lowest offset), or best-fit (best
- * size). If it's first-fit, walk the offset list linearly until we
- * find an entry that will work.
- *
- * If it's best-fit by size, search the by-size skiplist for the size
- * and take the first entry on the by-size offset list. This means we
- * prefer best-fit over lower offset, but within a size we'll prefer an
- * offset appearing earlier in the file.
- *
- * If we don't have anything big enough, extend the file.
- */
- if (block->live.avail.bytes < (uint64_t)size)
- goto append;
- if (block->allocfirst) {
- if (!__block_first_srch(block->live.avail.off, size, estack))
- goto append;
- ext = *estack[0];
- } else {
- __block_size_srch(block->live.avail.sz, size, sstack);
- if ((szp = *sstack[0]) == NULL) {
-append: WT_RET(__block_extend(session, block, offp, size));
- WT_RET(__block_append(session, block,
- &block->live.alloc, *offp, (wt_off_t)size));
- return (0);
- }
-
- /* Take the first record. */
- ext = szp->off[0];
- }
-
- /* Remove the record, and set the returned offset. */
- WT_RET(__block_off_remove(
- session, block, &block->live.avail, ext->off, &ext));
- *offp = ext->off;
-
- /* If doing a partial allocation, adjust the record and put it back. */
- if (ext->size > size) {
- __wt_verbose(session, WT_VERB_BLOCK,
- "allocate %" PRIdMAX " from range %" PRIdMAX "-%"
- PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
- (intmax_t)size,
- (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
- (intmax_t)(ext->off + size),
- (intmax_t)(ext->off + size + ext->size - size));
-
- ext->off += size;
- ext->size -= size;
- WT_RET(__block_ext_insert(session, &block->live.avail, ext));
- } else {
- __wt_verbose(session, WT_VERB_BLOCK,
- "allocate range %" PRIdMAX "-%" PRIdMAX,
- (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
-
- __wt_block_ext_free(session, ext);
- }
-
- /* Add the newly allocated extent to the list of allocations. */
- WT_RET(__block_merge(
- session, block, &block->live.alloc, *offp, (wt_off_t)size));
- return (0);
+ WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+
+ /* If a sync is running, no other sessions can allocate blocks. */
+ WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
+
+ /* Assert we're maintaining the by-size skiplist. */
+ WT_ASSERT(session, block->live.avail.track_size != 0);
+
+ WT_STAT_DATA_INCR(session, block_alloc);
+ if (size % block->allocsize != 0)
+ WT_RET_MSG(session, EINVAL, "cannot allocate a block size %" PRIdMAX
+ " that is not "
+ "a multiple of the allocation size %" PRIu32,
+ (intmax_t)size, block->allocsize);
+
+ /*
+ * Allocation is either first-fit (lowest offset), or best-fit (best
+ * size). If it's first-fit, walk the offset list linearly until we
+ * find an entry that will work.
+ *
+ * If it's best-fit by size, search the by-size skiplist for the size
+ * and take the first entry on the by-size offset list. This means we
+ * prefer best-fit over lower offset, but within a size we'll prefer an
+ * offset appearing earlier in the file.
+ *
+ * If we don't have anything big enough, extend the file.
+ */
+ if (block->live.avail.bytes < (uint64_t)size)
+ goto append;
+ if (block->allocfirst) {
+ if (!__block_first_srch(block->live.avail.off, size, estack))
+ goto append;
+ ext = *estack[0];
+ } else {
+ __block_size_srch(block->live.avail.sz, size, sstack);
+ if ((szp = *sstack[0]) == NULL) {
+append:
+ WT_RET(__block_extend(session, block, offp, size));
+ WT_RET(__block_append(session, block, &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
+ }
+
+ /* Take the first record. */
+ ext = szp->off[0];
+ }
+
+ /* Remove the record, and set the returned offset. */
+ WT_RET(__block_off_remove(session, block, &block->live.avail, ext->off, &ext));
+ *offp = ext->off;
+
+ /* If doing a partial allocation, adjust the record and put it back. */
+ if (ext->size > size) {
+ __wt_verbose(session, WT_VERB_BLOCK,
+ "allocate %" PRIdMAX " from range %" PRIdMAX "-%" PRIdMAX ", range shrinks to %" PRIdMAX
+ "-%" PRIdMAX,
+ (intmax_t)size, (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)(ext->off + size), (intmax_t)(ext->off + size + ext->size - size));
+
+ ext->off += size;
+ ext->size -= size;
+ WT_RET(__block_ext_insert(session, &block->live.avail, ext));
+ } else {
+ __wt_verbose(session, WT_VERB_BLOCK, "allocate range %" PRIdMAX "-%" PRIdMAX,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
+
+ __wt_block_ext_free(session, ext);
+ }
+
+ /* Add the newly allocated extent to the list of allocations. */
+ WT_RET(__block_merge(session, block, &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
}
/*
* __wt_block_free --
- * Free a cookie-referenced chunk of space to the underlying file.
+ * Free a cookie-referenced chunk of space to the underlying file.
*/
int
-__wt_block_free(WT_SESSION_IMPL *session,
- WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+__wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
{
- WT_DECL_RET;
- wt_off_t offset;
- uint32_t checksum, size;
+ WT_DECL_RET;
+ wt_off_t offset;
+ uint32_t checksum, size;
- WT_UNUSED(addr_size);
- WT_STAT_DATA_INCR(session, block_free);
+ WT_UNUSED(addr_size);
+ WT_STAT_DATA_INCR(session, block_free);
- /* Crack the cookie. */
- WT_RET(
- __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));
- __wt_verbose(session, WT_VERB_BLOCK,
- "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size);
+ __wt_verbose(
+ session, WT_VERB_BLOCK, "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size);
#ifdef HAVE_DIAGNOSTIC
- WT_RET(__wt_block_misplaced(
- session, block, "free", offset, size, true, __func__, __LINE__));
+ WT_RET(__wt_block_misplaced(session, block, "free", offset, size, true, __func__, __LINE__));
#endif
- WT_RET(__wt_block_ext_prealloc(session, 5));
- __wt_spin_lock(session, &block->live_lock);
- ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
- __wt_spin_unlock(session, &block->live_lock);
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
- return (ret);
+ return (ret);
}
/*
* __wt_block_off_free --
- * Free a file range to the underlying file.
+ * Free a file range to the underlying file.
*/
int
-__wt_block_off_free(
- WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+__wt_block_off_free(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
{
- WT_DECL_RET;
-
- /* If a sync is running, no other sessions can free blocks. */
- WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
-
- /*
- * Callers of this function are expected to have already acquired any
- * locks required to manipulate the extent lists.
- *
- * We can reuse this extent immediately if it was allocated during this
- * checkpoint, merge it into the avail list (which slows file growth in
- * workloads including repeated overflow record modification). If this
- * extent is referenced in a previous checkpoint, merge into the discard
- * list.
- */
- if ((ret = __wt_block_off_remove_overlap(
- session, block, &block->live.alloc, offset, size)) == 0)
- ret = __block_merge(
- session, block, &block->live.avail, offset, size);
- else if (ret == WT_NOTFOUND)
- ret = __block_merge(
- session, block, &block->live.discard, offset, size);
- return (ret);
+ WT_DECL_RET;
+
+ /* If a sync is running, no other sessions can free blocks. */
+ WT_ASSERT(session, WT_SESSION_BTREE_SYNC_SAFE(session, S2BT(session)));
+
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent lists.
+ *
+ * We can reuse this extent immediately if it was allocated during this
+ * checkpoint, merge it into the avail list (which slows file growth in
+ * workloads including repeated overflow record modification). If this
+ * extent is referenced in a previous checkpoint, merge into the discard
+ * list.
+ */
+ if ((ret = __wt_block_off_remove_overlap(session, block, &block->live.alloc, offset, size)) ==
+ 0)
+ ret = __block_merge(session, block, &block->live.avail, offset, size);
+ else if (ret == WT_NOTFOUND)
+ ret = __block_merge(session, block, &block->live.discard, offset, size);
+ return (ret);
}
#ifdef HAVE_DIAGNOSTIC
/*
* __wt_block_extlist_check --
- * Return if the extent lists overlap.
+ * Return if the extent lists overlap.
*/
int
-__wt_block_extlist_check(
- WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
+__wt_block_extlist_check(WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
{
- WT_EXT *a, *b;
-
- a = al->off[0];
- b = bl->off[0];
-
- /* Walk the lists in parallel, looking for overlaps. */
- while (a != NULL && b != NULL) {
- /*
- * If there's no overlap, move the lower-offset entry to the
- * next entry in its list.
- */
- if (a->off + a->size <= b->off) {
- a = a->next[0];
- continue;
- }
- if (b->off + b->size <= a->off) {
- b = b->next[0];
- continue;
- }
- WT_PANIC_RET(session, EINVAL,
- "checkpoint merge check: %s list overlaps the %s list",
- al->name, bl->name);
- }
- return (0);
+ WT_EXT *a, *b;
+
+ a = al->off[0];
+ b = bl->off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (a != NULL && b != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the next entry in its list.
+ */
+ if (a->off + a->size <= b->off) {
+ a = a->next[0];
+ continue;
+ }
+ if (b->off + b->size <= a->off) {
+ b = b->next[0];
+ continue;
+ }
+ WT_PANIC_RET(session, EINVAL, "checkpoint merge check: %s list overlaps the %s list",
+ al->name, bl->name);
+ }
+ return (0);
}
#endif
/*
* __wt_block_extlist_overlap --
- * Review a checkpoint's alloc/discard extent lists, move overlaps into the
- * live system's checkpoint-avail list.
+ * Review a checkpoint's alloc/discard extent lists, move overlaps into the live system's
+ * checkpoint-avail list.
*/
int
-__wt_block_extlist_overlap(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+__wt_block_extlist_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
{
- WT_EXT *alloc, *discard;
-
- alloc = ci->alloc.off[0];
- discard = ci->discard.off[0];
-
- /* Walk the lists in parallel, looking for overlaps. */
- while (alloc != NULL && discard != NULL) {
- /*
- * If there's no overlap, move the lower-offset entry to the
- * next entry in its list.
- */
- if (alloc->off + alloc->size <= discard->off) {
- alloc = alloc->next[0];
- continue;
- }
- if (discard->off + discard->size <= alloc->off) {
- discard = discard->next[0];
- continue;
- }
-
- /* Reconcile the overlap. */
- WT_RET(__block_ext_overlap(session, block,
- &ci->alloc, &alloc, &ci->discard, &discard));
- }
- return (0);
+ WT_EXT *alloc, *discard;
+
+ alloc = ci->alloc.off[0];
+ discard = ci->discard.off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (alloc != NULL && discard != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the next entry in its list.
+ */
+ if (alloc->off + alloc->size <= discard->off) {
+ alloc = alloc->next[0];
+ continue;
+ }
+ if (discard->off + discard->size <= alloc->off) {
+ discard = discard->next[0];
+ continue;
+ }
+
+ /* Reconcile the overlap. */
+ WT_RET(__block_ext_overlap(session, block, &ci->alloc, &alloc, &ci->discard, &discard));
+ }
+ return (0);
}
/*
* __block_ext_overlap --
- * Reconcile two overlapping ranges.
+ * Reconcile two overlapping ranges.
*/
static int
-__block_ext_overlap(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+__block_ext_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap,
+ WT_EXTLIST *bel, WT_EXT **bp)
{
- WT_EXT *a, *b, **ext;
- WT_EXTLIST *avail, *el;
- wt_off_t off, size;
-
- avail = &block->live.ckpt_avail;
-
- /*
- * The ranges overlap, choose the range we're going to take from each.
- *
- * We can think of the overlap possibilities as 11 different cases:
- *
- * AAAAAAAAAAAAAAAAAA
- * #1 BBBBBBBBBBBBBBBBBB ranges are the same
- * #2 BBBBBBBBBBBBB overlaps the beginning
- * #3 BBBBBBBBBBBBBBBB overlaps the end
- * #4 BBBBB B is a prefix of A
- * #5 BBBBBB B is middle of A
- * #6 BBBBBBBBBB B is a suffix of A
- *
- * and:
- *
- * BBBBBBBBBBBBBBBBBB
- * #7 AAAAAAAAAAAAA same as #3
- * #8 AAAAAAAAAAAAAAAA same as #2
- * #9 AAAAA A is a prefix of B
- * #10 AAAAAA A is middle of B
- * #11 AAAAAAAAAA A is a suffix of B
- *
- *
- * By swapping the arguments so "A" is always the lower range, we can
- * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
- *
- * AAAAAAAAAAAAAAAAAA
- * #1 BBBBBBBBBBBBBBBBBB ranges are the same
- * #3 BBBBBBBBBBBBBBBB overlaps the end
- * #4 BBBBB B is a prefix of A
- * #5 BBBBBB B is middle of A
- * #6 BBBBBBBBBB B is a suffix of A
- *
- * and:
- *
- * BBBBBBBBBBBBBBBBBB
- * #7 AAAAAAAAAAAAA same as #3
- * #9 AAAAA A is a prefix of B
- */
- a = *ap;
- b = *bp;
- if (a->off > b->off) { /* Swap */
- b = *ap;
- a = *bp;
- ext = ap; ap = bp; bp = ext;
- el = ael; ael = bel; bel = el;
- }
-
- if (a->off == b->off) { /* Case #1, #4, #9 */
- if (a->size == b->size) { /* Case #1 */
- /*
- * Move caller's A and B to the next element
- * Add that A and B range to the avail list
- * Delete A and B
- */
- *ap = (*ap)->next[0];
- *bp = (*bp)->next[0];
- WT_RET(__block_merge(
- session, block, avail, b->off, b->size));
- WT_RET(__block_off_remove(
- session, block, ael, a->off, NULL));
- WT_RET(__block_off_remove(
- session, block, bel, b->off, NULL));
- }
- else if (a->size > b->size) { /* Case #4 */
- /*
- * Remove A from its list
- * Increment/Decrement A's offset/size by the size of B
- * Insert A on its list
- */
- WT_RET(__block_off_remove(
- session, block, ael, a->off, &a));
- a->off += b->size;
- a->size -= b->size;
- WT_RET(__block_ext_insert(session, ael, a));
-
- /*
- * Move caller's B to the next element
- * Add B's range to the avail list
- * Delete B
- */
- *bp = (*bp)->next[0];
- WT_RET(__block_merge(
- session, block, avail, b->off, b->size));
- WT_RET(__block_off_remove(
- session, block, bel, b->off, NULL));
- } else { /* Case #9 */
- /*
- * Remove B from its list
- * Increment/Decrement B's offset/size by the size of A
- * Insert B on its list
- */
- WT_RET(__block_off_remove(
- session, block, bel, b->off, &b));
- b->off += a->size;
- b->size -= a->size;
- WT_RET(__block_ext_insert(session, bel, b));
-
- /*
- * Move caller's A to the next element
- * Add A's range to the avail list
- * Delete A
- */
- *ap = (*ap)->next[0];
- WT_RET(__block_merge(
- session, block, avail, a->off, a->size));
- WT_RET(__block_off_remove(
- session, block, ael, a->off, NULL));
- } /* Case #6 */
- } else if (a->off + a->size == b->off + b->size) {
- /*
- * Remove A from its list
- * Decrement A's size by the size of B
- * Insert A on its list
- */
- WT_RET(__block_off_remove(session, block, ael, a->off, &a));
- a->size -= b->size;
- WT_RET(__block_ext_insert(session, ael, a));
-
- /*
- * Move caller's B to the next element
- * Add B's range to the avail list
- * Delete B
- */
- *bp = (*bp)->next[0];
- WT_RET(__block_merge(session, block, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
- } else if /* Case #3, #7 */
- (a->off + a->size < b->off + b->size) {
- /*
- * Add overlap to the avail list
- */
- off = b->off;
- size = (a->off + a->size) - b->off;
- WT_RET(__block_merge(session, block, avail, off, size));
-
- /*
- * Remove A from its list
- * Decrement A's size by the overlap
- * Insert A on its list
- */
- WT_RET(__block_off_remove(session, block, ael, a->off, &a));
- a->size -= size;
- WT_RET(__block_ext_insert(session, ael, a));
-
- /*
- * Remove B from its list
- * Increment/Decrement B's offset/size by the overlap
- * Insert B on its list
- */
- WT_RET(__block_off_remove(session, block, bel, b->off, &b));
- b->off += size;
- b->size -= size;
- WT_RET(__block_ext_insert(session, bel, b));
- } else { /* Case #5 */
- /* Calculate the offset/size of the trailing part of A. */
- off = b->off + b->size;
- size = (a->off + a->size) - off;
-
- /*
- * Remove A from its list
- * Decrement A's size by trailing part of A plus B's size
- * Insert A on its list
- */
- WT_RET(__block_off_remove(session, block, ael, a->off, &a));
- a->size = b->off - a->off;
- WT_RET(__block_ext_insert(session, ael, a));
-
- /* Add trailing part of A to A's list as a new element. */
- WT_RET(__block_merge(session, block, ael, off, size));
-
- /*
- * Move caller's B to the next element
- * Add B's range to the avail list
- * Delete B
- */
- *bp = (*bp)->next[0];
- WT_RET(__block_merge(session, block, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
- }
-
- return (0);
+ WT_EXT *a, *b, **ext;
+ WT_EXTLIST *avail, *el;
+ wt_off_t off, size;
+
+ avail = &block->live.ckpt_avail;
+
+ /*
+ * The ranges overlap, choose the range we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ *
+ * By swapping the arguments so "A" is always the lower range, we can
+ * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #9 AAAAA A is a prefix of B
+ */
+ a = *ap;
+ b = *bp;
+ if (a->off > b->off) { /* Swap */
+ b = *ap;
+ a = *bp;
+ ext = ap;
+ ap = bp;
+ bp = ext;
+ el = ael;
+ ael = bel;
+ bel = el;
+ }
+
+ if (a->off == b->off) { /* Case #1, #4, #9 */
+ if (a->size == b->size) { /* Case #1 */
+ /*
+ * Move caller's A and B to the next element Add that A and B
+ * range to the avail list Delete A and B
+ */
+ *ap = (*ap)->next[0];
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, ael, a->off, NULL));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
+ } else if (a->size > b->size) { /* Case #4 */
+ /*
+ * Remove A from its list Increment/Decrement A's
+ * offset/size by the size of B Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
+ a->off += b->size;
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element Add B's range to the avail list Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
+ } else { /* Case #9 */
+ /*
+ * Remove B from its list Increment/Decrement B's offset/size by the size of A
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, block, bel, b->off, &b));
+ b->off += a->size;
+ b->size -= a->size;
+ WT_RET(__block_ext_insert(session, bel, b));
+
+ /*
+ * Move caller's A to the next element Add A's range to the avail list Delete A
+ */
+ *ap = (*ap)->next[0];
+ WT_RET(__block_merge(session, block, avail, a->off, a->size));
+ WT_RET(__block_off_remove(session, block, ael, a->off, NULL));
+ } /* Case #6 */
+ } else if (a->off + a->size == b->off + b->size) {
+ /*
+ * Remove A from its list Decrement A's size by the size of B Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element Add B's range to the avail list Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
+ } else if /* Case #3, #7 */
+ (a->off + a->size < b->off + b->size) {
+ /*
+ * Add overlap to the avail list
+ */
+ off = b->off;
+ size = (a->off + a->size) - b->off;
+ WT_RET(__block_merge(session, block, avail, off, size));
+
+ /*
+ * Remove A from its list Decrement A's size by the overlap Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
+ a->size -= size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Remove B from its list Increment/Decrement B's offset/size by the overlap Insert B on its
+ * list
+ */
+ WT_RET(__block_off_remove(session, block, bel, b->off, &b));
+ b->off += size;
+ b->size -= size;
+ WT_RET(__block_ext_insert(session, bel, b));
+ } else { /* Case #5 */
+ /* Calculate the offset/size of the trailing part of A. */
+ off = b->off + b->size;
+ size = (a->off + a->size) - off;
+
+ /*
+ * Remove A from its list Decrement A's size by trailing part of A plus B's size Insert A on
+ * its list
+ */
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
+ a->size = b->off - a->off;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /* Add trailing part of A to A's list as a new element. */
+ WT_RET(__block_merge(session, block, ael, off, size));
+
+ /*
+ * Move caller's B to the next element Add B's range to the avail list Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
+ }
+
+ return (0);
}
/*
* __wt_block_extlist_merge --
- * Merge one extent list into another.
+ * Merge one extent list into another.
*/
int
-__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *a, WT_EXTLIST *b)
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b)
{
- WT_EXT *ext;
- WT_EXTLIST tmp;
- u_int i;
-
- __wt_verbose(
- session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name);
-
- /*
- * Sometimes the list we are merging is much bigger than the other: if
- * so, swap the lists around to reduce the amount of work we need to do
- * during the merge. The size lists have to match as well, so this is
- * only possible if both lists are tracking sizes, or neither are.
- */
- if (a->track_size == b->track_size && a->entries > b->entries) {
- tmp = *a;
- a->bytes = b->bytes;
- b->bytes = tmp.bytes;
- a->entries = b->entries;
- b->entries = tmp.entries;
- for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
- a->off[i] = b->off[i];
- b->off[i] = tmp.off[i];
- a->sz[i] = b->sz[i];
- b->sz[i] = tmp.sz[i];
- }
- }
-
- WT_EXT_FOREACH(ext, a->off)
- WT_RET(__block_merge(session, block, b, ext->off, ext->size));
-
- return (0);
+ WT_EXT *ext;
+ WT_EXTLIST tmp;
+ u_int i;
+
+ __wt_verbose(session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name);
+
+ /*
+ * Sometimes the list we are merging is much bigger than the other: if so, swap the lists around
+ * to reduce the amount of work we need to do during the merge. The size lists have to match as
+ * well, so this is only possible if both lists are tracking sizes, or neither are.
+ */
+ if (a->track_size == b->track_size && a->entries > b->entries) {
+ tmp = *a;
+ a->bytes = b->bytes;
+ b->bytes = tmp.bytes;
+ a->entries = b->entries;
+ b->entries = tmp.entries;
+ for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+ a->off[i] = b->off[i];
+ b->off[i] = tmp.off[i];
+ a->sz[i] = b->sz[i];
+ b->sz[i] = tmp.sz[i];
+ }
+ }
+
+ WT_EXT_FOREACH (ext, a->off)
+ WT_RET(__block_merge(session, block, b, ext->off, ext->size));
+
+ return (0);
}
/*
* __block_append --
- * Append a new entry to the allocation list.
+ * Append a new entry to the allocation list.
*/
static int
-__block_append(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+__block_append(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
- u_int i;
-
- WT_UNUSED(block);
- WT_ASSERT(session, el->track_size == 0);
-
- /*
- * Identical to __block_merge, when we know the file is being extended,
- * that is, the information is either going to be used to extend the
- * last object on the list, or become a new object ending the list.
- *
- * The terminating element of the list is cached, check it; otherwise,
- * get a stack for the last object in the skiplist, check for a simple
- * extension, and otherwise append a new structure.
- */
- if ((ext = el->last) != NULL && ext->off + ext->size == off)
- ext->size += size;
- else {
- ext = __block_off_srch_last(el->off, astack);
- if (ext != NULL && ext->off + ext->size == off)
- ext->size += size;
- else {
- WT_RET(__wt_block_ext_alloc(session, &ext));
- ext->off = off;
- ext->size = size;
-
- for (i = 0; i < ext->depth; ++i)
- *astack[i] = ext;
- ++el->entries;
- }
-
- /* Update the cached end-of-list */
- el->last = ext;
- }
- el->bytes += (uint64_t)size;
-
- return (0);
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ WT_UNUSED(block);
+ WT_ASSERT(session, el->track_size == 0);
+
+ /*
+ * Identical to __block_merge, when we know the file is being extended,
+ * that is, the information is either going to be used to extend the
+ * last object on the list, or become a new object ending the list.
+ *
+ * The terminating element of the list is cached, check it; otherwise,
+ * get a stack for the last object in the skiplist, check for a simple
+ * extension, and otherwise append a new structure.
+ */
+ if ((ext = el->last) != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ ext = __block_off_srch_last(el->off, astack);
+ if (ext != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
+
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext;
+ ++el->entries;
+ }
+
+ /* Update the cached end-of-list */
+ el->last = ext;
+ }
+ el->bytes += (uint64_t)size;
+
+ return (0);
}
/*
* __wt_block_insert_ext --
- * Insert an extent into an extent list, merging if possible.
+ * Insert an extent into an extent list, merging if possible.
*/
int
-__wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+__wt_block_insert_ext(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- /*
- * There are currently two copies of this function (this code is a one-
- * liner that calls the internal version of the function, which means
- * the compiler should compress out the function call). It's that way
- * because the interface is still fluid, I'm not convinced there won't
- * be a need for a functional split between the internal and external
- * versions in the future.
- *
- * Callers of this function are expected to have already acquired any
- * locks required to manipulate the extent list.
- */
- return (__block_merge(session, block, el, off, size));
+ /*
+ * There are currently two copies of this function (this code is a one-
+ * liner that calls the internal version of the function, which means
+ * the compiler should compress out the function call). It's that way
+ * because the interface is still fluid, I'm not convinced there won't
+ * be a need for a functional split between the internal and external
+ * versions in the future.
+ *
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent list.
+ */
+ return (__block_merge(session, block, el, off, size));
}
/*
* __block_merge --
- * Insert an extent into an extent list, merging if possible (internal
- * version).
+ * Insert an extent into an extent list, merging if possible (internal version).
*/
static int
-__block_merge(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+__block_merge(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
- WT_EXT *ext, *after, *before;
-
- /*
- * Retrieve the records preceding/following the offset. If the records
- * are contiguous with the free'd offset, combine records.
- */
- __block_off_srch_pair(el, off, &before, &after);
- if (before != NULL) {
- if (before->off + before->size > off)
- WT_BLOCK_RET(session, block, EINVAL,
- "%s: existing range %" PRIdMAX "-%" PRIdMAX
- " overlaps with merge range %" PRIdMAX "-%" PRIdMAX,
- el->name,
- (intmax_t)before->off,
- (intmax_t)(before->off + before->size),
- (intmax_t)off, (intmax_t)(off + size));
- if (before->off + before->size != off)
- before = NULL;
- }
- if (after != NULL) {
- if (off + size > after->off) {
- WT_BLOCK_RET(session, block, EINVAL,
- "%s: merge range %" PRIdMAX "-%" PRIdMAX
- " overlaps with existing range %" PRIdMAX
- "-%" PRIdMAX,
- el->name,
- (intmax_t)off, (intmax_t)(off + size),
- (intmax_t)after->off,
- (intmax_t)(after->off + after->size));
- }
- if (off + size != after->off)
- after = NULL;
- }
- if (before == NULL && after == NULL) {
- __wt_verbose(session, WT_VERB_BLOCK,
- "%s: insert range %" PRIdMAX "-%" PRIdMAX,
- el->name, (intmax_t)off, (intmax_t)(off + size));
-
- return (__block_off_insert(session, el, off, size));
- }
-
- /*
- * If the "before" offset range abuts, we'll use it as our new record;
- * if the "after" offset range also abuts, include its size and remove
- * it from the system. Else, only the "after" offset range abuts, use
- * the "after" offset range as our new record. In either case, remove
- * the record we're going to use, adjust it and re-insert it.
- */
- if (before == NULL) {
- WT_RET(__block_off_remove(
- session, block, el, after->off, &ext));
-
- __wt_verbose(session, WT_VERB_BLOCK,
- "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
- PRIdMAX "-%" PRIdMAX,
- el->name,
- (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
- (intmax_t)off, (intmax_t)(off + ext->size + size));
-
- ext->off = off;
- ext->size += size;
- } else {
- if (after != NULL) {
- size += after->size;
- WT_RET(__block_off_remove(
- session, block, el, after->off, NULL));
- }
- WT_RET(__block_off_remove(
- session, block, el, before->off, &ext));
-
- __wt_verbose(session, WT_VERB_BLOCK,
- "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
- PRIdMAX "-%" PRIdMAX,
- el->name,
- (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
- (intmax_t)ext->off,
- (intmax_t)(ext->off + ext->size + size));
-
- ext->size += size;
- }
- return (__block_ext_insert(session, el, ext));
+ WT_EXT *ext, *after, *before;
+
+ /*
+ * Retrieve the records preceding/following the offset. If the records are contiguous with the
+ * free'd offset, combine records.
+ */
+ __block_off_srch_pair(el, off, &before, &after);
+ if (before != NULL) {
+ if (before->off + before->size > off)
+ WT_BLOCK_RET(session, block, EINVAL,
+ "%s: existing range %" PRIdMAX "-%" PRIdMAX " overlaps with merge range %" PRIdMAX
+ "-%" PRIdMAX,
+ el->name, (intmax_t)before->off, (intmax_t)(before->off + before->size),
+ (intmax_t)off, (intmax_t)(off + size));
+ if (before->off + before->size != off)
+ before = NULL;
+ }
+ if (after != NULL) {
+ if (off + size > after->off) {
+ WT_BLOCK_RET(session, block, EINVAL,
+ "%s: merge range %" PRIdMAX "-%" PRIdMAX " overlaps with existing range %" PRIdMAX
+ "-%" PRIdMAX,
+ el->name, (intmax_t)off, (intmax_t)(off + size), (intmax_t)after->off,
+ (intmax_t)(after->off + after->size));
+ }
+ if (off + size != after->off)
+ after = NULL;
+ }
+ if (before == NULL && after == NULL) {
+ __wt_verbose(session, WT_VERB_BLOCK, "%s: insert range %" PRIdMAX "-%" PRIdMAX, el->name,
+ (intmax_t)off, (intmax_t)(off + size));
+
+ return (__block_off_insert(session, el, off, size));
+ }
+
+ /*
+ * If the "before" offset range abuts, we'll use it as our new record; if the "after" offset
+ * range also abuts, include its size and remove it from the system. Else, only the "after"
+ * offset range abuts, use the "after" offset range as our new record. In either case, remove
+ * the record we're going to use, adjust it and re-insert it.
+ */
+ if (before == NULL) {
+ WT_RET(__block_off_remove(session, block, el, after->off, &ext));
+
+ __wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" PRIdMAX "-%" PRIdMAX, el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), (intmax_t)off,
+ (intmax_t)(off + ext->size + size));
+
+ ext->off = off;
+ ext->size += size;
+ } else {
+ if (after != NULL) {
+ size += after->size;
+ WT_RET(__block_off_remove(session, block, el, after->off, NULL));
+ }
+ WT_RET(__block_off_remove(session, block, el, before->off, &ext));
+
+ __wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" PRIdMAX "-%" PRIdMAX, el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size), (intmax_t)ext->off,
+ (intmax_t)(ext->off + ext->size + size));
+
+ ext->size += size;
+ }
+ return (__block_ext_insert(session, el, ext));
}
/*
* __wt_block_extlist_read_avail --
- * Read an avail extent list, includes minor special handling.
+ * Read an avail extent list, includes minor special handling.
*/
int
-__wt_block_extlist_read_avail(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+__wt_block_extlist_read_avail(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
{
- WT_DECL_RET;
+ WT_DECL_RET;
- /* If there isn't a list, we're done. */
- if (el->offset == WT_BLOCK_INVALID_OFFSET)
- return (0);
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
#ifdef HAVE_DIAGNOSTIC
- /*
- * In diagnostic mode, reads are checked against the available and
- * discard lists (a block being read should never appear on either).
- * Checkpoint threads may be running in the file, don't race with
- * them.
- */
- __wt_spin_lock(session, &block->live_lock);
+ /*
+ * In diagnostic mode, reads are checked against the available and discard lists (a block being
+ * read should never appear on either). Checkpoint threads may be running in the file, don't
+ * race with them.
+ */
+ __wt_spin_lock(session, &block->live_lock);
#endif
- WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size));
+ WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size));
- /*
- * Extent blocks are allocated from the available list: if reading the
- * avail list, the extent blocks might be included, remove them.
- */
- WT_ERR_NOTFOUND_OK(__wt_block_off_remove_overlap(
- session, block, el, el->offset, el->size));
+ /*
+ * Extent blocks are allocated from the available list: if reading the avail list, the extent
+ * blocks might be included, remove them.
+ */
+ WT_ERR_NOTFOUND_OK(__wt_block_off_remove_overlap(session, block, el, el->offset, el->size));
err:
#ifdef HAVE_DIAGNOSTIC
- __wt_spin_unlock(session, &block->live_lock);
+ __wt_spin_unlock(session, &block->live_lock);
#endif
- return (ret);
+ return (ret);
}
/*
* __wt_block_extlist_read --
- * Read an extent list.
+ * Read an extent list.
*/
int
-__wt_block_extlist_read(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+__wt_block_extlist_read(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- wt_off_t off, size;
- int (*func)(
- WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
- const uint8_t *p;
-
- /* If there isn't a list, we're done. */
- if (el->offset == WT_BLOCK_INVALID_OFFSET)
- return (0);
-
- WT_RET(__wt_scr_alloc(session, el->size, &tmp));
- WT_ERR(__wt_block_read_off(
- session, block, tmp, el->offset, el->size, el->checksum));
-
- p = WT_BLOCK_HEADER_BYTE(tmp->mem);
- WT_ERR(__wt_extlist_read_pair(&p, &off, &size));
- if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
- goto corrupted;
-
- /*
- * If we're not creating both offset and size skiplists, use the simpler
- * append API, otherwise do a full merge. There are two reasons for the
- * test: first, checkpoint "available" lists are NOT sorted (checkpoints
- * write two separate lists, both of which are sorted but they're not
- * merged). Second, the "available" list is sorted by size as well as
- * by offset, and the fast-path append code doesn't support that, it's
- * limited to offset. The test of "track size" is short-hand for "are
- * we reading the available-blocks list".
- */
- func = el->track_size == 0 ? __block_append : __block_merge;
- for (;;) {
- WT_ERR(__wt_extlist_read_pair(&p, &off, &size));
- if (off == WT_BLOCK_INVALID_OFFSET)
- break;
-
- /*
- * We check the offset/size pairs represent valid file ranges,
- * then insert them into the list. We don't necessarily have
- * to check for offsets past the end of the checkpoint, but it's
- * a cheap test to do here and we'd have to do the check as part
- * of file verification, regardless.
- */
- if (off < block->allocsize ||
- off % block->allocsize != 0 ||
- size % block->allocsize != 0 ||
- off + size > ckpt_size) {
-corrupted: __wt_scr_free(session, &tmp);
- WT_BLOCK_RET(session, block, WT_ERROR,
- "file contains a corrupted %s extent list, range %"
- PRIdMAX "-%" PRIdMAX " past end-of-file",
- el->name,
- (intmax_t)off, (intmax_t)(off + size));
- }
-
- WT_ERR(func(session, block, el, off, size));
- }
-
- WT_ERR(__block_extlist_dump(session, block, el, "read"));
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ wt_off_t off, size;
+ const uint8_t *p;
+ int (*func)(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t);
+
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ WT_RET(__wt_scr_alloc(session, el->size, &tmp));
+ WT_ERR(__wt_block_read_off(session, block, tmp, el->offset, el->size, el->checksum));
+
+ p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_ERR(__wt_extlist_read_pair(&p, &off, &size));
+ if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
+ goto corrupted;
+
+ /*
+ * If we're not creating both offset and size skiplists, use the simpler append API, otherwise
+ * do a full merge. There are two reasons for the test: first, checkpoint "available" lists are
+ * NOT sorted (checkpoints write two separate lists, both of which are sorted but they're not
+ * merged). Second, the "available" list is sorted by size as well as by offset, and the
+ * fast-path append code doesn't support that, it's limited to offset. The test of "track size"
+ * is short-hand for "are we reading the available-blocks list".
+ */
+ func = el->track_size == 0 ? __block_append : __block_merge;
+ for (;;) {
+ WT_ERR(__wt_extlist_read_pair(&p, &off, &size));
+ if (off == WT_BLOCK_INVALID_OFFSET)
+ break;
+
+ /*
+ * We check the offset/size pairs represent valid file ranges, then insert them into the
+ * list. We don't necessarily have to check for offsets past the end of the checkpoint, but
+ * it's a cheap test to do here and we'd have to do the check as part of file verification,
+ * regardless.
+ */
+ if (off < block->allocsize || off % block->allocsize != 0 || size % block->allocsize != 0 ||
+ off + size > ckpt_size) {
+corrupted:
+ __wt_scr_free(session, &tmp);
+ WT_BLOCK_RET(session, block, WT_ERROR,
+ "file contains a corrupted %s extent list, range %" PRIdMAX "-%" PRIdMAX
+ " past end-of-file",
+ el->name, (intmax_t)off, (intmax_t)(off + size));
+ }
+
+ WT_ERR(func(session, block, el, off, size));
+ }
+
+ WT_ERR(__block_extlist_dump(session, block, el, "read"));
+
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
* __wt_block_extlist_write --
- * Write an extent list at the tail of the file.
+ * Write an extent list at the tail of the file.
*/
int
-__wt_block_extlist_write(WT_SESSION_IMPL *session,
- WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
+__wt_block_extlist_write(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_EXT *ext;
- WT_PAGE_HEADER *dsk;
- size_t size;
- uint32_t entries;
- uint8_t *p;
-
- WT_RET(__block_extlist_dump(session, block, el, "write"));
-
- /*
- * Figure out how many entries we're writing -- if there aren't any
- * entries, there's nothing to write, unless we still have to write
- * the extent list to include the checkpoint recovery information.
- */
- entries = el->entries + (additional == NULL ? 0 : additional->entries);
- if (entries == 0 && block->final_ckpt == NULL) {
- el->offset = WT_BLOCK_INVALID_OFFSET;
- el->checksum = el->size = 0;
- return (0);
- }
-
- /*
- * Get a scratch buffer, clear the page's header and data, initialize
- * the header.
- *
- * Allocate memory for the extent list entries plus two additional
- * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
- * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
- */
- size = ((size_t)entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
- WT_RET(__wt_block_write_size(session, block, &size));
- WT_RET(__wt_scr_alloc(session, size, &tmp));
- dsk = tmp->mem;
- memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
- dsk->type = WT_PAGE_BLOCK_MANAGER;
- dsk->version = WT_PAGE_VERSION_TS;
-
- /* Fill the page's data. */
- p = WT_BLOCK_HEADER_BYTE(dsk);
- /* Extent list starts */
- WT_ERR(__wt_extlist_write_pair(&p, WT_BLOCK_EXTLIST_MAGIC, 0));
- WT_EXT_FOREACH(ext, el->off) /* Free ranges */
- WT_ERR(__wt_extlist_write_pair(&p, ext->off, ext->size));
- if (additional != NULL)
- WT_EXT_FOREACH(ext, additional->off) /* Free ranges */
- WT_ERR(
- __wt_extlist_write_pair(&p, ext->off, ext->size));
- /* Extent list stops */
- WT_ERR(__wt_extlist_write_pair(&p, WT_BLOCK_INVALID_OFFSET,
- block->final_ckpt == NULL ? 0 : WT_BLOCK_EXTLIST_VERSION_CKPT));
-
- dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk));
- tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk);
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_PAGE_HEADER *dsk;
+ size_t size;
+ uint32_t entries;
+ uint8_t *p;
+
+ WT_RET(__block_extlist_dump(session, block, el, "write"));
+
+ /*
+ * Figure out how many entries we're writing -- if there aren't any entries, there's nothing to
+ * write, unless we still have to write the extent list to include the checkpoint recovery
+ * information.
+ */
+ entries = el->entries + (additional == NULL ? 0 : additional->entries);
+ if (entries == 0 && block->final_ckpt == NULL) {
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->checksum = el->size = 0;
+ return (0);
+ }
+
+ /*
+ * Get a scratch buffer, clear the page's header and data, initialize
+ * the header.
+ *
+ * Allocate memory for the extent list entries plus two additional
+ * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
+ * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
+ */
+ size = ((size_t)entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_block_write_size(session, block, &size));
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ dsk = tmp->mem;
+ memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
+ dsk->type = WT_PAGE_BLOCK_MANAGER;
+ dsk->version = WT_PAGE_VERSION_TS;
+
+ /* Fill the page's data. */
+ p = WT_BLOCK_HEADER_BYTE(dsk);
+ /* Extent list starts */
+ WT_ERR(__wt_extlist_write_pair(&p, WT_BLOCK_EXTLIST_MAGIC, 0));
+ WT_EXT_FOREACH (ext, el->off) /* Free ranges */
+ WT_ERR(__wt_extlist_write_pair(&p, ext->off, ext->size));
+ if (additional != NULL)
+ WT_EXT_FOREACH (ext, additional->off) /* Free ranges */
+ WT_ERR(__wt_extlist_write_pair(&p, ext->off, ext->size));
+ /* Extent list stops */
+ WT_ERR(__wt_extlist_write_pair(
+ &p, WT_BLOCK_INVALID_OFFSET, block->final_ckpt == NULL ? 0 : WT_BLOCK_EXTLIST_VERSION_CKPT));
+
+ dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk));
+ tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk);
#ifdef HAVE_DIAGNOSTIC
- /*
- * The extent list is written as a valid btree page because the salvage
- * functionality might move into the btree layer some day, besides, we
- * don't need another format and this way the page format can be easily
- * verified.
- */
- WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp));
+ /*
+ * The extent list is written as a valid btree page because the salvage functionality might move
+ * into the btree layer some day, besides, we don't need another format and this way the page
+ * format can be easily verified.
+ */
+ WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp));
#endif
- /* Write the extent list to disk. */
- WT_ERR(__wt_block_write_off(session, block,
- tmp, &el->offset, &el->size, &el->checksum, true, true, true));
+ /* Write the extent list to disk. */
+ WT_ERR(__wt_block_write_off(
+ session, block, tmp, &el->offset, &el->size, &el->checksum, true, true, true));
- /*
- * Remove the allocated blocks from the system's allocation list, extent
- * blocks never appear on any allocation list.
- */
- WT_TRET(__wt_block_off_remove_overlap(
- session, block, &block->live.alloc, el->offset, el->size));
+ /*
+ * Remove the allocated blocks from the system's allocation list, extent blocks never appear on
+ * any allocation list.
+ */
+ WT_TRET(
+ __wt_block_off_remove_overlap(session, block, &block->live.alloc, el->offset, el->size));
- __wt_verbose(session, WT_VERB_BLOCK,
- "%s written %" PRIdMAX "/%" PRIu32,
- el->name, (intmax_t)el->offset, el->size);
+ __wt_verbose(session, WT_VERB_BLOCK, "%s written %" PRIdMAX "/%" PRIu32, el->name,
+ (intmax_t)el->offset, el->size);
-err: __wt_scr_free(session, &tmp);
- return (ret);
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
* __wt_block_extlist_truncate --
- * Truncate the file based on the last available extent in the list.
+ * Truncate the file based on the last available extent in the list.
*/
int
-__wt_block_extlist_truncate(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+__wt_block_extlist_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
- WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
- wt_off_t size;
-
- /*
- * Check if the last available extent is at the end of the file, and if
- * so, truncate the file and discard the extent.
- */
- if ((ext = __block_off_srch_last(el->off, astack)) == NULL)
- return (0);
- WT_ASSERT(session, ext->off + ext->size <= block->size);
- if (ext->off + ext->size < block->size)
- return (0);
-
- /*
- * Remove the extent list entry. (Save the value, we need it to reset
- * the cached file size, and that can't happen until after the extent
- * list removal succeeds.)
- */
- size = ext->off;
- WT_RET(__block_off_remove(session, block, el, size, NULL));
-
- /* Truncate the file. */
- return (__wt_block_truncate(session, block, size));
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ wt_off_t size;
+
+ /*
+ * Check if the last available extent is at the end of the file, and if so, truncate the file
+ * and discard the extent.
+ */
+ if ((ext = __block_off_srch_last(el->off, astack)) == NULL)
+ return (0);
+ WT_ASSERT(session, ext->off + ext->size <= block->size);
+ if (ext->off + ext->size < block->size)
+ return (0);
+
+ /*
+ * Remove the extent list entry. (Save the value, we need it to reset the cached file size, and
+ * that can't happen until after the extent list removal succeeds.)
+ */
+ size = ext->off;
+ WT_RET(__block_off_remove(session, block, el, size, NULL));
+
+ /* Truncate the file. */
+ return (__wt_block_truncate(session, block, size));
}
/*
* __wt_block_extlist_init --
- * Initialize an extent list.
+ * Initialize an extent list.
*/
int
-__wt_block_extlist_init(WT_SESSION_IMPL *session,
- WT_EXTLIST *el, const char *name, const char *extname, bool track_size)
+__wt_block_extlist_init(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, bool track_size)
{
- size_t size;
+ size_t size;
- WT_CLEAR(*el);
+ WT_CLEAR(*el);
- size = (name == NULL ? 0 : strlen(name)) +
- strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1);
- WT_RET(__wt_calloc_def(session, size, &el->name));
- WT_RET(__wt_snprintf(el->name, size, "%s.%s",
- name == NULL ? "" : name, extname == NULL ? "" : extname));
+ size =
+ (name == NULL ? 0 : strlen(name)) + strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1);
+ WT_RET(__wt_calloc_def(session, size, &el->name));
+ WT_RET(__wt_snprintf(
+ el->name, size, "%s.%s", name == NULL ? "" : name, extname == NULL ? "" : extname));
- el->offset = WT_BLOCK_INVALID_OFFSET;
- el->track_size = track_size;
- return (0);
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->track_size = track_size;
+ return (0);
}
/*
* __wt_block_extlist_free --
- * Discard an extent list.
+ * Discard an extent list.
*/
void
__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
{
- WT_EXT *ext, *next;
- WT_SIZE *szp, *nszp;
-
- __wt_free(session, el->name);
-
- for (ext = el->off[0]; ext != NULL; ext = next) {
- next = ext->next[0];
- __wt_free(session, ext);
- }
- for (szp = el->sz[0]; szp != NULL; szp = nszp) {
- nszp = szp->next[0];
- __wt_free(session, szp);
- }
-
- /* Extent lists are re-used, clear them. */
- WT_CLEAR(*el);
+ WT_EXT *ext, *next;
+ WT_SIZE *szp, *nszp;
+
+ __wt_free(session, el->name);
+
+ for (ext = el->off[0]; ext != NULL; ext = next) {
+ next = ext->next[0];
+ __wt_free(session, ext);
+ }
+ for (szp = el->sz[0]; szp != NULL; szp = nszp) {
+ nszp = szp->next[0];
+ __wt_free(session, szp);
+ }
+
+ /* Extent lists are re-used, clear them. */
+ WT_CLEAR(*el);
}
/*
* __block_extlist_dump --
- * Dump an extent list as verbose messages.
+ * Dump an extent list as verbose messages.
*/
static int
-__block_extlist_dump(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, const char *tag)
+__block_extlist_dump(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, const char *tag)
{
- WT_DECL_ITEM(t1);
- WT_DECL_ITEM(t2);
- WT_DECL_RET;
- WT_EXT *ext;
- uint64_t pow, sizes[64];
- u_int i;
- const char *sep;
-
- if (!block->verify_layout && !WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
- return (0);
-
- WT_ERR(__wt_scr_alloc(session, 0, &t1));
- if (block->verify_layout)
- WT_ERR(__wt_msg(session,
- "%s extent list %s, %" PRIu32 " entries, %s bytes",
- tag, el->name, el->entries,
- __wt_buf_set_size(session, el->bytes, true, t1)));
- else
- __wt_verbose(session, WT_VERB_BLOCK,
- "%s extent list %s, %" PRIu32 " entries, %s bytes",
- tag, el->name, el->entries,
- __wt_buf_set_size(session, el->bytes, true, t1));
-
- if (el->entries == 0)
- goto done;
-
- memset(sizes, 0, sizeof(sizes));
- WT_EXT_FOREACH(ext, el->off)
- for (i = 9, pow = 512;; ++i, pow *= 2)
- if (ext->size <= (wt_off_t)pow) {
- ++sizes[i];
- break;
- }
- sep = "extents by bucket:";
- t1->size = 0;
- WT_ERR(__wt_scr_alloc(session, 0, &t2));
- for (i = 9, pow = 512; i < WT_ELEMENTS(sizes); ++i, pow *= 2)
- if (sizes[i] != 0) {
- WT_ERR(__wt_buf_catfmt(session, t1,
- "%s {%s: %" PRIu64 "}",
- sep,
- __wt_buf_set_size(session, pow, false, t2),
- sizes[i]));
- sep = ",";
- }
-
- if (block->verify_layout)
- WT_ERR(__wt_msg(session, "%s", (char *)t1->data));
- else
- __wt_verbose(session, WT_VERB_BLOCK, "%s", (char *)t1->data);
-
-done: err:
- __wt_scr_free(session, &t1);
- __wt_scr_free(session, &t2);
- return (ret);
+ WT_DECL_ITEM(t1);
+ WT_DECL_ITEM(t2);
+ WT_DECL_RET;
+ WT_EXT *ext;
+ uint64_t pow, sizes[64];
+ u_int i;
+ const char *sep;
+
+ if (!block->verify_layout && !WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+ return (0);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &t1));
+ if (block->verify_layout)
+ WT_ERR(__wt_msg(session, "%s extent list %s, %" PRIu32 " entries, %s bytes", tag, el->name,
+ el->entries, __wt_buf_set_size(session, el->bytes, true, t1)));
+ else
+ __wt_verbose(session, WT_VERB_BLOCK, "%s extent list %s, %" PRIu32 " entries, %s bytes",
+ tag, el->name, el->entries, __wt_buf_set_size(session, el->bytes, true, t1));
+
+ if (el->entries == 0)
+ goto done;
+
+ memset(sizes, 0, sizeof(sizes));
+ WT_EXT_FOREACH (ext, el->off)
+ for (i = 9, pow = 512;; ++i, pow *= 2)
+ if (ext->size <= (wt_off_t)pow) {
+ ++sizes[i];
+ break;
+ }
+ sep = "extents by bucket:";
+ t1->size = 0;
+ WT_ERR(__wt_scr_alloc(session, 0, &t2));
+ for (i = 9, pow = 512; i < WT_ELEMENTS(sizes); ++i, pow *= 2)
+ if (sizes[i] != 0) {
+ WT_ERR(__wt_buf_catfmt(session, t1, "%s {%s: %" PRIu64 "}", sep,
+ __wt_buf_set_size(session, pow, false, t2), sizes[i]));
+ sep = ",";
+ }
+
+ if (block->verify_layout)
+ WT_ERR(__wt_msg(session, "%s", (char *)t1->data));
+ else
+ __wt_verbose(session, WT_VERB_BLOCK, "%s", (char *)t1->data);
+
+done:
+err:
+ __wt_scr_free(session, &t1);
+ __wt_scr_free(session, &t2);
+ return (ret);
}