diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_slvg.c')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_slvg.c | 4171 |
1 files changed, 1979 insertions, 2192 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 5b0f2a5569a..fea2da9ae03 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -8,36 +8,39 @@ #include "wt_internal.h" -struct __wt_stuff; typedef struct __wt_stuff WT_STUFF; -struct __wt_track; typedef struct __wt_track WT_TRACK; -struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED; +struct __wt_stuff; +typedef struct __wt_stuff WT_STUFF; +struct __wt_track; +typedef struct __wt_track WT_TRACK; +struct __wt_track_shared; +typedef struct __wt_track_shared WT_TRACK_SHARED; /* - * There's a bunch of stuff we pass around during salvage, group it together - * to make the code prettier. + * There's a bunch of stuff we pass around during salvage, group it together to make the code + * prettier. */ struct __wt_stuff { - WT_SESSION_IMPL *session; /* Salvage session */ + WT_SESSION_IMPL *session; /* Salvage session */ - WT_TRACK **pages; /* Pages */ - uint32_t pages_next; /* Next empty slot */ - size_t pages_allocated; /* Bytes allocated */ + WT_TRACK **pages; /* Pages */ + uint32_t pages_next; /* Next empty slot */ + size_t pages_allocated; /* Bytes allocated */ - WT_TRACK **ovfl; /* Overflow pages */ - uint32_t ovfl_next; /* Next empty slot */ - size_t ovfl_allocated; /* Bytes allocated */ + WT_TRACK **ovfl; /* Overflow pages */ + uint32_t ovfl_next; /* Next empty slot */ + size_t ovfl_allocated; /* Bytes allocated */ - WT_REF root_ref; /* Created root page */ + WT_REF root_ref; /* Created root page */ - uint8_t page_type; /* Page type */ + uint8_t page_type; /* Page type */ - /* If need to free blocks backing merged page ranges. */ - bool merge_free; + /* If need to free blocks backing merged page ranges. */ + bool merge_free; - WT_ITEM *tmp1; /* Verbose print buffer */ - WT_ITEM *tmp2; /* Verbose print buffer */ + WT_ITEM *tmp1; /* Verbose print buffer */ + WT_ITEM *tmp2; /* Verbose print buffer */ - uint64_t fcnt; /* Progress counter */ + uint64_t fcnt; /* Progress counter */ }; /* @@ -45,24 +48,23 @@ struct __wt_stuff { * Information shared between pages being merged. */ struct __wt_track_shared { - uint32_t ref; /* Reference count */ - - /* - * Physical information about the file block. - */ - WT_ADDR addr; /* Page address */ - uint32_t size; /* Page size */ - uint64_t gen; /* Page generation */ - - /* - * Pages that reference overflow pages contain a list of the overflow - * pages they reference. We start out with a list of addresses, and - * convert to overflow array slots during the reconciliation of page - * references to overflow records. - */ - WT_ADDR *ovfl_addr; /* Overflow pages by address */ - uint32_t *ovfl_slot; /* Overflow pages by slot */ - uint32_t ovfl_cnt; /* Overflow reference count */ + uint32_t ref; /* Reference count */ + + /* + * Physical information about the file block. + */ + WT_ADDR addr; /* Page address */ + uint32_t size; /* Page size */ + uint64_t gen; /* Page generation */ + + /* + * Pages that reference overflow pages contain a list of the overflow pages they reference. We + * start out with a list of addresses, and convert to overflow array slots during the + * reconciliation of page references to overflow records. + */ + WT_ADDR *ovfl_addr; /* Overflow pages by address */ + uint32_t *ovfl_slot; /* Overflow pages by slot */ + uint32_t ovfl_cnt; /* Overflow reference count */ }; /* @@ -72,2453 +74,2238 @@ struct __wt_track_shared { * split the leaf page chunks up, one chunk for each unique key range. */ struct __wt_track { -#define trk_addr shared->addr.addr -#define trk_addr_size shared->addr.size -#define trk_gen shared->gen -#define trk_ovfl_addr shared->ovfl_addr -#define trk_ovfl_cnt shared->ovfl_cnt -#define trk_ovfl_slot shared->ovfl_slot -#define trk_size shared->size - WT_TRACK_SHARED *shared; /* Shared information */ - - WT_STUFF *ss; /* Enclosing stuff */ - - union { - struct { -#undef row_start -#define row_start u.row._row_start - WT_ITEM _row_start; /* Row-store start range */ -#undef row_stop -#define row_stop u.row._row_stop - WT_ITEM _row_stop; /* Row-store stop range */ - } row; - - struct { -#undef col_start -#define col_start u.col._col_start - uint64_t _col_start; /* Col-store start range */ -#undef col_stop -#define col_stop u.col._col_stop - uint64_t _col_stop; /* Col-store stop range */ -#undef col_missing -#define col_missing u.col._col_missing - uint64_t _col_missing; /* Col-store missing range */ - } col; - } u; +#define trk_addr shared->addr.addr +#define trk_addr_size shared->addr.size +#define trk_gen shared->gen +#define trk_ovfl_addr shared->ovfl_addr +#define trk_ovfl_cnt shared->ovfl_cnt +#define trk_ovfl_slot shared->ovfl_slot +#define trk_size shared->size + WT_TRACK_SHARED *shared; /* Shared information */ + + WT_STUFF *ss; /* Enclosing stuff */ + + union { + struct { +#undef row_start +#define row_start u.row._row_start + WT_ITEM _row_start; /* Row-store start range */ +#undef row_stop +#define row_stop u.row._row_stop + WT_ITEM _row_stop; /* Row-store stop range */ + } row; + + struct { +#undef col_start +#define col_start u.col._col_start + uint64_t _col_start; /* Col-store start range */ +#undef col_stop +#define col_stop u.col._col_stop + uint64_t _col_stop; /* Col-store stop range */ +#undef col_missing +#define col_missing u.col._col_missing + uint64_t _col_missing; /* Col-store missing range */ + } col; + } u; /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_TRACK_CHECK_START 0x1u /* Row: initial key updated */ -#define WT_TRACK_CHECK_STOP 0x2u /* Row: last key updated */ -#define WT_TRACK_MERGE 0x4u /* Page requires merging */ -#define WT_TRACK_OVFL_REFD 0x8u /* Overflow page referenced */ -/* AUTOMATIC FLAG VALUE GENERATION STOP */ - u_int flags; +#define WT_TRACK_CHECK_START 0x1u /* Row: initial key updated */ +#define WT_TRACK_CHECK_STOP 0x2u /* Row: last key updated */ +#define WT_TRACK_MERGE 0x4u /* Page requires merging */ +#define WT_TRACK_OVFL_REFD 0x8u /* Overflow page referenced */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + u_int flags; }; -static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); -static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *); -static int __slvg_col_ovfl(WT_SESSION_IMPL *, - WT_TRACK *, WT_PAGE *, uint64_t, uint64_t, uint64_t); -static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); +static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *); +static int __slvg_col_ovfl(WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t, uint64_t); +static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *); static void __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_col_range_overlap( - WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); +static int __slvg_col_range_overlap(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *); -static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *); static int WT_CDECL __slvg_ovfl_compare(const void *, const void *); -static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, bool); -static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *); -static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); -static int __slvg_row_build_leaf( - WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *); -static int __slvg_row_ovfl( - WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t); -static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_row_range_overlap( - WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); -static int __slvg_row_trk_update_start( - WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *); -static int WT_CDECL __slvg_trk_compare_addr(const void *, const void *); -static int WT_CDECL __slvg_trk_compare_gen(const void *, const void *); -static int WT_CDECL __slvg_trk_compare_key(const void *, const void *); -static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, bool); +static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, bool); +static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *); +static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); +static int __slvg_row_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *); +static int __slvg_row_ovfl(WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t); +static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *); +static int __slvg_row_range_overlap(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *); +static int __slvg_row_trk_update_start(WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *); +static int WT_CDECL __slvg_trk_compare_addr(const void *, const void *); +static int WT_CDECL __slvg_trk_compare_gen(const void *, const void *); +static int WT_CDECL __slvg_trk_compare_key(const void *, const void *); +static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, bool); static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *); -static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *, - size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **); -static int __slvg_trk_leaf(WT_SESSION_IMPL *, - const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); -static int __slvg_trk_leaf_ovfl( - WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *); -static int __slvg_trk_ovfl(WT_SESSION_IMPL *, - const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); +static int __slvg_trk_init( + WT_SESSION_IMPL *, uint8_t *, size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **); +static int __slvg_trk_leaf( + WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); +static int __slvg_trk_leaf_ovfl(WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *); +static int __slvg_trk_ovfl( + WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *); /* * __wt_bt_salvage -- - * Salvage a Btree. + * Salvage a Btree. */ int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) { - WT_BM *bm; - WT_BTREE *btree; - WT_DECL_RET; - WT_STUFF *ss, stuff; - uint32_t i, leaf_cnt; - - WT_UNUSED(cfg); - - btree = S2BT(session); - bm = btree->bm; - - WT_CLEAR(stuff); - ss = &stuff; - ss->session = session; - ss->page_type = WT_PAGE_INVALID; - - /* Allocate temporary buffers. */ - WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1)); - WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); - - /* - * Step 1: - * Inform the underlying block manager that we're salvaging the file. - */ - WT_ERR(bm->salvage_start(bm, session)); - - /* - * Step 2: - * Read the file and build in-memory structures that reference any leaf - * or overflow page. Any pages other than leaf or overflow pages are - * added to the free list. - * - * Turn off read checksum and verification error messages while we're - * reading the file, we expect to see corrupted blocks. - */ - F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); - ret = __slvg_read(session, ss); - F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); - WT_ERR(ret); - - /* - * Step 3: - * Discard any page referencing a non-existent overflow page. We do - * this before checking overlapping key ranges on the grounds that a - * bad key range we can use is better than a terrific key range that - * references pages we don't have. On the other hand, we subsequently - * discard key ranges where there are better overlapping ranges, and - * it would be better if we let the availability of an overflow value - * inform our choices as to the key ranges we select, ideally on a - * per-key basis. - * - * A complicating problem is found in variable-length column-store - * objects, where we potentially split key ranges within RLE units. - * For example, if there's a page with rows 15-20 and we later find - * row 17 with a larger LSN, the range splits into 3 chunks, 15-16, - * 17, and 18-20. If rows 15-20 were originally a single value (an - * RLE of 6), and that record is an overflow record, we end up with - * two chunks, both of which want to reference the same overflow value. - * - * Instead of the approach just described, we're first discarding any - * pages referencing non-existent overflow pages, then we're reviewing - * our key ranges and discarding any that overlap. We're doing it that - * way for a few reasons: absent corruption, missing overflow items are - * strong arguments the page was replaced (on the other hand, some kind - * of file corruption is probably why we're here); it's a significant - * amount of additional complexity to simultaneously juggle overlapping - * ranges and missing overflow items; finally, real-world applications - * usually don't have a lot of overflow items, as WiredTiger supports - * very large page sizes, overflow items shouldn't be common. - * - * Step 4: - * Add unreferenced overflow page blocks to the free list so they are - * reused immediately. - */ - WT_ERR(__slvg_ovfl_reconcile(session, ss)); - WT_ERR(__slvg_ovfl_discard(session, ss)); - - /* - * Step 5: - * Walk the list of pages looking for overlapping ranges to resolve. - * If we find a range that needs to be resolved, set a global flag - * and a per WT_TRACK flag on the pages requiring modification. - * - * This requires sorting the page list by key, and secondarily by LSN. - * - * !!! - * It's vanishingly unlikely and probably impossible for fixed-length - * column-store files to have overlapping key ranges. It's possible - * for an entire key range to go missing (if a page is corrupted and - * lost), but because pages can't split, it shouldn't be possible to - * find pages where the key ranges overlap. That said, we check for - * it and clean up after it in reconciliation because it doesn't cost - * much and future column-store formats or operations might allow for - * fixed-length format ranges to overlap during salvage, and I don't - * want to have to retrofit the code later. - */ - __wt_qsort(ss->pages, - (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key); - if (ss->page_type == WT_PAGE_ROW_LEAF) - WT_ERR(__slvg_row_range(session, ss)); - else - WT_ERR(__slvg_col_range(session, ss)); - - /* - * Step 6: - * We may have lost key ranges in column-store databases, that is, some - * part of the record number space is gone; look for missing ranges. - */ - switch (ss->page_type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - __slvg_col_range_missing(session, ss); - break; - case WT_PAGE_ROW_LEAF: - break; - } - - /* - * Step 7: - * Build an internal page that references all of the leaf pages, - * and write it, as well as any merged pages, to the file. - * - * Count how many leaf pages we have (we could track this during the - * array shuffling/splitting, but that's a lot harder). - */ - for (leaf_cnt = i = 0; i < ss->pages_next; ++i) - if (ss->pages[i] != NULL) - ++leaf_cnt; - if (leaf_cnt != 0) - switch (ss->page_type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - WT_WITH_PAGE_INDEX(session, - ret = __slvg_col_build_internal( - session, leaf_cnt, ss)); - WT_ERR(ret); - break; - case WT_PAGE_ROW_LEAF: - WT_WITH_PAGE_INDEX(session, - ret = __slvg_row_build_internal( - session, leaf_cnt, ss)); - WT_ERR(ret); - break; - } - - /* - * Step 8: - * If we had to merge key ranges, we have to do a final pass through - * the leaf page array and discard file pages used during key merges. - * We can't do it earlier: if we free'd the leaf pages we're merging as - * we merged them, the write of subsequent leaf pages or the internal - * page might allocate those free'd file blocks, and if the salvage run - * subsequently fails, we'd have overwritten pages used to construct the - * final key range. In other words, if the salvage run fails, we don't - * want to overwrite data the next salvage run might need. - */ - if (ss->merge_free) - WT_ERR(__slvg_merge_block_free(session, ss)); - - /* - * Step 9: - * Evict the newly created root page, creating a checkpoint. - */ - if (ss->root_ref.page != NULL) { - btree->ckpt = ckptbase; - ret = __wt_evict(session, &ss->root_ref, WT_REF_MEM, - WT_EVICT_CALL_CLOSING); - ss->root_ref.page = NULL; - btree->ckpt = NULL; - } - - /* - * Step 10: - * Inform the underlying block manager that we're done. - */ -err: WT_TRET(bm->salvage_end(bm, session)); - - /* Discard any root page we created. */ - if (ss->root_ref.page != NULL) - __wt_ref_out(session, &ss->root_ref); - - /* Discard the leaf and overflow page memory. */ - WT_TRET(__slvg_cleanup(session, ss)); - - /* Discard temporary buffers. */ - __wt_scr_free(session, &ss->tmp1); - __wt_scr_free(session, &ss->tmp2); - - return (ret); + WT_BM *bm; + WT_BTREE *btree; + WT_DECL_RET; + WT_STUFF *ss, stuff; + uint32_t i, leaf_cnt; + + WT_UNUSED(cfg); + + btree = S2BT(session); + bm = btree->bm; + + WT_CLEAR(stuff); + ss = &stuff; + ss->session = session; + ss->page_type = WT_PAGE_INVALID; + + /* Allocate temporary buffers. */ + WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); + + /* + * Step 1: Inform the underlying block manager that we're salvaging the file. + */ + WT_ERR(bm->salvage_start(bm, session)); + + /* + * Step 2: + * Read the file and build in-memory structures that reference any leaf + * or overflow page. Any pages other than leaf or overflow pages are + * added to the free list. + * + * Turn off read checksum and verification error messages while we're + * reading the file, we expect to see corrupted blocks. + */ + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); + ret = __slvg_read(session, ss); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); + WT_ERR(ret); + + /* + * Step 3: + * Discard any page referencing a non-existent overflow page. We do + * this before checking overlapping key ranges on the grounds that a + * bad key range we can use is better than a terrific key range that + * references pages we don't have. On the other hand, we subsequently + * discard key ranges where there are better overlapping ranges, and + * it would be better if we let the availability of an overflow value + * inform our choices as to the key ranges we select, ideally on a + * per-key basis. + * + * A complicating problem is found in variable-length column-store + * objects, where we potentially split key ranges within RLE units. + * For example, if there's a page with rows 15-20 and we later find + * row 17 with a larger LSN, the range splits into 3 chunks, 15-16, + * 17, and 18-20. If rows 15-20 were originally a single value (an + * RLE of 6), and that record is an overflow record, we end up with + * two chunks, both of which want to reference the same overflow value. + * + * Instead of the approach just described, we're first discarding any + * pages referencing non-existent overflow pages, then we're reviewing + * our key ranges and discarding any that overlap. We're doing it that + * way for a few reasons: absent corruption, missing overflow items are + * strong arguments the page was replaced (on the other hand, some kind + * of file corruption is probably why we're here); it's a significant + * amount of additional complexity to simultaneously juggle overlapping + * ranges and missing overflow items; finally, real-world applications + * usually don't have a lot of overflow items, as WiredTiger supports + * very large page sizes, overflow items shouldn't be common. + * + * Step 4: + * Add unreferenced overflow page blocks to the free list so they are + * reused immediately. + */ + WT_ERR(__slvg_ovfl_reconcile(session, ss)); + WT_ERR(__slvg_ovfl_discard(session, ss)); + + /* + * Step 5: + * Walk the list of pages looking for overlapping ranges to resolve. + * If we find a range that needs to be resolved, set a global flag + * and a per WT_TRACK flag on the pages requiring modification. + * + * This requires sorting the page list by key, and secondarily by LSN. + * + * !!! + * It's vanishingly unlikely and probably impossible for fixed-length + * column-store files to have overlapping key ranges. It's possible + * for an entire key range to go missing (if a page is corrupted and + * lost), but because pages can't split, it shouldn't be possible to + * find pages where the key ranges overlap. That said, we check for + * it and clean up after it in reconciliation because it doesn't cost + * much and future column-store formats or operations might allow for + * fixed-length format ranges to overlap during salvage, and I don't + * want to have to retrofit the code later. + */ + __wt_qsort(ss->pages, (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key); + if (ss->page_type == WT_PAGE_ROW_LEAF) + WT_ERR(__slvg_row_range(session, ss)); + else + WT_ERR(__slvg_col_range(session, ss)); + + /* + * Step 6: We may have lost key ranges in column-store databases, that is, some part of the + * record number space is gone; look for missing ranges. + */ + switch (ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + __slvg_col_range_missing(session, ss); + break; + case WT_PAGE_ROW_LEAF: + break; + } + + /* + * Step 7: + * Build an internal page that references all of the leaf pages, + * and write it, as well as any merged pages, to the file. + * + * Count how many leaf pages we have (we could track this during the + * array shuffling/splitting, but that's a lot harder). + */ + for (leaf_cnt = i = 0; i < ss->pages_next; ++i) + if (ss->pages[i] != NULL) + ++leaf_cnt; + if (leaf_cnt != 0) + switch (ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_WITH_PAGE_INDEX(session, ret = __slvg_col_build_internal(session, leaf_cnt, ss)); + WT_ERR(ret); + break; + case WT_PAGE_ROW_LEAF: + WT_WITH_PAGE_INDEX(session, ret = __slvg_row_build_internal(session, leaf_cnt, ss)); + WT_ERR(ret); + break; + } + + /* + * Step 8: If we had to merge key ranges, we have to do a final pass through the leaf page array + * and discard file pages used during key merges. We can't do it earlier: if we free'd the leaf + * pages we're merging as we merged them, the write of subsequent leaf pages or the internal + * page might allocate those free'd file blocks, and if the salvage run subsequently fails, we'd + * have overwritten pages used to construct the final key range. In other words, if the salvage + * run fails, we don't want to overwrite data the next salvage run might need. + */ + if (ss->merge_free) + WT_ERR(__slvg_merge_block_free(session, ss)); + + /* + * Step 9: Evict the newly created root page, creating a checkpoint. + */ + if (ss->root_ref.page != NULL) { + btree->ckpt = ckptbase; + ret = __wt_evict(session, &ss->root_ref, WT_REF_MEM, WT_EVICT_CALL_CLOSING); + ss->root_ref.page = NULL; + btree->ckpt = NULL; + } + +/* + * Step 10: Inform the underlying block manager that we're done. + */ +err: + WT_TRET(bm->salvage_end(bm, session)); + + /* Discard any root page we created. */ + if (ss->root_ref.page != NULL) + __wt_ref_out(session, &ss->root_ref); + + /* Discard the leaf and overflow page memory. */ + WT_TRET(__slvg_cleanup(session, ss)); + + /* Discard temporary buffers. */ + __wt_scr_free(session, &ss->tmp1); + __wt_scr_free(session, &ss->tmp2); + + return (ret); } /* * __slvg_read -- - * Read the file and build a table of the pages we can use. + * Read the file and build a table of the pages we can use. */ static int __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_BM *bm; - WT_DECL_ITEM(as); - WT_DECL_ITEM(buf); - WT_DECL_RET; - const WT_PAGE_HEADER *dsk; - size_t addr_size; - uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; - bool eof, valid; - - bm = S2BT(session)->bm; - WT_ERR(__wt_scr_alloc(session, 0, &as)); - WT_ERR(__wt_scr_alloc(session, 0, &buf)); - - for (;;) { - /* Get the next block address from the block manager. */ - WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof)); - if (eof) - break; - - /* Report progress occasionally. */ -#define WT_SALVAGE_PROGRESS_INTERVAL 100 - if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0) - WT_ERR(__wt_progress(session, NULL, ss->fcnt)); - - /* - * Read (and potentially decompress) the block; the underlying - * block manager might return only good blocks if checksums are - * configured, or both good and bad blocks if we're relying on - * compression. - * - * Report the block's status to the block manager. - */ - if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0) - valid = true; - else { - valid = false; - if (ret == WT_ERROR) - ret = 0; - WT_ERR(ret); - } - WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid)); - if (!valid) - continue; - - /* Create a printable version of the address. */ - WT_ERR(bm->addr_string(bm, session, as, addr, addr_size)); - - /* - * Make sure it's an expected page type for the file. - * - * We only care about leaf and overflow pages from here on out; - * discard all of the others. We put them on the free list now, - * because we might as well overwrite them, we want the file to - * grow as little as possible, or shrink, and future salvage - * calls don't need them either. - */ - dsk = buf->data; - switch (dsk->type) { - case WT_PAGE_BLOCK_MANAGER: - case WT_PAGE_COL_INT: - case WT_PAGE_ROW_INT: - __wt_verbose(session, WT_VERB_SALVAGE, - "%s page ignored %s", - __wt_page_type_string(dsk->type), - (const char *)as->data); - WT_ERR(bm->free(bm, session, addr, addr_size)); - continue; - } - - /* - * Verify the page. It's unlikely a page could have a valid - * checksum and still be broken, but paranoia is healthy in - * salvage. Regardless, verify does return failure because - * it detects failures we'd expect to see in a corrupted file, - * like overflow references past the end of the file or - * overflow references to non-existent pages, might as well - * discard these pages now. - */ - if (__wt_verify_dsk(session, as->data, buf) != 0) { - __wt_verbose(session, WT_VERB_SALVAGE, - "%s page failed verify %s", - __wt_page_type_string(dsk->type), - (const char *)as->data); - WT_ERR(bm->free(bm, session, addr, addr_size)); - continue; - } - - __wt_verbose(session, WT_VERB_SALVAGE, - "tracking %s page, generation %" PRIu64 " %s", - __wt_page_type_string(dsk->type), dsk->write_gen, - (const char *)as->data); - - switch (dsk->type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - case WT_PAGE_ROW_LEAF: - if (ss->page_type == WT_PAGE_INVALID) - ss->page_type = dsk->type; - if (ss->page_type != dsk->type) - WT_ERR_MSG(session, WT_ERROR, - "file contains multiple file formats (both " - "%s and %s), and cannot be salvaged", - __wt_page_type_string(ss->page_type), - __wt_page_type_string(dsk->type)); - - WT_ERR(__slvg_trk_leaf( - session, dsk, addr, addr_size, ss)); - break; - case WT_PAGE_OVFL: - WT_ERR(__slvg_trk_ovfl( - session, dsk, addr, addr_size, ss)); - break; - } - } - -err: __wt_scr_free(session, &as); - __wt_scr_free(session, &buf); - - return (ret); + WT_BM *bm; + WT_DECL_ITEM(as); + WT_DECL_ITEM(buf); + WT_DECL_RET; + const WT_PAGE_HEADER *dsk; + size_t addr_size; + uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; + bool eof, valid; + + bm = S2BT(session)->bm; + WT_ERR(__wt_scr_alloc(session, 0, &as)); + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + for (;;) { + /* Get the next block address from the block manager. */ + WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof)); + if (eof) + break; + +/* Report progress occasionally. */ +#define WT_SALVAGE_PROGRESS_INTERVAL 100 + if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0) + WT_ERR(__wt_progress(session, NULL, ss->fcnt)); + + /* + * Read (and potentially decompress) the block; the underlying + * block manager might return only good blocks if checksums are + * configured, or both good and bad blocks if we're relying on + * compression. + * + * Report the block's status to the block manager. + */ + if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0) + valid = true; + else { + valid = false; + if (ret == WT_ERROR) + ret = 0; + WT_ERR(ret); + } + WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid)); + if (!valid) + continue; + + /* Create a printable version of the address. */ + WT_ERR(bm->addr_string(bm, session, as, addr, addr_size)); + + /* + * Make sure it's an expected page type for the file. + * + * We only care about leaf and overflow pages from here on out; + * discard all of the others. We put them on the free list now, + * because we might as well overwrite them, we want the file to + * grow as little as possible, or shrink, and future salvage + * calls don't need them either. + */ + dsk = buf->data; + switch (dsk->type) { + case WT_PAGE_BLOCK_MANAGER: + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + __wt_verbose(session, WT_VERB_SALVAGE, "%s page ignored %s", + __wt_page_type_string(dsk->type), (const char *)as->data); + WT_ERR(bm->free(bm, session, addr, addr_size)); + continue; + } + + /* + * Verify the page. It's unlikely a page could have a valid checksum and still be broken, + * but paranoia is healthy in salvage. Regardless, verify does return failure because it + * detects failures we'd expect to see in a corrupted file, like overflow references past + * the end of the file or overflow references to non-existent pages, might as well discard + * these pages now. + */ + if (__wt_verify_dsk(session, as->data, buf) != 0) { + __wt_verbose(session, WT_VERB_SALVAGE, "%s page failed verify %s", + __wt_page_type_string(dsk->type), (const char *)as->data); + WT_ERR(bm->free(bm, session, addr, addr_size)); + continue; + } + + __wt_verbose(session, WT_VERB_SALVAGE, "tracking %s page, generation %" PRIu64 " %s", + __wt_page_type_string(dsk->type), dsk->write_gen, (const char *)as->data); + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + case WT_PAGE_ROW_LEAF: + if (ss->page_type == WT_PAGE_INVALID) + ss->page_type = dsk->type; + if (ss->page_type != dsk->type) + WT_ERR_MSG(session, WT_ERROR, + "file contains multiple file formats (both " + "%s and %s), and cannot be salvaged", + __wt_page_type_string(ss->page_type), __wt_page_type_string(dsk->type)); + + WT_ERR(__slvg_trk_leaf(session, dsk, addr, addr_size, ss)); + break; + case WT_PAGE_OVFL: + WT_ERR(__slvg_trk_ovfl(session, dsk, addr, addr_size, ss)); + break; + } + } + +err: + __wt_scr_free(session, &as); + __wt_scr_free(session, &buf); + + return (ret); } /* * __slvg_trk_init -- - * Initialize tracking information for a page. + * Initialize tracking information for a page. */ static int -__slvg_trk_init(WT_SESSION_IMPL *session, - uint8_t *addr, size_t addr_size, - uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp) +__slvg_trk_init(WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, uint32_t size, + uint64_t gen, WT_STUFF *ss, WT_TRACK **retp) { - WT_DECL_RET; - WT_TRACK *trk; - - WT_RET(__wt_calloc_one(session, &trk)); - WT_ERR(__wt_calloc_one(session, &trk->shared)); - trk->shared->ref = 1; - - trk->ss = ss; - WT_ERR(__wt_memdup(session, addr, addr_size, &trk->trk_addr)); - trk->trk_addr_size = (uint8_t)addr_size; - trk->trk_size = size; - trk->trk_gen = gen; - - *retp = trk; - return (0); - -err: __wt_free(session, trk->trk_addr); - __wt_free(session, trk->shared); - __wt_free(session, trk); - return (ret); + WT_DECL_RET; + WT_TRACK *trk; + + WT_RET(__wt_calloc_one(session, &trk)); + WT_ERR(__wt_calloc_one(session, &trk->shared)); + trk->shared->ref = 1; + + trk->ss = ss; + WT_ERR(__wt_memdup(session, addr, addr_size, &trk->trk_addr)); + trk->trk_addr_size = (uint8_t)addr_size; + trk->trk_size = size; + trk->trk_gen = gen; + + *retp = trk; + return (0); + +err: + __wt_free(session, trk->trk_addr); + __wt_free(session, trk->shared); + __wt_free(session, trk); + return (ret); } /* * __slvg_trk_leaf -- - * Track a leaf page. + * Track a leaf page. */ static int -__slvg_trk_leaf(WT_SESSION_IMPL *session, - const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss) +__slvg_trk_leaf(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *addr, + size_t addr_size, WT_STUFF *ss) { - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; - WT_DECL_RET; - WT_PAGE *page; - WT_TRACK *trk; - uint64_t stop_recno; - uint32_t i; - - btree = S2BT(session); - unpack = &_unpack; - page = NULL; - trk = NULL; - - /* Re-allocate the array of pages, as necessary. */ - WT_RET(__wt_realloc_def( - session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)); - - /* Allocate a WT_TRACK entry for this new page and fill it in. */ - WT_RET(__slvg_trk_init( - session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); - - switch (dsk->type) { - case WT_PAGE_COL_FIX: - /* - * Column-store fixed-sized format: start and stop keys can be - * taken from the block's header, and doesn't contain overflow - * items. - */ - trk->col_start = dsk->recno; - trk->col_stop = dsk->recno + (dsk->u.entries - 1); - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s records %" PRIu64 "-%" PRIu64, - __wt_addr_string( - session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), - trk->col_start, trk->col_stop); - break; - case WT_PAGE_COL_VAR: - /* - * Column-store variable-length format: the start key can be - * taken from the block's header, stop key requires walking - * the page. - */ - stop_recno = dsk->recno; - WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { - __wt_cell_unpack(cell, unpack); - stop_recno += __wt_cell_rle(unpack); - } - - trk->col_start = dsk->recno; - trk->col_stop = stop_recno - 1; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s records %" PRIu64 "-%" PRIu64, - __wt_addr_string( - session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), - trk->col_start, trk->col_stop); - - /* Column-store pages can contain overflow items. */ - WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); - break; - case WT_PAGE_ROW_LEAF: - /* - * Row-store format: copy the first and last keys on the page. - * Keys are prefix-compressed, the simplest and slowest thing - * to do is instantiate the in-memory page, then instantiate - * and copy the full keys, then free the page. We do this on - * every leaf page, and if you need to speed up the salvage, - * it's probably a great place to start. - * - * Page flags are 0 because we aren't releasing the memory used - * to read the page into memory and we don't want page discard - * to free it. - */ - WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page)); - WT_ERR(__wt_row_leaf_key_copy(session, - page, &page->pg_row[0], &trk->row_start)); - WT_ERR(__wt_row_leaf_key_copy(session, - page, &page->pg_row[page->entries - 1], &trk->row_stop)); - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s start key %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - __wt_buf_set_printable(session, - trk->row_start.data, trk->row_start.size, ss->tmp2)); - __wt_verbose(session, WT_VERB_SALVAGE, - "%s stop key %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - __wt_buf_set_printable(session, - trk->row_stop.data, trk->row_stop.size, ss->tmp2)); - - /* Row-store pages can contain overflow items. */ - WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); - break; - } - ss->pages[ss->pages_next++] = trk; - - if (0) { -err: __wt_free(session, trk); - } - if (page != NULL) - __wt_page_out(session, &page); - return (ret); + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_DECL_RET; + WT_PAGE *page; + WT_TRACK *trk; + uint64_t stop_recno; + uint32_t i; + + btree = S2BT(session); + unpack = &_unpack; + page = NULL; + trk = NULL; + + /* Re-allocate the array of pages, as necessary. */ + WT_RET(__wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)); + + /* Allocate a WT_TRACK entry for this new page and fill it in. */ + WT_RET(__slvg_trk_init(session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + /* + * Column-store fixed-sized format: start and stop keys can be taken from the block's + * header, and doesn't contain overflow items. + */ + trk->col_start = dsk->recno; + trk->col_stop = dsk->recno + (dsk->u.entries - 1); + + __wt_verbose(session, WT_VERB_SALVAGE, "%s records %" PRIu64 "-%" PRIu64, + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), trk->col_start, + trk->col_stop); + break; + case WT_PAGE_COL_VAR: + /* + * Column-store variable-length format: the start key can be taken from the block's header, + * stop key requires walking the page. + */ + stop_recno = dsk->recno; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) + { + __wt_cell_unpack(cell, unpack); + stop_recno += __wt_cell_rle(unpack); + } + + trk->col_start = dsk->recno; + trk->col_stop = stop_recno - 1; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s records %" PRIu64 "-%" PRIu64, + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), trk->col_start, + trk->col_stop); + + /* Column-store pages can contain overflow items. */ + WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); + break; + case WT_PAGE_ROW_LEAF: + /* + * Row-store format: copy the first and last keys on the page. + * Keys are prefix-compressed, the simplest and slowest thing + * to do is instantiate the in-memory page, then instantiate + * and copy the full keys, then free the page. We do this on + * every leaf page, and if you need to speed up the salvage, + * it's probably a great place to start. + * + * Page flags are 0 because we aren't releasing the memory used + * to read the page into memory and we don't want page discard + * to free it. + */ + WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page)); + WT_ERR(__wt_row_leaf_key_copy(session, page, &page->pg_row[0], &trk->row_start)); + WT_ERR( + __wt_row_leaf_key_copy(session, page, &page->pg_row[page->entries - 1], &trk->row_stop)); + + __wt_verbose(session, WT_VERB_SALVAGE, "%s start key %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, trk->row_start.data, trk->row_start.size, ss->tmp2)); + __wt_verbose(session, WT_VERB_SALVAGE, "%s stop key %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, trk->row_stop.data, trk->row_stop.size, ss->tmp2)); + + /* Row-store pages can contain overflow items. */ + WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); + break; + } + ss->pages[ss->pages_next++] = trk; + + if (0) { +err: + __wt_free(session, trk); + } + if (page != NULL) + __wt_page_out(session, &page); + return (ret); } /* * __slvg_trk_ovfl -- - * Track an overflow page. + * Track an overflow page. */ static int -__slvg_trk_ovfl(WT_SESSION_IMPL *session, - const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss) +__slvg_trk_ovfl(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint8_t *addr, + size_t addr_size, WT_STUFF *ss) { - WT_TRACK *trk; + WT_TRACK *trk; - /* - * Reallocate the overflow page array as necessary, then save the - * page's location information. - */ - WT_RET(__wt_realloc_def( - session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl)); + /* + * Reallocate the overflow page array as necessary, then save the page's location information. + */ + WT_RET(__wt_realloc_def(session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl)); - WT_RET(__slvg_trk_init( - session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); - ss->ovfl[ss->ovfl_next++] = trk; + WT_RET(__slvg_trk_init(session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk)); + ss->ovfl[ss->ovfl_next++] = trk; - return (0); + return (0); } /* * __slvg_trk_leaf_ovfl -- - * Search a leaf page for overflow items. + * Search a leaf page for overflow items. */ static int -__slvg_trk_leaf_ovfl( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk) +__slvg_trk_leaf_ovfl(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk) { - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; - uint32_t i, ovfl_cnt; - - btree = S2BT(session); - unpack = &_unpack; - - /* - * Two passes: count the overflow items, then copy them into an - * allocated array. - */ - ovfl_cnt = 0; - WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { - __wt_cell_unpack(cell, unpack); - if (unpack->ovfl) - ++ovfl_cnt; - } - if (ovfl_cnt == 0) - return (0); - - /* Allocate room for the array of overflow addresses and fill it in. */ - WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr)); - trk->trk_ovfl_cnt = ovfl_cnt; - - ovfl_cnt = 0; - WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { - __wt_cell_unpack(cell, unpack); - if (unpack->ovfl) { - WT_RET(__wt_memdup(session, unpack->data, - unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr)); - trk->trk_ovfl_addr[ovfl_cnt].size = - (uint8_t)unpack->size; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s overflow reference %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), - __wt_addr_string(session, - unpack->data, unpack->size, trk->ss->tmp2)); - - if (++ovfl_cnt == trk->trk_ovfl_cnt) - break; - } - } - - return (0); + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + uint32_t i, ovfl_cnt; + + btree = S2BT(session); + unpack = &_unpack; + + /* + * Two passes: count the overflow items, then copy them into an allocated array. + */ + ovfl_cnt = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) + { + __wt_cell_unpack(cell, unpack); + if (unpack->ovfl) + ++ovfl_cnt; + } + if (ovfl_cnt == 0) + return (0); + + /* Allocate room for the array of overflow addresses and fill it in. */ + WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr)); + trk->trk_ovfl_cnt = ovfl_cnt; + + ovfl_cnt = 0; + WT_CELL_FOREACH(btree, dsk, cell, unpack, i) + { + __wt_cell_unpack(cell, unpack); + if (unpack->ovfl) { + WT_RET( + __wt_memdup(session, unpack->data, unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr)); + trk->trk_ovfl_addr[ovfl_cnt].size = (uint8_t)unpack->size; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s overflow reference %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), + __wt_addr_string(session, unpack->data, unpack->size, trk->ss->tmp2)); + + if (++ovfl_cnt == trk->trk_ovfl_cnt) + break; + } + } + + return (0); } /* * __slvg_col_range -- - * Figure out the leaf pages we need and free the leaf pages we don't. - * - * When pages split, the key range is split across multiple pages. If not all - * of the old versions of the page are overwritten, or not all of the new pages - * are written, or some of the pages are corrupted, salvage will read different - * pages with overlapping key ranges, at different LSNs. - * - * We salvage all of the key ranges we find, at the latest LSN value: this means - * we may resurrect pages of deleted items, as page deletion doesn't write leaf - * pages and salvage will read and instantiate the contents of an old version of - * the deleted page. - * - * The leaf page array is sorted in key order, and secondarily on LSN: what this - * means is that for each new key range, the first page we find is the best page - * for that key. The process is to walk forward from each page until we reach a - * page with a starting key after the current page's stopping key. - * - * For each of page, check to see if they overlap the current page's key range. - * If they do, resolve the overlap. Because WiredTiger rarely splits pages, - * overlap resolution usually means discarding a page because the key ranges - * are the same, and one of the pages is simply an old version of the other. - * - * However, it's possible more complex resolution is necessary. For example, - * here's an improbably complex list of page ranges and LSNs: - * - * Page Range LSN - * 30 A-G 3 - * 31 C-D 4 - * 32 B-C 5 - * 33 C-F 6 - * 34 C-D 7 - * 35 F-M 8 - * 36 H-O 9 - * - * We walk forward from each page reviewing all other pages in the array that - * overlap the range. For each overlap, the current or the overlapping - * page is updated so the page with the most recent information for any range - * "owns" that range. Here's an example for page 30. - * - * Review page 31: because page 31 has the range C-D and a higher LSN than page - * 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D - * range to page 31. The new track element would be inserted into array with - * the following result: - * - * Page Range LSN - * 30 A-C 3 << Changed WT_TRACK element - * 31 C-D 4 - * 32 B-C 5 - * 33 C-F 6 - * 34 C-D 7 - * 30 E-G 3 << New WT_TRACK element - * 35 F-M 8 - * 36 H-O 9 - * - * Continue the review of the first element, using its new values. - * - * Review page 32: because page 31 has the range B-C and a higher LSN than page - * 30, page 30's A-C range would be truncated, conceding the B-C range to page - * 32. - * 30 A-B 3 - * E-G 3 - * 31 C-D 4 - * 32 B-C 5 - * 33 C-F 6 - * 34 C-D 7 - * - * Review page 33: because page 33 has a starting key (C) past page 30's ending - * key (B), we stop evaluating page 30's A-B range, as there can be no further - * overlaps. - * - * This process is repeated for each page in the array. - * - * When page 33 is processed, we'd discover that page 33's C-F range overlaps - * page 30's E-G range, and page 30's E-G range would be updated, conceding the - * E-F range to page 33. - * - * This is not computationally expensive because we don't walk far forward in - * the leaf array because it's sorted by starting key, and because WiredTiger - * splits are rare, the chance of finding the kind of range overlap requiring - * re-sorting the array is small. + * Figure out the leaf pages we need and free the leaf pages we don't. When pages split, the key + * range is split across multiple pages. If not all of the old versions of the page are + * overwritten, or not all of the new pages are written, or some of the pages are corrupted, + * salvage will read different pages with overlapping key ranges, at different LSNs. We salvage + * all of the key ranges we find, at the latest LSN value: this means we may resurrect pages of + * deleted items, as page deletion doesn't write leaf pages and salvage will read and + * instantiate the contents of an old version of the deleted page. The leaf page array is sorted + * in key order, and secondarily on LSN: what this means is that for each new key range, the + * first page we find is the best page for that key. The process is to walk forward from each + * page until we reach a page with a starting key after the current page's stopping key. For + * each of page, check to see if they overlap the current page's key range. If they do, resolve + * the overlap. Because WiredTiger rarely splits pages, overlap resolution usually means + * discarding a page because the key ranges are the same, and one of the pages is simply an old + * version of the other. However, it's possible more complex resolution is necessary. For + * example, here's an improbably complex list of page ranges and LSNs: Page Range LSN 30 A-G 3 + * 31 C-D 4 32 B-C 5 33 C-F 6 34 C-D 7 35 F-M 8 36 H-O 9 We walk forward from each page + * reviewing all other pages in the array that overlap the range. For each overlap, the current + * or the overlapping page is updated so the page with the most recent information for any range + * "owns" that range. Here's an example for page 30. Review page 31: because page 31 has the + * range C-D and a higher LSN than page 30, page 30 would "split" into two ranges, A-C and E-G, + * conceding the C-D range to page 31. The new track element would be inserted into array with + * the following result: Page Range LSN 30 A-C 3 << Changed WT_TRACK element 31 C-D 4 32 B-C 5 + * 33 C-F 6 34 C-D 7 30 E-G 3 << New WT_TRACK element 35 F-M 8 36 H-O 9 Continue the review of + * the first element, using its new values. Review page 32: because page 31 has the range B-C + * and a higher LSN than page 30, page 30's A-C range would be truncated, conceding the B-C + * range to page 32. 30 A-B 3 E-G 3 31 C-D 4 32 B-C 5 33 C-F 6 34 C-D 7 Review page 33: because + * page 33 has a starting key (C) past page 30's ending key (B), we stop evaluating page 30's + * A-B range, as there can be no further overlaps. This process is repeated for each page in the + * array. When page 33 is processed, we'd discover that page 33's C-F range overlaps page 30's + * E-G range, and page 30's E-G range would be updated, conceding the E-F range to page 33. This + * is not computationally expensive because we don't walk far forward in the leaf array because + * it's sorted by starting key, and because WiredTiger splits are rare, the chance of finding + * the kind of range overlap requiring re-sorting the array is small. */ static int __slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_TRACK *jtrk; - uint32_t i, j; - - /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. - * - * Walk the page array looking for overlapping key ranges, adjusting - * the ranges based on the LSN until there are no overlaps. - * - * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE - * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE - * PLUS OFFSET. - */ - for (i = 0; i < ss->pages_next; ++i) { - if (ss->pages[i] == NULL) - continue; - - /* Check for pages that overlap our page. */ - for (j = i + 1; j < ss->pages_next; ++j) { - if (ss->pages[j] == NULL) - continue; - /* - * We're done if this page starts after our stop, no - * subsequent pages can overlap our page. - */ - if (ss->pages[j]->col_start > - ss->pages[i]->col_stop) - break; - - /* There's an overlap, fix it up. */ - jtrk = ss->pages[j]; - WT_RET(__slvg_col_range_overlap(session, i, j, ss)); - - /* - * If the overlap resolution changed the entry's start - * key, the entry might have moved and the page array - * re-sorted, and pages[j] would reference a different - * page. We don't move forward if that happened, we - * re-process the slot again (by decrementing j before - * the loop's increment). - */ - if (ss->pages[j] != NULL && jtrk != ss->pages[j]) - --j; - } - } - return (0); + WT_TRACK *jtrk; + uint32_t i, j; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + * + * Walk the page array looking for overlapping key ranges, adjusting + * the ranges based on the LSN until there are no overlaps. + * + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE + * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE + * PLUS OFFSET. + */ + for (i = 0; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + + /* Check for pages that overlap our page. */ + for (j = i + 1; j < ss->pages_next; ++j) { + if (ss->pages[j] == NULL) + continue; + /* + * We're done if this page starts after our stop, no subsequent pages can overlap our + * page. + */ + if (ss->pages[j]->col_start > ss->pages[i]->col_stop) + break; + + /* There's an overlap, fix it up. */ + jtrk = ss->pages[j]; + WT_RET(__slvg_col_range_overlap(session, i, j, ss)); + + /* + * If the overlap resolution changed the entry's start key, the entry might have moved + * and the page array re-sorted, and pages[j] would reference a different page. We don't + * move forward if that happened, we re-process the slot again (by decrementing j before + * the loop's increment). + */ + if (ss->pages[j] != NULL && jtrk != ss->pages[j]) + --j; + } + } + return (0); } /* * __slvg_col_range_overlap -- - * Two column-store key ranges overlap, deal with it. + * Two column-store key ranges overlap, deal with it. */ static int -__slvg_col_range_overlap( - WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) +__slvg_col_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) { - WT_DECL_RET; - WT_TRACK *a_trk, *b_trk, *new; - uint32_t i; - - /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. - */ - a_trk = ss->pages[a_slot]; - b_trk = ss->pages[b_slot]; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s and %s range overlap", - __wt_addr_string( - session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), - __wt_addr_string( - session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); - - /* - * The key ranges of two WT_TRACK pages in the array overlap -- choose - * the ranges we're going to take from each. - * - * We can think of the overlap possibilities as 11 different cases: - * - * AAAAAAAAAAAAAAAAAA - * #1 BBBBBBBBBBBBBBBBBB pages are the same - * #2 BBBBBBBBBBBBB overlaps the beginning - * #3 BBBBBBBBBBBBBBBB overlaps the end - * #4 BBBBB B is a prefix of A - * #5 BBBBBB B is middle of A - * #6 BBBBBBBBBB B is a suffix of A - * - * and: - * - * BBBBBBBBBBBBBBBBBB - * #7 AAAAAAAAAAAAA same as #3 - * #8 AAAAAAAAAAAAAAAA same as #2 - * #9 AAAAA A is a prefix of B - * #10 AAAAAA A is middle of B - * #11 AAAAAAAAAA A is a suffix of B - * - * Note the leaf page array was sorted by key and a_trk appears earlier - * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. - * - * Finally, there's one additional complicating factor -- final ranges - * are assigned based on the page's LSN. - */ - /* Case #2/8, #10, #11 */ - if (a_trk->col_start > b_trk->col_start) - WT_PANIC_RET( - session, EINVAL, "unexpected merge array sort order"); - - if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */ - /* - * The secondary sort of the leaf page array was the page's LSN, - * in high-to-low order, which means a_trk has a higher LSN, and - * is more desirable, than b_trk. In cases #1 and #4 and #9, - * where the start of the range is the same for the two pages, - * this simplifies things, it guarantees a_trk has a higher LSN - * than b_trk. - */ - if (a_trk->col_stop >= b_trk->col_stop) - /* - * Case #1, #4: a_trk is a superset of b_trk, and a_trk - * is more desirable -- discard b_trk. - */ - goto delete_b; - - /* - * Case #9: b_trk is a superset of a_trk, but a_trk is more - * desirable: keep both but delete a_trk's key range from - * b_trk. - */ - b_trk->col_start = a_trk->col_stop + 1; - __slvg_col_trk_update_start(b_slot, ss); - F_SET(b_trk, WT_TRACK_MERGE); - goto merge; - } - - if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */ - if (a_trk->trk_gen > b_trk->trk_gen) - /* - * Case #6: a_trk is a superset of b_trk and a_trk is - * more desirable -- discard b_trk. - */ - goto delete_b; - - /* - * Case #6: a_trk is a superset of b_trk, but b_trk is more - * desirable: keep both but delete b_trk's key range from a_trk. - */ - a_trk->col_stop = b_trk->col_start - 1; - F_SET(a_trk, WT_TRACK_MERGE); - goto merge; - } - - if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */ - if (a_trk->trk_gen > b_trk->trk_gen) { - /* - * Case #3/7: a_trk is more desirable, delete a_trk's - * key range from b_trk; - */ - b_trk->col_start = a_trk->col_stop + 1; - __slvg_col_trk_update_start(b_slot, ss); - F_SET(b_trk, WT_TRACK_MERGE); - } else { - /* - * Case #3/7: b_trk is more desirable, delete b_trk's - * key range from a_trk; - */ - a_trk->col_stop = b_trk->col_start - 1; - F_SET(a_trk, WT_TRACK_MERGE); - } - goto merge; - } - - /* - * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- - * discard b_trk. - */ - if (a_trk->trk_gen > b_trk->trk_gen) { -delete_b: /* - * After page and overflow reconciliation, one (and only one) - * page can reference an overflow record. But, if we split a - * page into multiple chunks, any of the chunks might own any - * of the backing overflow records, so overflow records won't - * normally be discarded until after the merge phase completes. - * (The merge phase is where the final pages are written, and - * we figure out which overflow records are actually used.) - * If freeing a chunk and there are no other references to the - * underlying shared information, the overflow records must be - * useless, discard them to keep the final file size small. - */ - if (b_trk->shared->ref == 1) - for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) - WT_RET(__slvg_trk_free(session, - &ss->ovfl[b_trk->trk_ovfl_slot[i]], true)); - return (__slvg_trk_free(session, &ss->pages[b_slot], true)); - } - - /* - * Case #5: b_trk is more desirable and is a middle chunk of a_trk. - * Split a_trk into two parts, the key range before b_trk and the - * key range after b_trk. - * - * Allocate a new WT_TRACK object, and extend the array of pages as - * necessary. - */ - WT_RET(__wt_calloc_one(session, &new)); - if ((ret = __wt_realloc_def(session, - &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != 0) { - __wt_free(session, new); - return (ret); - } - - /* - * First, set up the track share (we do this after the allocation to - * ensure the shared reference count is never incorrect). - */ - new->shared = a_trk->shared; - new->ss = a_trk->ss; - ++new->shared->ref; - - /* - * Second, insert the new element into the array after the existing - * element (that's probably wrong, but we'll fix it up in a second). - */ - memmove(ss->pages + a_slot + 1, ss->pages + a_slot, - (ss->pages_next - a_slot) * sizeof(*ss->pages)); - ss->pages[a_slot + 1] = new; - ++ss->pages_next; - - /* - * Third, set its start key to be the first key after the stop key of - * the middle chunk (that's b_trk), and its stop key to be the stop key - * of the original chunk, and call __slvg_col_trk_update_start. That - * function will re-sort the WT_TRACK array as necessary to move our - * new entry into the right sorted location. - */ - new->col_start = b_trk->col_stop + 1; - new->col_stop = a_trk->col_stop; - __slvg_col_trk_update_start(a_slot + 1, ss); - - /* - * Fourth, set the original WT_TRACK information to reference only - * the initial key space in the page, that is, everything up to the - * starting key of the middle chunk (that's b_trk). - */ - a_trk->col_stop = b_trk->col_start - 1; - - F_SET(new, WT_TRACK_MERGE); - F_SET(a_trk, WT_TRACK_MERGE); - -merge: __wt_verbose(session, WT_VERB_SALVAGE, - "%s and %s require merge", - __wt_addr_string( - session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), - __wt_addr_string( - session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); - return (0); + WT_DECL_RET; + WT_TRACK *a_trk, *b_trk, *new; + uint32_t i; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY + * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED. + */ + a_trk = ss->pages[a_slot]; + b_trk = ss->pages[b_slot]; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s and %s range overlap", + __wt_addr_string(session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); + + /* + * The key ranges of two WT_TRACK pages in the array overlap -- choose + * the ranges we're going to take from each. + * + * We can think of the overlap possibilities as 11 different cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB pages are the same + * #2 BBBBBBBBBBBBB overlaps the beginning + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #8 AAAAAAAAAAAAAAAA same as #2 + * #9 AAAAA A is a prefix of B + * #10 AAAAAA A is middle of B + * #11 AAAAAAAAAA A is a suffix of B + * + * Note the leaf page array was sorted by key and a_trk appears earlier + * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. + * + * Finally, there's one additional complicating factor -- final ranges + * are assigned based on the page's LSN. + */ + /* Case #2/8, #10, #11 */ + if (a_trk->col_start > b_trk->col_start) + WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + + if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */ + /* + * The secondary sort of the leaf page array was the + * page's LSN, in high-to-low order, which means + * a_trk has a higher LSN, and is more desirable, + * than b_trk. In cases #1 and #4 and #9, where the + * start of the range is the same for the two pages, + * this simplifies things, it guarantees a_trk has a + * higher LSN than b_trk. + */ + if (a_trk->col_stop >= b_trk->col_stop) + /* + * Case #1, #4: a_trk is a superset of b_trk, and a_trk is more desirable -- discard + * b_trk. + */ + goto delete_b; + + /* + * Case #9: b_trk is a superset of a_trk, but a_trk is more desirable: keep both but delete + * a_trk's key range from b_trk. + */ + b_trk->col_start = a_trk->col_stop + 1; + __slvg_col_trk_update_start(b_slot, ss); + F_SET(b_trk, WT_TRACK_MERGE); + goto merge; + } + + if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */ + if (a_trk->trk_gen > b_trk->trk_gen) + /* + * Case #6: a_trk is a superset of b_trk and a_trk is more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #6: a_trk is a superset of b_trk, but b_trk is more desirable: keep both but delete + * b_trk's key range from a_trk. + */ + a_trk->col_stop = b_trk->col_start - 1; + F_SET(a_trk, WT_TRACK_MERGE); + goto merge; + } + + if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */ + if (a_trk->trk_gen > b_trk->trk_gen) { + /* + * Case #3/7: a_trk is more desirable, delete a_trk's key range from b_trk; + */ + b_trk->col_start = a_trk->col_stop + 1; + __slvg_col_trk_update_start(b_slot, ss); + F_SET(b_trk, WT_TRACK_MERGE); + } else { + /* + * Case #3/7: b_trk is more desirable, delete b_trk's key range from a_trk; + */ + a_trk->col_stop = b_trk->col_start - 1; + F_SET(a_trk, WT_TRACK_MERGE); + } + goto merge; + } + + /* + * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- discard b_trk. + */ + if (a_trk->trk_gen > b_trk->trk_gen) { + delete_b: + /* + * After page and overflow reconciliation, one (and only one) + * page can reference an overflow record. But, if we split a + * page into multiple chunks, any of the chunks might own any + * of the backing overflow records, so overflow records won't + * normally be discarded until after the merge phase completes. + * (The merge phase is where the final pages are written, and + * we figure out which overflow records are actually used.) + * If freeing a chunk and there are no other references to the + * underlying shared information, the overflow records must be + * useless, discard them to keep the final file size small. + */ + if (b_trk->shared->ref == 1) + for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_trk_free(session, &ss->ovfl[b_trk->trk_ovfl_slot[i]], true)); + return (__slvg_trk_free(session, &ss->pages[b_slot], true)); + } + + /* + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. + * Split a_trk into two parts, the key range before b_trk and the + * key range after b_trk. + * + * Allocate a new WT_TRACK object, and extend the array of pages as + * necessary. + */ + WT_RET(__wt_calloc_one(session, &new)); + if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != + 0) { + __wt_free(session, new); + return (ret); + } + + /* + * First, set up the track share (we do this after the allocation to ensure the shared reference + * count is never incorrect). + */ + new->shared = a_trk->shared; + new->ss = a_trk->ss; + ++new->shared->ref; + + /* + * Second, insert the new element into the array after the existing element (that's probably + * wrong, but we'll fix it up in a second). + */ + memmove( + ss->pages + a_slot + 1, ss->pages + a_slot, (ss->pages_next - a_slot) * sizeof(*ss->pages)); + ss->pages[a_slot + 1] = new; + ++ss->pages_next; + + /* + * Third, set its start key to be the first key after the stop key of the middle chunk (that's + * b_trk), and its stop key to be the stop key of the original chunk, and call + * __slvg_col_trk_update_start. That function will re-sort the WT_TRACK array as necessary to + * move our new entry into the right sorted location. + */ + new->col_start = b_trk->col_stop + 1; + new->col_stop = a_trk->col_stop; + __slvg_col_trk_update_start(a_slot + 1, ss); + + /* + * Fourth, set the original WT_TRACK information to reference only the initial key space in the + * page, that is, everything up to the starting key of the middle chunk (that's b_trk). + */ + a_trk->col_stop = b_trk->col_start - 1; + + F_SET(new, WT_TRACK_MERGE); + F_SET(a_trk, WT_TRACK_MERGE); + +merge: + __wt_verbose(session, WT_VERB_SALVAGE, "%s and %s require merge", + __wt_addr_string(session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); + return (0); } /* * __slvg_col_trk_update_start -- - * Update a column-store page's start key after an overlap. + * Update a column-store page's start key after an overlap. */ static void __slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss) { - WT_TRACK *trk; - uint32_t i; - - trk = ss->pages[slot]; - - /* - * If we deleted an initial piece of the WT_TRACK name space, it may no - * longer be in the right location. - * - * For example, imagine page #1 has the key range 30-50, it split, and - * we wrote page #2 with key range 30-40, and page #3 key range with - * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the - * key ranges were sorted, page #2 came first, then page #1 (because of - * their earlier start keys than page #3), and page #2 came before page - * #1 because of its LSN. When we resolve the overlap between page #2 - * and page #1, we truncate the initial key range of page #1, and it now - * sorts after page #3, because it has the same starting key of 40, and - * a lower LSN. - * - * We have already updated b_trk's start key; what we may have to do is - * re-sort some number of elements in the list. - */ - for (i = slot + 1; i < ss->pages_next; ++i) { - if (ss->pages[i] == NULL) - continue; - if (ss->pages[i]->col_start > trk->col_stop) - break; - } - i -= slot; - if (i > 1) - __wt_qsort(ss->pages + slot, (size_t)i, - sizeof(WT_TRACK *), __slvg_trk_compare_key); + WT_TRACK *trk; + uint32_t i; + + trk = ss->pages[slot]; + + /* + * If we deleted an initial piece of the WT_TRACK name space, it may no + * longer be in the right location. + * + * For example, imagine page #1 has the key range 30-50, it split, and + * we wrote page #2 with key range 30-40, and page #3 key range with + * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the + * key ranges were sorted, page #2 came first, then page #1 (because of + * their earlier start keys than page #3), and page #2 came before page + * #1 because of its LSN. When we resolve the overlap between page #2 + * and page #1, we truncate the initial key range of page #1, and it now + * sorts after page #3, because it has the same starting key of 40, and + * a lower LSN. + * + * We have already updated b_trk's start key; what we may have to do is + * re-sort some number of elements in the list. + */ + for (i = slot + 1; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + if (ss->pages[i]->col_start > trk->col_stop) + break; + } + i -= slot; + if (i > 1) + __wt_qsort(ss->pages + slot, (size_t)i, sizeof(WT_TRACK *), __slvg_trk_compare_key); } /* * __slvg_col_range_missing -- - * Detect missing ranges from column-store files. + * Detect missing ranges from column-store files. */ static void __slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_TRACK *trk; - uint64_t r; - uint32_t i; - - for (i = 0, r = 0; i < ss->pages_next; ++i) { - if ((trk = ss->pages[i]) == NULL) - continue; - if (trk->col_start != r + 1) { - __wt_verbose(session, WT_VERB_SALVAGE, - "%s column-store missing range from %" - PRIu64 " to %" PRIu64 " inclusive", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - r + 1, trk->col_start - 1); - - /* - * We need to instantiate deleted items for the missing - * record range. - */ - trk->col_missing = r + 1; - F_SET(trk, WT_TRACK_MERGE); - } - r = trk->col_stop; - } + WT_TRACK *trk; + uint64_t r; + uint32_t i; + + for (i = 0, r = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + if (trk->col_start != r + 1) { + __wt_verbose(session, WT_VERB_SALVAGE, + "%s column-store missing range from %" PRIu64 " to %" PRIu64 " inclusive", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), r + 1, + trk->col_start - 1); + + /* + * We need to instantiate deleted items for the missing record range. + */ + trk->col_missing = r + 1; + F_SET(trk, WT_TRACK_MERGE); + } + r = trk->col_stop; + } } /* * __slvg_modify_init -- - * Initialize a salvage page's modification information. + * Initialize a salvage page's modification information. */ static int __slvg_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_RET(__wt_page_modify_init(session, page)); - __wt_page_modify_set(session, page); + WT_RET(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); - return (0); + return (0); } /* * __slvg_col_build_internal -- - * Build a column-store in-memory page that references all of the leaf - * pages we've found. + * Build a column-store in-memory page that references all of the leaf pages we've found. */ static int -__slvg_col_build_internal( - WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) +__slvg_col_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) { - WT_ADDR *addr; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *ref, **refp; - WT_TRACK *trk; - uint32_t i; - - addr = NULL; - - /* Allocate a column-store root (internal) page and fill it in. */ - WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_INT, leaf_cnt, true, &page)); - WT_ERR(__slvg_modify_init(session, page)); - - pindex = WT_INTL_INDEX_GET_SAFE(page); - for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { - if ((trk = ss->pages[i]) == NULL) - continue; - - ref = *refp++; - ref->home = page; - ref->page = NULL; - - WT_ERR(__wt_calloc_one(session, &addr)); - WT_ERR(__wt_memdup( - session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); - addr->size = trk->trk_addr_size; - addr->type = - trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; - ref->addr = addr; - addr = NULL; - - ref->ref_recno = trk->col_start; - WT_REF_SET_STATE(ref, WT_REF_DISK); - - /* - * If the page's key range is unmodified from when we read it - * (in other words, we didn't merge part of this page with - * another page), we can use the page without change, and the - * only thing we need to do is mark all overflow records the - * page references as in-use. - * - * If we did merge with another page, we have to build a page - * reflecting the updated key range. Note, that requires an - * additional pass to free the merge page's backing blocks. - */ - if (F_ISSET(trk, WT_TRACK_MERGE)) { - ss->merge_free = true; - - WT_ERR(__slvg_col_build_leaf(session, trk, ref)); - } else - WT_ERR(__slvg_ovfl_ref_all(session, trk)); - ++ref; - } - - __wt_root_ref_init(session, &ss->root_ref, page, true); - - if (0) { -err: __wt_free(session, addr); - __wt_page_out(session, &page); - } - return (ret); + WT_ADDR *addr; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *ref, **refp; + WT_TRACK *trk; + uint32_t i; + + addr = NULL; + + /* Allocate a column-store root (internal) page and fill it in. */ + WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT, leaf_cnt, true, &page)); + WT_ERR(__slvg_modify_init(session, page)); + + pindex = WT_INTL_INDEX_GET_SAFE(page); + for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + + ref = *refp++; + ref->home = page; + ref->page = NULL; + + WT_ERR(__wt_calloc_one(session, &addr)); + WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); + addr->size = trk->trk_addr_size; + addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; + ref->addr = addr; + addr = NULL; + + ref->ref_recno = trk->col_start; + WT_REF_SET_STATE(ref, WT_REF_DISK); + + /* + * If the page's key range is unmodified from when we read it + * (in other words, we didn't merge part of this page with + * another page), we can use the page without change, and the + * only thing we need to do is mark all overflow records the + * page references as in-use. + * + * If we did merge with another page, we have to build a page + * reflecting the updated key range. Note, that requires an + * additional pass to free the merge page's backing blocks. + */ + if (F_ISSET(trk, WT_TRACK_MERGE)) { + ss->merge_free = true; + + WT_ERR(__slvg_col_build_leaf(session, trk, ref)); + } else + WT_ERR(__slvg_ovfl_ref_all(session, trk)); + ++ref; + } + + __wt_root_ref_init(session, &ss->root_ref, page, true); + + if (0) { +err: + __wt_free(session, addr); + __wt_page_out(session, &page); + } + return (ret); } /* * __slvg_col_build_leaf -- - * Build a column-store leaf page for a merged page. + * Build a column-store leaf page for a merged page. */ static int __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) { - WT_COL *save_col_var; - WT_DECL_RET; - WT_PAGE *page; - WT_SALVAGE_COOKIE *cookie, _cookie; - uint64_t recno, skip, take; - uint32_t save_entries; - - cookie = &_cookie; - WT_CLEAR(*cookie); - - /* Get the original page, including the full in-memory setup. */ - WT_RET(__wt_page_in(session, ref, 0)); - page = ref->page; - - save_col_var = page->pg_var; - save_entries = page->entries; - - /* - * Calculate the number of K/V entries we are going to skip, and - * the total number of K/V entries we'll take from this page. - */ - recno = page->dsk->recno; - cookie->skip = skip = trk->col_start - recno; - cookie->take = take = (trk->col_stop - trk->col_start) + 1; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s merge discarding first %" PRIu64 " records, " - "then taking %" PRIu64 " records", - __wt_addr_string( - session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), - skip, take); - - /* Set the referenced flag on overflow pages we're using. */ - if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0) - WT_ERR(__slvg_col_ovfl(session, trk, page, recno, skip, take)); - - /* - * If we're missing some part of the range, the real start range is in - * trk->col_missing, else, it's in trk->col_start. Update the parent's - * reference as well as the page itself. - */ - if (trk->col_missing == 0) - ref->ref_recno = trk->col_start; - else { - ref->ref_recno = trk->col_missing; - cookie->missing = trk->col_start - trk->col_missing; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s merge inserting %" PRIu64 " missing records", - __wt_addr_string( - session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), - cookie->missing); - } - - /* - * We can't discard the original blocks associated with this page now. - * (The problem is we don't want to overwrite any original information - * until the salvage run succeeds -- if we free the blocks now, the next - * merge page we write might allocate those blocks and overwrite them, - * and should the salvage run eventually fail, the original information - * would have been lost.) Clear the reference addr so eviction doesn't - * free the underlying blocks. - */ - __wt_ref_addr_free(session, ref); - - /* Write the new version of the leaf page to disk. */ - WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile( - session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); - - /* Reset the page. */ - page->pg_var = save_col_var; - page->entries = save_entries; - - ret = __wt_page_release(session, ref, 0); - if (ret == 0) - ret = __wt_evict(session, ref, WT_REF_MEM, - WT_EVICT_CALL_CLOSING); - - if (0) { -err: WT_TRET(__wt_page_release(session, ref, 0)); - } - - return (ret); + WT_COL *save_col_var; + WT_DECL_RET; + WT_PAGE *page; + WT_SALVAGE_COOKIE *cookie, _cookie; + uint64_t recno, skip, take; + uint32_t save_entries; + + cookie = &_cookie; + WT_CLEAR(*cookie); + + /* Get the original page, including the full in-memory setup. */ + WT_RET(__wt_page_in(session, ref, 0)); + page = ref->page; + + save_col_var = page->pg_var; + save_entries = page->entries; + + /* + * Calculate the number of K/V entries we are going to skip, and the total number of K/V entries + * we'll take from this page. + */ + recno = page->dsk->recno; + cookie->skip = skip = trk->col_start - recno; + cookie->take = take = (trk->col_stop - trk->col_start) + 1; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s merge discarding first %" PRIu64 + " records, " + "then taking %" PRIu64 " records", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), skip, take); + + /* Set the referenced flag on overflow pages we're using. */ + if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0) + WT_ERR(__slvg_col_ovfl(session, trk, page, recno, skip, take)); + + /* + * If we're missing some part of the range, the real start range is in trk->col_missing, else, + * it's in trk->col_start. Update the parent's reference as well as the page itself. + */ + if (trk->col_missing == 0) + ref->ref_recno = trk->col_start; + else { + ref->ref_recno = trk->col_missing; + cookie->missing = trk->col_start - trk->col_missing; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s merge inserting %" PRIu64 " missing records", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), + cookie->missing); + } + + /* + * We can't discard the original blocks associated with this page now. + * (The problem is we don't want to overwrite any original information + * until the salvage run succeeds -- if we free the blocks now, the next + * merge page we write might allocate those blocks and overwrite them, + * and should the salvage run eventually fail, the original information + * would have been lost.) Clear the reference addr so eviction doesn't + * free the underlying blocks. + */ + __wt_ref_addr_free(session, ref); + + /* Write the new version of the leaf page to disk. */ + WT_ERR(__slvg_modify_init(session, page)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); + + /* Reset the page. */ + page->pg_var = save_col_var; + page->entries = save_entries; + + ret = __wt_page_release(session, ref, 0); + if (ret == 0) + ret = __wt_evict(session, ref, WT_REF_MEM, WT_EVICT_CALL_CLOSING); + + if (0) { +err: + WT_TRET(__wt_page_release(session, ref, 0)); + } + + return (ret); } /* * __slvg_col_ovfl_single -- - * Find a single overflow record in the merge page's list, and mark it as - * referenced. + * Find a single overflow record in the merge page's list, and mark it as referenced. */ static int -__slvg_col_ovfl_single( - WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) +__slvg_col_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) { - WT_TRACK *ovfl; - uint32_t i; - - /* - * Search the list of overflow records for this page -- we should find - * exactly one match, and we mark it as referenced. - */ - for (i = 0; i < trk->trk_ovfl_cnt; ++i) { - ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; - if (unpack->size == ovfl->trk_addr_size && - memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) - return (__slvg_ovfl_ref(session, ovfl, false)); - } - - WT_PANIC_RET(session, - EINVAL, "overflow record at column-store page merge not found"); + WT_TRACK *ovfl; + uint32_t i; + + /* + * Search the list of overflow records for this page -- we should find exactly one match, and we + * mark it as referenced. + */ + for (i = 0; i < trk->trk_ovfl_cnt; ++i) { + ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; + if (unpack->size == ovfl->trk_addr_size && + memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) + return (__slvg_ovfl_ref(session, ovfl, false)); + } + + WT_PANIC_RET(session, EINVAL, "overflow record at column-store page merge not found"); } /* * __slvg_col_ovfl -- - * Mark overflow items referenced by the merged page. + * Mark overflow items referenced by the merged page. */ static int -__slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, - WT_PAGE *page, uint64_t recno, uint64_t skip, uint64_t take) +__slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint64_t recno, + uint64_t skip, uint64_t take) { - WT_CELL *cell; - WT_CELL_UNPACK unpack; - WT_COL *cip; - WT_DECL_RET; - uint64_t start, stop; - uint32_t i; - - /* - * Merging a variable-length column-store page, and we took some number - * of records, figure out which (if any) overflow records we used. - */ - start = recno + skip; - stop = (recno + skip + take) - 1; - - WT_COL_FOREACH(page, cip, i) { - cell = WT_COL_PTR(page, cip); - __wt_cell_unpack(cell, &unpack); - recno += __wt_cell_rle(&unpack); - - /* - * I keep getting this calculation wrong, so here's the logic. - * Start is the first record we want, stop is the last record - * we want. The record number has already been incremented one - * past the maximum record number for this page entry, that is, - * it's set to the first record number for the next page entry. - * The test of start should be greater-than (not greater-than- - * or-equal), because of that increment, if the record number - * equals start, we want the next record, not this one. The - * test against stop is greater-than, not greater-than-or-equal - * because stop is the last record wanted, if the record number - * equals stop, we want the next record. - */ - if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) { - ret = __slvg_col_ovfl_single(session, trk, &unpack); - - /* - * When handling overlapping ranges on variable-length - * column-store leaf pages, we split ranges without - * considering if we were splitting RLE units. (See - * note at the beginning of this file for explanation - * of the overall process.) If the RLE unit was on-page, - * we can simply write it again. If the RLE unit was an - * overflow value that's already been used by another - * row (from some other page created by a range split), - * there's not much to do, this row can't reference an - * overflow record we don't have: delete the row. - */ - if (ret == EBUSY) { - __wt_cell_type_reset(session, - cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL); - ret = 0; - } - WT_RET(ret); - } - if (recno > stop) - break; - } - return (0); + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_COL *cip; + WT_DECL_RET; + uint64_t start, stop; + uint32_t i; + + /* + * Merging a variable-length column-store page, and we took some number of records, figure out + * which (if any) overflow records we used. + */ + start = recno + skip; + stop = (recno + skip + take) - 1; + + WT_COL_FOREACH (page, cip, i) { + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(cell, &unpack); + recno += __wt_cell_rle(&unpack); + + /* + * I keep getting this calculation wrong, so here's the logic. Start is the first record we + * want, stop is the last record we want. The record number has already been incremented one + * past the maximum record number for this page entry, that is, it's set to the first record + * number for the next page entry. The test of start should be greater-than (not + * greater-than- or-equal), because of that increment, if the record number equals start, we + * want the next record, not this one. The test against stop is greater-than, not + * greater-than-or-equal because stop is the last record wanted, if the record number equals + * stop, we want the next record. + */ + if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) { + ret = __slvg_col_ovfl_single(session, trk, &unpack); + + /* + * When handling overlapping ranges on variable-length column-store leaf pages, we split + * ranges without considering if we were splitting RLE units. (See note at the beginning + * of this file for explanation of the overall process.) If the RLE unit was on-page, we + * can simply write it again. If the RLE unit was an overflow value that's already been + * used by another row (from some other page created by a range split), there's not much + * to do, this row can't reference an overflow record we don't have: delete the row. + */ + if (ret == EBUSY) { + __wt_cell_type_reset(session, cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL); + ret = 0; + } + WT_RET(ret); + } + if (recno > stop) + break; + } + return (0); } /* * __slvg_row_range -- - * Figure out the leaf pages we need and discard everything else. At the - * same time, tag the overflow pages they reference. + * Figure out the leaf pages we need and discard everything else. At the same time, tag the + * overflow pages they reference. */ static int __slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_BTREE *btree; - WT_TRACK *jtrk; - uint32_t i, j; - int cmp; - - btree = S2BT(session); - - /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. - * - * Walk the page array looking for overlapping key ranges, adjusting - * the ranges based on the LSN until there are no overlaps. - * - * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE - * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE - * PLUS OFFSET. - */ - for (i = 0; i < ss->pages_next; ++i) { - if (ss->pages[i] == NULL) - continue; - - /* Check for pages that overlap our page. */ - for (j = i + 1; j < ss->pages_next; ++j) { - if (ss->pages[j] == NULL) - continue; - /* - * We're done if this page starts after our stop, no - * subsequent pages can overlap our page. - */ - WT_RET(__wt_compare(session, btree->collator, - &ss->pages[j]->row_start, &ss->pages[i]->row_stop, - &cmp)); - if (cmp > 0) - break; - - /* There's an overlap, fix it up. */ - jtrk = ss->pages[j]; - WT_RET(__slvg_row_range_overlap(session, i, j, ss)); - - /* - * If the overlap resolution changed the entry's start - * key, the entry might have moved and the page array - * re-sorted, and pages[j] would reference a different - * page. We don't move forward if that happened, we - * re-process the slot again (by decrementing j before - * the loop's increment). - */ - if (ss->pages[j] != NULL && jtrk != ss->pages[j]) - --j; - } - } - return (0); + WT_BTREE *btree; + WT_TRACK *jtrk; + uint32_t i, j; + int cmp; + + btree = S2BT(session); + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR + * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE + * BEING HANDLED. + * + * Walk the page array looking for overlapping key ranges, adjusting + * the ranges based on the LSN until there are no overlaps. + * + * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE + * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE + * PLUS OFFSET. + */ + for (i = 0; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + + /* Check for pages that overlap our page. */ + for (j = i + 1; j < ss->pages_next; ++j) { + if (ss->pages[j] == NULL) + continue; + /* + * We're done if this page starts after our stop, no subsequent pages can overlap our + * page. + */ + WT_RET(__wt_compare( + session, btree->collator, &ss->pages[j]->row_start, &ss->pages[i]->row_stop, &cmp)); + if (cmp > 0) + break; + + /* There's an overlap, fix it up. */ + jtrk = ss->pages[j]; + WT_RET(__slvg_row_range_overlap(session, i, j, ss)); + + /* + * If the overlap resolution changed the entry's start key, the entry might have moved + * and the page array re-sorted, and pages[j] would reference a different page. We don't + * move forward if that happened, we re-process the slot again (by decrementing j before + * the loop's increment). + */ + if (ss->pages[j] != NULL && jtrk != ss->pages[j]) + --j; + } + } + return (0); } /* * __slvg_row_range_overlap -- - * Two row-store key ranges overlap, deal with it. + * Two row-store key ranges overlap, deal with it. */ static int -__slvg_row_range_overlap( - WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) +__slvg_row_range_overlap(WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss) { - WT_BTREE *btree; - WT_DECL_RET; - WT_TRACK *a_trk, *b_trk, *new; - uint32_t i; - int start_cmp, stop_cmp; - - /* - * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR - * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE - * BEING HANDLED. - */ - btree = S2BT(session); - - a_trk = ss->pages[a_slot]; - b_trk = ss->pages[b_slot]; - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s and %s range overlap", - __wt_addr_string( - session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), - __wt_addr_string( - session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); - - /* - * The key ranges of two WT_TRACK pages in the array overlap -- choose - * the ranges we're going to take from each. - * - * We can think of the overlap possibilities as 11 different cases: - * - * AAAAAAAAAAAAAAAAAA - * #1 BBBBBBBBBBBBBBBBBB pages are the same - * #2 BBBBBBBBBBBBB overlaps the beginning - * #3 BBBBBBBBBBBBBBBB overlaps the end - * #4 BBBBB B is a prefix of A - * #5 BBBBBB B is middle of A - * #6 BBBBBBBBBB B is a suffix of A - * - * and: - * - * BBBBBBBBBBBBBBBBBB - * #7 AAAAAAAAAAAAA same as #3 - * #8 AAAAAAAAAAAAAAAA same as #2 - * #9 AAAAA A is a prefix of B - * #10 AAAAAA A is middle of B - * #11 AAAAAAAAAA A is a suffix of B - * - * Note the leaf page array was sorted by key and a_trk appears earlier - * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. - * - * Finally, there's one additional complicating factor -- final ranges - * are assigned based on the page's LSN. - */ -#define A_TRK_START (&a_trk->row_start) -#define A_TRK_STOP (&a_trk->row_stop) -#define B_TRK_START (&b_trk->row_start) -#define B_TRK_STOP (&b_trk->row_stop) -#define SLOT_START(i) (&ss->pages[i]->row_start) -#define __slvg_key_copy(session, dst, src) \ - __wt_buf_set(session, dst, (src)->data, (src)->size) - - WT_RET(__wt_compare( - session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp)); - WT_RET(__wt_compare( - session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp)); - - if (start_cmp > 0) /* Case #2/8, #10, #11 */ - WT_PANIC_RET( - session, EINVAL, "unexpected merge array sort order"); - - if (start_cmp == 0) { /* Case #1, #4, #9 */ - /* - * The secondary sort of the leaf page array was the page's LSN, - * in high-to-low order, which means a_trk has a higher LSN, and - * is more desirable, than b_trk. In cases #1 and #4 and #9, - * where the start of the range is the same for the two pages, - * this simplifies things, it guarantees a_trk has a higher LSN - * than b_trk. - */ - if (stop_cmp >= 0) - /* - * Case #1, #4: a_trk is a superset of b_trk, and a_trk - * is more desirable -- discard b_trk. - */ - goto delete_b; - - /* - * Case #9: b_trk is a superset of a_trk, but a_trk is more - * desirable: keep both but delete a_trk's key range from - * b_trk. - */ - WT_RET(__slvg_row_trk_update_start( - session, A_TRK_STOP, b_slot, ss)); - F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); - goto merge; - } - - if (stop_cmp == 0) { /* Case #6 */ - if (a_trk->trk_gen > b_trk->trk_gen) - /* - * Case #6: a_trk is a superset of b_trk and a_trk is - * more desirable -- discard b_trk. - */ - goto delete_b; - - /* - * Case #6: a_trk is a superset of b_trk, but b_trk is more - * desirable: keep both but delete b_trk's key range from a_trk. - */ - WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); - F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); - goto merge; - } - - if (stop_cmp < 0) { /* Case #3/7 */ - if (a_trk->trk_gen > b_trk->trk_gen) { - /* - * Case #3/7: a_trk is more desirable, delete a_trk's - * key range from b_trk; - */ - WT_RET(__slvg_row_trk_update_start( - session, A_TRK_STOP, b_slot, ss)); - F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); - } else { - /* - * Case #3/7: b_trk is more desirable, delete b_trk's - * key range from a_trk; - */ - WT_RET(__slvg_key_copy( - session, A_TRK_STOP, B_TRK_START)); - F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); - } - goto merge; - } - - /* - * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- - * discard b_trk. - */ - if (a_trk->trk_gen > b_trk->trk_gen) { -delete_b: /* - * After page and overflow reconciliation, one (and only one) - * page can reference an overflow record. But, if we split a - * page into multiple chunks, any of the chunks might own any - * of the backing overflow records, so overflow records won't - * normally be discarded until after the merge phase completes. - * (The merge phase is where the final pages are written, and - * we figure out which overflow records are actually used.) - * If freeing a chunk and there are no other references to the - * underlying shared information, the overflow records must be - * useless, discard them to keep the final file size small. - */ - if (b_trk->shared->ref == 1) - for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) - WT_RET(__slvg_trk_free(session, - &ss->ovfl[b_trk->trk_ovfl_slot[i]], true)); - return (__slvg_trk_free(session, &ss->pages[b_slot], true)); - } - - /* - * Case #5: b_trk is more desirable and is a middle chunk of a_trk. - * Split a_trk into two parts, the key range before b_trk and the - * key range after b_trk. - * - * Allocate a new WT_TRACK object, and extend the array of pages as - * necessary. - */ - WT_RET(__wt_calloc_one(session, &new)); - if ((ret = __wt_realloc_def(session, - &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != 0) { - __wt_free(session, new); - return (ret); - } - - /* - * First, set up the track share (we do this after the allocation to - * ensure the shared reference count is never incorrect). - */ - new->shared = a_trk->shared; - new->ss = a_trk->ss; - ++new->shared->ref; - - /* - * Second, insert the new element into the array after the existing - * element (that's probably wrong, but we'll fix it up in a second). - */ - memmove(ss->pages + a_slot + 1, ss->pages + a_slot, - (ss->pages_next - a_slot) * sizeof(*ss->pages)); - ss->pages[a_slot + 1] = new; - ++ss->pages_next; - - /* - * Third, set its its stop key to be the stop key of the original chunk, - * and call __slvg_row_trk_update_start. That function will both set - * the start key to be the first key after the stop key of the middle - * chunk (that's b_trk), and re-sort the WT_TRACK array as necessary to - * move our new entry into the right sorted location. - */ - WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP)); - WT_RET( - __slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss)); - - /* - * Fourth, set the original WT_TRACK information to reference only - * the initial key space in the page, that is, everything up to the - * starting key of the middle chunk (that's b_trk). - */ - WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); - F_SET(new, WT_TRACK_CHECK_START); - F_SET(a_trk, WT_TRACK_CHECK_STOP); - - F_SET(new, WT_TRACK_MERGE); - F_SET(a_trk, WT_TRACK_MERGE); - -merge: __wt_verbose(session, WT_VERB_SALVAGE, - "%s and %s require merge", - __wt_addr_string( - session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), - __wt_addr_string( - session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); - return (0); + WT_BTREE *btree; + WT_DECL_RET; + WT_TRACK *a_trk, *b_trk, *new; + uint32_t i; + int start_cmp, stop_cmp; + + /* + * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR COLUMN-STORE CODE: THEY + * ARE IDENTICAL OTHER THAN THE PAGES THAT ARE BEING HANDLED. + */ + btree = S2BT(session); + + a_trk = ss->pages[a_slot]; + b_trk = ss->pages[b_slot]; + + __wt_verbose(session, WT_VERB_SALVAGE, "%s and %s range overlap", + __wt_addr_string(session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); + +/* + * The key ranges of two WT_TRACK pages in the array overlap -- choose + * the ranges we're going to take from each. + * + * We can think of the overlap possibilities as 11 different cases: + * + * AAAAAAAAAAAAAAAAAA + * #1 BBBBBBBBBBBBBBBBBB pages are the same + * #2 BBBBBBBBBBBBB overlaps the beginning + * #3 BBBBBBBBBBBBBBBB overlaps the end + * #4 BBBBB B is a prefix of A + * #5 BBBBBB B is middle of A + * #6 BBBBBBBBBB B is a suffix of A + * + * and: + * + * BBBBBBBBBBBBBBBBBB + * #7 AAAAAAAAAAAAA same as #3 + * #8 AAAAAAAAAAAAAAAA same as #2 + * #9 AAAAA A is a prefix of B + * #10 AAAAAA A is middle of B + * #11 AAAAAAAAAA A is a suffix of B + * + * Note the leaf page array was sorted by key and a_trk appears earlier + * in the array than b_trk, so cases #2/8, #10 and #11 are impossible. + * + * Finally, there's one additional complicating factor -- final ranges + * are assigned based on the page's LSN. + */ +#define A_TRK_START (&a_trk->row_start) +#define A_TRK_STOP (&a_trk->row_stop) +#define B_TRK_START (&b_trk->row_start) +#define B_TRK_STOP (&b_trk->row_stop) +#define SLOT_START(i) (&ss->pages[i]->row_start) +#define __slvg_key_copy(session, dst, src) __wt_buf_set(session, dst, (src)->data, (src)->size) + + WT_RET(__wt_compare(session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp)); + WT_RET(__wt_compare(session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp)); + + if (start_cmp > 0) /* Case #2/8, #10, #11 */ + WT_PANIC_RET(session, EINVAL, "unexpected merge array sort order"); + + if (start_cmp == 0) { /* Case #1, #4, #9 */ + /* + * The secondary sort of the leaf page array was the page's LSN, in + * high-to-low order, which means a_trk has a higher LSN, and is more + * desirable, than b_trk. In cases #1 and #4 and #9, where the start of + * the range is the same for the two pages, this simplifies things, it + * guarantees a_trk has a higher LSN than b_trk. + */ + if (stop_cmp >= 0) + /* + * Case #1, #4: a_trk is a superset of b_trk, and a_trk is more desirable -- discard + * b_trk. + */ + goto delete_b; + + /* + * Case #9: b_trk is a superset of a_trk, but a_trk is more desirable: keep both but delete + * a_trk's key range from b_trk. + */ + WT_RET(__slvg_row_trk_update_start(session, A_TRK_STOP, b_slot, ss)); + F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); + goto merge; + } + + if (stop_cmp == 0) { /* Case #6 */ + if (a_trk->trk_gen > b_trk->trk_gen) + /* + * Case #6: a_trk is a superset of b_trk and a_trk is more desirable -- discard b_trk. + */ + goto delete_b; + + /* + * Case #6: a_trk is a superset of b_trk, but b_trk is more desirable: keep both but delete + * b_trk's key range from a_trk. + */ + WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); + F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); + goto merge; + } + + if (stop_cmp < 0) { /* Case #3/7 */ + if (a_trk->trk_gen > b_trk->trk_gen) { + /* + * Case #3/7: a_trk is more desirable, delete a_trk's key range from b_trk; + */ + WT_RET(__slvg_row_trk_update_start(session, A_TRK_STOP, b_slot, ss)); + F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE); + } else { + /* + * Case #3/7: b_trk is more desirable, delete b_trk's key range from a_trk; + */ + WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); + F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE); + } + goto merge; + } + + /* + * Case #5: a_trk is a superset of b_trk and a_trk is more desirable -- discard b_trk. + */ + if (a_trk->trk_gen > b_trk->trk_gen) { + delete_b: + /* + * After page and overflow reconciliation, one (and only one) + * page can reference an overflow record. But, if we split a + * page into multiple chunks, any of the chunks might own any + * of the backing overflow records, so overflow records won't + * normally be discarded until after the merge phase completes. + * (The merge phase is where the final pages are written, and + * we figure out which overflow records are actually used.) + * If freeing a chunk and there are no other references to the + * underlying shared information, the overflow records must be + * useless, discard them to keep the final file size small. + */ + if (b_trk->shared->ref == 1) + for (i = 0; i < b_trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_trk_free(session, &ss->ovfl[b_trk->trk_ovfl_slot[i]], true)); + return (__slvg_trk_free(session, &ss->pages[b_slot], true)); + } + + /* + * Case #5: b_trk is more desirable and is a middle chunk of a_trk. + * Split a_trk into two parts, the key range before b_trk and the + * key range after b_trk. + * + * Allocate a new WT_TRACK object, and extend the array of pages as + * necessary. + */ + WT_RET(__wt_calloc_one(session, &new)); + if ((ret = __wt_realloc_def(session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages)) != + 0) { + __wt_free(session, new); + return (ret); + } + + /* + * First, set up the track share (we do this after the allocation to ensure the shared reference + * count is never incorrect). + */ + new->shared = a_trk->shared; + new->ss = a_trk->ss; + ++new->shared->ref; + + /* + * Second, insert the new element into the array after the existing element (that's probably + * wrong, but we'll fix it up in a second). + */ + memmove( + ss->pages + a_slot + 1, ss->pages + a_slot, (ss->pages_next - a_slot) * sizeof(*ss->pages)); + ss->pages[a_slot + 1] = new; + ++ss->pages_next; + + /* + * Third, set its its stop key to be the stop key of the original chunk, and call + * __slvg_row_trk_update_start. That function will both set the start key to be the first key + * after the stop key of the middle chunk (that's b_trk), and re-sort the WT_TRACK array as + * necessary to move our new entry into the right sorted location. + */ + WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP)); + WT_RET(__slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss)); + + /* + * Fourth, set the original WT_TRACK information to reference only the initial key space in the + * page, that is, everything up to the starting key of the middle chunk (that's b_trk). + */ + WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START)); + F_SET(new, WT_TRACK_CHECK_START); + F_SET(a_trk, WT_TRACK_CHECK_STOP); + + F_SET(new, WT_TRACK_MERGE); + F_SET(a_trk, WT_TRACK_MERGE); + +merge: + __wt_verbose(session, WT_VERB_SALVAGE, "%s and %s require merge", + __wt_addr_string(session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)); + return (0); } /* * __slvg_row_trk_update_start -- - * Update a row-store page's start key after an overlap. + * Update a row-store page's start key after an overlap. */ static int -__slvg_row_trk_update_start( - WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss) +__slvg_row_trk_update_start(WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss) { - WT_BTREE *btree; - WT_DECL_ITEM(dsk); - WT_DECL_ITEM(key); - WT_DECL_RET; - WT_PAGE *page; - WT_ROW *rip; - WT_TRACK *trk; - uint32_t i; - int cmp; - bool found; - - btree = S2BT(session); - page = NULL; - found = false; - - trk = ss->pages[slot]; - - /* - * If we deleted an initial piece of the WT_TRACK name space, it may no - * longer be in the right location. - * - * For example, imagine page #1 has the key range 30-50, it split, and - * we wrote page #2 with key range 30-40, and page #3 key range with - * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the - * key ranges were sorted, page #2 came first, then page #1 (because of - * their earlier start keys than page #3), and page #2 came before page - * #1 because of its LSN. When we resolve the overlap between page #2 - * and page #1, we truncate the initial key range of page #1, and it now - * sorts after page #3, because it has the same starting key of 40, and - * a lower LSN. - * - * First, update the WT_TRACK start key based on the specified stop key. - * - * Read and instantiate the WT_TRACK page (we don't have to verify the - * page, nor do we have to be quiet on error, we've already read this - * page successfully). - * - * Page flags are 0 because we aren't releasing the memory used to read - * the page into memory and we don't want page discard to free it. - */ - WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk)); - WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size)); - WT_ERR(__wt_page_inmem(session, NULL, dsk->data, 0, &page)); - - /* - * Walk the page, looking for a key sorting greater than the specified - * stop key -- that's our new start key. - */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - WT_ROW_FOREACH(page, rip, i) { - WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); - WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp)); - if (cmp > 0) { - found = true; - break; - } - } - - /* - * We know that at least one key on the page sorts after the specified - * stop key, otherwise the page would have entirely overlapped and we - * would have discarded it, we wouldn't be here. Therefore, this test - * is safe. (But, it never hurts to check.) - */ - if (!found) - WT_ERR_MSG(session, WT_ERROR, "expected on-page key not found"); - WT_ERR(__slvg_key_copy(session, &trk->row_start, key)); - - /* - * We may need to re-sort some number of elements in the list. Walk - * forward in the list until reaching an entry which cannot overlap - * the adjusted entry. If it's more than a single slot, re-sort the - * entries. - */ - for (i = slot + 1; i < ss->pages_next; ++i) { - if (ss->pages[i] == NULL) - continue; - WT_ERR(__wt_compare(session, - btree->collator, SLOT_START(i), &trk->row_stop, &cmp)); - if (cmp > 0) - break; - } - i -= slot; - if (i > 1) - __wt_qsort(ss->pages + slot, (size_t)i, - sizeof(WT_TRACK *), __slvg_trk_compare_key); - -err: if (page != NULL) - __wt_page_out(session, &page); - __wt_scr_free(session, &dsk); - __wt_scr_free(session, &key); - - return (ret); + WT_BTREE *btree; + WT_DECL_ITEM(dsk); + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_PAGE *page; + WT_ROW *rip; + WT_TRACK *trk; + uint32_t i; + int cmp; + bool found; + + btree = S2BT(session); + page = NULL; + found = false; + + trk = ss->pages[slot]; + + /* + * If we deleted an initial piece of the WT_TRACK name space, it may no + * longer be in the right location. + * + * For example, imagine page #1 has the key range 30-50, it split, and + * we wrote page #2 with key range 30-40, and page #3 key range with + * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the + * key ranges were sorted, page #2 came first, then page #1 (because of + * their earlier start keys than page #3), and page #2 came before page + * #1 because of its LSN. When we resolve the overlap between page #2 + * and page #1, we truncate the initial key range of page #1, and it now + * sorts after page #3, because it has the same starting key of 40, and + * a lower LSN. + * + * First, update the WT_TRACK start key based on the specified stop key. + * + * Read and instantiate the WT_TRACK page (we don't have to verify the + * page, nor do we have to be quiet on error, we've already read this + * page successfully). + * + * Page flags are 0 because we aren't releasing the memory used to read + * the page into memory and we don't want page discard to free it. + */ + WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk)); + WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size)); + WT_ERR(__wt_page_inmem(session, NULL, dsk->data, 0, &page)); + + /* + * Walk the page, looking for a key sorting greater than the specified stop key -- that's our + * new start key. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ROW_FOREACH (page, rip, i) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp)); + if (cmp > 0) { + found = true; + break; + } + } + + /* + * We know that at least one key on the page sorts after the specified stop key, otherwise the + * page would have entirely overlapped and we would have discarded it, we wouldn't be here. + * Therefore, this test is safe. (But, it never hurts to check.) + */ + if (!found) + WT_ERR_MSG(session, WT_ERROR, "expected on-page key not found"); + WT_ERR(__slvg_key_copy(session, &trk->row_start, key)); + + /* + * We may need to re-sort some number of elements in the list. Walk forward in the list until + * reaching an entry which cannot overlap the adjusted entry. If it's more than a single slot, + * re-sort the entries. + */ + for (i = slot + 1; i < ss->pages_next; ++i) { + if (ss->pages[i] == NULL) + continue; + WT_ERR(__wt_compare(session, btree->collator, SLOT_START(i), &trk->row_stop, &cmp)); + if (cmp > 0) + break; + } + i -= slot; + if (i > 1) + __wt_qsort(ss->pages + slot, (size_t)i, sizeof(WT_TRACK *), __slvg_trk_compare_key); + +err: + if (page != NULL) + __wt_page_out(session, &page); + __wt_scr_free(session, &dsk); + __wt_scr_free(session, &key); + + return (ret); } /* * __slvg_row_build_internal -- - * Build a row-store in-memory page that references all of the leaf - * pages we've found. + * Build a row-store in-memory page that references all of the leaf pages we've found. */ static int -__slvg_row_build_internal( - WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) +__slvg_row_build_internal(WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) { - WT_ADDR *addr; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *ref, **refp; - WT_TRACK *trk; - uint32_t i; - - addr = NULL; - - /* Allocate a row-store root (internal) page and fill it in. */ - WT_RET(__wt_page_alloc( - session, WT_PAGE_ROW_INT, leaf_cnt, true, &page)); - WT_ERR(__slvg_modify_init(session, page)); - - pindex = WT_INTL_INDEX_GET_SAFE(page); - for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { - if ((trk = ss->pages[i]) == NULL) - continue; - - ref = *refp++; - ref->home = page; - ref->page = NULL; - - WT_ERR(__wt_calloc_one(session, &addr)); - WT_ERR(__wt_memdup( - session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); - addr->size = trk->trk_addr_size; - addr->type = - trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; - ref->addr = addr; - addr = NULL; - - __wt_ref_key_clear(ref); - WT_REF_SET_STATE(ref, WT_REF_DISK); - - /* - * If the page's key range is unmodified from when we read it - * (in other words, we didn't merge part of this page with - * another page), we can use the page without change, and the - * only thing we need to do is mark all overflow records the - * page references as in-use. - * - * If we did merge with another page, we have to build a page - * reflecting the updated key range. Note, that requires an - * additional pass to free the merge page's backing blocks. - */ - if (F_ISSET(trk, WT_TRACK_MERGE)) { - ss->merge_free = true; - - WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss)); - } else { - WT_ERR(__wt_row_ikey_incr(session, page, 0, - trk->row_start.data, trk->row_start.size, ref)); - - WT_ERR(__slvg_ovfl_ref_all(session, trk)); - } - ++ref; - } - - __wt_root_ref_init(session, &ss->root_ref, page, false); - - if (0) { -err: __wt_free(session, addr); - __wt_page_out(session, &page); - } - return (ret); + WT_ADDR *addr; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *ref, **refp; + WT_TRACK *trk; + uint32_t i; + + addr = NULL; + + /* Allocate a row-store root (internal) page and fill it in. */ + WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT, leaf_cnt, true, &page)); + WT_ERR(__slvg_modify_init(session, page)); + + pindex = WT_INTL_INDEX_GET_SAFE(page); + for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + + ref = *refp++; + ref->home = page; + ref->page = NULL; + + WT_ERR(__wt_calloc_one(session, &addr)); + WT_ERR(__wt_memdup(session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); + addr->size = trk->trk_addr_size; + addr->type = trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF; + ref->addr = addr; + addr = NULL; + + __wt_ref_key_clear(ref); + WT_REF_SET_STATE(ref, WT_REF_DISK); + + /* + * If the page's key range is unmodified from when we read it + * (in other words, we didn't merge part of this page with + * another page), we can use the page without change, and the + * only thing we need to do is mark all overflow records the + * page references as in-use. + * + * If we did merge with another page, we have to build a page + * reflecting the updated key range. Note, that requires an + * additional pass to free the merge page's backing blocks. + */ + if (F_ISSET(trk, WT_TRACK_MERGE)) { + ss->merge_free = true; + + WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss)); + } else { + WT_ERR( + __wt_row_ikey_incr(session, page, 0, trk->row_start.data, trk->row_start.size, ref)); + + WT_ERR(__slvg_ovfl_ref_all(session, trk)); + } + ++ref; + } + + __wt_root_ref_init(session, &ss->root_ref, page, false); + + if (0) { +err: + __wt_free(session, addr); + __wt_page_out(session, &page); + } + return (ret); } /* * __slvg_row_build_leaf -- - * Build a row-store leaf page for a merged page. + * Build a row-store leaf page for a merged page. */ static int -__slvg_row_build_leaf( - WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss) +__slvg_row_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss) { - WT_BTREE *btree; - WT_DECL_ITEM(key); - WT_DECL_RET; - WT_PAGE *page; - WT_ROW *rip; - WT_SALVAGE_COOKIE *cookie, _cookie; - uint32_t i, skip_start, skip_stop; - int cmp; - - btree = S2BT(session); - page = NULL; - - cookie = &_cookie; - WT_CLEAR(*cookie); - - /* Allocate temporary space in which to instantiate the keys. */ - WT_RET(__wt_scr_alloc(session, 0, &key)); - - /* Get the original page, including the full in-memory setup. */ - WT_ERR(__wt_page_in(session, ref, 0)); - page = ref->page; - - /* - * Figure out how many page keys we want to take and how many we want - * to skip. - * - * If checking the starting range key, the key we're searching for will - * be equal to the starting range key. This is because we figured out - * the true merged-page start key as part of discarding initial keys - * from the page (see the __slvg_row_range_overlap function, and its - * calls to __slvg_row_trk_update_start for more information). - * - * If checking the stopping range key, we want the keys on the page that - * are less-than the stopping range key. This is because we copied a - * key from another page to define this page's stop range: that page is - * the page that owns the "equal to" range space. - */ - skip_start = skip_stop = 0; - if (F_ISSET(trk, WT_TRACK_CHECK_START)) - WT_ROW_FOREACH(page, rip, i) { - WT_ERR( - __wt_row_leaf_key(session, page, rip, key, false)); - - /* - * >= is correct: see the comment above. - */ - WT_ERR(__wt_compare(session, - btree->collator, key, &trk->row_start, &cmp)); - if (cmp >= 0) - break; - __wt_verbose(session, WT_VERB_SALVAGE, - "%s merge discarding leading key %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - __wt_buf_set_printable( - session, key->data, key->size, ss->tmp2)); - ++skip_start; - } - if (F_ISSET(trk, WT_TRACK_CHECK_STOP)) - WT_ROW_FOREACH_REVERSE(page, rip, i) { - WT_ERR( - __wt_row_leaf_key(session, page, rip, key, false)); - - /* - * < is correct: see the comment above. - */ - WT_ERR(__wt_compare(session, - btree->collator, key, &trk->row_stop, &cmp)); - if (cmp < 0) - break; - __wt_verbose(session, WT_VERB_SALVAGE, - "%s merge discarding trailing key %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - __wt_buf_set_printable( - session, key->data, key->size, ss->tmp2)); - ++skip_stop; - } - - /* We should have selected some entries, but not the entire page. */ - WT_ASSERT(session, - skip_start + skip_stop > 0 && - skip_start + skip_stop < page->entries); - - /* - * Take a copy of this page's first key to define the start of - * its range. The key may require processing, otherwise, it's - * a copy from the page. - */ - rip = page->pg_row + skip_start; - WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); - WT_ERR(__wt_row_ikey_incr( - session, ref->home, 0, key->data, key->size, ref)); - - /* Set the referenced flag on overflow pages we're using. */ - if (trk->trk_ovfl_cnt != 0) - WT_ERR(__slvg_row_ovfl(session, - trk, page, skip_start, page->entries - skip_stop)); - - /* - * Change the page to reflect the correct record count: there is no - * need to copy anything on the page itself, the entries value limits - * the number of page items. - */ - page->entries -= skip_stop; - cookie->skip = skip_start; - - /* - * We can't discard the original blocks associated with this page now. - * (The problem is we don't want to overwrite any original information - * until the salvage run succeeds -- if we free the blocks now, the next - * merge page we write might allocate those blocks and overwrite them, - * and should the salvage run eventually fail, the original information - * would have been lost.) Clear the reference addr so eviction doesn't - * free the underlying blocks. - */ - __wt_ref_addr_free(session, ref); - - /* Write the new version of the leaf page to disk. */ - WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile( - session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); - - /* Reset the page. */ - page->entries += skip_stop; - - /* - * Discard our hazard pointer and evict the page, updating the - * parent's reference. - */ - ret = __wt_page_release(session, ref, 0); - if (ret == 0) - ret = __wt_evict(session, ref, WT_REF_MEM, - WT_EVICT_CALL_CLOSING); - - if (0) { -err: WT_TRET(__wt_page_release(session, ref, 0)); - } - __wt_scr_free(session, &key); - - return (ret); + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_PAGE *page; + WT_ROW *rip; + WT_SALVAGE_COOKIE *cookie, _cookie; + uint32_t i, skip_start, skip_stop; + int cmp; + + btree = S2BT(session); + page = NULL; + + cookie = &_cookie; + WT_CLEAR(*cookie); + + /* Allocate temporary space in which to instantiate the keys. */ + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* Get the original page, including the full in-memory setup. */ + WT_ERR(__wt_page_in(session, ref, 0)); + page = ref->page; + + /* + * Figure out how many page keys we want to take and how many we want + * to skip. + * + * If checking the starting range key, the key we're searching for will + * be equal to the starting range key. This is because we figured out + * the true merged-page start key as part of discarding initial keys + * from the page (see the __slvg_row_range_overlap function, and its + * calls to __slvg_row_trk_update_start for more information). + * + * If checking the stopping range key, we want the keys on the page that + * are less-than the stopping range key. This is because we copied a + * key from another page to define this page's stop range: that page is + * the page that owns the "equal to" range space. + */ + skip_start = skip_stop = 0; + if (F_ISSET(trk, WT_TRACK_CHECK_START)) + WT_ROW_FOREACH (page, rip, i) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + + /* + * >= is correct: see the comment above. + */ + WT_ERR(__wt_compare(session, btree->collator, key, &trk->row_start, &cmp)); + if (cmp >= 0) + break; + __wt_verbose(session, WT_VERB_SALVAGE, "%s merge discarding leading key %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, key->data, key->size, ss->tmp2)); + ++skip_start; + } + if (F_ISSET(trk, WT_TRACK_CHECK_STOP)) + WT_ROW_FOREACH_REVERSE(page, rip, i) + { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + + /* + * < is correct: see the comment above. + */ + WT_ERR(__wt_compare(session, btree->collator, key, &trk->row_stop, &cmp)); + if (cmp < 0) + break; + __wt_verbose(session, WT_VERB_SALVAGE, "%s merge discarding trailing key %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, key->data, key->size, ss->tmp2)); + ++skip_stop; + } + + /* We should have selected some entries, but not the entire page. */ + WT_ASSERT(session, skip_start + skip_stop > 0 && skip_start + skip_stop < page->entries); + + /* + * Take a copy of this page's first key to define the start of its range. The key may require + * processing, otherwise, it's a copy from the page. + */ + rip = page->pg_row + skip_start; + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + WT_ERR(__wt_row_ikey_incr(session, ref->home, 0, key->data, key->size, ref)); + + /* Set the referenced flag on overflow pages we're using. */ + if (trk->trk_ovfl_cnt != 0) + WT_ERR(__slvg_row_ovfl(session, trk, page, skip_start, page->entries - skip_stop)); + + /* + * Change the page to reflect the correct record count: there is no need to copy anything on the + * page itself, the entries value limits the number of page items. + */ + page->entries -= skip_stop; + cookie->skip = skip_start; + + /* + * We can't discard the original blocks associated with this page now. + * (The problem is we don't want to overwrite any original information + * until the salvage run succeeds -- if we free the blocks now, the next + * merge page we write might allocate those blocks and overwrite them, + * and should the salvage run eventually fail, the original information + * would have been lost.) Clear the reference addr so eviction doesn't + * free the underlying blocks. + */ + __wt_ref_addr_free(session, ref); + + /* Write the new version of the leaf page to disk. */ + WT_ERR(__slvg_modify_init(session, page)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); + + /* Reset the page. */ + page->entries += skip_stop; + + /* + * Discard our hazard pointer and evict the page, updating the parent's reference. + */ + ret = __wt_page_release(session, ref, 0); + if (ret == 0) + ret = __wt_evict(session, ref, WT_REF_MEM, WT_EVICT_CALL_CLOSING); + + if (0) { +err: + WT_TRET(__wt_page_release(session, ref, 0)); + } + __wt_scr_free(session, &key); + + return (ret); } /* * __slvg_row_ovfl_single -- - * Find a single overflow record in the merge page's list, and mark it as - * referenced. + * Find a single overflow record in the merge page's list, and mark it as referenced. */ static int -__slvg_row_ovfl_single( - WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) +__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) { - WT_TRACK *ovfl; - uint32_t i; - - /* Check if it's an overflow record. */ - if (unpack->type != WT_CELL_KEY_OVFL && - unpack->type != WT_CELL_VALUE_OVFL) - return (0); - - /* - * Search the list of overflow records for this page -- we should find - * exactly one match, and we mark it as referenced. - */ - for (i = 0; i < trk->trk_ovfl_cnt; ++i) { - ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; - if (unpack->size == ovfl->trk_addr_size && - memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) - return (__slvg_ovfl_ref(session, ovfl, true)); - } - - WT_PANIC_RET(session, - EINVAL, "overflow record at row-store page merge not found"); + WT_TRACK *ovfl; + uint32_t i; + + /* Check if it's an overflow record. */ + if (unpack->type != WT_CELL_KEY_OVFL && unpack->type != WT_CELL_VALUE_OVFL) + return (0); + + /* + * Search the list of overflow records for this page -- we should find exactly one match, and we + * mark it as referenced. + */ + for (i = 0; i < trk->trk_ovfl_cnt; ++i) { + ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; + if (unpack->size == ovfl->trk_addr_size && + memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) + return (__slvg_ovfl_ref(session, ovfl, true)); + } + + WT_PANIC_RET(session, EINVAL, "overflow record at row-store page merge not found"); } /* * __slvg_row_ovfl -- - * Mark overflow items referenced by the merged page. + * Mark overflow items referenced by the merged page. */ static int -__slvg_row_ovfl(WT_SESSION_IMPL *session, - WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop) +__slvg_row_ovfl( + WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop) { - WT_CELL *cell; - WT_CELL_UNPACK unpack; - WT_ROW *rip; - void *copy; - - /* - * We're merging a row-store page, and we took some number of records, - * figure out which (if any) overflow records we used. - */ - for (rip = page->pg_row + start; start < stop; ++start, ++rip) { - copy = WT_ROW_KEY_COPY(rip); - (void)__wt_row_leaf_key_info( - page, copy, NULL, &cell, NULL, NULL); - if (cell != NULL) { - __wt_cell_unpack(cell, &unpack); - WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); - } - __wt_row_leaf_value_cell(page, rip, NULL, &unpack); - WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); - } - return (0); + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_ROW *rip; + void *copy; + + /* + * We're merging a row-store page, and we took some number of records, figure out which (if any) + * overflow records we used. + */ + for (rip = page->pg_row + start; start < stop; ++start, ++rip) { + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info(page, copy, NULL, &cell, NULL, NULL); + if (cell != NULL) { + __wt_cell_unpack(cell, &unpack); + WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); + } + __wt_row_leaf_value_cell(page, rip, NULL, &unpack); + WT_RET(__slvg_row_ovfl_single(session, trk, &unpack)); + } + return (0); } /* * __slvg_trk_compare_addr -- - * Compare two WT_TRACK array entries by address cookie. + * Compare two WT_TRACK array entries by address cookie. */ static int WT_CDECL __slvg_trk_compare_addr(const void *a, const void *b) { - WT_DECL_RET; - WT_TRACK *a_trk, *b_trk; - size_t len; - - a_trk = *(WT_TRACK **)a; - b_trk = *(WT_TRACK **)b; - - /* - * We don't care about the order because these are opaque cookies -- - * we're just sorting them so we can binary search instead of linear - * search. - */ - len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size); - ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len); - if (ret == 0) - ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1; - return (ret); + WT_DECL_RET; + WT_TRACK *a_trk, *b_trk; + size_t len; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + /* + * We don't care about the order because these are opaque cookies -- we're just sorting them so + * we can binary search instead of linear search. + */ + len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size); + ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len); + if (ret == 0) + ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1; + return (ret); } /* * __slvg_ovfl_compare -- - * Bsearch comparison routine for the overflow array. + * Bsearch comparison routine for the overflow array. */ static int WT_CDECL __slvg_ovfl_compare(const void *a, const void *b) { - WT_ADDR *addr; - WT_DECL_RET; - WT_TRACK *trk; - size_t len; - - addr = (WT_ADDR *)a; - trk = *(WT_TRACK **)b; - - len = WT_MIN(trk->trk_addr_size, addr->size); - ret = memcmp(addr->addr, trk->trk_addr, len); - if (ret == 0 && addr->size != trk->trk_addr_size) - ret = addr->size < trk->trk_addr_size ? -1 : 1; - return (ret); + WT_ADDR *addr; + WT_DECL_RET; + WT_TRACK *trk; + size_t len; + + addr = (WT_ADDR *)a; + trk = *(WT_TRACK **)b; + + len = WT_MIN(trk->trk_addr_size, addr->size); + ret = memcmp(addr->addr, trk->trk_addr, len); + if (ret == 0 && addr->size != trk->trk_addr_size) + ret = addr->size < trk->trk_addr_size ? -1 : 1; + return (ret); } /* * __slvg_ovfl_reconcile -- - * Review relationships between leaf pages and the overflow pages, delete - * leaf pages until there's a one-to-one relationship between leaf and overflow - * pages. + * Review relationships between leaf pages and the overflow pages, delete leaf pages until + * there's a one-to-one relationship between leaf and overflow pages. */ static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_ADDR *addr; - WT_DECL_RET; - WT_TRACK **searchp, *trk; - uint32_t i, j, *slot; - - slot = NULL; - - /* - * If an overflow page is referenced more than once, discard leaf pages - * with the lowest LSNs until overflow pages are only referenced once. - * - * This requires sorting the page list by LSN, and the overflow array by - * address cookie. - */ - __wt_qsort(ss->pages, - (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen); - __wt_qsort(ss->ovfl, - (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr); - - /* - * Walk the list of pages and discard any pages referencing non-existent - * overflow pages or referencing overflow pages also referenced by pages - * with higher LSNs. Our caller sorted the page list by LSN, high to - * low, so we don't have to do explicit testing of the page LSNs, the - * first page to reference an overflow page is the best page to own it. - */ - for (i = 0; i < ss->pages_next; ++i) { - if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0) - continue; - - WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot)); - for (j = 0; j < trk->trk_ovfl_cnt; ++j) { - addr = &trk->trk_ovfl_addr[j]; - searchp = bsearch(addr, ss->ovfl, ss->ovfl_next, - sizeof(WT_TRACK *), __slvg_ovfl_compare); - - /* - * If the overflow page doesn't exist or if another page - * has already claimed it, this leaf page isn't usable. - */ - if (searchp != NULL && - !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) { - /* - * Convert each block address into a slot in the - * list of overflow pages as we go. - */ - slot[j] = (uint32_t)(searchp - ss->ovfl); - F_SET(*searchp, WT_TRACK_OVFL_REFD); - continue; - } - - __wt_verbose(session, WT_VERB_SALVAGE, - "%s references unavailable overflow page %s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp1), - __wt_addr_string(session, - addr->addr, addr->size, ss->tmp2)); - - /* - * Clear the "referenced" flag for any overflow pages - * already claimed by this leaf page some other page - * might claim them. - */ - while (j > 0) - F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD); - trk = NULL; - WT_ERR(__slvg_trk_free(session, &ss->pages[i], true)); - break; - } - - /* - * We now have a reference to the overflow WT_TRACK, and so no - * longer need the page's address array, discard it. Note, we - * potentially freed the WT_TRACK in the loop above, check it's - * still valid. - */ - if (trk == NULL) - __wt_free(session, slot); - else { - __slvg_trk_free_addr(session, trk); - - trk->trk_ovfl_slot = slot; - slot = NULL; - } - } - return (0); - -err: __wt_free(session, slot); - return (ret); + WT_ADDR *addr; + WT_DECL_RET; + WT_TRACK **searchp, *trk; + uint32_t i, j, *slot; + + slot = NULL; + + /* + * If an overflow page is referenced more than once, discard leaf pages + * with the lowest LSNs until overflow pages are only referenced once. + * + * This requires sorting the page list by LSN, and the overflow array by + * address cookie. + */ + __wt_qsort(ss->pages, (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen); + __wt_qsort(ss->ovfl, (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr); + + /* + * Walk the list of pages and discard any pages referencing non-existent overflow pages or + * referencing overflow pages also referenced by pages with higher LSNs. Our caller sorted the + * page list by LSN, high to low, so we don't have to do explicit testing of the page LSNs, the + * first page to reference an overflow page is the best page to own it. + */ + for (i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0) + continue; + + WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot)); + for (j = 0; j < trk->trk_ovfl_cnt; ++j) { + addr = &trk->trk_ovfl_addr[j]; + searchp = + bsearch(addr, ss->ovfl, ss->ovfl_next, sizeof(WT_TRACK *), __slvg_ovfl_compare); + + /* + * If the overflow page doesn't exist or if another page has already claimed it, this + * leaf page isn't usable. + */ + if (searchp != NULL && !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) { + /* + * Convert each block address into a slot in the list of overflow pages as we go. + */ + slot[j] = (uint32_t)(searchp - ss->ovfl); + F_SET(*searchp, WT_TRACK_OVFL_REFD); + continue; + } + + __wt_verbose(session, WT_VERB_SALVAGE, "%s references unavailable overflow page %s", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_addr_string(session, addr->addr, addr->size, ss->tmp2)); + + /* + * Clear the "referenced" flag for any overflow pages already claimed by this leaf page + * some other page might claim them. + */ + while (j > 0) + F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD); + trk = NULL; + WT_ERR(__slvg_trk_free(session, &ss->pages[i], true)); + break; + } + + /* + * We now have a reference to the overflow WT_TRACK, and so no longer need the page's + * address array, discard it. Note, we potentially freed the WT_TRACK in the loop above, + * check it's still valid. + */ + if (trk == NULL) + __wt_free(session, slot); + else { + __slvg_trk_free_addr(session, trk); + + trk->trk_ovfl_slot = slot; + slot = NULL; + } + } + return (0); + +err: + __wt_free(session, slot); + return (ret); } /* * __slvg_trk_compare_key -- - * Compare two WT_TRACK array entries by key, and secondarily, by LSN. + * Compare two WT_TRACK array entries by key, and secondarily, by LSN. */ static int WT_CDECL __slvg_trk_compare_key(const void *a, const void *b) { - WT_SESSION_IMPL *session; - WT_TRACK *a_trk, *b_trk; - uint64_t a_gen, a_recno, b_gen, b_recno; - int cmp; - - a_trk = *(WT_TRACK **)a; - b_trk = *(WT_TRACK **)b; - - if (a_trk == NULL) - return (b_trk == NULL ? 0 : 1); - if (b_trk == NULL) - return (-1); - - switch (a_trk->ss->page_type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - a_recno = a_trk->col_start; - b_recno = b_trk->col_start; - if (a_recno == b_recno) - break; - if (a_recno > b_recno) - return (1); - if (a_recno < b_recno) - return (-1); - break; - case WT_PAGE_ROW_LEAF: - /* - * XXX - * __wt_compare can potentially fail, and we're ignoring that - * error because this routine is called as an underlying qsort - * routine. - */ - session = a_trk->ss->session; - (void)__wt_compare(session, S2BT(session)->collator, - &a_trk->row_start, &b_trk->row_start, &cmp); - if (cmp != 0) - return (cmp); - break; - } - - /* - * If the primary keys compare equally, differentiate based on LSN. - * Sort from highest LSN to lowest, that is, the earlier pages in - * the array are more desirable. - */ - a_gen = a_trk->trk_gen; - b_gen = b_trk->trk_gen; - return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); + WT_SESSION_IMPL *session; + WT_TRACK *a_trk, *b_trk; + uint64_t a_gen, a_recno, b_gen, b_recno; + int cmp; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + if (a_trk == NULL) + return (b_trk == NULL ? 0 : 1); + if (b_trk == NULL) + return (-1); + + switch (a_trk->ss->page_type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + a_recno = a_trk->col_start; + b_recno = b_trk->col_start; + if (a_recno == b_recno) + break; + if (a_recno > b_recno) + return (1); + if (a_recno < b_recno) + return (-1); + break; + case WT_PAGE_ROW_LEAF: + /* + * XXX + * __wt_compare can potentially fail, and we're ignoring that + * error because this routine is called as an underlying qsort + * routine. + */ + session = a_trk->ss->session; + (void)__wt_compare( + session, S2BT(session)->collator, &a_trk->row_start, &b_trk->row_start, &cmp); + if (cmp != 0) + return (cmp); + break; + } + + /* + * If the primary keys compare equally, differentiate based on LSN. Sort from highest LSN to + * lowest, that is, the earlier pages in the array are more desirable. + */ + a_gen = a_trk->trk_gen; + b_gen = b_trk->trk_gen; + return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); } /* * __slvg_trk_compare_gen -- - * Compare two WT_TRACK array entries by LSN. + * Compare two WT_TRACK array entries by LSN. */ static int WT_CDECL __slvg_trk_compare_gen(const void *a, const void *b) { - WT_TRACK *a_trk, *b_trk; - uint64_t a_gen, b_gen; - - a_trk = *(WT_TRACK **)a; - b_trk = *(WT_TRACK **)b; - - /* - * Sort from highest LSN to lowest, that is, the earlier pages in the - * array are more desirable. - */ - a_gen = a_trk->trk_gen; - b_gen = b_trk->trk_gen; - return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); + WT_TRACK *a_trk, *b_trk; + uint64_t a_gen, b_gen; + + a_trk = *(WT_TRACK **)a; + b_trk = *(WT_TRACK **)b; + + /* + * Sort from highest LSN to lowest, that is, the earlier pages in the array are more desirable. + */ + a_gen = a_trk->trk_gen; + b_gen = b_trk->trk_gen; + return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0)); } /* * __slvg_merge_block_free -- - * Clean up backing file and overflow blocks after the merge phase. + * Clean up backing file and overflow blocks after the merge phase. */ static int __slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_TRACK *trk; - uint32_t i; - - /* Free any underlying file blocks for merged pages. */ - for (i = 0; i < ss->pages_next; ++i) { - if ((trk = ss->pages[i]) == NULL) - continue; - if (F_ISSET(trk, WT_TRACK_MERGE)) - WT_RET(__slvg_trk_free(session, &ss->pages[i], true)); - } - - /* Free any unused overflow records. */ - return (__slvg_ovfl_discard(session, ss)); + WT_TRACK *trk; + uint32_t i; + + /* Free any underlying file blocks for merged pages. */ + for (i = 0; i < ss->pages_next; ++i) { + if ((trk = ss->pages[i]) == NULL) + continue; + if (F_ISSET(trk, WT_TRACK_MERGE)) + WT_RET(__slvg_trk_free(session, &ss->pages[i], true)); + } + + /* Free any unused overflow records. */ + return (__slvg_ovfl_discard(session, ss)); } /* * __slvg_ovfl_ref -- - * Reference an overflow page, checking for multiple references. + * Reference an overflow page, checking for multiple references. */ static int __slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, bool multi_panic) { - if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { - if (!multi_panic) - return (__wt_set_return(session, EBUSY)); - WT_PANIC_RET(session, EINVAL, - "overflow record unexpectedly referenced multiple times " - "during leaf page merge"); - } - - F_SET(trk, WT_TRACK_OVFL_REFD); - return (0); + if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { + if (!multi_panic) + return (__wt_set_return(session, EBUSY)); + WT_PANIC_RET(session, EINVAL, + "overflow record unexpectedly referenced multiple times " + "during leaf page merge"); + } + + F_SET(trk, WT_TRACK_OVFL_REFD); + return (0); } /* * __slvg_ovfl_ref_all -- - * Reference all of the page's overflow pages. + * Reference all of the page's overflow pages. */ static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk) { - uint32_t i; + uint32_t i; - for (i = 0; i < trk->trk_ovfl_cnt; ++i) - WT_RET(__slvg_ovfl_ref( - session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1)); + for (i = 0; i < trk->trk_ovfl_cnt; ++i) + WT_RET(__slvg_ovfl_ref(session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1)); - return (0); + return (0); } /* * __slvg_ovfl_discard -- - * Discard unused overflow pages. + * Discard unused overflow pages. */ static int __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss) { - WT_TRACK *trk; - uint32_t i; - - /* - * Walk the overflow page array: if an overflow page isn't referenced, - * add its file blocks to the free list. - * - * Clear the reference flag (it's reused to figure out if the overflow - * record is referenced, but never used, by merged pages). - */ - for (i = 0; i < ss->ovfl_next; ++i) { - if ((trk = ss->ovfl[i]) == NULL) - continue; - - if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { - F_CLR(trk, WT_TRACK_OVFL_REFD); - continue; - } - __wt_verbose(session, WT_VERB_SALVAGE, - "%s unused overflow page", - __wt_addr_string( - session, trk->trk_addr, trk->trk_addr_size, ss->tmp1)); - WT_RET(__slvg_trk_free(session, &ss->ovfl[i], true)); - } - - return (0); + WT_TRACK *trk; + uint32_t i; + + /* + * Walk the overflow page array: if an overflow page isn't referenced, + * add its file blocks to the free list. + * + * Clear the reference flag (it's reused to figure out if the overflow + * record is referenced, but never used, by merged pages). + */ + for (i = 0; i < ss->ovfl_next; ++i) { + if ((trk = ss->ovfl[i]) == NULL) + continue; + + if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { + F_CLR(trk, WT_TRACK_OVFL_REFD); + continue; + } + __wt_verbose(session, WT_VERB_SALVAGE, "%s unused overflow page", + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, ss->tmp1)); + WT_RET(__slvg_trk_free(session, &ss->ovfl[i], true)); + } + + return (0); } /* * __slvg_cleanup -- - * Discard memory allocated to the page and overflow arrays. + * Discard memory allocated to the page and overflow arrays. */ static int __slvg_cleanup(WT_SESSION_IMPL *session, WT_STUFF *ss) { - uint32_t i; + uint32_t i; - /* Discard the leaf page array. */ - for (i = 0; i < ss->pages_next; ++i) - if (ss->pages[i] != NULL) - WT_RET(__slvg_trk_free(session, &ss->pages[i], false)); - __wt_free(session, ss->pages); + /* Discard the leaf page array. */ + for (i = 0; i < ss->pages_next; ++i) + if (ss->pages[i] != NULL) + WT_RET(__slvg_trk_free(session, &ss->pages[i], false)); + __wt_free(session, ss->pages); - /* Discard the ovfl page array. */ - for (i = 0; i < ss->ovfl_next; ++i) - if (ss->ovfl[i] != NULL) - WT_RET(__slvg_trk_free(session, &ss->ovfl[i], false)); - __wt_free(session, ss->ovfl); + /* Discard the ovfl page array. */ + for (i = 0; i < ss->ovfl_next; ++i) + if (ss->ovfl[i] != NULL) + WT_RET(__slvg_trk_free(session, &ss->ovfl[i], false)); + __wt_free(session, ss->ovfl); - return (0); + return (0); } /* * __slvg_trk_free_addr -- - * Discard address information. + * Discard address information. */ static void __slvg_trk_free_addr(WT_SESSION_IMPL *session, WT_TRACK *trk) { - uint32_t i; + uint32_t i; - if (trk->trk_ovfl_addr != NULL) { - for (i = 0; i < trk->trk_ovfl_cnt; ++i) - __wt_free(session, trk->trk_ovfl_addr[i].addr); - __wt_free(session, trk->trk_ovfl_addr); - } + if (trk->trk_ovfl_addr != NULL) { + for (i = 0; i < trk->trk_ovfl_cnt; ++i) + __wt_free(session, trk->trk_ovfl_addr[i].addr); + __wt_free(session, trk->trk_ovfl_addr); + } } /* * __slvg_trk_free_block -- - * Discard underlying blocks. + * Discard underlying blocks. */ static int __slvg_trk_free_block(WT_SESSION_IMPL *session, WT_TRACK *trk) { - WT_BM *bm; + WT_BM *bm; - bm = S2BT(session)->bm; + bm = S2BT(session)->bm; - /* - * If freeing underlying file blocks or overflow pages, this is a page - * we were tracking but eventually decided not to use. - */ - __wt_verbose(session, WT_VERB_SALVAGE, - "%s blocks discarded: discard freed file bytes %" PRIu32, - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size); + /* + * If freeing underlying file blocks or overflow pages, this is a page we were tracking but + * eventually decided not to use. + */ + __wt_verbose(session, WT_VERB_SALVAGE, "%s blocks discarded: discard freed file bytes %" PRIu32, + __wt_addr_string(session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size); - return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size)); + return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size)); } /* * __slvg_trk_free -- - * Discard a WT_TRACK structure and (optionally) its underlying blocks. + * Discard a WT_TRACK structure and (optionally) its underlying blocks. */ static int -__slvg_trk_free( - WT_SESSION_IMPL *session, WT_TRACK **trkp, bool free_on_last_ref) +__slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, bool free_on_last_ref) { - WT_TRACK *trk; + WT_TRACK *trk; - trk = *trkp; - *trkp = NULL; + trk = *trkp; + *trkp = NULL; - /* - * If we're the last user of shared information, clean up. - */ - WT_ASSERT(session, trk->shared->ref > 0); - if (--trk->shared->ref == 0) { - /* - * If the free-on-last-ref flag is set, this chunk isn't going - * to use the backing physical blocks. As we're the last user - * of those blocks, nobody is going to use them and they can be - * discarded. - */ - if (free_on_last_ref) - WT_RET(__slvg_trk_free_block(session, trk)); + /* + * If we're the last user of shared information, clean up. + */ + WT_ASSERT(session, trk->shared->ref > 0); + if (--trk->shared->ref == 0) { + /* + * If the free-on-last-ref flag is set, this chunk isn't going to use the backing physical + * blocks. As we're the last user of those blocks, nobody is going to use them and they can + * be discarded. + */ + if (free_on_last_ref) + WT_RET(__slvg_trk_free_block(session, trk)); - __wt_free(session, trk->trk_addr); + __wt_free(session, trk->trk_addr); - __slvg_trk_free_addr(session, trk); + __slvg_trk_free_addr(session, trk); - __wt_free(session, trk->trk_ovfl_slot); + __wt_free(session, trk->trk_ovfl_slot); - __wt_free(session, trk->shared); - } + __wt_free(session, trk->shared); + } - if (trk->ss->page_type == WT_PAGE_ROW_LEAF) { - __wt_buf_free(session, &trk->row_start); - __wt_buf_free(session, &trk->row_stop); - } + if (trk->ss->page_type == WT_PAGE_ROW_LEAF) { + __wt_buf_free(session, &trk->row_start); + __wt_buf_free(session, &trk->row_stop); + } - __wt_free(session, trk); + __wt_free(session, trk); - return (0); + return (0); } |