diff options
author | Michael Cahill <mjc@wiredtiger.com> | 2012-11-29 17:55:20 -0800 |
---|---|---|
committer | Michael Cahill <mjc@wiredtiger.com> | 2012-11-29 17:55:20 -0800 |
commit | 467f354bd61b188c6e8aaee19a710a98b659796b (patch) | |
tree | 7e59e60e19789bb681d46755439ed582bb81163a | |
parent | b0099cb3e9402b99b50889f11f46c14ba424227a (diff) | |
parent | 923e2881df5ce70afe495f2ef7db39ce86d66747 (diff) | |
download | mongo-467f354bd61b188c6e8aaee19a710a98b659796b.tar.gz |
Merge pull request #384 from wiredtiger/generation
Move the write-generation from the block layer into the btree layer.
-rw-r--r-- | src/block/block_addr.c | 4 | ||||
-rw-r--r-- | src/block/block_ckpt.c | 5 | ||||
-rw-r--r-- | src/block/block_mgr.c | 8 | ||||
-rw-r--r-- | src/block/block_slvg.c | 16 | ||||
-rw-r--r-- | src/block/block_write.c | 17 | ||||
-rw-r--r-- | src/btree/bt_handle.c | 77 | ||||
-rw-r--r-- | src/btree/bt_io.c | 22 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 29 | ||||
-rw-r--r-- | src/conn/conn_btree.c | 9 | ||||
-rw-r--r-- | src/include/block.h | 35 | ||||
-rw-r--r-- | src/include/btmem.h | 21 | ||||
-rw-r--r-- | src/include/btree.h | 2 | ||||
-rw-r--r-- | src/include/extern.h | 17 | ||||
-rw-r--r-- | src/include/meta.h | 2 | ||||
-rw-r--r-- | src/meta/meta_ckpt.c | 206 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 8 | ||||
-rw-r--r-- | test/salvage/salvage.c | 2 |
17 files changed, 245 insertions, 235 deletions
diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 68e350e1f12..2e18e73ab46 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -156,8 +156,6 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, ci->file_size = (off_t)a; WT_RET(__wt_vunpack_uint(pp, 0, &a)); ci->ckpt_size = a; - WT_RET(__wt_vunpack_uint(pp, 0, &a)); - ci->write_gen = a; return (0); } @@ -190,8 +188,6 @@ __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_RET(__wt_vpack_uint(pp, 0, a)); a = (uint64_t)ci->ckpt_size; WT_RET(__wt_vpack_uint(pp, 0, a)); - a = ci->write_gen; - WT_RET(__wt_vpack_uint(pp, 0, a)); return (0); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 778426bb839..6bf04f08b13 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -745,10 +745,7 @@ __ckpt_string(WT_SESSION_IMPL *session, (uintmax_t)(ci->discard.offset + ci->discard.size), ci->discard.size, ci->discard.cksum)); WT_RET(__wt_buf_catfmt(session, buf, - ", file size=%" PRIuMAX - ", write generation=%" PRIu64, - (uintmax_t)ci->file_size, - ci->write_gen)); + ", file size=%" PRIuMAX, (uintmax_t)ci->file_size)); __wt_block_ckpt_destroy(session, ci); diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index e3104fcaf3d..8f65dcc3ab6 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -337,16 +337,16 @@ __wt_bm_salvage_start(WT_SESSION_IMPL *session) * Return the next block from the file. */ int -__wt_bm_salvage_next(WT_SESSION_IMPL *session, - uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) +__wt_bm_salvage_next( + WT_SESSION_IMPL *session, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK *block; if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_salvage_next( - session, block, addr, addr_sizep, write_genp, eofp)); + return ( + __wt_block_salvage_next(session, block, addr, addr_sizep, eofp)); } /* diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index 56a37806547..326df3c38cc 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -74,9 +74,8 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) * Return the address for the next potential block from the file. */ int -__wt_block_salvage_next( - WT_SESSION_IMPL *session, WT_BLOCK *block, - uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) +__wt_block_salvage_next(WT_SESSION_IMPL *session, + WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); @@ -140,17 +139,6 @@ skip: WT_VERBOSE_ERR(session, salvage, session, block, offset, (off_t)allocsize)); } - /* - * Track the largest write-generation we've seen in the file so future - * writes, done after salvage completes, are preferred to these blocks. - * - * The read may have grown the buffer, reset our reference. - */ - blk = WT_BLOCK_HEADER_REF(tmp->mem); - *write_genp = blk->write_gen; - if (block->live.write_gen < blk->write_gen) - block->live.write_gen = blk->write_gen; - /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); diff --git a/src/block/block_write.c b/src/block/block_write.c index 5b7e5b4450b..7f1c401c882 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -99,23 +99,6 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* - * We increment the block's write generation so it's easy to identify - * newer versions of blocks during salvage: it's common in WiredTiger - * for multiple blocks to be internally consistent with identical - * first and last keys, so we need a way to know the most recent state - * of the block. (We could check to see which leaf is referenced by - * by the internal page, which implies salvaging internal pages (which - * I don't want to do), and it's not quite as good anyway, because the - * internal page may not have been written to disk after the leaf page - * was updated. So, write generations it is. - * - * Nothing is locked at this point but two versions of a page with the - * same generation is pretty unlikely, and if we did, they're going to - * be roughly identical for the purposes of salvage, anyway. - */ - blk->write_gen = ++block->live.write_gen; - - /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index f63661bb3a2..0675d2cf987 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -7,7 +7,7 @@ #include "wt_internal.h" -static int __btree_conf(WT_SESSION_IMPL *, const char *[]); +static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt, const char *[]); static int __btree_get_last_recno(WT_SESSION_IMPL *); static int __btree_page_sizes(WT_SESSION_IMPL *, const char *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, int); @@ -40,28 +40,31 @@ __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename) * Open a Btree. */ int -__wt_btree_open(WT_SESSION_IMPL *session, - const uint8_t *addr, uint32_t addr_size, const char *cfg[], int readonly) +__wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; + WT_CKPT ckpt; WT_CONFIG_ITEM cval; WT_DECL_RET; uint32_t root_addr_size; - int created, forced_salvage; - const char *filename; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; + int creation, forced_salvage, readonly; + const char *filename; btree = session->btree; + readonly = btree->checkpoint == NULL ? 0 : 1; - /* Initialize and configure the WT_BTREE structure. */ - WT_ERR(__btree_conf(session, cfg)); + /* Get the checkpoint information for this name/checkpoint pair. */ + WT_CLEAR(ckpt); + WT_RET(__wt_meta_checkpoint( + session, btree->name, btree->checkpoint, &ckpt)); /* * Bulk-load is only permitted on newly created files, not any empty * file -- see the checkpoint code for a discussion. */ - created = addr == NULL || addr_size == 0; - if (!created && F_ISSET(btree, WT_BTREE_BULK)) + creation = ckpt.raw.size == 0; + if (!creation && F_ISSET(btree, WT_BTREE_BULK)) WT_ERR_MSG(session, EINVAL, "bulk-load is only supported on newly created objects"); @@ -75,11 +78,13 @@ __wt_btree_open(WT_SESSION_IMPL *session, forced_salvage = 1; } + /* Initialize and configure the WT_BTREE structure. */ + WT_ERR(__btree_conf(session, &ckpt, cfg)); + /* Connect to the underlying block manager. */ filename = btree->name; if (!WT_PREFIX_SKIP(filename, "file:")) WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI"); - WT_ERR(__wt_bm_open( session, filename, btree->config, cfg, forced_salvage)); @@ -87,31 +92,33 @@ __wt_btree_open(WT_SESSION_IMPL *session, * Open the specified checkpoint unless it's a special command (special * commands are responsible for loading their own checkpoints, if any). */ - if (F_ISSET(btree, - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - return (0); - - /* - * There are two reasons to load an empty tree rather than a checkpoint: - * either there is no checkpoint (the file is being created), or the - * load call returns no root page (the checkpoint is for an empty file). - */ - WT_ERR(__wt_bm_checkpoint_load( - session, addr, addr_size, root_addr, &root_addr_size, readonly)); - if (created || root_addr_size == 0) - WT_ERR(__btree_tree_open_empty(session, created)); - else { - WT_ERR( - __wt_btree_tree_open(session, root_addr, root_addr_size)); - - /* Get the last record number in a column-store file. */ - if (btree->type != BTREE_ROW) - WT_ERR(__btree_get_last_recno(session)); + if (!F_ISSET(btree, + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + /* + * There are two reasons to load an empty tree rather than a + * checkpoint: either there is no checkpoint (the file is + * being created), or the load call returns no root page (the + * checkpoint is for an empty file). + */ + WT_ERR(__wt_bm_checkpoint_load(session, + ckpt.raw.data, ckpt.raw.size, + root_addr, &root_addr_size, readonly)); + if (creation || root_addr_size == 0) + WT_ERR(__btree_tree_open_empty(session, creation)); + else { + WT_ERR(__wt_btree_tree_open( + session, root_addr, root_addr_size)); + + /* Get the last record number in a column-store file. */ + if (btree->type != BTREE_ROW) + WT_ERR(__btree_get_last_recno(session)); + } } if (0) { err: (void)__wt_btree_close(session); } + __wt_meta_checkpoint_free(session, &ckpt); return (ret); } @@ -157,7 +164,7 @@ __wt_btree_close(WT_SESSION_IMPL *session) * Configure a WT_BTREE structure. */ static int -__btree_conf(WT_SESSION_IMPL *session, const char *cfg[]) +__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt, const char *cfg[]) { WT_BTREE *btree; WT_CONFIG_ITEM cval; @@ -304,8 +311,8 @@ __btree_conf(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_stat_alloc_dsrc_stats(session, &btree->stats)); - /* The tree has not been modified. */ - btree->modified = 0; + btree->write_gen = ckpt->write_gen; /* Write generation */ + btree->modified = 0; /* Clean */ return (0); } @@ -347,7 +354,7 @@ err: __wt_buf_free(session, &dsk); * Create an empty in-memory tree. */ static int -__btree_tree_open_empty(WT_SESSION_IMPL *session, int created) +__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) { WT_BTREE *btree; WT_DECL_RET; @@ -362,7 +369,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int created) * loads; set a flag that's cleared when a row is inserted into the * tree. */ - if (created) + if (creation) btree->bulk_load_ok = 1; /* diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index 174f7c6a994..6d8fb5b6149 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -233,12 +233,28 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, ip = tmp; } } + dsk = ip->mem; /* If the buffer is compressed, set the flag. */ - if (compressed) { - dsk = ip->mem; + if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); - } + + /* + * We increment the block's write generation so it's easy to identify + * newer versions of blocks during salvage. (It's common in WiredTiger, + * at least for the default block manager, for multiple blocks to be + * internally consistent with identical first and last keys, so we need + * a way to know the most recent state of the block. We could check + * which leaf is referenced by a valid internal page, but that implies + * salvaging internal pages, which I don't want to do, and it's not + * as good anyway, because the internal page may not have been written + * after the leaf page was updated. So, write generations it is. + * + * Nothing is locked at this point but two versions of a page with the + * same generation is pretty unlikely, and if we did, they're going to + * be roughly identical for the purposes of salvage, anyway. + */ + dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 45728934802..49d365e787c 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -125,11 +125,11 @@ static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, uint32_t); static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *, uint32_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **); static int __slvg_trk_leaf(WT_SESSION_IMPL *, - WT_PAGE_HEADER *, uint8_t *, uint32_t, uint64_t, WT_STUFF *); + WT_PAGE_HEADER *, uint8_t *, uint32_t, WT_STUFF *); static int __slvg_trk_leaf_ovfl( WT_SESSION_IMPL *, WT_PAGE_HEADER *, WT_TRACK *); static int __slvg_trk_ovfl(WT_SESSION_IMPL *, - WT_PAGE_HEADER *, uint8_t *, uint32_t, uint64_t, WT_STUFF *); + WT_PAGE_HEADER *, uint8_t *, uint32_t, WT_STUFF *); /* * __wt_bt_salvage -- @@ -312,7 +312,6 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) WT_DECL_ITEM(buf); WT_DECL_RET; WT_PAGE_HEADER *dsk; - uint64_t gen; uint32_t addrbuf_size; uint8_t addrbuf[WT_BTREE_MAX_ADDR_COOKIE]; int eof; @@ -323,7 +322,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) for (;;) { /* Get the next block address from the block manager. */ WT_ERR(__wt_bm_salvage_next( - session, addrbuf, &addrbuf_size, &gen, &eof)); + session, addrbuf, &addrbuf_size, &eof)); if (eof) break; @@ -388,7 +387,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) WT_VERBOSE_ERR(session, salvage, "tracking %s page, generation %" PRIu64 " %s", - __wt_page_type_string(dsk->type), gen, + __wt_page_type_string(dsk->type), dsk->write_gen, (const char *)as->data); switch (dsk->type) { @@ -405,11 +404,11 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) __wt_page_type_string(dsk->type)); WT_ERR(__slvg_trk_leaf( - session, dsk, addrbuf, addrbuf_size, gen, ss)); + session, dsk, addrbuf, addrbuf_size, ss)); break; case WT_PAGE_OVFL: WT_ERR(__slvg_trk_ovfl( - session, dsk, addrbuf, addrbuf_size, gen, ss)); + session, dsk, addrbuf, addrbuf_size, ss)); break; } } @@ -455,8 +454,8 @@ err: if (trk->addr.addr != NULL) * Track a leaf page. */ static int -__slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, - uint8_t *addr, uint32_t size, uint64_t gen, WT_STUFF *ss) +__slvg_trk_leaf(WT_SESSION_IMPL *session, + WT_PAGE_HEADER *dsk, uint8_t *addr, uint32_t size, WT_STUFF *ss) { WT_BTREE *btree; WT_CELL *cell; @@ -478,8 +477,8 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, (ss->pages_next + 1000) * sizeof(WT_TRACK *), &ss->pages)); /* Allocate a WT_TRACK entry for this new page and fill it in. */ - WT_RET( - __slvg_trk_init(session, addr, size, dsk->mem_size, gen, ss, &trk)); + WT_RET(__slvg_trk_init( + session, addr, size, dsk->mem_size, dsk->write_gen, ss, &trk)); switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -573,8 +572,8 @@ err: __wt_free(session, trk); * Track an overflow page. */ static int -__slvg_trk_ovfl(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, - uint8_t *addr, uint32_t size, uint64_t gen, WT_STUFF *ss) +__slvg_trk_ovfl(WT_SESSION_IMPL *session, + WT_PAGE_HEADER *dsk, uint8_t *addr, uint32_t size, WT_STUFF *ss) { WT_TRACK *trk; @@ -586,8 +585,8 @@ __slvg_trk_ovfl(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_RET(__wt_realloc(session, &ss->ovfl_allocated, (ss->ovfl_next + 1000) * sizeof(WT_TRACK *), &ss->ovfl)); - WT_RET( - __slvg_trk_init(session, addr, size, dsk->mem_size, gen, ss, &trk)); + WT_RET(__slvg_trk_init( + session, addr, size, dsk->mem_size, dsk->write_gen, ss, &trk)); ss->ovfl[ss->ovfl_next++] = trk; return (0); diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c index 625a2822c0a..ea1e2d494e3 100644 --- a/src/conn/conn_btree.c +++ b/src/conn/conn_btree.c @@ -204,7 +204,6 @@ __conn_btree_open(WT_SESSION_IMPL *session, const char *config, const char *cfg[], uint32_t flags) { WT_BTREE *btree; - WT_DECL_ITEM(addr); WT_DECL_RET; btree = session->btree; @@ -226,16 +225,11 @@ __conn_btree_open(WT_SESSION_IMPL *session, if (F_ISSET(btree, WT_BTREE_OPEN)) WT_RET(__wt_conn_btree_sync_and_close(session)); - WT_RET(__wt_scr_alloc(session, WT_BTREE_MAX_ADDR_COOKIE, &addr)); - /* Set any special flags on the handle. */ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS)); do { - WT_ERR(__wt_meta_checkpoint_addr( - session, btree->name, btree->checkpoint, addr)); - WT_ERR(__wt_btree_open(session, addr->data, addr->size, cfg, - btree->checkpoint == NULL ? 0 : 1)); + WT_ERR(__wt_btree_open(session, cfg)); F_SET(btree, WT_BTREE_OPEN); /* * Checkpoint handles are read only, so eviction calculations @@ -257,7 +251,6 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); (void)__wt_conn_btree_close(session, 1); } - __wt_scr_free(&addr); return (ret); } diff --git a/src/include/block.h b/src/include/block.h index 80a1c0026b5..327e922c2ec 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -128,8 +128,6 @@ struct __wt_block_ckpt { off_t file_size; /* Checkpoint file size */ uint64_t ckpt_size; /* Checkpoint byte count */ WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */ - - uint64_t write_gen; /* Write generation */ }; /* @@ -190,53 +188,40 @@ struct __wt_block_desc { */ struct __wt_block_header { /* - * We maintain page write-generations in the non-transactional case - * (where, instead of a transactional LSN, the value is a counter), - * as that's how salvage can determine the most recent page between - * pages overlapping the same key range. - * - * !!! - * The write-generation is "owned" by the btree layer, but it's easier - * to set (when physically writing blocks) and restore (during salvage), - * in the block-manager layer. - */ - uint64_t write_gen; /* 00-07: write generation */ - - /* * We write the page size in the on-disk page header because it makes * salvage easier. (If we don't know the expected page length, we'd * have to read increasingly larger chunks from the file until we find * one that checksums, and that's going to be harsh given WiredTiger's * potentially large page sizes.) */ - uint32_t disk_size; /* 08-11: on-disk page size */ + uint32_t disk_size; /* 00-03: on-disk page size */ /* - * Page checksums are stored in two places. First, a page's checksum - * is in the internal page that references a page as part of the - * address cookie. This is done to improve the chances of detecting - * not only disk corruption but software bugs (for example, overwriting - * a page with another valid page image). Second, a page's checksum is + * Page checksums are stored in two places. First, the page checksum + * is written within the internal page that references it as part of + * the address cookie. This is done to improve the chances of detecting + * not only disk corruption but other bugs (for example, overwriting a + * page with another valid page image). Second, a page's checksum is * stored in the disk header. This is for salvage, so salvage knows it * has found a page that may be useful. */ - uint32_t cksum; /* 12-15: checksum */ + uint32_t cksum; /* 04-07: checksum */ #define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */ - uint8_t flags; /* 16: flags */ + uint8_t flags; /* 08: flags */ /* * End the structure with 3 bytes of padding: it wastes space, but it * leaves the structure 32-bit aligned and having a few bytes to play * with in the future can't hurt. */ - uint8_t unused[3]; /* 17-19: unused padding */ + uint8_t unused[3]; /* 09-11: unused padding */ }; /* * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if * the compiler inserts padding it will break the world. */ -#define WT_BLOCK_HEADER_SIZE 20 +#define WT_BLOCK_HEADER_SIZE 12 /* * WT_BLOCK_HEADER_BYTE diff --git a/src/include/btmem.h b/src/include/btmem.h index 8d976f00cec..bd89c533efc 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -19,33 +19,40 @@ struct __wt_page_header { uint64_t recno; /* 00-07: column-store starting recno */ /* + * We maintain page write-generations in the non-transactional case + * as that's how salvage can determine the most recent page between + * pages overlapping the same key range. + */ + uint64_t write_gen; /* 08-15: write generation */ + + /* * The page's in-memory size isn't rounded or aligned, it's the actual * number of bytes the disk-image consumes when instantiated in memory. */ - uint32_t mem_size; /* 08-11: in-memory page size */ + uint32_t mem_size; /* 16-19: in-memory page size */ union { - uint32_t entries; /* 12-15: number of cells on page */ - uint32_t datalen; /* 12-15: overflow data length */ + uint32_t entries; /* 20-23: number of cells on page */ + uint32_t datalen; /* 20-23: overflow data length */ } u; - uint8_t type; /* 16: page type */ + uint8_t type; /* 24: page type */ #define WT_PAGE_COMPRESSED 0x01 /* Page is compressed on disk */ - uint8_t flags; /* 17: flags */ + uint8_t flags; /* 25: flags */ /* * End the structure with 2 bytes of padding: it wastes space, but it * leaves the structure 32-bit aligned and having a few bytes to play * with in the future can't hurt. */ - uint8_t unused[2]; /* 18-19: unused padding */ + uint8_t unused[2]; /* 26-27: unused padding */ }; /* * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if * the compiler inserts padding it will break the world. */ -#define WT_PAGE_HEADER_SIZE 20 +#define WT_PAGE_HEADER_SIZE 28 /* * The block-manager specific information immediately follows the WT_PAGE_DISK diff --git a/src/include/btree.h b/src/include/btree.h index a59e5da8f52..c8722b9f91c 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -123,6 +123,8 @@ struct __wt_btree { void *block; /* Block manager */ u_int block_header; /* Block manager header length */ + uint64_t write_gen; /* Write generation */ + WT_PAGE *evict_page; /* Eviction thread's location */ uint64_t evict_priority; /* Relative priority of cached pages. */ volatile uint32_t lru_count; /* Count of threads in LRU eviction */ diff --git a/src/include/extern.h b/src/include/extern.h index 0f4c1c550b6..1c1a53f07c3 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -173,10 +173,9 @@ extern int __wt_bm_write(WT_SESSION_IMPL *session, int data_cksum); extern int __wt_bm_stat(WT_SESSION_IMPL *session); extern int __wt_bm_salvage_start(WT_SESSION_IMPL *session); -extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session, +extern int __wt_bm_salvage_next( WT_SESSION_IMPL *session, uint8_t *addr, uint32_t *addr_sizep, - uint64_t *write_genp, int *eofp); extern int __wt_bm_salvage_valid( WT_SESSION_IMPL *session, uint8_t *addr, @@ -211,11 +210,10 @@ extern int __wt_block_read_off(WT_SESSION_IMPL *session, uint32_t cksum); extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_block_salvage_next( WT_SESSION_IMPL *session, +extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, - uint64_t *write_genp, int *eofp); extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, @@ -323,11 +321,7 @@ extern void *__wt_cache_evict_server(void *arg); extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename); -extern int __wt_btree_open(WT_SESSION_IMPL *session, - const uint8_t *addr, - uint32_t addr_size, - const char *cfg[], - int readonly); +extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, @@ -812,10 +806,10 @@ extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_meta_checkpoint_addr(WT_SESSION_IMPL *session, +extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, - WT_ITEM *addr); + WT_CKPT *ckpt); extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep); @@ -829,6 +823,7 @@ extern int __wt_meta_ckptlist_set( WT_SESSION_IMPL *session, WT_CKPT *ckptbase); extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase); +extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt); extern int __wt_metadata_open(WT_SESSION_IMPL *session); extern int __wt_metadata_load_backup(WT_SESSION_IMPL *session); extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, diff --git a/src/include/meta.h b/src/include/meta.h index 6e0c9d01111..b2a9541d4c0 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -38,6 +38,8 @@ struct __wt_ckpt { uint64_t ckpt_size; /* Checkpoint size */ + uint64_t write_gen; /* Write generation */ + void *bpriv; /* Block manager private */ #define WT_CKPT_ADD 0x01 /* Checkpoint to be added */ diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index 1588711e0ec..844d625c985 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -7,20 +7,22 @@ #include "wt_internal.h" -static int __ckpt_last_addr(WT_SESSION_IMPL *, const char *, WT_ITEM *); -static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **); -static int __ckpt_named_addr( - WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *); -static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *); -static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); +static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *); +static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **); +static int __ckpt_load(WT_SESSION_IMPL *, + WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *); +static int __ckpt_named( + WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *); +static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *); +static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); /* - * __wt_meta_checkpoint_addr -- - * Return a file's checkpoint address. + * __wt_meta_checkpoint -- + * Return a file's checkpoint information. */ int -__wt_meta_checkpoint_addr(WT_SESSION_IMPL *session, - const char *fname, const char *checkpoint, WT_ITEM *addr) +__wt_meta_checkpoint(WT_SESSION_IMPL *session, + const char *fname, const char *checkpoint, WT_CKPT *ckpt) { WT_DECL_RET; const char *config; @@ -41,14 +43,13 @@ __wt_meta_checkpoint_addr(WT_SESSION_IMPL *session, * data" and let our caller handle it. */ if (checkpoint == NULL) { - if ((ret = - __ckpt_last_addr(session, config, addr)) == WT_NOTFOUND) { + if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) { ret = 0; - addr->data = NULL; - addr->size = 0; + ckpt->addr.data = ckpt->raw.data = NULL; + ckpt->addr.size = ckpt->raw.size = 0; } } else - WT_ERR(__ckpt_named_addr(session, checkpoint, config, addr)); + WT_ERR(__ckpt_named(session, checkpoint, config, ckpt)); err: __wt_free(session, config); return (ret); @@ -125,15 +126,15 @@ err: __wt_free(session, config); } /* - * __ckpt_named_addr -- - * Return the cookie associated with a file's named checkpoint. + * __ckpt_named -- + * Return the information associated with a file's named checkpoint. */ static int -__ckpt_named_addr(WT_SESSION_IMPL *session, - const char *checkpoint, const char *config, WT_ITEM *addr) +__ckpt_named(WT_SESSION_IMPL *session, + const char *checkpoint, const char *config, WT_CKPT *ckpt) { WT_CONFIG ckptconf; - WT_CONFIG_ITEM a, k, v; + WT_CONFIG_ITEM k, v; WT_RET(__wt_config_getones(session, config, "checkpoint", &v)); WT_RET(__wt_config_subinit(session, &ckptconf, &v)); @@ -143,23 +144,18 @@ __ckpt_named_addr(WT_SESSION_IMPL *session, * checkpoint of any name. */ while (__wt_config_next(&ckptconf, &k, &v) == 0) - if (WT_STRING_MATCH(checkpoint, k.str, k.len)) { - WT_RET(__wt_config_subgets(session, &v, "addr", &a)); - if (a.len != 0) - WT_RET(__wt_nhex_to_raw( - session, a.str, a.len, addr)); - return (0); - } + if (WT_STRING_MATCH(checkpoint, k.str, k.len)) + return (__ckpt_load(session, &k, &v, ckpt)); + return (WT_NOTFOUND); } /* - * __ckpt_last_addr -- - * Return the cookie associated with the file's last checkpoint. + * __ckpt_last -- + * Return the information associated with the file's last checkpoint. */ static int -__ckpt_last_addr( - WT_SESSION_IMPL *session, const char *config, WT_ITEM *addr) +__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt) { WT_CONFIG ckptconf; WT_CONFIG_ITEM a, k, v; @@ -170,17 +166,13 @@ __ckpt_last_addr( for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) { /* Ignore checkpoints before the ones we've already seen. */ WT_RET(__wt_config_subgets(session, &v, "order", &a)); - if (found && a.val < found) - continue; + if (found) { + if (a.val < found) + continue; + __wt_meta_checkpoint_free(session, ckpt); + } found = a.val; - - /* - * Copy out the address; our caller wants the raw cookie, not - * the hex. - */ - WT_RET(__wt_config_subgets(session, &v, "addr", &a)); - if (a.len != 0) - WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr)); + WT_RET(__ckpt_load(session, &k, &v, ckpt)); } return (found ? 0 : WT_NOTFOUND); @@ -257,12 +249,11 @@ __wt_meta_ckptlist_get( { WT_CKPT *ckpt, *ckptbase; WT_CONFIG ckptconf; - WT_CONFIG_ITEM a, k, v; + WT_CONFIG_ITEM k, v; WT_DECL_RET; WT_ITEM *buf; size_t allocated, slot; const char *config; - char timebuf[64]; *ckptbasep = NULL; @@ -284,39 +275,7 @@ __wt_meta_ckptlist_get( (slot + 50) * sizeof(WT_CKPT), &ckptbase)); ckpt = &ckptbase[slot]; - /* - * Copy the name, address (raw and hex), order and time - * into the slot. If there's no address, it's a fake. - */ - WT_ERR( - __wt_strndup(session, k.str, k.len, &ckpt->name)); - - WT_ERR(__wt_config_subgets(session, &v, "addr", &a)); - WT_ERR( - __wt_buf_set(session, &ckpt->addr, a.str, a.len)); - if (a.len == 0) - F_SET(ckpt, WT_CKPT_FAKE); - else - WT_ERR(__wt_nhex_to_raw( - session, a.str, a.len, &ckpt->raw)); - - WT_ERR(__wt_config_subgets(session, &v, "order", &a)); - if (a.val == 0) - goto format; - ckpt->order = a.val; - - WT_ERR(__wt_config_subgets(session, &v, "time", &a)); - if (a.len == 0) - goto format; - if (a.len > sizeof(timebuf) - 1) - goto format; - memcpy(timebuf, a.str, a.len); - timebuf[a.len] = '\0'; - if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) - goto format; - - WT_ERR(__wt_config_subgets(session, &v, "size", &a)); - ckpt->ckpt_size = (uint64_t)a.val; + WT_ERR(__ckpt_load(session, &k, &v, ckpt)); } /* @@ -340,7 +299,6 @@ __wt_meta_ckptlist_get( *ckptbasep = ckptbase; if (0) { -format: WT_ERR_MSG(session, WT_ERROR, "corrupted checkpoint list"); err: __wt_meta_ckptlist_free(session, ckptbase); } __wt_free(session, config); @@ -350,6 +308,62 @@ err: __wt_meta_ckptlist_free(session, ckptbase); } /* + * __ckpt_load -- + * Load a single checkpoint's information into a WT_CKPT structure. + */ +static int +__ckpt_load(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt) +{ + WT_CONFIG_ITEM a; + char timebuf[64]; + + /* + * Copy the name, address (raw and hex), order and time into the slot. + * If there's no address, it's a fake. + */ + WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name)); + + WT_RET(__wt_config_subgets(session, v, "addr", &a)); + WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len)); + if (a.len == 0) + F_SET(ckpt, WT_CKPT_FAKE); + else + WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw)); + + WT_RET(__wt_config_subgets(session, v, "order", &a)); + if (a.len == 0) + goto format; + ckpt->order = a.val; + + WT_RET(__wt_config_subgets(session, v, "time", &a)); + if (a.len == 0 || a.len > sizeof(timebuf) - 1) + goto format; + memcpy(timebuf, a.str, a.len); + timebuf[a.len] = '\0'; + if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) + goto format; + + WT_RET(__wt_config_subgets(session, v, "size", &a)); + ckpt->ckpt_size = (uint64_t)a.val; + + WT_RET(__wt_config_subgets(session, v, "write_gen", &a)); + if (a.len == 0) + goto format; + /* + * The largest value a WT_CONFIG_ITEM can handle is signed: this value + * appears on disk and I don't want to sign it there, so I'm casting it + * here instead. + */ + ckpt->write_gen = (uint64_t)a.val; + + return (0); + +format: + WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list"); +} + +/* * __wt_meta_ckptlist_set -- * Set a file's checkpoint value from the WT_CKPT list. */ @@ -414,17 +428,21 @@ __wt_meta_ckptlist_set( if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) WT_ERR(__wt_buf_catfmt(session, buf, "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64 - ",time=%" PRIuMAX ",size=%" PRIu64 ")", + ",time=%" PRIuMAX ",size=%" PRIu64 + ",write_gen=%" PRIu64 ")", sep, ckpt->name, ckpt->order, (int)ckpt->addr.size, (char *)ckpt->addr.data, - ckpt->order, ckpt->sec, ckpt->ckpt_size)); + ckpt->order, ckpt->sec, ckpt->ckpt_size, + ckpt->write_gen)); else WT_ERR(__wt_buf_catfmt(session, buf, "%s%s=(addr=\"%.*s\",order=%" PRIu64 - ",time=%" PRIuMAX ",size=%" PRIu64 ")", + ",time=%" PRIuMAX ",size=%" PRIu64 + ",write_gen=%" PRIu64 ")", sep, ckpt->name, (int)ckpt->addr.size, (char *)ckpt->addr.data, - ckpt->order, ckpt->sec, ckpt->ckpt_size)); + ckpt->order, ckpt->sec, ckpt->ckpt_size, + ckpt->write_gen)); sep = ","; } WT_ERR(__wt_buf_catfmt(session, buf, ")")); @@ -447,16 +465,30 @@ __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) if (ckptbase == NULL) return; - WT_CKPT_FOREACH(ckptbase, ckpt) { - __wt_free(session, ckpt->name); - __wt_buf_free(session, &ckpt->addr); - __wt_buf_free(session, &ckpt->raw); - __wt_free(session, ckpt->bpriv); - } + WT_CKPT_FOREACH(ckptbase, ckpt) + __wt_meta_checkpoint_free(session, ckpt); __wt_free(session, ckptbase); } /* + * __wt_meta_checkpoint_free -- + * Clean up a single checkpoint structure. + */ +void +__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt) +{ + if (ckpt == NULL) + return; + + __wt_free(session, ckpt->name); + __wt_buf_free(session, &ckpt->addr); + __wt_buf_free(session, &ckpt->raw); + __wt_free(session, ckpt->bpriv); + + WT_CLEAR(*ckpt); /* Clear to prepare for re-use. */ +} + +/* * __ckpt_version_chk -- * Check the version major/minor numbers. */ diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 34eaf68ffd4..cde9a491943 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -580,6 +580,14 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_bt_cache_flush(session, ckptbase, is_checkpoint ? WT_SYNC : WT_SYNC_DISCARD)); + /* + * All blocks being written have been written; set the object's write + * generation. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_ADD)) + ckpt->write_gen = btree->write_gen; + fake: /* Update the object's metadata. */ txn->isolation = TXN_ISO_READ_UNCOMMITTED; diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index dffb672a871..437716970b9 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -574,8 +574,8 @@ copy(u_int gen, u_int recno) dsk = (void *)buf; if (page_type != WT_PAGE_ROW_LEAF) dsk->recno = recno; + dsk->write_gen = gen; blk = WT_BLOCK_HEADER_REF(buf); - blk->write_gen = gen; blk->cksum = 0; blk->cksum = __wt_cksum(dsk, PSIZE); assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE); |