summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <mjc@wiredtiger.com>2012-11-29 17:55:20 -0800
committerMichael Cahill <mjc@wiredtiger.com>2012-11-29 17:55:20 -0800
commit467f354bd61b188c6e8aaee19a710a98b659796b (patch)
tree7e59e60e19789bb681d46755439ed582bb81163a
parentb0099cb3e9402b99b50889f11f46c14ba424227a (diff)
parent923e2881df5ce70afe495f2ef7db39ce86d66747 (diff)
downloadmongo-467f354bd61b188c6e8aaee19a710a98b659796b.tar.gz
Merge pull request #384 from wiredtiger/generation
Move the write-generation from the block layer into the btree layer.
-rw-r--r--src/block/block_addr.c4
-rw-r--r--src/block/block_ckpt.c5
-rw-r--r--src/block/block_mgr.c8
-rw-r--r--src/block/block_slvg.c16
-rw-r--r--src/block/block_write.c17
-rw-r--r--src/btree/bt_handle.c77
-rw-r--r--src/btree/bt_io.c22
-rw-r--r--src/btree/bt_slvg.c29
-rw-r--r--src/conn/conn_btree.c9
-rw-r--r--src/include/block.h35
-rw-r--r--src/include/btmem.h21
-rw-r--r--src/include/btree.h2
-rw-r--r--src/include/extern.h17
-rw-r--r--src/include/meta.h2
-rw-r--r--src/meta/meta_ckpt.c206
-rw-r--r--src/txn/txn_ckpt.c8
-rw-r--r--test/salvage/salvage.c2
17 files changed, 245 insertions, 235 deletions
diff --git a/src/block/block_addr.c b/src/block/block_addr.c
index 68e350e1f12..2e18e73ab46 100644
--- a/src/block/block_addr.c
+++ b/src/block/block_addr.c
@@ -156,8 +156,6 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
ci->file_size = (off_t)a;
WT_RET(__wt_vunpack_uint(pp, 0, &a));
ci->ckpt_size = a;
- WT_RET(__wt_vunpack_uint(pp, 0, &a));
- ci->write_gen = a;
return (0);
}
@@ -190,8 +188,6 @@ __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session,
WT_RET(__wt_vpack_uint(pp, 0, a));
a = (uint64_t)ci->ckpt_size;
WT_RET(__wt_vpack_uint(pp, 0, a));
- a = ci->write_gen;
- WT_RET(__wt_vpack_uint(pp, 0, a));
return (0);
}
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index 778426bb839..6bf04f08b13 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -745,10 +745,7 @@ __ckpt_string(WT_SESSION_IMPL *session,
(uintmax_t)(ci->discard.offset + ci->discard.size),
ci->discard.size, ci->discard.cksum));
WT_RET(__wt_buf_catfmt(session, buf,
- ", file size=%" PRIuMAX
- ", write generation=%" PRIu64,
- (uintmax_t)ci->file_size,
- ci->write_gen));
+ ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
__wt_block_ckpt_destroy(session, ci);
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index e3104fcaf3d..8f65dcc3ab6 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -337,16 +337,16 @@ __wt_bm_salvage_start(WT_SESSION_IMPL *session)
* Return the next block from the file.
*/
int
-__wt_bm_salvage_next(WT_SESSION_IMPL *session,
- uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp)
+__wt_bm_salvage_next(
+ WT_SESSION_IMPL *session, uint8_t *addr, uint32_t *addr_sizep, int *eofp)
{
WT_BLOCK *block;
if ((block = session->btree->block) == NULL)
return (__bm_invalid(session));
- return (__wt_block_salvage_next(
- session, block, addr, addr_sizep, write_genp, eofp));
+ return (
+ __wt_block_salvage_next(session, block, addr, addr_sizep, eofp));
}
/*
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 56a37806547..326df3c38cc 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -74,9 +74,8 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
* Return the address for the next potential block from the file.
*/
int
-__wt_block_salvage_next(
- WT_SESSION_IMPL *session, WT_BLOCK *block,
- uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp)
+__wt_block_salvage_next(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp)
{
WT_BLOCK_HEADER *blk;
WT_DECL_ITEM(tmp);
@@ -140,17 +139,6 @@ skip: WT_VERBOSE_ERR(session, salvage,
session, block, offset, (off_t)allocsize));
}
- /*
- * Track the largest write-generation we've seen in the file so future
- * writes, done after salvage completes, are preferred to these blocks.
- *
- * The read may have grown the buffer, reset our reference.
- */
- blk = WT_BLOCK_HEADER_REF(tmp->mem);
- *write_genp = blk->write_gen;
- if (block->live.write_gen < blk->write_gen)
- block->live.write_gen = blk->write_gen;
-
/* Re-create the address cookie that should reference this block. */
endp = addr;
WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 5b7e5b4450b..7f1c401c882 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -99,23 +99,6 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);
/*
- * We increment the block's write generation so it's easy to identify
- * newer versions of blocks during salvage: it's common in WiredTiger
- * for multiple blocks to be internally consistent with identical
- * first and last keys, so we need a way to know the most recent state
- * of the block. (We could check to see which leaf is referenced by
- * by the internal page, which implies salvaging internal pages (which
- * I don't want to do), and it's not quite as good anyway, because the
- * internal page may not have been written to disk after the leaf page
- * was updated. So, write generations it is.
- *
- * Nothing is locked at this point but two versions of a page with the
- * same generation is pretty unlikely, and if we did, they're going to
- * be roughly identical for the purposes of salvage, anyway.
- */
- blk->write_gen = ++block->live.write_gen;
-
- /*
* Set the disk size so we don't have to incrementally read blocks
* during salvage.
*/
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index f63661bb3a2..0675d2cf987 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -7,7 +7,7 @@
#include "wt_internal.h"
-static int __btree_conf(WT_SESSION_IMPL *, const char *[]);
+static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt, const char *[]);
static int __btree_get_last_recno(WT_SESSION_IMPL *);
static int __btree_page_sizes(WT_SESSION_IMPL *, const char *);
static int __btree_tree_open_empty(WT_SESSION_IMPL *, int);
@@ -40,28 +40,31 @@ __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename)
* Open a Btree.
*/
int
-__wt_btree_open(WT_SESSION_IMPL *session,
- const uint8_t *addr, uint32_t addr_size, const char *cfg[], int readonly)
+__wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_BTREE *btree;
+ WT_CKPT ckpt;
WT_CONFIG_ITEM cval;
WT_DECL_RET;
uint32_t root_addr_size;
- int created, forced_salvage;
- const char *filename;
uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int creation, forced_salvage, readonly;
+ const char *filename;
btree = session->btree;
+ readonly = btree->checkpoint == NULL ? 0 : 1;
- /* Initialize and configure the WT_BTREE structure. */
- WT_ERR(__btree_conf(session, cfg));
+ /* Get the checkpoint information for this name/checkpoint pair. */
+ WT_CLEAR(ckpt);
+ WT_RET(__wt_meta_checkpoint(
+ session, btree->name, btree->checkpoint, &ckpt));
/*
* Bulk-load is only permitted on newly created files, not any empty
* file -- see the checkpoint code for a discussion.
*/
- created = addr == NULL || addr_size == 0;
- if (!created && F_ISSET(btree, WT_BTREE_BULK))
+ creation = ckpt.raw.size == 0;
+ if (!creation && F_ISSET(btree, WT_BTREE_BULK))
WT_ERR_MSG(session, EINVAL,
"bulk-load is only supported on newly created objects");
@@ -75,11 +78,13 @@ __wt_btree_open(WT_SESSION_IMPL *session,
forced_salvage = 1;
}
+ /* Initialize and configure the WT_BTREE structure. */
+ WT_ERR(__btree_conf(session, &ckpt, cfg));
+
/* Connect to the underlying block manager. */
filename = btree->name;
if (!WT_PREFIX_SKIP(filename, "file:"))
WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
-
WT_ERR(__wt_bm_open(
session, filename, btree->config, cfg, forced_salvage));
@@ -87,31 +92,33 @@ __wt_btree_open(WT_SESSION_IMPL *session,
* Open the specified checkpoint unless it's a special command (special
* commands are responsible for loading their own checkpoints, if any).
*/
- if (F_ISSET(btree,
- WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
- return (0);
-
- /*
- * There are two reasons to load an empty tree rather than a checkpoint:
- * either there is no checkpoint (the file is being created), or the
- * load call returns no root page (the checkpoint is for an empty file).
- */
- WT_ERR(__wt_bm_checkpoint_load(
- session, addr, addr_size, root_addr, &root_addr_size, readonly));
- if (created || root_addr_size == 0)
- WT_ERR(__btree_tree_open_empty(session, created));
- else {
- WT_ERR(
- __wt_btree_tree_open(session, root_addr, root_addr_size));
-
- /* Get the last record number in a column-store file. */
- if (btree->type != BTREE_ROW)
- WT_ERR(__btree_get_last_recno(session));
+ if (!F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ /*
+ * There are two reasons to load an empty tree rather than a
+ * checkpoint: either there is no checkpoint (the file is
+ * being created), or the load call returns no root page (the
+ * checkpoint is for an empty file).
+ */
+ WT_ERR(__wt_bm_checkpoint_load(session,
+ ckpt.raw.data, ckpt.raw.size,
+ root_addr, &root_addr_size, readonly));
+ if (creation || root_addr_size == 0)
+ WT_ERR(__btree_tree_open_empty(session, creation));
+ else {
+ WT_ERR(__wt_btree_tree_open(
+ session, root_addr, root_addr_size));
+
+ /* Get the last record number in a column-store file. */
+ if (btree->type != BTREE_ROW)
+ WT_ERR(__btree_get_last_recno(session));
+ }
}
if (0) {
err: (void)__wt_btree_close(session);
}
+ __wt_meta_checkpoint_free(session, &ckpt);
return (ret);
}
@@ -157,7 +164,7 @@ __wt_btree_close(WT_SESSION_IMPL *session)
* Configure a WT_BTREE structure.
*/
static int
-__btree_conf(WT_SESSION_IMPL *session, const char *cfg[])
+__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt, const char *cfg[])
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval;
@@ -304,8 +311,8 @@ __btree_conf(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_stat_alloc_dsrc_stats(session, &btree->stats));
- /* The tree has not been modified. */
- btree->modified = 0;
+ btree->write_gen = ckpt->write_gen; /* Write generation */
+ btree->modified = 0; /* Clean */
return (0);
}
@@ -347,7 +354,7 @@ err: __wt_buf_free(session, &dsk);
* Create an empty in-memory tree.
*/
static int
-__btree_tree_open_empty(WT_SESSION_IMPL *session, int created)
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -362,7 +369,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int created)
* loads; set a flag that's cleared when a row is inserted into the
* tree.
*/
- if (created)
+ if (creation)
btree->bulk_load_ok = 1;
/*
diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c
index 174f7c6a994..6d8fb5b6149 100644
--- a/src/btree/bt_io.c
+++ b/src/btree/bt_io.c
@@ -233,12 +233,28 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
ip = tmp;
}
}
+ dsk = ip->mem;
/* If the buffer is compressed, set the flag. */
- if (compressed) {
- dsk = ip->mem;
+ if (compressed)
F_SET(dsk, WT_PAGE_COMPRESSED);
- }
+
+ /*
+ * We increment the block's write generation so it's easy to identify
+ * newer versions of blocks during salvage. (It's common in WiredTiger,
+ * at least for the default block manager, for multiple blocks to be
+ * internally consistent with identical first and last keys, so we need
+ * a way to know the most recent state of the block. We could check
+ * which leaf is referenced by a valid internal page, but that implies
+ * salvaging internal pages, which I don't want to do, and it's not
+ * as good anyway, because the internal page may not have been written
+ * after the leaf page was updated. So, write generations it is.
+ *
+ * Nothing is locked at this point but two versions of a page with the
+ * same generation is pretty unlikely, and if we did, they're going to
+ * be roughly identical for the purposes of salvage, anyway.
+ */
+ dsk->write_gen = ++btree->write_gen;
/*
* Checksum the data if the buffer isn't compressed or checksums are
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 45728934802..49d365e787c 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -125,11 +125,11 @@ static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, uint32_t);
static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *,
uint32_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **);
static int __slvg_trk_leaf(WT_SESSION_IMPL *,
- WT_PAGE_HEADER *, uint8_t *, uint32_t, uint64_t, WT_STUFF *);
+ WT_PAGE_HEADER *, uint8_t *, uint32_t, WT_STUFF *);
static int __slvg_trk_leaf_ovfl(
WT_SESSION_IMPL *, WT_PAGE_HEADER *, WT_TRACK *);
static int __slvg_trk_ovfl(WT_SESSION_IMPL *,
- WT_PAGE_HEADER *, uint8_t *, uint32_t, uint64_t, WT_STUFF *);
+ WT_PAGE_HEADER *, uint8_t *, uint32_t, WT_STUFF *);
/*
* __wt_bt_salvage --
@@ -312,7 +312,6 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
WT_DECL_ITEM(buf);
WT_DECL_RET;
WT_PAGE_HEADER *dsk;
- uint64_t gen;
uint32_t addrbuf_size;
uint8_t addrbuf[WT_BTREE_MAX_ADDR_COOKIE];
int eof;
@@ -323,7 +322,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
for (;;) {
/* Get the next block address from the block manager. */
WT_ERR(__wt_bm_salvage_next(
- session, addrbuf, &addrbuf_size, &gen, &eof));
+ session, addrbuf, &addrbuf_size, &eof));
if (eof)
break;
@@ -388,7 +387,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
WT_VERBOSE_ERR(session, salvage,
"tracking %s page, generation %" PRIu64 " %s",
- __wt_page_type_string(dsk->type), gen,
+ __wt_page_type_string(dsk->type), dsk->write_gen,
(const char *)as->data);
switch (dsk->type) {
@@ -405,11 +404,11 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
__wt_page_type_string(dsk->type));
WT_ERR(__slvg_trk_leaf(
- session, dsk, addrbuf, addrbuf_size, gen, ss));
+ session, dsk, addrbuf, addrbuf_size, ss));
break;
case WT_PAGE_OVFL:
WT_ERR(__slvg_trk_ovfl(
- session, dsk, addrbuf, addrbuf_size, gen, ss));
+ session, dsk, addrbuf, addrbuf_size, ss));
break;
}
}
@@ -455,8 +454,8 @@ err: if (trk->addr.addr != NULL)
* Track a leaf page.
*/
static int
-__slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
- uint8_t *addr, uint32_t size, uint64_t gen, WT_STUFF *ss)
+__slvg_trk_leaf(WT_SESSION_IMPL *session,
+ WT_PAGE_HEADER *dsk, uint8_t *addr, uint32_t size, WT_STUFF *ss)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -478,8 +477,8 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
(ss->pages_next + 1000) * sizeof(WT_TRACK *), &ss->pages));
/* Allocate a WT_TRACK entry for this new page and fill it in. */
- WT_RET(
- __slvg_trk_init(session, addr, size, dsk->mem_size, gen, ss, &trk));
+ WT_RET(__slvg_trk_init(
+ session, addr, size, dsk->mem_size, dsk->write_gen, ss, &trk));
switch (dsk->type) {
case WT_PAGE_COL_FIX:
@@ -573,8 +572,8 @@ err: __wt_free(session, trk);
* Track an overflow page.
*/
static int
-__slvg_trk_ovfl(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
- uint8_t *addr, uint32_t size, uint64_t gen, WT_STUFF *ss)
+__slvg_trk_ovfl(WT_SESSION_IMPL *session,
+ WT_PAGE_HEADER *dsk, uint8_t *addr, uint32_t size, WT_STUFF *ss)
{
WT_TRACK *trk;
@@ -586,8 +585,8 @@ __slvg_trk_ovfl(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
WT_RET(__wt_realloc(session, &ss->ovfl_allocated,
(ss->ovfl_next + 1000) * sizeof(WT_TRACK *), &ss->ovfl));
- WT_RET(
- __slvg_trk_init(session, addr, size, dsk->mem_size, gen, ss, &trk));
+ WT_RET(__slvg_trk_init(
+ session, addr, size, dsk->mem_size, dsk->write_gen, ss, &trk));
ss->ovfl[ss->ovfl_next++] = trk;
return (0);
diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c
index 625a2822c0a..ea1e2d494e3 100644
--- a/src/conn/conn_btree.c
+++ b/src/conn/conn_btree.c
@@ -204,7 +204,6 @@ __conn_btree_open(WT_SESSION_IMPL *session,
const char *config, const char *cfg[], uint32_t flags)
{
WT_BTREE *btree;
- WT_DECL_ITEM(addr);
WT_DECL_RET;
btree = session->btree;
@@ -226,16 +225,11 @@ __conn_btree_open(WT_SESSION_IMPL *session,
if (F_ISSET(btree, WT_BTREE_OPEN))
WT_RET(__wt_conn_btree_sync_and_close(session));
- WT_RET(__wt_scr_alloc(session, WT_BTREE_MAX_ADDR_COOKIE, &addr));
-
/* Set any special flags on the handle. */
F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
do {
- WT_ERR(__wt_meta_checkpoint_addr(
- session, btree->name, btree->checkpoint, addr));
- WT_ERR(__wt_btree_open(session, addr->data, addr->size, cfg,
- btree->checkpoint == NULL ? 0 : 1));
+ WT_ERR(__wt_btree_open(session, cfg));
F_SET(btree, WT_BTREE_OPEN);
/*
* Checkpoint handles are read only, so eviction calculations
@@ -257,7 +251,6 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
(void)__wt_conn_btree_close(session, 1);
}
- __wt_scr_free(&addr);
return (ret);
}
diff --git a/src/include/block.h b/src/include/block.h
index 80a1c0026b5..327e922c2ec 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -128,8 +128,6 @@ struct __wt_block_ckpt {
off_t file_size; /* Checkpoint file size */
uint64_t ckpt_size; /* Checkpoint byte count */
WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
-
- uint64_t write_gen; /* Write generation */
};
/*
@@ -190,53 +188,40 @@ struct __wt_block_desc {
*/
struct __wt_block_header {
/*
- * We maintain page write-generations in the non-transactional case
- * (where, instead of a transactional LSN, the value is a counter),
- * as that's how salvage can determine the most recent page between
- * pages overlapping the same key range.
- *
- * !!!
- * The write-generation is "owned" by the btree layer, but it's easier
- * to set (when physically writing blocks) and restore (during salvage),
- * in the block-manager layer.
- */
- uint64_t write_gen; /* 00-07: write generation */
-
- /*
* We write the page size in the on-disk page header because it makes
* salvage easier. (If we don't know the expected page length, we'd
* have to read increasingly larger chunks from the file until we find
* one that checksums, and that's going to be harsh given WiredTiger's
* potentially large page sizes.)
*/
- uint32_t disk_size; /* 08-11: on-disk page size */
+ uint32_t disk_size; /* 00-03: on-disk page size */
/*
- * Page checksums are stored in two places. First, a page's checksum
- * is in the internal page that references a page as part of the
- * address cookie. This is done to improve the chances of detecting
- * not only disk corruption but software bugs (for example, overwriting
- * a page with another valid page image). Second, a page's checksum is
+ * Page checksums are stored in two places. First, the page checksum
+ * is written within the internal page that references it as part of
+ * the address cookie. This is done to improve the chances of detecting
+ * not only disk corruption but other bugs (for example, overwriting a
+ * page with another valid page image). Second, a page's checksum is
* stored in the disk header. This is for salvage, so salvage knows it
* has found a page that may be useful.
*/
- uint32_t cksum; /* 12-15: checksum */
+ uint32_t cksum; /* 04-07: checksum */
#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */
- uint8_t flags; /* 16: flags */
+ uint8_t flags; /* 08: flags */
/*
* End the structure with 3 bytes of padding: it wastes space, but it
* leaves the structure 32-bit aligned and having a few bytes to play
* with in the future can't hurt.
*/
- uint8_t unused[3]; /* 17-19: unused padding */
+ uint8_t unused[3]; /* 09-11: unused padding */
};
/*
* WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
* the compiler inserts padding it will break the world.
*/
-#define WT_BLOCK_HEADER_SIZE 20
+#define WT_BLOCK_HEADER_SIZE 12
/*
* WT_BLOCK_HEADER_BYTE
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 8d976f00cec..bd89c533efc 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -19,33 +19,40 @@ struct __wt_page_header {
uint64_t recno; /* 00-07: column-store starting recno */
/*
+ * We maintain page write-generations in the non-transactional case
+ * as that's how salvage can determine the most recent page between
+ * pages overlapping the same key range.
+ */
+ uint64_t write_gen; /* 08-15: write generation */
+
+ /*
* The page's in-memory size isn't rounded or aligned, it's the actual
* number of bytes the disk-image consumes when instantiated in memory.
*/
- uint32_t mem_size; /* 08-11: in-memory page size */
+ uint32_t mem_size; /* 16-19: in-memory page size */
union {
- uint32_t entries; /* 12-15: number of cells on page */
- uint32_t datalen; /* 12-15: overflow data length */
+ uint32_t entries; /* 20-23: number of cells on page */
+ uint32_t datalen; /* 20-23: overflow data length */
} u;
- uint8_t type; /* 16: page type */
+ uint8_t type; /* 24: page type */
#define WT_PAGE_COMPRESSED 0x01 /* Page is compressed on disk */
- uint8_t flags; /* 17: flags */
+ uint8_t flags; /* 25: flags */
/*
* End the structure with 2 bytes of padding: it wastes space, but it
* leaves the structure 32-bit aligned and having a few bytes to play
* with in the future can't hurt.
*/
- uint8_t unused[2]; /* 18-19: unused padding */
+ uint8_t unused[2]; /* 26-27: unused padding */
};
/*
* WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
* the compiler inserts padding it will break the world.
*/
-#define WT_PAGE_HEADER_SIZE 20
+#define WT_PAGE_HEADER_SIZE 28
/*
* The block-manager specific information immediately follows the WT_PAGE_DISK
diff --git a/src/include/btree.h b/src/include/btree.h
index a59e5da8f52..c8722b9f91c 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -123,6 +123,8 @@ struct __wt_btree {
void *block; /* Block manager */
u_int block_header; /* Block manager header length */
+ uint64_t write_gen; /* Write generation */
+
WT_PAGE *evict_page; /* Eviction thread's location */
uint64_t evict_priority; /* Relative priority of cached pages. */
volatile uint32_t lru_count; /* Count of threads in LRU eviction */
diff --git a/src/include/extern.h b/src/include/extern.h
index 0f4c1c550b6..1c1a53f07c3 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -173,10 +173,9 @@ extern int __wt_bm_write(WT_SESSION_IMPL *session,
int data_cksum);
extern int __wt_bm_stat(WT_SESSION_IMPL *session);
extern int __wt_bm_salvage_start(WT_SESSION_IMPL *session);
-extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session,
+extern int __wt_bm_salvage_next( WT_SESSION_IMPL *session,
uint8_t *addr,
uint32_t *addr_sizep,
- uint64_t *write_genp,
int *eofp);
extern int __wt_bm_salvage_valid( WT_SESSION_IMPL *session,
uint8_t *addr,
@@ -211,11 +210,10 @@ extern int __wt_block_read_off(WT_SESSION_IMPL *session,
uint32_t cksum);
extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
+extern int __wt_block_salvage_next(WT_SESSION_IMPL *session,
WT_BLOCK *block,
uint8_t *addr,
uint32_t *addr_sizep,
- uint64_t *write_genp,
int *eofp);
extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session,
WT_BLOCK *block,
@@ -323,11 +321,7 @@ extern void *__wt_cache_evict_server(void *arg);
extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename);
-extern int __wt_btree_open(WT_SESSION_IMPL *session,
- const uint8_t *addr,
- uint32_t addr_size,
- const char *cfg[],
- int readonly);
+extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session,
const uint8_t *addr,
@@ -812,10 +806,10 @@ extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session,
int (*func)(WT_SESSION_IMPL *,
const char *[]),
const char *cfg[]);
-extern int __wt_meta_checkpoint_addr(WT_SESSION_IMPL *session,
+extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session,
const char *fname,
const char *checkpoint,
- WT_ITEM *addr);
+ WT_CKPT *ckpt);
extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session,
const char *fname,
const char **namep);
@@ -829,6 +823,7 @@ extern int __wt_meta_ckptlist_set( WT_SESSION_IMPL *session,
WT_CKPT *ckptbase);
extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session,
WT_CKPT *ckptbase);
+extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
extern int __wt_metadata_open(WT_SESSION_IMPL *session);
extern int __wt_metadata_load_backup(WT_SESSION_IMPL *session);
extern int __wt_metadata_cursor( WT_SESSION_IMPL *session,
diff --git a/src/include/meta.h b/src/include/meta.h
index 6e0c9d01111..b2a9541d4c0 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -38,6 +38,8 @@ struct __wt_ckpt {
uint64_t ckpt_size; /* Checkpoint size */
+ uint64_t write_gen; /* Write generation */
+
void *bpriv; /* Block manager private */
#define WT_CKPT_ADD 0x01 /* Checkpoint to be added */
diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c
index 1588711e0ec..844d625c985 100644
--- a/src/meta/meta_ckpt.c
+++ b/src/meta/meta_ckpt.c
@@ -7,20 +7,22 @@
#include "wt_internal.h"
-static int __ckpt_last_addr(WT_SESSION_IMPL *, const char *, WT_ITEM *);
-static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
-static int __ckpt_named_addr(
- WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *);
-static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *);
-static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
+static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *);
+static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
+static int __ckpt_load(WT_SESSION_IMPL *,
+ WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *);
+static int __ckpt_named(
+ WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
+static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *);
+static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
/*
- * __wt_meta_checkpoint_addr --
- * Return a file's checkpoint address.
+ * __wt_meta_checkpoint --
+ * Return a file's checkpoint information.
*/
int
-__wt_meta_checkpoint_addr(WT_SESSION_IMPL *session,
- const char *fname, const char *checkpoint, WT_ITEM *addr)
+__wt_meta_checkpoint(WT_SESSION_IMPL *session,
+ const char *fname, const char *checkpoint, WT_CKPT *ckpt)
{
WT_DECL_RET;
const char *config;
@@ -41,14 +43,13 @@ __wt_meta_checkpoint_addr(WT_SESSION_IMPL *session,
* data" and let our caller handle it.
*/
if (checkpoint == NULL) {
- if ((ret =
- __ckpt_last_addr(session, config, addr)) == WT_NOTFOUND) {
+ if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) {
ret = 0;
- addr->data = NULL;
- addr->size = 0;
+ ckpt->addr.data = ckpt->raw.data = NULL;
+ ckpt->addr.size = ckpt->raw.size = 0;
}
} else
- WT_ERR(__ckpt_named_addr(session, checkpoint, config, addr));
+ WT_ERR(__ckpt_named(session, checkpoint, config, ckpt));
err: __wt_free(session, config);
return (ret);
@@ -125,15 +126,15 @@ err: __wt_free(session, config);
}
/*
- * __ckpt_named_addr --
- * Return the cookie associated with a file's named checkpoint.
+ * __ckpt_named --
+ * Return the information associated with a file's named checkpoint.
*/
static int
-__ckpt_named_addr(WT_SESSION_IMPL *session,
- const char *checkpoint, const char *config, WT_ITEM *addr)
+__ckpt_named(WT_SESSION_IMPL *session,
+ const char *checkpoint, const char *config, WT_CKPT *ckpt)
{
WT_CONFIG ckptconf;
- WT_CONFIG_ITEM a, k, v;
+ WT_CONFIG_ITEM k, v;
WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
WT_RET(__wt_config_subinit(session, &ckptconf, &v));
@@ -143,23 +144,18 @@ __ckpt_named_addr(WT_SESSION_IMPL *session,
* checkpoint of any name.
*/
while (__wt_config_next(&ckptconf, &k, &v) == 0)
- if (WT_STRING_MATCH(checkpoint, k.str, k.len)) {
- WT_RET(__wt_config_subgets(session, &v, "addr", &a));
- if (a.len != 0)
- WT_RET(__wt_nhex_to_raw(
- session, a.str, a.len, addr));
- return (0);
- }
+ if (WT_STRING_MATCH(checkpoint, k.str, k.len))
+ return (__ckpt_load(session, &k, &v, ckpt));
+
return (WT_NOTFOUND);
}
/*
- * __ckpt_last_addr --
- * Return the cookie associated with the file's last checkpoint.
+ * __ckpt_last --
+ * Return the information associated with the file's last checkpoint.
*/
static int
-__ckpt_last_addr(
- WT_SESSION_IMPL *session, const char *config, WT_ITEM *addr)
+__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
{
WT_CONFIG ckptconf;
WT_CONFIG_ITEM a, k, v;
@@ -170,17 +166,13 @@ __ckpt_last_addr(
for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
/* Ignore checkpoints before the ones we've already seen. */
WT_RET(__wt_config_subgets(session, &v, "order", &a));
- if (found && a.val < found)
- continue;
+ if (found) {
+ if (a.val < found)
+ continue;
+ __wt_meta_checkpoint_free(session, ckpt);
+ }
found = a.val;
-
- /*
- * Copy out the address; our caller wants the raw cookie, not
- * the hex.
- */
- WT_RET(__wt_config_subgets(session, &v, "addr", &a));
- if (a.len != 0)
- WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr));
+ WT_RET(__ckpt_load(session, &k, &v, ckpt));
}
return (found ? 0 : WT_NOTFOUND);
@@ -257,12 +249,11 @@ __wt_meta_ckptlist_get(
{
WT_CKPT *ckpt, *ckptbase;
WT_CONFIG ckptconf;
- WT_CONFIG_ITEM a, k, v;
+ WT_CONFIG_ITEM k, v;
WT_DECL_RET;
WT_ITEM *buf;
size_t allocated, slot;
const char *config;
- char timebuf[64];
*ckptbasep = NULL;
@@ -284,39 +275,7 @@ __wt_meta_ckptlist_get(
(slot + 50) * sizeof(WT_CKPT), &ckptbase));
ckpt = &ckptbase[slot];
- /*
- * Copy the name, address (raw and hex), order and time
- * into the slot. If there's no address, it's a fake.
- */
- WT_ERR(
- __wt_strndup(session, k.str, k.len, &ckpt->name));
-
- WT_ERR(__wt_config_subgets(session, &v, "addr", &a));
- WT_ERR(
- __wt_buf_set(session, &ckpt->addr, a.str, a.len));
- if (a.len == 0)
- F_SET(ckpt, WT_CKPT_FAKE);
- else
- WT_ERR(__wt_nhex_to_raw(
- session, a.str, a.len, &ckpt->raw));
-
- WT_ERR(__wt_config_subgets(session, &v, "order", &a));
- if (a.val == 0)
- goto format;
- ckpt->order = a.val;
-
- WT_ERR(__wt_config_subgets(session, &v, "time", &a));
- if (a.len == 0)
- goto format;
- if (a.len > sizeof(timebuf) - 1)
- goto format;
- memcpy(timebuf, a.str, a.len);
- timebuf[a.len] = '\0';
- if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
- goto format;
-
- WT_ERR(__wt_config_subgets(session, &v, "size", &a));
- ckpt->ckpt_size = (uint64_t)a.val;
+ WT_ERR(__ckpt_load(session, &k, &v, ckpt));
}
/*
@@ -340,7 +299,6 @@ __wt_meta_ckptlist_get(
*ckptbasep = ckptbase;
if (0) {
-format: WT_ERR_MSG(session, WT_ERROR, "corrupted checkpoint list");
err: __wt_meta_ckptlist_free(session, ckptbase);
}
__wt_free(session, config);
@@ -350,6 +308,62 @@ err: __wt_meta_ckptlist_free(session, ckptbase);
}
/*
+ * __ckpt_load --
+ * Load a single checkpoint's information into a WT_CKPT structure.
+ */
+static int
+__ckpt_load(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt)
+{
+ WT_CONFIG_ITEM a;
+ char timebuf[64];
+
+ /*
+ * Copy the name, address (raw and hex), order and time into the slot.
+ * If there's no address, it's a fake.
+ */
+ WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name));
+
+ WT_RET(__wt_config_subgets(session, v, "addr", &a));
+ WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len));
+ if (a.len == 0)
+ F_SET(ckpt, WT_CKPT_FAKE);
+ else
+ WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw));
+
+ WT_RET(__wt_config_subgets(session, v, "order", &a));
+ if (a.len == 0)
+ goto format;
+ ckpt->order = a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "time", &a));
+ if (a.len == 0 || a.len > sizeof(timebuf) - 1)
+ goto format;
+ memcpy(timebuf, a.str, a.len);
+ timebuf[a.len] = '\0';
+ if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
+ goto format;
+
+ WT_RET(__wt_config_subgets(session, v, "size", &a));
+ ckpt->ckpt_size = (uint64_t)a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "write_gen", &a));
+ if (a.len == 0)
+ goto format;
+ /*
+ * The largest value a WT_CONFIG_ITEM can handle is signed: this value
+ * appears on disk and I don't want to sign it there, so I'm casting it
+ * here instead.
+ */
+ ckpt->write_gen = (uint64_t)a.val;
+
+ return (0);
+
+format:
+ WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list");
+}
+
+/*
* __wt_meta_ckptlist_set --
* Set a file's checkpoint value from the WT_CKPT list.
*/
@@ -414,17 +428,21 @@ __wt_meta_ckptlist_set(
if (strcmp(ckpt->name, WT_CHECKPOINT) == 0)
WT_ERR(__wt_buf_catfmt(session, buf,
"%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64
- ",time=%" PRIuMAX ",size=%" PRIu64 ")",
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
sep, ckpt->name, ckpt->order,
(int)ckpt->addr.size, (char *)ckpt->addr.data,
- ckpt->order, ckpt->sec, ckpt->ckpt_size));
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
else
WT_ERR(__wt_buf_catfmt(session, buf,
"%s%s=(addr=\"%.*s\",order=%" PRIu64
- ",time=%" PRIuMAX ",size=%" PRIu64 ")",
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
sep, ckpt->name,
(int)ckpt->addr.size, (char *)ckpt->addr.data,
- ckpt->order, ckpt->sec, ckpt->ckpt_size));
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
sep = ",";
}
WT_ERR(__wt_buf_catfmt(session, buf, ")"));
@@ -447,16 +465,30 @@ __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
if (ckptbase == NULL)
return;
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- __wt_free(session, ckpt->name);
- __wt_buf_free(session, &ckpt->addr);
- __wt_buf_free(session, &ckpt->raw);
- __wt_free(session, ckpt->bpriv);
- }
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ __wt_meta_checkpoint_free(session, ckpt);
__wt_free(session, ckptbase);
}
/*
+ * __wt_meta_checkpoint_free --
+ * Clean up a single checkpoint structure.
+ */
+void
+__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+ if (ckpt == NULL)
+ return;
+
+ __wt_free(session, ckpt->name);
+ __wt_buf_free(session, &ckpt->addr);
+ __wt_buf_free(session, &ckpt->raw);
+ __wt_free(session, ckpt->bpriv);
+
+ WT_CLEAR(*ckpt); /* Clear to prepare for re-use. */
+}
+
+/*
* __ckpt_version_chk --
* Check the version major/minor numbers.
*/
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 34eaf68ffd4..cde9a491943 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -580,6 +580,14 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_bt_cache_flush(session,
ckptbase, is_checkpoint ? WT_SYNC : WT_SYNC_DISCARD));
+ /*
+ * All blocks being written have been written; set the object's write
+ * generation.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD))
+ ckpt->write_gen = btree->write_gen;
+
fake:
/* Update the object's metadata. */
txn->isolation = TXN_ISO_READ_UNCOMMITTED;
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index dffb672a871..437716970b9 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -574,8 +574,8 @@ copy(u_int gen, u_int recno)
dsk = (void *)buf;
if (page_type != WT_PAGE_ROW_LEAF)
dsk->recno = recno;
+ dsk->write_gen = gen;
blk = WT_BLOCK_HEADER_REF(buf);
- blk->write_gen = gen;
blk->cksum = 0;
blk->cksum = __wt_cksum(dsk, PSIZE);
assert(fwrite(buf, 1, PSIZE, ofp) == PSIZE);