diff options
42 files changed, 1938 insertions, 1934 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index b1793cf3eac..3adea9ca157 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -158,8 +158,8 @@ file_config = format_meta + [ # File metadata, including both configurable and non-configurable (internal) file_meta = file_config + [ - Config('snapshot', '', r''' - the file snapshot entries'''), + Config('checkpoint', '', r''' + the file checkpoint entries'''), Config('version', '(major=0,minor=0)', r''' the file version'''), ] @@ -376,6 +376,7 @@ methods = { list, such as <code>"verbose=[evictserver,read]"</code>''', type='list', choices=[ 'block', + 'ckpt', 'evict', 'evictserver', 'fileops', @@ -385,7 +386,6 @@ methods = { 'readserver', 'reconcile', 'salvage', - 'snapshot', 'verify', 'write']), ]), @@ -400,6 +400,7 @@ flags = { 'rec_evict' : [ 'REC_SINGLE' ], 'verbose' : [ 'VERB_block', + 'VERB_ckpt', 'VERB_evict', 'VERB_evictserver', 'VERB_fileops', @@ -409,7 +410,6 @@ flags = { 'VERB_readserver', 'VERB_reconcile', 'VERB_salvage', - 'VERB_snapshot', 'VERB_verify', 'VERB_write' ], diff --git a/dist/filelist b/dist/filelist index 70cd5819540..d6baf04c90a 100644 --- a/dist/filelist +++ b/dist/filelist @@ -4,13 +4,13 @@ src/api/api_strerror.c src/api/api_version.c src/block/block_addr.c +src/block/block_ckpt.c src/block/block_cksum.c src/block/block_ext.c src/block/block_mgr.c src/block/block_open.c src/block/block_read.c src/block/block_slvg.c -src/block/block_snap.c src/block/block_vrfy.c src/block/block_write.c src/btree/bt_bulk.c @@ -66,7 +66,7 @@ src/log/log.c src/log/log_desc.c src/meta/meta_api.c src/meta/meta_apply.c -src/meta/meta_snapshot.c +src/meta/meta_ckpt.c src/meta/meta_table.c src/meta/meta_track.c src/meta/meta_turtle.c @@ -116,4 +116,4 @@ src/support/scratch.c src/support/sess_dump.c src/support/stat.c src/txn/txn.c -src/txn/txn_snapshot.c +src/txn/txn_ckpt.c diff --git a/dist/s_string.ok b/dist/s_string.ok index 59659e7641a..dbb617fe885 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -25,6 +25,7 @@ Btree CAS CELL's CELLs +CKPT CLR COL's CONCAT @@ -150,7 +151,7 @@ WIREDTIGER WinNT WiredTiger WiredTiger's -WiredTigerInternalSnapshot +WiredTigerInternalCheckpoint Wuninitialized XP __wt_epoch @@ -197,6 +198,9 @@ checksum checksums chk cip +ckpt +ckptfrag +ckptlist cksum clr cmp @@ -447,11 +451,6 @@ sizev skiplist skiplists slvg -snapall -snapfrag -snapfrom -snaplist -snapto snprintf sp spinlock diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 2b5d10ef7b6..68e350e1f12 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -129,68 +129,68 @@ __wt_block_addr_string(WT_SESSION_IMPL *session, } /* - * __wt_block_buffer_to_snapshot -- - * Convert a filesystem snapshot cookie into its components. + * __wt_block_buffer_to_ckpt -- + * Convert a checkpoint cookie into its components. */ int -__wt_block_buffer_to_snapshot(WT_SESSION_IMPL *session, - WT_BLOCK *block, const uint8_t *p, WT_BLOCK_SNAPSHOT *si) +__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) { uint64_t a; const uint8_t **pp; - si->version = *p++; - if (si->version != WT_BM_SNAPSHOT_VERSION) - WT_RET_MSG(session, WT_ERROR, "illegal snapshot address"); + ci->version = *p++; + if (ci->version != WT_BM_CHECKPOINT_VERSION) + WT_RET_MSG(session, WT_ERROR, "illegal checkpoint address"); pp = &p; WT_RET(__block_buffer_to_addr(block, pp, - &si->root_offset, &si->root_size, &si->root_cksum)); + &ci->root_offset, &ci->root_size, &ci->root_cksum)); WT_RET(__block_buffer_to_addr(block, pp, - &si->alloc.offset, &si->alloc.size, &si->alloc.cksum)); + &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum)); WT_RET(__block_buffer_to_addr(block, pp, - &si->avail.offset, &si->avail.size, &si->avail.cksum)); + &ci->avail.offset, &ci->avail.size, &ci->avail.cksum)); WT_RET(__block_buffer_to_addr(block, pp, - &si->discard.offset, &si->discard.size, &si->discard.cksum)); + &ci->discard.offset, &ci->discard.size, &ci->discard.cksum)); WT_RET(__wt_vunpack_uint(pp, 0, &a)); - si->file_size = (off_t)a; + ci->file_size = (off_t)a; WT_RET(__wt_vunpack_uint(pp, 0, &a)); - si->snapshot_size = a; + ci->ckpt_size = a; WT_RET(__wt_vunpack_uint(pp, 0, &a)); - si->write_gen = a; + ci->write_gen = a; return (0); } /* - * __wt_block_snapshot_to_buffer -- - * Convert the filesystem components into its snapshot cookie. + * __wt_block_ckpt_to_buffer -- + * Convert the components into its checkpoint cookie. */ int -__wt_block_snapshot_to_buffer(WT_SESSION_IMPL *session, - WT_BLOCK *block, uint8_t **pp, WT_BLOCK_SNAPSHOT *si) +__wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, + WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci) { uint64_t a; - if (si->version != WT_BM_SNAPSHOT_VERSION) - WT_RET_MSG(session, WT_ERROR, "illegal snapshot address"); + if (ci->version != WT_BM_CHECKPOINT_VERSION) + WT_RET_MSG(session, WT_ERROR, "illegal checkpoint address"); - (*pp)[0] = si->version; + (*pp)[0] = ci->version; (*pp)++; WT_RET(__wt_block_addr_to_buffer(block, pp, - si->root_offset, si->root_size, si->root_cksum)); + ci->root_offset, ci->root_size, ci->root_cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, - si->alloc.offset, si->alloc.size, si->alloc.cksum)); + ci->alloc.offset, ci->alloc.size, ci->alloc.cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, - si->avail.offset, si->avail.size, si->avail.cksum)); + ci->avail.offset, ci->avail.size, ci->avail.cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, - si->discard.offset, si->discard.size, si->discard.cksum)); - a = (uint64_t)si->file_size; + ci->discard.offset, ci->discard.size, ci->discard.cksum)); + a = (uint64_t)ci->file_size; WT_RET(__wt_vpack_uint(pp, 0, a)); - a = (uint64_t)si->snapshot_size; + a = (uint64_t)ci->ckpt_size; WT_RET(__wt_vpack_uint(pp, 0, a)); - a = si->write_gen; + a = ci->write_gen; WT_RET(__wt_vpack_uint(pp, 0, a)); return (0); diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c new file mode 100644 index 00000000000..08e3856facd --- /dev/null +++ b/src/block/block_ckpt.c @@ -0,0 +1,708 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); +static int __ckpt_string( + WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *); +static int __ckpt_update(WT_SESSION_IMPL *, + WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, uint64_t, int); + +/* + * __wt_block_ckpt_init -- + * Initialize a checkpoint structure. + */ +int +__wt_block_ckpt_init(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_BLOCK_CKPT *ci, const char *name, int is_live) +{ + WT_DECL_RET; + + /* + * If we're loading a new live checkpoint, there shouldn't be one + * already loaded. The btree engine should prevent this from ever + * happening, but paranoia is a healthy thing. + */ + if (is_live) { + __wt_spin_lock(session, &block->live_lock); + if (block->live_load) + ret = EINVAL; + else + block->live_load = 1; + __wt_spin_unlock(session, &block->live_lock); + if (ret) + WT_RET_MSG( + session, EINVAL, "checkpoint already loaded"); + } + + memset(ci, 0, sizeof(*ci)); + + ci->root_offset = WT_BLOCK_INVALID_OFFSET; + + WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc")); + WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail")); + WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard")); + + ci->file_size = WT_BLOCK_DESC_SECTOR; + WT_RET(__wt_block_extlist_init( + session, &ci->ckpt_avail, name, "ckpt_avail")); + + return (0); +} + +/* + * __wt_block_checkpoint_load -- + * Load a checkpoint. + */ +int +__wt_block_checkpoint_load(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, + int readonly) +{ + WT_BLOCK_CKPT *ci; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + WT_UNUSED(addr_size); + + /* + * Sometimes we don't find a root page (we weren't given a checkpoint, + * or the referenced checkpoint was empty). In that case we return a + * root page size of 0. Set that up now. + */ + dsk->size = 0; + + ci = &block->live; + WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1)); + + if (WT_VERBOSE_ISSET(session, ckpt)) { + if (addr != NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string(session, block, addr, tmp)); + } + WT_VERBOSE_ERR(session, ckpt, + "%s: load-checkpoint: %s", block->name, + addr == NULL ? "[Empty]" : (char *)tmp->data); + } + + /* If not loading a checkpoint from disk, we're done. */ + if (addr == NULL || addr_size == 0) + return (0); + + /* Crack the checkpoint cookie. */ + if (addr != NULL) + WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); + + /* Verify sets up next. */ + if (block->verify) + WT_ERR(__wt_verify_ckpt_load(session, block, ci)); + + /* Read, and optionally verify, any root page. */ + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { + WT_ERR(__wt_block_read_off(session, block, + dsk, ci->root_offset, ci->root_size, ci->root_cksum)); + if (block->verify) { + if (tmp == NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string( + session, block, addr, tmp)); + } + WT_ERR( + __wt_verify_dsk(session, (char *)tmp->data, dsk)); + } + } + + /* + * Rolling a checkpoint forward requires the avail list, the blocks from + * which we can allocate. + */ + if (!readonly) + WT_ERR(__wt_block_extlist_read(session, block, &ci->avail)); + + /* + * If the checkpoint can be written, that means anything written after + * the checkpoint is no longer interesting. Truncate the file. + */ + if (!readonly) { + WT_VERBOSE_ERR(session, ckpt, + "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size); + WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); + } + + if (0) { +err: (void)__wt_block_checkpoint_unload(session, block); + } + + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_checkpoint_unload -- + * Unload a checkpoint. + */ +int +__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_BLOCK_CKPT *ci; + WT_DECL_RET; + + WT_VERBOSE_RETVAL( + session, ckpt, ret, "%s: unload checkpoint", block->name); + + ci = &block->live; + + /* Verify cleanup. */ + if (block->verify) + WT_TRET(__wt_verify_ckpt_unload(session, block, ci)); + + __wt_block_ckpt_destroy(session, ci); + + block->live_load = 0; + + return (ret); +} + +/* + * __wt_block_ckpt_destroy -- + * Clear a checkpoint structure. + */ +void +__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci) +{ + /* Discard the extent lists. */ + __wt_block_extlist_free(session, &ci->alloc); + __wt_block_extlist_free(session, &ci->avail); + __wt_block_extlist_free(session, &ci->discard); + __wt_block_extlist_free(session, &ci->ckpt_avail); +} + +/* + * __wt_block_checkpoint -- + * Create a new checkpoint. + */ +int +__wt_block_checkpoint(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase) +{ + WT_BLOCK_CKPT *ci; + + ci = &block->live; + ci->version = WT_BM_CHECKPOINT_VERSION; + + /* + * Write the root page: it's possible for there to be a checkpoint of + * an empty tree, in which case, we store an illegal root offset. + * + * XXX + * We happen to know that checkpoints are single-threaded above us in + * the btree engine. That's probably something we want to guarantee + * for any WiredTiger block manager. + */ + if (buf == NULL) { + ci->root_offset = WT_BLOCK_INVALID_OFFSET; + ci->root_size = ci->root_cksum = 0; + } else + WT_RET(__wt_block_write_off(session, block, buf, + &ci->root_offset, &ci->root_size, &ci->root_cksum, 0)); + + /* Process the checkpoint list, deleting and updating as required. */ + WT_RET(__ckpt_process(session, block, ckptbase)); + + /* + * Checkpoints have to hit disk (it would be reasonable to configure for + * lazy checkpoints, but we don't support them yet). Regardless, we're + * not holding any locks, other writers can proceed while we wait. + */ + if (!F_ISSET(S2C(session), WT_CONN_NOSYNC)) + WT_RET(__wt_fsync(session, block->fh)); + + return (0); +} + +/* + * __ckpt_extlist_fblocks -- + * If an extent list was read from disk, free its space to the live avail + * list. + */ +static inline int +__ckpt_extlist_fblocks( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el) +{ + if (el->offset == WT_BLOCK_INVALID_OFFSET) + return (0); + return (__wt_block_insert_ext( + session, &block->live.avail, el->offset, el->size)); +} + +/* + * __ckpt_process -- + * Process the list of checkpoints. + */ +static int +__ckpt_process( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) +{ + WT_BLOCK_CKPT *a, *b, *ci; + WT_CKPT *ckpt; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + uint64_t ckpt_size; + int deleting, locked; + + ci = &block->live; + locked = 0; + + /* + * We've allocated our last page, update the checkpoint size. We need + * to calculate the live system's checkpoint size before reading and + * merging checkpoint allocation and discard information from the + * checkpoints we're deleting, those operations change the underlying + * byte counts. + */ + ckpt_size = ci->ckpt_size; + ckpt_size += ci->alloc.bytes; + ckpt_size -= ci->discard.bytes; + + /* + * Extents newly available as a result of deleting previous checkpoints + * are added to a list of extents. The list should be empty, but there + * is no explicit "free the checkpoint information" call into the block + * manager; if there was an error in an upper level resulting in some + * previous checkpoint never being resolved, the list may not be empty. + * + * XXX + * This isn't sufficient, actually: we're going to leak all the blocks + * written as part of the last checkpoint because it was never resolved. + */ + __wt_block_extlist_free(session, &ci->ckpt_avail); + WT_RET(__wt_block_extlist_init( + session, &ci->ckpt_avail, "live", "ckpt_avail")); + + /* + * To delete a checkpoint, we'll need extent list for it, and we have to + * read that from the disk. + */ + deleting = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) { + /* + * To delete a checkpoint, we'll need checkpoint information for + * it and the subsequent checkpoint. The test is tricky, load + * the current checkpoint's information if it's marked for + * deletion, or if it follows a checkpoint marked for deletion, + * where the boundary cases are the first checkpoint in the list + * and the last checkpoint in the list: if we're deleting the + * last checkpoint in the list, there's no next checkpoint, the + * checkpoint will be merged into the live tree. + */ + if (!F_ISSET(ckpt, WT_CKPT_DELETE) && + (ckpt == ckptbase || + F_ISSET(ckpt, WT_CKPT_ADD) || + !F_ISSET(ckpt - 1, WT_CKPT_DELETE))) + continue; + deleting = 1; + + /* + * Allocate a checkpoint structure, crack the cookie and read + * the checkpoint's extent lists. + * + * Ignore the avail list: checkpoint avail lists are only useful + * if we are rolling forward from the particular checkpoint and + * they represent our best understanding of what blocks can be + * allocated. If we are not operating on the live checkpoint, + * subsequent checkpoints might have allocated those blocks, and + * the avail list is useless. We don't discard it, because it + * is useful as part of verification, but we don't re-write it + * either. + */ + WT_ERR(__wt_calloc( + session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); + ci = ckpt->bpriv; + WT_ERR(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0)); + WT_ERR(__wt_block_buffer_to_ckpt( + session, block, ckpt->raw.data, ci)); + WT_ERR(__wt_block_extlist_read(session, block, &ci->alloc)); + WT_ERR(__wt_block_extlist_read(session, block, &ci->discard)); + } + + /* + * Hold a lock so the live extent lists and the file size can't change + * underneath us. I suspect we'll tighten this if checkpoints take too + * much time away from real work: we read the historic checkpoint + * information without a lock, but we could also merge and re-write the + * delete checkpoint information without a lock, except for ranges + * merged into the live tree. + */ + __wt_spin_lock(session, &block->live_lock); + locked = 1; + + /* Skip the additional processing if we aren't deleting checkpoints. */ + if (!deleting) + goto live_update; + + /* + * Delete any no-longer-needed checkpoints: we do this first as it frees + * blocks to the live lists, and the freed blocks will then be included + * when writing the live extent lists. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + + if (WT_VERBOSE_ISSET(session, ckpt)) { + if (tmp == NULL) + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string( + session, block, ckpt->raw.data, tmp)); + WT_VERBOSE_ERR(session, ckpt, + "%s: delete-checkpoint: %s: %s", + block->name, ckpt->name, (char *)tmp->data); + } + + /* + * Set the from/to checkpoint structures, where the "to" value + * may be the live tree. + */ + a = ckpt->bpriv; + if (F_ISSET(ckpt + 1, WT_CKPT_ADD)) + b = &block->live; + else + b = (ckpt + 1)->bpriv; + + /* + * Free the root page: there's nothing special about this free, + * the root page is allocated using normal rules, that is, it + * may have been taken from the avail list, and was entered on + * the live system's alloc list at that time. We free it into + * the checkpoint's discard list, however, not the live system's + * list because it appears on the checkpoint's alloc list and so + * must be paired in the checkpoint. + */ + if (a->root_offset != WT_BLOCK_INVALID_OFFSET) + WT_ERR(__wt_block_insert_ext(session, + &a->discard, a->root_offset, a->root_size)); + + /* + * Free the blocks used to hold the "from" checkpoint's extent + * lists directly to the live system's avail list, they were + * never on any alloc list. Include the "from" checkpoint's + * avail list, it's going away. + */ + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); + + /* + * Roll the "from" alloc and discard extent lists into the "to" + * checkpoint's lists. + */ + if (a->alloc.entries != 0) + WT_ERR(__wt_block_extlist_merge( + session, &a->alloc, &b->alloc)); + if (a->discard.entries != 0) + WT_ERR(__wt_block_extlist_merge( + session, &a->discard, &b->discard)); + + /* + * If the "to" checkpoint is also being deleted, we're done with + * it, it's merged into some other checkpoint in the next loop. + * This means the extent lists may aggregate over a number of + * checkpoints, but that's OK, they're disjoint sets of ranges. + */ + if (F_ISSET(ckpt + 1, WT_CKPT_DELETE)) + continue; + + /* + * Find blocks for re-use: wherever the "to" checkpoint's + * allocate and discard lists overlap is fair game, move ranges + * appearing on both lists to the live checkpoint's newly + * available list. + */ + WT_ERR(__wt_block_extlist_overlap(session, block, b)); + + /* + * If we're updating the live system's information, we're done. + */ + if (F_ISSET(ckpt + 1, WT_CKPT_ADD)) + continue; + + /* + * We have to write the "to" checkpoint's extent lists out in + * new blocks, and update its cookie. + * + * Free the blocks used to hold the "to" checkpoint's extent + * lists directly to the live system's avail list, they were + * never on any alloc list. Don't include the "to" checkpoint's + * avail list, it's not changing. + */ + WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); + WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); + + F_SET(ckpt + 1, WT_CKPT_UPDATE); + } + + /* Update checkpoints marked for update. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_UPDATE)) { + WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD)); + WT_ERR(__ckpt_update( + session, block, ckpt, ckpt->bpriv, 0, 0)); + } + +live_update: + ci = &block->live; + + /* Truncate the file if that's possible. */ + WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); + + /* Update the final, added checkpoint based on the live system. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_ADD)) { + WT_ERR(__ckpt_update( + session, block, ckpt, ci, ckpt_size, 1)); + + /* + * XXX + * Our caller wants two pieces of information: the time + * the checkpoint was written and the final checkpoint + * size. This violates layering but the alternative is + * a call for the btree layer to crack the checkpoint + * cookie into its components, and that's a fair amount + * of work. (We could just read the system time in the + * session layer when updating the metadata file, but + * that won't work for the checkpoint size, and so we + * do both here.) + */ + ckpt->ckpt_size = ci->ckpt_size; + WT_ERR(__wt_epoch(session, &ckpt->sec, NULL)); + } + + /* + * Reset the live system's alloc and discard extent lists, leave the + * avail list alone. + */ + __wt_block_extlist_free(session, &ci->alloc); + WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc")); + __wt_block_extlist_free(session, &ci->discard); + WT_ERR( + __wt_block_extlist_init(session, &ci->discard, "live", "discard")); + +#ifdef HAVE_DIAGNOSTIC + /* + * The first checkpoint in the system should always have an empty + * discard list. If we've read that checkpoint and/or created it, + * check. + */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + break; + if ((a = ckpt->bpriv) == NULL) + a = &block->live; + if (a->discard.entries != 0) { + __wt_errx(session, + "checkpoint incorrectly has blocks on the discard list"); + WT_ERR(WT_ERROR); + } +#endif + +err: if (locked) + __wt_spin_unlock(session, &block->live_lock); + + /* Discard any checkpoint information we loaded. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + if ((ci = ckpt->bpriv) != NULL) { + __wt_block_extlist_free(session, &ci->alloc); + __wt_block_extlist_free(session, &ci->avail); + __wt_block_extlist_free(session, &ci->discard); + } + + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __ckpt_update -- + * Update a checkpoint. + */ +static int +__ckpt_update( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, + WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + uint8_t *endp; + +#ifdef HAVE_DIAGNOSTIC + /* Check the extent list combinations for overlaps. */ + WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); + WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); + WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); +#endif + /* + * Write the checkpoint's extent lists; we only write an avail list for + * the live system, other checkpoint's avail lists are static and never + * change. When we do write the avail list for the live system it's + * two lists: the current avail list plus the list of blocks that are + * being made available as of the new checkpoint. We can't merge that + * second list into the real list yet, it's not truly available until + * the new checkpoint location has been saved to the metadata. + */ + WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); + if (is_live) + WT_RET(__wt_block_extlist_write( + session, block, &ci->avail, &ci->ckpt_avail)); + WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); + + /* + * Set the file size for the live system. + * + * XXX + * We do NOT set the file size when re-writing checkpoints because we + * want to test the checkpoint's blocks against a reasonable maximum + * file size during verification. This is bad: imagine a checkpoint + * appearing early in the file, re-written, and then the checkpoint + * requires blocks at the end of the file, blocks after the listed file + * size. If the application opens that checkpoint for writing + * (discarding subsequent checkpoints), we would truncate the file to + * the early chunk, discarding the re-written checkpoint information. + * The alternative, updating the file size has its own problems, in + * that case we'd work correctly, but we'd lose all of the blocks + * between the original checkpoint and the re-written checkpoint. + * Currently, there's no API to roll-forward intermediate checkpoints, + * if there ever is, this will need to be fixed. + */ + if (is_live) + WT_RET(__wt_filesize(session, block->fh, &ci->file_size)); + + /* Set the checkpoint size for the live system. */ + if (is_live) + ci->ckpt_size = ckpt_size; + + /* + * Copy the checkpoint information into the checkpoint array's address + * cookie. + */ + WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); + endp = ckpt->raw.mem; + WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); + ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem); + + if (WT_VERBOSE_ISSET(session, ckpt)) { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); + WT_VERBOSE_ERR(session, ckpt, + "%s: create-checkpoint: %s: %s", + block->name, ckpt->name, (char *)tmp->data); + } + +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_block_checkpoint_resolve -- + * Resolve a checkpoint. + */ +int +__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_BLOCK_CKPT *ci; + WT_DECL_RET; + + ci = &block->live; + + /* + * Checkpoints are a two-step process: first, write a new checkpoint to + * disk (including all the new extent lists for modified checkpoints + * and the live system). As part of this, create a list of file blocks + * newly available for reallocation, based on checkpoints being deleted. + * We then return the locations of the new checkpoint information to our + * caller. Our caller has to write that information into some kind of + * stable storage, and once that's done, we can actually allocate from + * that list of newly available file blocks. (We can't allocate from + * that list immediately because the allocation might happen before our + * caller saves the new checkpoint information, and if we crashed before + * the new checkpoint location was saved, we'd have overwritten blocks + * still referenced by checkpoints in the system.) In summary, there is + * a second step: after our caller saves the checkpoint information, we + * are called to add the newly available blocks into the live system's + * available list. + */ + __wt_spin_lock(session, &block->live_lock); + ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail); + __wt_spin_unlock(session, &block->live_lock); + + /* Discard the list. */ + __wt_block_extlist_free(session, &ci->ckpt_avail); + + return (ret); +} + +/* + * __ckpt_string -- + * Return a printable string representation of a checkpoint address cookie. + */ +static int +__ckpt_string(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf) +{ + WT_BLOCK_CKPT *ci, _ci; + + /* Initialize the checkpoint, crack the cookie. */ + ci = &_ci; + WT_RET(__wt_block_ckpt_init(session, block, ci, "string", 0)); + WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci)); + + WT_RET(__wt_buf_fmt(session, buf, + "version=%d", + ci->version)); + if (ci->root_offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", root=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->root_offset, + (uintmax_t)(ci->root_offset + ci->root_size), + ci->root_size, ci->root_cksum)); + if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", alloc=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->alloc.offset, + (uintmax_t)(ci->alloc.offset + ci->alloc.size), + ci->alloc.size, ci->alloc.cksum)); + if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", avail=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->avail.offset, + (uintmax_t)(ci->avail.offset + ci->avail.size), + ci->avail.size, ci->avail.cksum)); + if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET) + WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]")); + else + WT_RET(__wt_buf_catfmt(session, buf, + ", discard=[%" + PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", + (uintmax_t)ci->discard.offset, + (uintmax_t)(ci->discard.offset + ci->discard.size), + ci->discard.size, ci->discard.cksum)); + WT_RET(__wt_buf_catfmt(session, buf, + ", file size=%" PRIuMAX + ", write generation=%" PRIu64, + (uintmax_t)ci->file_size, + ci->write_gen)); + + return (0); +} diff --git a/src/block/block_ext.c b/src/block/block_ext.c index b102bc54899..7ab2a6fa590 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -471,9 +471,9 @@ __wt_block_off_free( * locks required to manipulate the extent lists. * * We can reuse this extent immediately if it was allocated during this - * snapshot, merge it into the avail list (which slows file growth in + * checkpoint, merge it into the avail list (which slows file growth in * workloads including repeated overflow record modification). If this - * extent is referenced in a previous snapshot, merge into the discard + * extent is referenced in a previous checkpoint, merge into the discard * list. */ if ((ret = __wt_block_off_remove_overlap( @@ -515,7 +515,7 @@ __wt_block_extlist_check( continue; } WT_RET_MSG(session, EINVAL, - "snapshot merge check: %s list overlaps the %s list", + "checkpoint merge check: %s list overlaps the %s list", al->name, bl->name); } return (0); @@ -524,17 +524,17 @@ __wt_block_extlist_check( /* * __wt_block_extlist_overlap -- - * Review a snapshot's alloc/discard extent lists, move overlaps into the - * live system's snapshot-avail list. + * Review a checkpoint's alloc/discard extent lists, move overlaps into the + * live system's checkpoint-avail list. */ int __wt_block_extlist_overlap( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si) + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci) { WT_EXT *alloc, *discard; - alloc = si->alloc.off[0]; - discard = si->discard.off[0]; + alloc = ci->alloc.off[0]; + discard = ci->discard.off[0]; /* Walk the lists in parallel, looking for overlaps. */ while (alloc != NULL && discard != NULL) { @@ -553,7 +553,7 @@ __wt_block_extlist_overlap( /* Reconcile the overlap. */ WT_RET(__block_ext_overlap(session, block, - &si->alloc, &alloc, &si->discard, &discard)); + &ci->alloc, &alloc, &ci->discard, &discard)); } return (0); } @@ -570,7 +570,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, WT_EXTLIST *avail, *el; off_t off, size; - avail = &block->live.snapshot_avail; + avail = &block->live.ckpt_avail; /* * The ranges overlap, choose the range we're going to take from each. @@ -926,7 +926,7 @@ corrupted: WT_ERR_MSG(session, WT_ERROR, * We could insert instead of merge, because ranges shouldn't * overlap, but merge knows how to allocate WT_EXT structures, * and a little paranoia is a good thing (if we corrupted the - * list and crashed, and rolled back to a corrupted snapshot, + * list and crashed, and rolled back to a corrupted checkpoint, * this might save us?) */ WT_ERR(__block_merge(session, el, off, size)); diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index b1bc28766f6..e54e01190d4 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -132,41 +132,41 @@ __wt_bm_close(WT_SESSION_IMPL *session) } /* - * __wt_bm_snapshot -- - * Write a buffer into a block, creating a snapshot. + * __wt_bm_checkpoint -- + * Write a buffer into a block, creating a checkpoint. */ int -__wt_bm_snapshot(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_SNAPSHOT *snapbase) +__wt_bm_checkpoint(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase) { WT_BLOCK *block; if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_snapshot(session, block, buf, snapbase)); + return (__wt_block_checkpoint(session, block, buf, ckptbase)); } /* - * __wt_bm_snapshot_resolve -- - * Resolve the snapshot. + * __wt_bm_checkpoint_resolve -- + * Resolve the checkpoint. */ int -__wt_bm_snapshot_resolve(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase) +__wt_bm_checkpoint_resolve(WT_SESSION_IMPL *session) { WT_BLOCK *block; if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_snapshot_resolve(session, block, snapbase)); + return (__wt_block_checkpoint_resolve(session, block)); } /* - * __wt_bm_snapshot_load -- - * Load a snapshot point. + * __wt_bm_checkpoint_load -- + * Load a checkpoint point. */ int -__wt_bm_snapshot_load(WT_SESSION_IMPL *session, +__wt_bm_checkpoint_load(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK *block; @@ -174,23 +174,23 @@ __wt_bm_snapshot_load(WT_SESSION_IMPL *session, if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_snapshot_load( + return (__wt_block_checkpoint_load( session, block, buf, addr, addr_size, readonly)); } /* - * __wt_bm_snapshot_unload -- - * Unload a snapshot point. + * __wt_bm_checkpoint_unload -- + * Unload a checkpoint point. */ int -__wt_bm_snapshot_unload(WT_SESSION_IMPL *session) +__wt_bm_checkpoint_unload(WT_SESSION_IMPL *session) { WT_BLOCK *block; if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_snapshot_unload(session, block)); + return (__wt_block_checkpoint_unload(session, block)); } /* @@ -333,14 +333,14 @@ __wt_bm_salvage_end(WT_SESSION_IMPL *session) * Start a block manager salvage. */ int -__wt_bm_verify_start(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase) +__wt_bm_verify_start(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) { WT_BLOCK *block; if ((block = session->btree->block) == NULL) return (__bm_invalid(session)); - return (__wt_block_verify_start(session, block, snapbase)); + return (__wt_block_verify_start(session, block, ckptbase)); } /* diff --git a/src/block/block_open.c b/src/block/block_open.c index f00e333b735..af4015ed3bb 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -112,7 +112,7 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, /* Open the underlying file handle. */ WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh)); - /* Initialize the live snapshot lock. */ + /* Initialize the live checkpoint's lock. */ __wt_spin_init(session, &block->live_lock); /* @@ -142,7 +142,7 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_VERBOSE_RETVAL(session, block, ret, "close"); - ret = __wt_block_snapshot_unload(session, block); + ret = __wt_block_checkpoint_unload(session, block); if (block->name != NULL) __wt_free(session, block->name); diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index 0a44731d5aa..0d87232d15e 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -21,10 +21,10 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_RET(__wt_desc_init(session, block->fh)); /* - * Salvage creates a new snapshot when it's finished, set up for + * Salvage creates a new checkpoint when it's finished, set up for * rolling an empty file forward. */ - WT_RET(__wt_block_snap_init(session, block, &block->live, "live", 1)); + WT_RET(__wt_block_ckpt_init(session, block, &block->live, "live", 1)); /* * Truncate the file to an initial sector plus N allocation size @@ -48,9 +48,9 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) block->slvg_off = WT_BLOCK_DESC_SECTOR; /* - * The only snapshot extent we care about is the allocation list. Start - * with the entire file on the allocation list, we'll "free" any blocks - * we don't want as we process the file. + * The only checkpoint extent we care about is the allocation list. + * Start with the entire file on the allocation list, we'll "free" + * any blocks we don't want as we process the file. */ WT_RET(__wt_block_insert_ext(session, &block->live.alloc, WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR)); @@ -70,8 +70,8 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) block->slvg = 0; - /* Discard the snapshot. */ - return (__wt_block_snapshot_unload(session, block)); + /* Discard the checkpoint. */ + return (__wt_block_checkpoint_unload(session, block)); } /* diff --git a/src/block/block_snap.c b/src/block/block_snap.c deleted file mode 100644 index 6f2b1fd9224..00000000000 --- a/src/block/block_snap.c +++ /dev/null @@ -1,706 +0,0 @@ -/*- - * Copyright (c) 2008-2012 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -static int __snapshot_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *); -static int __snapshot_string( - WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *); -static int __snapshot_update(WT_SESSION_IMPL *, - WT_BLOCK *, WT_SNAPSHOT *, WT_BLOCK_SNAPSHOT *, uint64_t, int); - -/* - * __wt_block_snap_init -- - * Initialize a snapshot structure. - */ -int -__wt_block_snap_init(WT_SESSION_IMPL *session, - WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si, const char *name, int is_live) -{ - WT_DECL_RET; - - /* - * If we're loading a new live snapshot, there shouldn't be one already - * loaded. The btree engine should prevent this from ever happening, - * but paranoia is a healthy thing. - */ - if (is_live) { - __wt_spin_lock(session, &block->live_lock); - if (block->live_load) - ret = EINVAL; - else - block->live_load = 1; - __wt_spin_unlock(session, &block->live_lock); - if (ret) - WT_RET_MSG(session, EINVAL, "snapshot already loaded"); - } - - memset(si, 0, sizeof(*si)); - - si->root_offset = WT_BLOCK_INVALID_OFFSET; - - WT_RET(__wt_block_extlist_init(session, &si->alloc, name, "alloc")); - WT_RET(__wt_block_extlist_init(session, &si->avail, name, "avail")); - WT_RET(__wt_block_extlist_init(session, &si->discard, name, "discard")); - - si->file_size = WT_BLOCK_DESC_SECTOR; - WT_RET(__wt_block_extlist_init( - session, &si->snapshot_avail, name, "snapshot_avail")); - - return (0); -} - -/* - * __wt_block_snapshot_load -- - * Load a snapshot. - */ -int -__wt_block_snapshot_load(WT_SESSION_IMPL *session, - WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, - int readonly) -{ - WT_BLOCK_SNAPSHOT *si; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - - WT_UNUSED(addr_size); - - /* - * Sometimes we don't find a root page (we weren't given a snapshot, - * or the referenced snapshot was empty). In that case we return a - * root page size of 0. Set that up now. - */ - dsk->size = 0; - - si = &block->live; - WT_RET(__wt_block_snap_init(session, block, si, "live", 1)); - - if (WT_VERBOSE_ISSET(session, snapshot)) { - if (addr != NULL) { - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__snapshot_string(session, block, addr, tmp)); - } - WT_VERBOSE_ERR(session, snapshot, - "%s: load-snapshot: %s", block->name, - addr == NULL ? "[Empty]" : (char *)tmp->data); - } - - /* If not loading a snapshot from disk, we're done. */ - if (addr == NULL || addr_size == 0) - return (0); - - /* Crack the snapshot cookie. */ - if (addr != NULL) - WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si)); - - /* Verify sets up next. */ - if (block->verify) - WT_ERR(__wt_verify_snap_load(session, block, si)); - - /* Read, and optionally verify, any root page. */ - if (si->root_offset != WT_BLOCK_INVALID_OFFSET) { - WT_ERR(__wt_block_read_off(session, block, - dsk, si->root_offset, si->root_size, si->root_cksum)); - if (block->verify) { - if (tmp == NULL) { - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__snapshot_string( - session, block, addr, tmp)); - } - WT_ERR( - __wt_verify_dsk(session, (char *)tmp->data, dsk)); - } - } - - /* - * Rolling a snapshot forward requires the avail list, the blocks from - * which we can allocate. - */ - if (!readonly) - WT_ERR(__wt_block_extlist_read(session, block, &si->avail)); - - /* - * If the snapshot can be written, that means anything written after - * the snapshot is no longer interesting. Truncate the file. - */ - if (!readonly) { - WT_VERBOSE_ERR(session, snapshot, - "truncate file to %" PRIuMAX, (uintmax_t)si->file_size); - WT_ERR(__wt_ftruncate(session, block->fh, si->file_size)); - } - - if (0) { -err: (void)__wt_block_snapshot_unload(session, block); - } - - __wt_scr_free(&tmp); - return (ret); -} - -/* - * __wt_block_snapshot_unload -- - * Unload a snapshot. - */ -int -__wt_block_snapshot_unload(WT_SESSION_IMPL *session, WT_BLOCK *block) -{ - WT_BLOCK_SNAPSHOT *si; - WT_DECL_RET; - - WT_VERBOSE_RETVAL( - session, snapshot, ret, "%s: unload snapshot", block->name); - - si = &block->live; - - /* Verify cleanup. */ - if (block->verify) - WT_TRET(__wt_verify_snap_unload(session, block, si)); - - __wt_block_snap_destroy(session, si); - - block->live_load = 0; - - return (ret); -} - -/* - * __wt_block_snap_destroy -- - * Clear a snapshot structure. - */ -void -__wt_block_snap_destroy(WT_SESSION_IMPL *session, WT_BLOCK_SNAPSHOT *si) -{ - /* Discard the extent lists. */ - __wt_block_extlist_free(session, &si->alloc); - __wt_block_extlist_free(session, &si->avail); - __wt_block_extlist_free(session, &si->discard); - __wt_block_extlist_free(session, &si->snapshot_avail); -} - -/* - * __wt_block_snapshot -- - * Create a new snapshot. - */ -int -__wt_block_snapshot(WT_SESSION_IMPL *session, - WT_BLOCK *block, WT_ITEM *buf, WT_SNAPSHOT *snapbase) -{ - WT_BLOCK_SNAPSHOT *si; - - si = &block->live; - si->version = WT_BM_SNAPSHOT_VERSION; - - /* - * Write the root page: it's possible for there to be a snapshot of - * an empty tree, in which case, we store an illegal root offset. - * - * XXX - * We happen to know that snapshots are single-threaded above us in - * the btree engine. That's probably something we want to guarantee - * for any WiredTiger block manager. - */ - if (buf == NULL) { - si->root_offset = WT_BLOCK_INVALID_OFFSET; - si->root_size = si->root_cksum = 0; - } else - WT_RET(__wt_block_write_off(session, block, buf, - &si->root_offset, &si->root_size, &si->root_cksum, 0)); - - /* Process the list of snapshots, deleting and updating as required. */ - WT_RET(__snapshot_process(session, block, snapbase)); - - /* - * Snapshots have to hit disk (it would be reasonable to configure for - * lazy snapshots, but we don't support them yet). Regardless, we're - * not holding any locks, other writers can proceed while we wait. - */ - if (!F_ISSET(S2C(session), WT_CONN_NOSYNC)) - WT_RET(__wt_fsync(session, block->fh)); - - return (0); -} - -/* - * __snapshot_extlist_fblocks -- - * If an extent list was read from disk, free its space to the live avail - * list. - */ -static inline int -__snapshot_extlist_fblocks( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el) -{ - if (el->offset == WT_BLOCK_INVALID_OFFSET) - return (0); - return (__wt_block_insert_ext( - session, &block->live.avail, el->offset, el->size)); -} - -/* - * __snapshot_process -- - * Process the list of snapshots. - */ -static int -__snapshot_process( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) -{ - WT_BLOCK_SNAPSHOT *a, *b, *si; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_SNAPSHOT *snap; - uint64_t snapshot_size; - int deleting, locked; - - si = &block->live; - locked = 0; - - /* - * We've allocated our last page, update the snapshot size. We need to - * calculate the live system's snapshot size before reading and merging - * snapshot allocation and discard information from the snapshots we're - * deleting, those operations will change the underlying byte counts. - */ - snapshot_size = si->snapshot_size; - snapshot_size += si->alloc.bytes; - snapshot_size -= si->discard.bytes; - - /* - * Extents that become newly available as a result of deleting previous - * snapshots are added to a list of extents. The list should be empty, - * but there's no explicit "free the snapshot information" call into the - * block manager; if there was an error in an upper level resulting in - * the snapshot never being "resolved", the list might not be empty. - * - * XXX - * This isn't sufficient, actually: we're going to leak all the blocks - * that were written as part of the last snapshot because it was never - * resolved. - */ - __wt_block_extlist_free(session, &si->snapshot_avail); - WT_RET(__wt_block_extlist_init( - session, &si->snapshot_avail, "live", "snapshot_avail")); - - /* - * To delete a snapshot, we'll need snapshot information for it, and we - * have to read that from the disk. - */ - deleting = 0; - WT_SNAPSHOT_FOREACH(snapbase, snap) { - /* - * To delete a snapshot, we'll need snapshot information for it - * and the subsequent snapshot. The test is tricky, we have to - * load the current snapshot's information if it's marked for - * deletion, or if it follows a snapshot marked for deletion, - * where the boundary cases are the first snapshot in the list - * and the last snapshot in the list: if we're deleting the last - * snapshot in the list, there's no next snapshot, the snapshot - * will be merged into the live tree. - */ - if (!F_ISSET(snap, WT_SNAP_DELETE) && - (snap == snapbase || - F_ISSET(snap, WT_SNAP_ADD) || - !F_ISSET(snap - 1, WT_SNAP_DELETE))) - continue; - deleting = 1; - - /* - * Allocate a snapshot structure, crack the cookie and read the - * snapshot's extent lists. - * - * Ignore the avail list: snapshot avail lists are only useful - * if we are rolling forward from the particular snapshot and - * they represent our best understanding of what blocks can be - * allocated. If we are not operating on the live snapshot, - * subsequent snapshots might have allocated those blocks, and - * the avail list is useless. We don't discard it, because it - * is useful as part of verification, but we don't re-write it - * either. - */ - WT_ERR(__wt_calloc( - session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); - si = snap->bpriv; - WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); - WT_ERR(__wt_block_buffer_to_snapshot( - session, block, snap->raw.data, si)); - WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); - WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); - } - - /* - * Hold a lock so the live extent lists and the file size can't change - * underneath us. I suspect we'll tighten this if snapshots take too - * much time away from real work: we read historic snapshot information - * without a lock, but we could also merge and re-write the delete - * snapshot information without a lock, except for ranges merged into - * the live tree. - */ - __wt_spin_lock(session, &block->live_lock); - locked = 1; - - /* Skip the additional processing if we aren't deleting snapshots. */ - if (!deleting) - goto live_update; - - /* - * Delete any no-longer-needed snapshots: we do this first as it frees - * blocks to the live lists, and the freed blocks will then be included - * when writing the live extent lists. - */ - WT_SNAPSHOT_FOREACH(snapbase, snap) { - if (!F_ISSET(snap, WT_SNAP_DELETE)) - continue; - - if (WT_VERBOSE_ISSET(session, snapshot)) { - if (tmp == NULL) - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__snapshot_string( - session, block, snap->raw.data, tmp)); - WT_VERBOSE_ERR(session, snapshot, - "%s: delete-snapshot: %s: %s", - block->name, snap->name, (char *)tmp->data); - } - - /* - * Set the from/to snapshot structures, where the "to" value - * may be the live tree. - */ - a = snap->bpriv; - if (F_ISSET(snap + 1, WT_SNAP_ADD)) - b = &block->live; - else - b = (snap + 1)->bpriv; - - /* - * Free the root page: there's nothing special about this free, - * the root page is allocated using normal rules, that is, it - * may have been taken from the avail list, and was entered on - * the live system's alloc list at that time. We free it into - * the snapshot's discard list, however, not the live system's - * list because it appears on the snapshot's alloc list and so - * must be paired in the snapshot. - */ - if (a->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_ERR(__wt_block_insert_ext(session, - &a->discard, a->root_offset, a->root_size)); - - /* - * Free the blocks used to hold the "from" snapshot's extent - * lists directly to the live system's avail list, they were - * never on any alloc list. Include the "from" snapshot's - * avail list, it's going away. - */ - WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); - WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); - WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); - - /* - * Roll the "from" alloc and discard extent lists into the "to" - * snapshot's lists. - */ - if (a->alloc.entries != 0) - WT_ERR(__wt_block_extlist_merge( - session, &a->alloc, &b->alloc)); - if (a->discard.entries != 0) - WT_ERR(__wt_block_extlist_merge( - session, &a->discard, &b->discard)); - - /* - * If the "to" snapshot is also being deleted, we're done with - * it, it's merged into some other snapshot in the next loop. - * This means the extent lists may aggregate over a number of - * snapshots, but that's OK, they're disjoint sets of ranges. - */ - if (F_ISSET(snap + 1, WT_SNAP_DELETE)) - continue; - - /* - * Find blocks for re-use: wherever the "to" snapshot's allocate - * and discard lists overlap is fair game, move ranges appearing - * on both lists to the live snapshot's newly available list. - */ - WT_ERR(__wt_block_extlist_overlap(session, block, b)); - - /* - * If we're updating the live system's information, we're done. - */ - if (F_ISSET(snap + 1, WT_SNAP_ADD)) - continue; - - /* - * We have to write the "to" snapshot's extent lists out in new - * blocks, and update its cookie. - * - * Free the blocks used to hold the "to" snapshot's extent lists - * directly to the live system's avail list, they were never on - * any alloc list. Do not include the "to" snapshot's avail - * list, it's not changing. - */ - WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); - WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); - - F_SET(snap + 1, WT_SNAP_UPDATE); - } - - /* Update snapshots marked for update. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (F_ISSET(snap, WT_SNAP_UPDATE)) { - WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); - WT_ERR(__snapshot_update( - session, block, snap, snap->bpriv, 0, 0)); - } - -live_update: - si = &block->live; - - /* Truncate the file if that's possible. */ - WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); - - /* Update the final, added snapshot based on the live system. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (F_ISSET(snap, WT_SNAP_ADD)) { - WT_ERR(__snapshot_update( - session, block, snap, si, snapshot_size, 1)); - - /* - * XXX - * Our caller wants two pieces of information: the time - * the snapshot was taken and the final snapshot size. - * This violates layering but the alternative is a call - * for the btree layer to crack the snapshot cookie into - * its components, and that's a fair amount of work. - * (We could just read the system time in the session - * layer when updating the metadata file, but that won't - * work for the snapshot size, and so we do both here.) - */ - snap->snapshot_size = si->snapshot_size; - WT_ERR(__wt_epoch(session, &snap->sec, NULL)); - } - - /* - * Reset the live system's alloc and discard extent lists, leave the - * avail list alone. - */ - __wt_block_extlist_free(session, &si->alloc); - WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); - __wt_block_extlist_free(session, &si->discard); - WT_ERR( - __wt_block_extlist_init(session, &si->discard, "live", "discard")); - -#ifdef HAVE_DIAGNOSTIC - /* - * The first snapshot in the system should always have an empty discard - * list. If we've read that snapshot and/or created it, check. - */ - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (!F_ISSET(snap, WT_SNAP_DELETE)) - break; - if ((a = snap->bpriv) == NULL) - a = &block->live; - if (a->discard.entries != 0) { - __wt_errx(session, - "snapshot incorrectly has blocks on the discard list"); - WT_ERR(WT_ERROR); - } -#endif - -err: if (locked) - __wt_spin_unlock(session, &block->live_lock); - - /* Discard any snapshot information we loaded, we no longer need it. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) - if ((si = snap->bpriv) != NULL) { - __wt_block_extlist_free(session, &si->alloc); - __wt_block_extlist_free(session, &si->avail); - __wt_block_extlist_free(session, &si->discard); - } - - __wt_scr_free(&tmp); - return (ret); -} - -/* - * __snapshot_update -- - * Update a snapshot. - */ -static int -__snapshot_update( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, - WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) -{ - WT_DECL_ITEM(tmp); - WT_DECL_RET; - uint8_t *endp; - -#ifdef HAVE_DIAGNOSTIC - /* Check the extent list combinations for overlaps. */ - WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); - WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); - WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); -#endif - /* - * Write the snapshot's extent lists; we only write an avail list for - * the live system, other snapshot's avail lists are static and never - * change. When we do write the avail list for the live system it's - * two lists: the current avail list plus the list of blocks that are - * being made available as of the new snapshot. We can't merge that - * second list into the real list yet, it's not truly available until - * the new snapshot location has been saved to the metadata. - */ - WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); - if (is_live) - WT_RET(__wt_block_extlist_write( - session, block, &si->avail, &si->snapshot_avail)); - WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); - - /* - * Set the file size for the live system. - * - * XXX - * We do NOT set the file size when re-writing snapshots because we want - * to test the snapshot's blocks against a reasonable maximum file size - * during verification. This is not good: imagine a snapshot appearing - * early in the file, re-written, and then the snapshot requires blocks - * at the end of the file, blocks after the listed file size. If the - * application opens that snapshot for writing (discarding subsequent - * snapshots), we would truncate the file to the early chunk, discarding - * the re-written snapshot information. The alternative, updating the - * file size has its own problems, in that case we'd work correctly, but - * we'd lose all of the blocks between the original snapshot and the - * re-written snapshot. Currently, there's no API to roll-forward - * intermediate snapshots, if there ever is, this will need to be fixed. - */ - if (is_live) - WT_RET(__wt_filesize(session, block->fh, &si->file_size)); - - /* Set the snapshot size for the live system. */ - if (is_live) - si->snapshot_size = snapshot_size; - - /* - * Copy the snapshot information into the snapshot array's address - * cookie. - */ - WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); - endp = snap->raw.mem; - WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); - snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); - - if (WT_VERBOSE_ISSET(session, snapshot)) { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); - WT_VERBOSE_ERR(session, snapshot, - "%s: create-snapshot: %s: %s", - block->name, snap->name, (char *)tmp->data); - } - -err: __wt_scr_free(&tmp); - return (ret); -} - -/* - * __wt_block_snapshot_resolve -- - * Resolve a snapshot. - */ -int -__wt_block_snapshot_resolve( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) -{ - WT_BLOCK_SNAPSHOT *si; - WT_DECL_RET; - - si = &block->live; - - /* - * Snapshots are a two-step process: first, we write a new snapshot to - * disk (including all the new extent lists for modified snapshots and - * the live system). As part of this we create a list of file blocks - * newly available for re-allocation, based on snapshots being deleted. - * We then return the locations of the new snapshot information to our - * caller. Our caller has to write that information into some kind of - * stable storage, and once that's done, we can actually allocate from - * that list of newly available file blocks. (We can't allocate from - * that list immediately because the allocation might happen before our - * caller saves the new snapshot information, and if we crashed before - * the new snapshot information was saved, we'd have overwritten blocks - * still referenced by snapshots in the system.) In summary, there is - * a second step, after our caller saves the snapshot information, we - * are called to add the newly available blocks into the live system's - * available list. - */ - __wt_spin_lock(session, &block->live_lock); - ret = - __wt_block_extlist_merge(session, &si->snapshot_avail, &si->avail); - __wt_spin_unlock(session, &block->live_lock); - - /* Discard the list. */ - __wt_block_extlist_free(session, &si->snapshot_avail); - - WT_UNUSED(snapbase); - return (ret); -} - -/* - * __snapshot_string -- - * Return a printable string representation of a snapshot address cookie. - */ -static int -__snapshot_string(WT_SESSION_IMPL *session, - WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf) -{ - WT_BLOCK_SNAPSHOT *si, _si; - - /* Initialize the snapshot, crack the cookie. */ - si = &_si; - WT_RET(__wt_block_snap_init(session, block, si, "string", 0)); - WT_RET(__wt_block_buffer_to_snapshot(session, block, addr, si)); - - WT_RET(__wt_buf_fmt(session, buf, - "version=%d", - si->version)); - if (si->root_offset == WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]")); - else - WT_RET(__wt_buf_catfmt(session, buf, - ", root=[%" - PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", - (uintmax_t)si->root_offset, - (uintmax_t)(si->root_offset + si->root_size), - si->root_size, si->root_cksum)); - if (si->alloc.offset == WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]")); - else - WT_RET(__wt_buf_catfmt(session, buf, - ", alloc=[%" - PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", - (uintmax_t)si->alloc.offset, - (uintmax_t)(si->alloc.offset + si->alloc.size), - si->alloc.size, si->alloc.cksum)); - if (si->avail.offset == WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]")); - else - WT_RET(__wt_buf_catfmt(session, buf, - ", avail=[%" - PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", - (uintmax_t)si->avail.offset, - (uintmax_t)(si->avail.offset + si->avail.size), - si->avail.size, si->avail.cksum)); - if (si->discard.offset == WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]")); - else - WT_RET(__wt_buf_catfmt(session, buf, - ", discard=[%" - PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", - (uintmax_t)si->discard.offset, - (uintmax_t)(si->discard.offset + si->discard.size), - si->discard.size, si->discard.cksum)); - WT_RET(__wt_buf_catfmt(session, buf, - ", file size=%" PRIuMAX - ", write generation=%" PRIu64, - (uintmax_t)si->file_size, - si->write_gen)); - - return (0); -} diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index 711e982b66d..5373e5d2b30 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -7,14 +7,14 @@ #include "wt_internal.h" +static int __verify_ckptfrag_add(WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t); +static int __verify_ckptfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *); static int __verify_filefrag_add( WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t, int); static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *); -static int __verify_snapfrag_add(WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t); -static int __verify_snapfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *); -static int __verify_start_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *); +static int __verify_start_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); static int __verify_start_filesize( - WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *, off_t *); + WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, off_t *); /* The bit list ignores the first sector: convert to/from a frag/offset. */ #define WT_OFF_TO_FRAG(block, off) \ @@ -28,7 +28,7 @@ static int __verify_start_filesize( */ int __wt_block_verify_start( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { off_t file_size; @@ -40,12 +40,12 @@ __wt_block_verify_start( return (0); /* - * Opening a WiredTiger file truncates it back to the snapshot we are + * Opening a WiredTiger file truncates it back to the checkpoint we are * rolling forward, which means it's OK if there are blocks written - * after that snapshot, they'll be ignored. Find the largest file size - * referenced by any snapshot. + * after that checkpoint, they'll be ignored. Find the largest file + * size referenced by any checkpoint. */ - WT_RET(__verify_start_filesize(session, block, snapbase, &file_size)); + WT_RET(__verify_start_filesize(session, block, ckptbase, &file_size)); /* * Allocate a bit array, where each bit represents a single allocation @@ -78,16 +78,16 @@ __wt_block_verify_start( /* * We maintain an allocation list that is rolled forward through the - * set of snapshots. + * set of checkpoints. */ WT_RET(__wt_block_extlist_init( session, &block->verify_alloc, "verify", "alloc")); /* - * The only snapshot avail list we care about is the last one written; + * The only checkpoint avail list we care about is the last one written; * get it now and initialize the list of file fragments. */ - WT_RET(__verify_start_avail(session, block, snapbase)); + WT_RET(__verify_start_avail(session, block, ckptbase)); block->verify = 1; return (0); @@ -95,36 +95,36 @@ __wt_block_verify_start( /* * __verify_start_filesize -- - * Set the file size for the last snapshot. + * Set the file size for the last checkpoint. */ static int __verify_start_filesize(WT_SESSION_IMPL *session, - WT_BLOCK *block, WT_SNAPSHOT *snapbase, off_t *file_sizep) + WT_BLOCK *block, WT_CKPT *ckptbase, off_t *file_sizep) { - WT_BLOCK_SNAPSHOT *si, _si; - WT_SNAPSHOT *snap; + WT_BLOCK_CKPT *ci, _ci; + WT_CKPT *ckpt; off_t file_size; - si = &_si; + ci = &_ci; /* - * Find the largest file size referenced by any snapshot -- that should - * be the last snapshot taken, but out of sheer, raving paranoia, look - * through the list, future changes to snapshots might break this code + * Find the largest file size referenced by any checkpoint: that should + * be the last checkpoint taken, but out of sheer, raving paranoia, look + * through the list, future changes to checkpoints might break this code * if we make that assumption. */ file_size = 0; - WT_SNAPSHOT_FOREACH(snapbase, snap) { - WT_RET(__wt_block_buffer_to_snapshot( - session, block, snap->raw.data, si)); - if (si->file_size > file_size) - file_size = si->file_size; + WT_CKPT_FOREACH(ckptbase, ckpt) { + WT_RET(__wt_block_buffer_to_ckpt( + session, block, ckpt->raw.data, ci)); + if (ci->file_size > file_size) + file_size = ci->file_size; } - /* Verify doesn't make any sense if we don't have a snapshot. */ + /* Verify doesn't make any sense if we don't have a checkpoint. */ if (file_size <= WT_BLOCK_DESC_SECTOR) WT_RET_MSG(session, WT_ERROR, - "%s has no snapshots to verify", block->name); + "%s has no checkpoints to verify", block->name); /* * The file size should be a multiple of the allocsize, offset by the @@ -133,7 +133,7 @@ __verify_start_filesize(WT_SESSION_IMPL *session, file_size -= WT_BLOCK_DESC_SECTOR; if (file_size % block->allocsize != 0) WT_RET_MSG(session, WT_ERROR, - "the snapshot file size is not a multiple of the " + "the checkpoint file size is not a multiple of the " "allocation size"); *file_sizep = file_size; @@ -142,32 +142,31 @@ __verify_start_filesize(WT_SESSION_IMPL *session, /* * __verify_start_avail -- - * Get the last snapshot's avail list and load it into the list of file + * Get the last checkpoint's avail list and load it into the list of file * fragments. */ static int __verify_start_avail( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { - WT_BLOCK_SNAPSHOT *si, _si; + WT_BLOCK_CKPT *ci, _ci; + WT_CKPT *ckpt; WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; - WT_SNAPSHOT *snap; - /* Get the last on-disk snapshot, if one exists. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) + /* Get the last on-disk checkpoint, if one exists. */ + WT_CKPT_FOREACH(ckptbase, ckpt) ; - if (snap == snapbase) + if (ckpt == ckptbase) return (0); - --snap; + --ckpt; - si = &_si; - WT_RET(__wt_block_snap_init(session, block, si, snap->name, 0)); - WT_ERR( - __wt_block_buffer_to_snapshot(session, block, snap->raw.data, si)); + ci = &_ci; + WT_RET(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0)); + WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); - el = &si->avail; + el = &ci->avail; if (el->offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_extlist_read(session, block, el)); WT_EXT_FOREACH(ext, el->off) @@ -176,7 +175,7 @@ __verify_start_avail( break; } -err: __wt_block_snap_destroy(session, si); +err: __wt_block_ckpt_destroy(session, ci); return (ret); } @@ -197,59 +196,59 @@ __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Discard the fragment tracking lists. */ __wt_free(session, block->fragfile); - __wt_free(session, block->fragsnap); + __wt_free(session, block->fragckpt); block->verify = 0; return (ret); } /* - * __wt_verify_snap_load -- - * Verify work done when a snapshot is loaded. + * __wt_verify_ckpt_load -- + * Verify work done when a checkpoint is loaded. */ int -__wt_verify_snap_load( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si) +__wt_verify_ckpt_load( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci) { WT_EXTLIST *el; WT_EXT *ext; uint32_t frag, frags; - /* Set the maximum file size for this snapshot. */ - block->verify_size = si->file_size; + /* Set the maximum file size for this checkpoint. */ + block->verify_size = ci->file_size; /* * Add the root page and disk blocks used to store the extent lists to * the list of blocks we've "seen" from the file. */ - if (si->root_offset != WT_BLOCK_INVALID_OFFSET) + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__verify_filefrag_add(session, - block, si->root_offset, (off_t)si->root_size, 1)); - if (si->alloc.offset != WT_BLOCK_INVALID_OFFSET) + block, ci->root_offset, (off_t)ci->root_size, 1)); + if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__verify_filefrag_add(session, - block, si->alloc.offset, (off_t)si->alloc.size, 1)); - if (si->avail.offset != WT_BLOCK_INVALID_OFFSET) + block, ci->alloc.offset, (off_t)ci->alloc.size, 1)); + if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__verify_filefrag_add(session, - block, si->avail.offset, (off_t)si->avail.size, 1)); - if (si->discard.offset != WT_BLOCK_INVALID_OFFSET) + block, ci->avail.offset, (off_t)ci->avail.size, 1)); + if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__verify_filefrag_add(session, - block, si->discard.offset, (off_t)si->discard.size, 1)); + block, ci->discard.offset, (off_t)ci->discard.size, 1)); /* - * Snapshot verification is similar to deleting snapshots. As we read - * each new snapshot, we merge the allocation lists (accumulating all - * allocated pages as we move through the system), and then remove any - * pages found in the discard list. The result should be a one-to-one - * mapping to the pages we find in this particular snapshot. + * Checkpoint verification is similar to deleting checkpoints. As we + * read each new checkpoint, we merge the allocation lists (accumulating + * all allocated pages as we move through the system), and then remove + * any pages found in the discard list. The result should be a + * one-to-one mapping to the pages we find in this specific checkpoint. */ - el = &si->alloc; + el = &ci->alloc; if (el->offset != WT_BLOCK_INVALID_OFFSET) { WT_RET(__wt_block_extlist_read(session, block, el)); WT_RET(__wt_block_extlist_merge( session, el, &block->verify_alloc)); __wt_block_extlist_free(session, el); } - el = &si->discard; + el = &ci->discard; if (el->offset != WT_BLOCK_INVALID_OFFSET) { WT_RET(__wt_block_extlist_read(session, block, el)); WT_EXT_FOREACH(ext, el->off) @@ -259,50 +258,51 @@ __wt_verify_snap_load( } /* - * The root page of the snapshot appears on the alloc list, but not, at - * least until the snapshot is deleted, on a discard list. To handle - * this case, remove the root page from the accumulated list of snapshot - * pages, so it doesn't add a new requirement for subsequent snapshots. + * The root page of the checkpoint appears on the alloc list, but not, + * at least until the checkpoint is deleted, on a discard list. To + * handle this case, remove the root page from the accumulated list of + * checkpoint pages, so it doesn't add a new requirement for subsequent + * checkpoints. */ - if (si->root_offset != WT_BLOCK_INVALID_OFFSET) + if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap(session, - &block->verify_alloc, si->root_offset, si->root_size)); + &block->verify_alloc, ci->root_offset, ci->root_size)); /* - * Allocate the per-snapshot bit map. The per-snapshot bit map is the - * opposite of the per-file bit map, that is, we set all the bits that - * we expect to be set based on the snapshot's allocation and discard - * lists, then clear bits as we verify blocks. When finished verifying - * the snapshot, the bit list should be empty. + * Allocate the per-checkpoint bit map. The per-checkpoint bit map is + * the opposite of the per-file bit map, that is, we set all the bits + * that we expect to be set based on the checkpoint's allocation and + * discard lists, then clear bits as we verify blocks. When finished + * verifying the checkpoint, the bit list should be empty. */ - WT_RET(__bit_alloc(session, block->frags, &block->fragsnap)); + WT_RET(__bit_alloc(session, block->frags, &block->fragckpt)); el = &block->verify_alloc; WT_EXT_FOREACH(ext, el->off) { frag = (uint32_t)WT_OFF_TO_FRAG(block, ext->off); frags = (uint32_t)(ext->size / block->allocsize); - __bit_nset(block->fragsnap, frag, frag + (frags - 1)); + __bit_nset(block->fragckpt, frag, frag + (frags - 1)); } return (0); } /* - * __wt_verify_snap_unload -- - * Verify work done when a snapshot is unloaded. + * __wt_verify_ckpt_unload -- + * Verify work done when a checkpoint is unloaded. */ int -__wt_verify_snap_unload( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si) +__wt_verify_ckpt_unload( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci) { WT_DECL_RET; - WT_UNUSED(si); + WT_UNUSED(ci); - /* Confirm we verified every snapshot block. */ - ret = __verify_snapfrag_chk(session, block); + /* Confirm we verified every checkpoint block. */ + ret = __verify_ckptfrag_chk(session, block); - /* Discard the per-snapshot fragment list. */ - __wt_free(session, block->fragsnap); + /* Discard the per-checkpoint fragment list. */ + __wt_free(session, block->fragckpt); return (ret); } @@ -354,7 +354,7 @@ err: __wt_scr_free(&tmp); /* * __wt_block_verify_addr -- - * Update an address in a snapshot as verified. + * Update an address in a checkpoint as verified. */ int __wt_block_verify_addr(WT_SESSION_IMPL *session, @@ -374,18 +374,19 @@ __wt_block_verify_addr(WT_SESSION_IMPL *session, /* * It's tempting to try and flag a page as "verified" when we read it. * That doesn't work because we may visit a page multiple times when - * verifying a single snapshot (for example, when verifying the physical - * image of a row-store leaf page with overflow keys, the overflow keys - * are read when checking for key sort issues, and read again when more - * general overflow item checking is done). This function is called by - * the btree verification code, once per logical visit in a snapshot, so - * we can detect if a page is referenced multiple times within a single - * snapshot. This doesn't apply to the per-file list, because it is - * expected for the same btree blocks to appear in multiple snapshots. + * verifying a single checkpoint (for example, when verifying the + * physical image of a row-store leaf page with overflow keys, the + * overflow keys are read when checking for key sort issues, and read + * again when more general overflow item checking is done). This + * function is called by the btree verification code, once per logical + * visit in a checkpoint, so we can detect if a page is referenced + * multiple times within a single checkpoint. This doesn't apply to + * the per-file list, because it is expected for the same btree blocks + * to appear in multiple checkpoints. * - * Add the block to the per-snapshot list. + * Add the block to the per-checkpoint list. */ - WT_RET(__verify_snapfrag_add(session, block, offset, size)); + WT_RET(__verify_ckptfrag_add(session, block, offset, size)); return (0); } @@ -457,7 +458,7 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) } __wt_errx(session, - "file range %" PRIuMAX "-%" PRIuMAX " was never verified", + "file range %" PRIuMAX "-%" PRIuMAX " never verified", (uintmax_t)WT_FRAG_TO_OFF(block, first), (uintmax_t)WT_FRAG_TO_OFF(block, last)); ret = WT_ERROR; @@ -466,28 +467,28 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) } /* - * __verify_snapfrag_add -- - * Clear the fragments in the per-snapshot fragment list, and complain if - * we've already verified this chunk of the snapshot. + * __verify_ckptfrag_add -- + * Clear the fragments in the per-checkpoint fragment list, and complain if + * we've already verified this chunk of the checkpoint. */ static int -__verify_snapfrag_add( +__verify_ckptfrag_add( WT_SESSION_IMPL *session, WT_BLOCK *block, off_t offset, off_t size) { uint32_t f, frag, frags, i; WT_VERBOSE_RET(session, verify, - "adding snapshot block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")", + "add checkpoint block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")", (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size); /* - * Check each chunk against the snapshot's size, a snapshot should never - * reference a block outside of the snapshot's stored size. + * Check each chunk against the checkpoint's size, a checkpoint should + * never reference a block outside of the checkpoint's stored size. */ if (offset + size > block->verify_size) WT_RET_MSG(session, WT_ERROR, "fragment %" PRIuMAX "-%" PRIuMAX " references " - "file blocks outside the snapshot", + "file blocks outside the checkpoint", (uintmax_t)offset, (uintmax_t)(offset + size)); frag = (uint32_t)WT_OFF_TO_FRAG(block, offset); @@ -495,49 +496,48 @@ __verify_snapfrag_add( /* It is illegal to reference a particular chunk more than once. */ for (f = frag, i = 0; i < frags; ++f, ++i) - if (!__bit_test(block->fragsnap, f)) + if (!__bit_test(block->fragckpt, f)) WT_RET_MSG(session, WT_ERROR, - "snapshot fragment at %" PRIuMAX " referenced " - "multiple times in a single snapshot or found in " - "the snapshot but not listed in the snapshot's " + "checkpoint fragment at %" PRIuMAX " referenced " + "multiple times in a single checkpoint or found in " + "the checkpoint but not listed in the checkpoint's " "allocation list", (uintmax_t)offset); - /* Remove fragments from the snapshot's allocation list. */ - __bit_nclr(block->fragsnap, frag, frag + (frags - 1)); + /* Remove fragments from the checkpoint's allocation list. */ + __bit_nclr(block->fragckpt, frag, frag + (frags - 1)); return (0); } /* - * __verify_snapfrag_chk -- - * Verify we've checked all the fragments in the snapshot. + * __verify_ckptfrag_chk -- + * Verify we've checked all the fragments in the checkpoint. */ static int -__verify_snapfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) +__verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_DECL_RET; uint32_t first, last; /* - * Check for snapshot fragments we haven't verified -- every time we + * Check for checkpoint fragments we haven't verified -- every time we * find a bit that's set, complain. We re-start the search each time * after clearing the set bit(s) we found: it's simpler and this isn't * supposed to happen a lot. */ for (;;) { - if (__bit_ffs(block->fragsnap, block->frags, &first) != 0) + if (__bit_ffs(block->fragckpt, block->frags, &first) != 0) break; - __bit_clear(block->fragsnap, first); + __bit_clear(block->fragckpt, first); for (last = first + 1; last < block->frags; ++last) { - if (!__bit_test(block->fragsnap, last)) + if (!__bit_test(block->fragckpt, last)) break; - __bit_clear(block->fragsnap, last); + __bit_clear(block->fragckpt, last); } __wt_errx(session, - "snapshot range %" PRIuMAX "-%" PRIuMAX " was never " - "verified", + "checkpoint range %" PRIuMAX "-%" PRIuMAX " never verified", (uintmax_t)WT_FRAG_TO_OFF(block, first), (uintmax_t)WT_FRAG_TO_OFF(block, last)); ret = WT_ERROR; diff --git a/src/block/block_write.c b/src/block/block_write.c index 504a1ab18d9..0d99f02245b 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -225,7 +225,7 @@ not_compressed: /* /* * Allocate space from the underlying file and write the block. Always - * extend the file when writing snapshot extents, that's easier than + * extend the file when writing checkpoint extents, that's easier than * distinguishing between extents allocated from the live avail list, * and those which can't be allocated from the live avail list such as * blocks for writing the live avail list itself. diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index a040d9f8065..9c238cd1da6 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -74,19 +74,20 @@ __wt_btree_open(WT_SESSION_IMPL *session, session, filename, btree->config, cfg, forced_salvage)); /* - * Open the specified snapshot unless it's a special command (special - * commands are responsible for loading their own snapshots, if any). + * Open the specified checkpoint unless it's a special command (special + * commands are responsible for loading their own checkpoints, if any). */ if (F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) return (0); /* - * There are two reasons to load an empty tree rather than a snapshot: - * either there is no snapshot (the file is being created), or the load - * call returns no root page (the snapshot is empty). + * There are two reasons to load an empty tree rather than a checkpoint: + * either there is no checkpoint (the file is being created), or the + * load call returns no root page (the checkpoint is for an empty file). */ - WT_ERR(__wt_bm_snapshot_load(session, &dsk, addr, addr_size, readonly)); + WT_ERR( + __wt_bm_checkpoint_load(session, &dsk, addr, addr_size, readonly)); if (addr == NULL || addr_size == 0 || dsk.size == 0) WT_ERR(__btree_tree_open_empty(session)); else { @@ -117,14 +118,11 @@ __wt_btree_close(WT_SESSION_IMPL *session) btree = session->btree; - /* - * Discard the tree and, if the tree is modified, create a new snapshot - * for the underlying object, unless it's a special command. - */ + /* Unload the checkpoint, unless it's a special command. */ if (F_ISSET(btree, WT_BTREE_OPEN) && !F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - WT_TRET(__wt_bm_snapshot_unload(session)); + WT_TRET(__wt_bm_checkpoint_unload(session)); /* Close the underlying block manager reference. */ WT_TRET(__wt_bm_close(session)); @@ -132,9 +130,9 @@ __wt_btree_close(WT_SESSION_IMPL *session) /* Close the Huffman tree. */ __wt_btree_huffman_close(session); - /* Snapshot lock. */ - if (btree->snaplock != NULL) - __wt_rwlock_destroy(session, &btree->snaplock); + /* Checkpoint lock. */ + if (btree->ckptlock != NULL) + __wt_rwlock_destroy(session, &btree->ckptlock); /* Free allocated memory. */ __wt_free(session, btree->key_format); @@ -212,8 +210,9 @@ __btree_conf(WT_SESSION_IMPL *session) } } - /* Snapshot lock. */ - WT_RET(__wt_rwlock_alloc(session, "btree snapshot", &btree->snaplock)); + /* Checkpoint lock. */ + WT_RET( + __wt_rwlock_alloc(session, "btree checkpoint", &btree->ckptlock)); /* Page sizes */ WT_RET(__btree_page_sizes(session, config)); @@ -327,8 +326,8 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session) /* * Mark the child page empty so that if it is evicted, the tree ends up - * sane. The page should not be dirty, or we will always write empty - * trees on close, including empty snapshots. + * sane. The page should not be dirty, else we would write empty trees + * on close, including empty checkpoints. */ WT_ERR(__wt_page_modify_init(session, leaf)); F_SET(leaf->modify, WT_PM_REC_EMPTY); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index f7559c9d200..7bec54b86e3 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -137,7 +137,7 @@ static int __slvg_trk_ovfl(WT_SESSION_IMPL *, */ int __wt_bt_salvage( - WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase, const char *cfg[]) + WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) { WT_BTREE *btree; WT_DECL_RET; @@ -270,12 +270,12 @@ __wt_bt_salvage( /* * Step 9: - * Evict the newly created root page, creating a snapshot. + * Evict the newly created root page, creating a checkpoint. */ if (ss->root_page != NULL) { - btree->snap = snapbase; + btree->ckpt = ckptbase; ret = __wt_rec_evict(session, ss->root_page, WT_REC_SINGLE); - btree->snap = NULL; + btree->ckpt = NULL; ss->root_page = NULL; } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7d37d30c155..ab391458451 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -13,7 +13,7 @@ */ int __wt_bt_cache_flush( - WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase, int op, int force) + WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op, int force) { WT_DECL_RET; WT_BTREE *btree; @@ -21,7 +21,7 @@ __wt_bt_cache_flush( btree = session->btree; /* - * If we need a new snapshot, mark the root page dirty to ensure a + * If we need a new checkpoint, mark the root page dirty to ensure a * write. */ if (force) { @@ -51,9 +51,9 @@ __wt_bt_cache_flush( * already works that way. None of these problems can't be fixed, but * I don't see a reason to change at this time, either. */ - btree->snap = snapbase; + btree->ckpt = ckptbase; ret = __wt_sync_file_serial(session, op); - btree->snap = NULL; + btree->ckpt = NULL; WT_RET(ret); switch (op) { diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 929e9269651..60ecdd7e3fa 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -25,6 +25,7 @@ typedef struct { WT_ITEM *tmp2; /* Temporary buffer */ } WT_VSTUFF; +static void __verify_checkpoint_reset(WT_VSTUFF *); static int __verify_int(WT_SESSION_IMPL *, int); static int __verify_overflow( WT_SESSION_IMPL *, const uint8_t *, uint32_t, WT_VSTUFF *); @@ -33,7 +34,6 @@ static int __verify_row_int_key_order( WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *); static int __verify_row_leaf_key_order( WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *); -static void __verify_snapshot_reset(WT_VSTUFF *); static int __verify_tree(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, WT_VSTUFF *); /* @@ -79,13 +79,13 @@ static int __verify_int(WT_SESSION_IMPL *session, int dumpfile) { WT_BTREE *btree; + WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_ITEM dsk; - WT_SNAPSHOT *snapbase, *snap; WT_VSTUFF *vs, _vstuff; btree = session->btree; - snapbase = NULL; + ckptbase = NULL; WT_CLEAR(_vstuff); vs = &_vstuff; @@ -95,22 +95,22 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile) WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); - /* Get a list of the snapshots for this file. */ - WT_ERR(__wt_meta_snaplist_get(session, btree->name, &snapbase)); + /* Get a list of the checkpoints for this file. */ + WT_ERR(__wt_meta_ckptlist_get(session, btree->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ - WT_ERR(__wt_bm_verify_start(session, snapbase)); + WT_ERR(__wt_bm_verify_start(session, ckptbase)); - /* Loop through the file's snapshots, verifying each one. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) { + /* Loop through the file's checkpoints, verifying each one. */ + WT_CKPT_FOREACH(ckptbase, ckpt) { WT_VERBOSE_ERR(session, verify, - "%s: snapshot %s", btree->name, snap->name); + "%s: checkpoint %s", btree->name, ckpt->name); - /* House-keeping between snapshots. */ - __verify_snapshot_reset(vs); + /* House-keeping between checkpoints. */ + __verify_checkpoint_reset(vs); /* - * Load the snapshot -- if the size of the root page is 0, the + * Load the checkpoint -- if the size of the root page is 0, the * file is empty. * * Clearing the root page reference here is not an error: any @@ -120,10 +120,10 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile) * we can't ever use it again. */ WT_CLEAR(dsk); - WT_ERR(__wt_bm_snapshot_load( - session, &dsk, snap->raw.data, snap->raw.size, 1)); + WT_ERR(__wt_bm_checkpoint_load( + session, &dsk, ckpt->raw.data, ckpt->raw.size, 1)); if (dsk.size != 0) { - /* Verify, then discard the snapshot from the cache. */ + /* Verify then discard the checkpoint from the cache. */ if ((ret = __wt_btree_tree_open(session, &dsk)) == 0) { ret = __verify_tree( session, btree->root_page, (uint64_t)1, vs); @@ -132,13 +132,13 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile) } } - /* Unload the snapshot. */ - WT_TRET(__wt_bm_snapshot_unload(session)); + /* Unload the checkpoint. */ + WT_TRET(__wt_bm_checkpoint_unload(session)); WT_ERR(ret); } - /* Discard the list of snapshots. */ -err: __wt_meta_snaplist_free(session, snapbase); + /* Discard the list of checkpoints. */ +err: __wt_meta_ckptlist_free(session, ckptbase); /* Inform the underlying block manager we're done. */ WT_TRET(__wt_bm_verify_end(session)); @@ -158,19 +158,19 @@ err: __wt_meta_snaplist_free(session, snapbase); } /* - * __verify_snapshot_reset -- - * Reset anything needing to be reset for each new snapshot verification. + * __verify_checkpoint_reset -- + * Reset anything needing to be reset for each new checkpoint verification. */ static void -__verify_snapshot_reset(WT_VSTUFF *vs) +__verify_checkpoint_reset(WT_VSTUFF *vs) { /* - * Key order is per snapshot, reset the data length that serves as a + * Key order is per checkpoint, reset the data length that serves as a * flag value. */ vs->max_addr->size = 0; - /* Record total is per snapshot, reset the record count. */ + /* Record total is per checkpoint, reset the record count. */ vs->record_total = 0; } diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 633bd7e04db..58c13a9a282 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -305,7 +305,7 @@ __wt_rec_write( /* * Root pages are trickier. First, if the page is empty or we performed * a 1-for-1 page swap, we're done, we've written the root (and done the - * snapshot). + * checkpoint). */ switch (F_ISSET(page->modify, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ @@ -337,10 +337,10 @@ __wt_rec_write( * root page, pointing to a chain of pages, each of which are flagged as * "split" pages, up to a final replacement page. We don't use those * pages again, they are discarded in the next root page reconciliation. - * We could discard them immediately (because the snapshot is complete, - * any pages we discard go on the next snapshot's free list, it's safe - * to do), but the code is simpler this way, and this operation should - * not be common. + * We could discard them immediately (as the checkpoint is complete, any + * pages we discard go on the next checkpoint's free list, it's safe to + * do), but the code is simpler this way, and this operation should not + * be common. */ WT_VERBOSE_RET(session, reconcile, "root page split %p -> %p", page, page->modify->u.split); @@ -822,7 +822,7 @@ __rec_split_finish(WT_SESSION_IMPL *session) WT_BOUNDARY *bnd; WT_PAGE_HEADER *dsk; WT_RECONCILE *r; - int snapshot; + int checkpoint; r = session->reconcile; @@ -861,18 +861,18 @@ __rec_split_finish(WT_SESSION_IMPL *session) } /* - * Third, check to see if we're creating a snapshot: any time we write + * Third, check to see if we're creating a checkpoint: any time we write * the root page of the tree, we tell the underlying block manager so it - * can write and return the additional information a snapshot requires. + * can write and return any additional information checkpoints require. */ - snapshot = r->bnd_next == 1 && WT_PAGE_IS_ROOT(r->page); + checkpoint = r->bnd_next == 1 && WT_PAGE_IS_ROOT(r->page); /* Finalize the header information and write the page. */ dsk = r->dsk.mem; dsk->recno = bnd->recno; dsk->u.entries = r->entries; r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); - return (__rec_split_write(session, bnd, &r->dsk, snapshot)); + return (__rec_split_write(session, bnd, &r->dsk, checkpoint)); } /* @@ -960,7 +960,7 @@ err: __wt_scr_free(&tmp); */ static int __rec_split_write( - WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf, int snapshot) + WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf, int checkpoint) { WT_CELL *cell; WT_PAGE_HEADER *dsk; @@ -992,16 +992,16 @@ __rec_split_write( /* * Write the chunk and save the location information. There is one big - * question: if this is a snapshot, then we're going to have to wrap up + * question: if this is a checkpoint, we're going to have to wrap up * our tracking information (freeing blocks we no longer need) before we - * can create the snapshot, because snapshots write extent lists, that - * is, the whole system has to be consistent. We have to handle empty - * tree snapshots elsewhere (because we don't write anything for empty - * tree snapshots, they don't come through this path). Given that fact, - * clear the boundary information as a reminder, and do the snapshot at - * a later time, during wrapup. + * can create the checkpoint, because checkpoints may write additional + * information. We have to handle empty tree checkpoints elsewhere + * (because we don't write anything for empty tree checkpoints, they + * don't come through this path). Given that fact, clear the boundary + * information as a reminder, and do the checkpoint at a later time, + * during wrapup. */ - if (snapshot) { + if (checkpoint) { bnd->addr.addr = NULL; bnd->addr.size = 0; } else { @@ -2866,7 +2866,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Root page split: the last entry on the list. There * won't be a page to discard because writing the page - * created a snapshot, not a replacement page. + * created a checkpoint, not a replacement page. */ WT_ASSERT(session, mod->u.replace.addr == NULL); break; @@ -2906,7 +2906,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) * address blocks (if any). * * The exception is root pages are never tracked or free'd, they - * are snapshots, and must be explicitly dropped. + * are checkpoints, and must be explicitly dropped. */ if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL) { __wt_get_addr(page->parent, page->ref, &addr, &size); @@ -2921,7 +2921,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) * Discard the replacement leaf page's blocks. * * The exception is root pages are never tracked or free'd, they - * are snapshots, and must be explicitly dropped. + * are checkpoints, and must be explicitly dropped. */ if (!WT_PAGE_IS_ROOT(page)) WT_RET(__wt_rec_track(session, page, @@ -2952,10 +2952,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Wrap up discarded block and overflow tracking. If we are about to - * create a snapshot, the system must be entirely consistent at that + * create a checkpoint, the system must be entirely consistent at that * point, the underlying block manager is presumably going to do some * action to resolve the list of allocated/free/whatever blocks that - * are associated with the snapshot. + * are associated with the checkpoint. */ WT_RET(__wt_rec_track_wrapup(session, page)); @@ -2966,7 +2966,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) /* If this is the root page, we need to create a sync point. */ if (WT_PAGE_IS_ROOT(page)) - WT_RET(__wt_bm_snapshot(session, NULL, btree->snap)); + WT_RET(__wt_bm_checkpoint(session, NULL, btree->ckpt)); /* * If the page was empty, we want to discard it from the tree @@ -2989,7 +2989,8 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) */ bnd = &r->bnd[0]; if (bnd->addr.addr == NULL) - WT_RET(__wt_bm_snapshot(session, &r->dsk, btree->snap)); + WT_RET( + __wt_bm_checkpoint(session, &r->dsk, btree->ckpt)); else { mod->u.replace = bnd->addr; bnd->addr.addr = NULL; diff --git a/src/config/config_def.c b/src/config/config_def.c index 34c22c0ce43..88d1388eaa5 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -76,23 +76,24 @@ __wt_confchk_cursor_close = const char * __wt_confdfl_file_meta = - "allocation_size=512B,block_compressor="",checksum=true,collator=""," - "columns=(),huffman_key="",huffman_value="",internal_item_max=0," - "internal_key_truncate=true,internal_page_max=2KB,key_format=u,key_gap=10" - ",leaf_item_max=0,leaf_page_max=1MB,prefix_compression=true,snapshot=""," - "split_pct=75,type=btree,value_format=u,version=(major=0,minor=0)"; + "allocation_size=512B,block_compressor="",checkpoint="",checksum=true," + "collator="",columns=(),huffman_key="",huffman_value=""," + "internal_item_max=0,internal_key_truncate=true,internal_page_max=2KB," + "key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=1MB," + "prefix_compression=true,split_pct=75,type=btree,value_format=u," + "version=(major=0,minor=0)"; const char * __wt_confchk_file_meta = "allocation_size=(type=int,min=512B,max=128MB),block_compressor=()," - "checksum=(type=boolean),collator=(),columns=(type=list),huffman_key=()," - "huffman_value=(),internal_item_max=(type=int,min=0)," + "checkpoint=(),checksum=(type=boolean),collator=(),columns=(type=list)," + "huffman_key=(),huffman_value=(),internal_item_max=(type=int,min=0)," "internal_key_truncate=(type=boolean),internal_page_max=(type=int," "min=512B,max=512MB),key_format=(type=format),key_gap=(type=int,min=0)," "leaf_item_max=(type=int,min=0),leaf_page_max=(type=int,min=512B," - "max=512MB),prefix_compression=(type=boolean),snapshot=()," - "split_pct=(type=int,min=25,max=100),type=(choices=[\"btree\"])," - "value_format=(type=format),version=()"; + "max=512MB),prefix_compression=(type=boolean),split_pct=(type=int,min=25," + "max=100),type=(choices=[\"btree\"]),value_format=(type=format)," + "version=()"; const char * __wt_confdfl_index_meta = @@ -271,6 +272,6 @@ __wt_confchk_wiredtiger_open = "multiprocess=(type=boolean),session_max=(type=int,min=1)," "sync=(type=boolean),transactional=(type=boolean)," "use_environment_priv=(type=boolean),verbose=(type=list," - "choices=[\"block\",\"evict\",\"evictserver\",\"fileops\",\"hazard\"," - "\"mutex\",\"read\",\"readserver\",\"reconcile\",\"salvage\",\"snapshot\"" - ",\"verify\",\"write\"])"; + "choices=[\"block\",\"ckpt\",\"evict\",\"evictserver\",\"fileops\"," + "\"hazard\",\"mutex\",\"read\",\"readserver\",\"reconcile\",\"salvage\"," + "\"verify\",\"write\"])"; diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 2680c995477..63fe1ef68ce 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -705,6 +705,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, uint32_t flag; } *ft, verbtypes[] = { { "block", WT_VERB_block }, + { "ckpt", WT_VERB_ckpt }, { "evict", WT_VERB_evict }, { "evictserver",WT_VERB_evictserver }, { "fileops", WT_VERB_fileops }, @@ -715,7 +716,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, { "reconcile", WT_VERB_reconcile }, { "salvage", WT_VERB_salvage }, { "verify", WT_VERB_verify }, - { "snapshot", WT_VERB_snapshot }, { "write", WT_VERB_write }, { NULL, 0 } }, directio_types[] = { diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c index 236858a9c9f..0951bd635b9 100644 --- a/src/conn/conn_btree.c +++ b/src/conn/conn_btree.c @@ -89,9 +89,9 @@ __conn_btree_get(WT_SESSION_IMPL *session, __wt_spin_lock(session, &conn->spinlock); TAILQ_FOREACH(btree, &conn->btqh, q) { if (strcmp(name, btree->name) == 0 && - ((ckpt == NULL && btree->ckpt == NULL) || - (ckpt != NULL && btree->ckpt != NULL && - strcmp(ckpt, btree->ckpt) == 0))) { + ((ckpt == NULL && btree->checkpoint == NULL) || + (ckpt != NULL && btree->checkpoint != NULL && + strcmp(ckpt, btree->checkpoint) == 0))) { ++btree->refcnt; session->btree = btree; matched = 1; @@ -114,7 +114,7 @@ __conn_btree_get(WT_SESSION_IMPL *session, session, "btree handle", &btree->rwlock)) == 0 && (ret = __wt_strdup(session, name, &btree->name)) == 0 && (ckpt == NULL || - (ret = __wt_strdup(session, ckpt, &btree->ckpt)) == 0)) { + (ret = __wt_strdup(session, ckpt, &btree->checkpoint)) == 0)) { /* Lock the handle before it is inserted in the list. */ __wt_writelock(session, btree->rwlock); F_SET(btree, WT_BTREE_EXCLUSIVE); @@ -132,7 +132,7 @@ __conn_btree_get(WT_SESSION_IMPL *session, if (btree->rwlock != NULL) __wt_rwlock_destroy(session, &btree->rwlock); __wt_free(session, btree->name); - __wt_free(session, btree->ckpt); + __wt_free(session, btree->checkpoint); __wt_overwrite_and_free(session, btree); } @@ -156,7 +156,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session) if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - ret = __wt_snapshot(session, NULL); + ret = __wt_checkpoint(session, NULL); WT_TRET(__wt_btree_close(session)); @@ -207,10 +207,10 @@ __wt_conn_btree_open(WT_SESSION_IMPL *session, F_SET(btree, WT_BTREE_NO_EVICTION); do { - WT_ERR(__wt_meta_snapshot_get( - session, btree->name, btree->ckpt, addr)); + WT_ERR(__wt_meta_checkpoint_get( + session, btree->name, btree->checkpoint, addr)); WT_ERR(__wt_btree_open(session, addr->data, addr->size, cfg, - btree->ckpt == NULL ? 0 : 1)); + btree->checkpoint == NULL ? 0 : 1)); F_SET(btree, WT_BTREE_OPEN); /* Drop back to a readlock if that is all that was needed. */ @@ -299,7 +299,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, __wt_spin_lock(session, &conn->spinlock); TAILQ_FOREACH(btree, &conn->btqh, q) if (F_ISSET(btree, WT_BTREE_OPEN) && - btree->ckpt == NULL && + btree->checkpoint == NULL && strcmp(btree->name, WT_METADATA_URI) != 0) { /* * We have the connection spinlock, which prevents @@ -453,7 +453,7 @@ __conn_btree_discard(WT_SESSION_IMPL *session, WT_BTREE *btree) __wt_rwlock_destroy(session, &btree->rwlock); __wt_free(session, btree->config); __wt_free(session, btree->name); - __wt_free(session, btree->ckpt); + __wt_free(session, btree->checkpoint); __wt_overwrite_and_free(session, btree); return (ret); diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 293c95d6467..c6ec0e486a5 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -426,7 +426,7 @@ __wt_cursor_init(WT_CURSOR *cursor, if (cval.val != 0) F_SET(cursor, WT_CURSTD_RAW); - /* Snapshot cursors are read-only. */ + /* Checkpoint cursors are read-only. */ WT_RET(__wt_config_gets(session, cfg, "checkpoint", &cval)); if (cval.len != 0) { cursor->insert = (int (*)(WT_CURSOR *))__wt_cursor_notsup; diff --git a/src/docs/checkpoints.dox b/src/docs/checkpoints.dox index 07b4dfc3900..ac89c0991b3 100644 --- a/src/docs/checkpoints.dox +++ b/src/docs/checkpoints.dox @@ -39,6 +39,6 @@ discards any other checkpoint with the same name (unless they are currently open in a cursor). Unnamed checkpoints managed by WiredTiger are given the name -"WiredTigerInternal". +"WiredTigerInternalCheckpoint". */ diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 7c4f5480cc0..111726693e1 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -30,7 +30,7 @@ URIs Vv WiredTiger WiredTiger's -WiredTigerInternal +WiredTigerInternalCheckpoint aR ack'ed alloc diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index ca5a408d640..f8b05b7ac07 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,6 +1,6 @@ /*! @page upgrading Upgrading WiredTiger applications -@section version13 Version 1.3 +@section version13api Version 1.3 API changes The checkpoint functionality supported by WT_SESSION::checkpoint and the snapshot functionality supported by WT_SESSION::sync have been merged @@ -34,4 +34,9 @@ from the command line. The \c -s options to the \c dump and \c list commands for the \c wt command line utility have been renamed to be \c -c. +@section version13file Version 1.3 file format changes + +The underlying file formats changed in the 1.3 release; tables and files +should be dumped and re-loaded into a new database. + */ diff --git a/src/include/api.h b/src/include/api.h index 3e2483961ce..46e1db3f62f 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -340,16 +340,16 @@ extern WT_PROCESS __wt_process; #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_SALVAGE_QUIET_ERR 0x00000001 #define WT_VERB_block 0x00001000 -#define WT_VERB_evict 0x00000800 -#define WT_VERB_evictserver 0x00000400 -#define WT_VERB_fileops 0x00000200 -#define WT_VERB_hazard 0x00000100 -#define WT_VERB_mutex 0x00000080 -#define WT_VERB_read 0x00000040 -#define WT_VERB_readserver 0x00000020 -#define WT_VERB_reconcile 0x00000010 -#define WT_VERB_salvage 0x00000008 -#define WT_VERB_snapshot 0x00000004 +#define WT_VERB_ckpt 0x00000800 +#define WT_VERB_evict 0x00000400 +#define WT_VERB_evictserver 0x00000200 +#define WT_VERB_fileops 0x00000100 +#define WT_VERB_hazard 0x00000080 +#define WT_VERB_mutex 0x00000040 +#define WT_VERB_read 0x00000020 +#define WT_VERB_readserver 0x00000010 +#define WT_VERB_reconcile 0x00000008 +#define WT_VERB_salvage 0x00000004 #define WT_VERB_verify 0x00000002 #define WT_VERB_write 0x00000001 /* diff --git a/src/include/block.h b/src/include/block.h index 26dde46b715..0dffb8f46a4 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -17,10 +17,10 @@ #define WT_BLOCK_INVALID_OFFSET 0 /* - * The block manager maintains three per-snapshot extent lists: - * alloc: the extents allocated in this snapshot + * The block manager maintains three per-checkpoint extent lists: + * alloc: the extents allocated in this checkpoint * avail: the extents available for allocation - * discard: the extents freed in this snapshot + * discard: the extents freed in this checkpoint * Each of the extent lists is based on two skiplists: first, a by-offset list * linking WT_EXT elements and sorted by file offset (low-to-high), second, a * by-size list linking WT_SIZE elements and sorted by chunk size (low-to-high). @@ -60,7 +60,7 @@ struct __wt_extlist { /* * WT_EXT -- * Encapsulation of an extent, either allocated or freed within the - * snapshot. + * checkpoint. */ struct __wt_ext { off_t off; /* Extent's file offset */ @@ -106,16 +106,16 @@ struct __wt_size { (skip) != NULL; (skip) = (skip)->next[(skip)->depth]) /* - * Snapshot cookie: carries a version number as I don't want to rev the schema - * file version should the default block manager snapshot format change. + * Checkpoint cookie: carries a version number as I don't want to rev the schema + * file version should the default block manager checkpoint format change. * - * Version #1 snapshot cookie format: + * Version #1 checkpoint cookie format: * [1] [root addr] [alloc addr] [avail addr] [discard addr] - * [file size] [snapshot size] [write generation] + * [file size] [checkpoint size] [write generation] */ -#define WT_BM_SNAPSHOT_VERSION 1 /* Snapshot format version */ +#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */ #define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */ -struct __wt_block_snapshot { +struct __wt_block_ckpt { uint8_t version; /* Version */ off_t root_offset; /* The root */ @@ -125,9 +125,9 @@ struct __wt_block_snapshot { WT_EXTLIST avail; /* Extents available */ WT_EXTLIST discard; /* Extents discarded */ - off_t file_size; /* Snapshot file size */ - uint64_t snapshot_size; /* Snapshot byte count */ - WT_EXTLIST snapshot_avail; /* Snapshot free'd extents */ + off_t file_size; /* Checkpoint file size */ + uint64_t ckpt_size; /* Checkpoint byte count */ + WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */ uint64_t write_gen; /* Write generation */ }; @@ -144,9 +144,9 @@ struct __wt_block { uint32_t allocsize; /* Allocation size */ int checksum; /* If checksums configured */ - WT_SPINLOCK live_lock; /* Lock to protect the live snapshot */ - WT_BLOCK_SNAPSHOT live; /* Live snapshot */ - int live_load; /* Live snapshot loaded */ + WT_SPINLOCK live_lock; /* Live checkpoint lock */ + WT_BLOCK_CKPT live; /* Live checkpoint */ + int live_load; /* Live checkpoint loaded */ WT_COMPRESSOR *compressor; /* Page compressor */ @@ -156,11 +156,11 @@ struct __wt_block { /* Verification support */ int verify; /* If performing verification */ - off_t verify_size; /* Snapshot's file size */ + off_t verify_size; /* Checkpoint's file size */ WT_EXTLIST verify_alloc; /* Verification allocation list */ uint32_t frags; /* Maximum frags in the file */ uint8_t *fragfile; /* Per-file frag tracking list */ - uint8_t *fragsnap; /* Per-snapshot frag tracking list */ + uint8_t *fragckpt; /* Per-checkpoint frag tracking list */ }; /* diff --git a/src/include/btree.h b/src/include/btree.h index e713107e10e..50ca84dc194 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -68,15 +68,13 @@ struct __wt_btree { uint32_t refcnt; /* Sessions using this tree. */ TAILQ_ENTRY(__wt_btree) q; /* Linked list of handles */ - volatile uint32_t lru_count; /* Count of threads in LRU eviction. */ - const char *name; /* Object name as a URI */ + const char *checkpoint; /* Checkpoint name (or NULL) */ const char *config; /* Configuration string */ - const char *ckpt; /* Checkpoint name (or NULL) */ /* XXX Should move into the session-level handle information. */ - WT_RWLOCK *snaplock; /* Lock for snapshot creation */ - WT_SNAPSHOT *snap; /* Snapshot information */ + WT_RWLOCK *ckptlock; /* Lock for checkpoint creation */ + WT_CKPT *ckpt; /* Checkpoint information */ enum { BTREE_COL_FIX=1, /* Fixed-length column store */ BTREE_COL_VAR=2, /* Variable-length column store */ @@ -112,6 +110,7 @@ struct __wt_btree { u_int block_header; /* Block manager header length */ WT_PAGE *evict_page; /* Eviction thread's location */ + volatile uint32_t lru_count; /* Count of threads in LRU eviction. */ WT_BTREE_STATS *stats; /* Btree statistics */ diff --git a/src/include/extern.h b/src/include/extern.h index 323dbbcf712..51307283fcc 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -19,14 +19,35 @@ extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size); -extern int __wt_block_buffer_to_snapshot(WT_SESSION_IMPL *session, +extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, - WT_BLOCK_SNAPSHOT *si); -extern int __wt_block_snapshot_to_buffer(WT_SESSION_IMPL *session, + WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, - WT_BLOCK_SNAPSHOT *si); + WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_init(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_BLOCK_CKPT *ci, + const char *name, + int is_live); +extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_ITEM *dsk, + const uint8_t *addr, + uint32_t addr_size, + int readonly); +extern int __wt_block_checkpoint_unload(WT_SESSION_IMPL *session, + WT_BLOCK *block); +extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, + WT_BLOCK_CKPT *ci); +extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_ITEM *buf, + WT_CKPT *ckptbase); +extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, + WT_BLOCK *block); extern uint32_t __wt_cksum(const void *chunk, size_t len); extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, @@ -53,7 +74,7 @@ extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *bl); extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_BLOCK_SNAPSHOT *si); + WT_BLOCK_CKPT *ci); extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b); @@ -97,17 +118,16 @@ extern int __wt_bm_open(WT_SESSION_IMPL *session, const char *cfg[], int forced_salvage); extern int __wt_bm_close(WT_SESSION_IMPL *session); -extern int __wt_bm_snapshot(WT_SESSION_IMPL *session, +extern int __wt_bm_checkpoint(WT_SESSION_IMPL *session, WT_ITEM *buf, - WT_SNAPSHOT *snapbase); -extern int __wt_bm_snapshot_resolve(WT_SESSION_IMPL *session, - WT_SNAPSHOT *snapbase); -extern int __wt_bm_snapshot_load(WT_SESSION_IMPL *session, + WT_CKPT *ckptbase); +extern int __wt_bm_checkpoint_resolve(WT_SESSION_IMPL *session); +extern int __wt_bm_checkpoint_load(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size, int readonly); -extern int __wt_bm_snapshot_unload(WT_SESSION_IMPL *session); +extern int __wt_bm_checkpoint_unload(WT_SESSION_IMPL *session); extern int __wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename); extern int __wt_bm_free(WT_SESSION_IMPL *session, const uint8_t *addr, @@ -130,8 +150,7 @@ extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session, uint64_t *write_genp, int *eofp); extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session); -extern int __wt_bm_verify_start(WT_SESSION_IMPL *session, - WT_SNAPSHOT *snapbase); +extern int __wt_bm_verify_start(WT_SESSION_IMPL *session, WT_CKPT *ckptbase); extern int __wt_bm_verify_end(WT_SESSION_IMPL *session); extern int __wt_bm_verify_addr(WT_SESSION_IMPL *session, const uint8_t *addr, @@ -167,38 +186,16 @@ extern int __wt_block_salvage_next( WT_SESSION_IMPL *session, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp); -extern int __wt_block_snap_init(WT_SESSION_IMPL *session, - WT_BLOCK *block, - WT_BLOCK_SNAPSHOT *si, - const char *name, - int is_live); -extern int __wt_block_snapshot_load(WT_SESSION_IMPL *session, - WT_BLOCK *block, - WT_ITEM *dsk, - const uint8_t *addr, - uint32_t addr_size, - int readonly); -extern int __wt_block_snapshot_unload(WT_SESSION_IMPL *session, - WT_BLOCK *block); -extern void __wt_block_snap_destroy(WT_SESSION_IMPL *session, - WT_BLOCK_SNAPSHOT *si); -extern int __wt_block_snapshot(WT_SESSION_IMPL *session, - WT_BLOCK *block, - WT_ITEM *buf, - WT_SNAPSHOT *snapbase); -extern int __wt_block_snapshot_resolve( WT_SESSION_IMPL *session, - WT_BLOCK *block, - WT_SNAPSHOT *snapbase); extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_SNAPSHOT *snapbase); + WT_CKPT *ckptbase); extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_verify_snap_load( WT_SESSION_IMPL *session, +extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_BLOCK_SNAPSHOT *si); -extern int __wt_verify_snap_unload( WT_SESSION_IMPL *session, + WT_BLOCK_CKPT *ci); +extern int __wt_verify_ckpt_unload( WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_BLOCK_SNAPSHOT *si); + WT_BLOCK_CKPT *ci); extern int __wt_block_verify(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, @@ -322,11 +319,11 @@ extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret); extern int __wt_bt_salvage( WT_SESSION_IMPL *session, - WT_SNAPSHOT *snapbase, + WT_CKPT *ckptbase, const char *cfg[]); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session); extern int __wt_bt_cache_flush( WT_SESSION_IMPL *session, - WT_SNAPSHOT *snapbase, + WT_CKPT *ckptbase, int op, int force); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); @@ -616,29 +613,29 @@ extern WT_LOGREC_DESC __wt_logdesc_debug; extern int __wt_metadata_get(WT_SESSION *session, const char *uri, const char **valuep); -extern int __wt_metadata_get_snaplist( WT_SESSION *session, +extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, - WT_SNAPSHOT **snapbasep); -extern void __wt_metadata_free_snaplist(WT_SESSION *session, - WT_SNAPSHOT *snapbase); + WT_CKPT **ckptbasep); +extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase); extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[], uint32_t flags); -extern int __wt_meta_snapshot_get(WT_SESSION_IMPL *session, +extern int __wt_meta_checkpoint_get(WT_SESSION_IMPL *session, const char *name, - const char *snapshot, + const char *checkpoint, WT_ITEM *addr); -extern int __wt_meta_snapshot_clear(WT_SESSION_IMPL *session, const char *name); -extern int __wt_meta_snaplist_get( WT_SESSION_IMPL *session, +extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, + const char *name); +extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *name, - WT_SNAPSHOT **snapbasep); -extern int __wt_meta_snaplist_set( WT_SESSION_IMPL *session, + WT_CKPT **ckptbasep); +extern int __wt_meta_ckptlist_set( WT_SESSION_IMPL *session, const char *name, - WT_SNAPSHOT *snapbase); -extern void __wt_meta_snaplist_free(WT_SESSION_IMPL *session, - WT_SNAPSHOT *snapbase); + WT_CKPT *ckptbase); +extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, + WT_CKPT *ckptbase); extern int __wt_metadata_open(WT_SESSION_IMPL *session); extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, @@ -902,8 +899,8 @@ extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags); -extern int __wt_session_lock_snapshot( WT_SESSION_IMPL *session, - const char *snapshot, +extern int __wt_session_lock_checkpoint( WT_SESSION_IMPL *session, + const char *checkpoint, uint32_t flags); extern int __wt_session_discard_btree( WT_SESSION_IMPL *session, WT_BTREE_SESSION *btree_session); @@ -1048,11 +1045,12 @@ extern void __wt_stat_clear_connection_stats(WT_STATS *stats_arg); extern int __wt_txnid_cmp(const void *v1, const void *v2); extern int __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id); extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]); -extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_init(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern void __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn); -extern int __wt_snapshot(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/meta.h b/src/include/meta.h index 7e696cf0aef..ecef945c379 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -14,30 +14,30 @@ #define WT_METADATA_VERSION_STR "WiredTiger version string" /* - * WT_SNAPSHOT -- - * Encapsulation of snapshot information, shared by the metadata, the + * WT_CKPT -- + * Encapsulation of checkpoint information, shared by the metadata, the * btree engine, and the block manager. */ -#define WT_INTERNAL_SNAPSHOT "WiredTigerInternalSnapshot" -#define WT_SNAPSHOT_FOREACH(snapbase, snap) \ - for ((snap) = (snapbase); (snap)->name != NULL; ++(snap)) +#define WT_INTERNAL_CHKPT "WiredTigerInternalCheckpoint" +#define WT_CKPT_FOREACH(ckptbase, ckpt) \ + for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt)) -struct __wt_snapshot { +struct __wt_ckpt { char *name; /* Name or NULL */ - WT_ITEM addr; /* Snapshot cookie string */ - WT_ITEM raw; /* Snapshot cookie raw */ + WT_ITEM addr; /* Checkpoint cookie string */ + WT_ITEM raw; /* Checkpoint cookie raw */ - int64_t order; /* Snapshot order */ + int64_t order; /* Checkpoint order */ uintmax_t sec; /* Timestamp */ - uint64_t snapshot_size; /* Snapshot size */ + uint64_t ckpt_size; /* Checkpoint size */ void *bpriv; /* Block manager private */ -#define WT_SNAP_ADD 0x01 /* Snapshot to be added */ -#define WT_SNAP_DELETE 0x02 /* Snapshot to be deleted */ -#define WT_SNAP_UPDATE 0x04 /* Snapshot requires update */ +#define WT_CKPT_ADD 0x01 /* Checkpoint to be added */ +#define WT_CKPT_DELETE 0x02 /* Checkpoint to be deleted */ +#define WT_CKPT_UPDATE 0x04 /* Checkpoint requires update */ uint32_t flags; }; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 676e0dd31d9..2027a47a0aa 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -711,7 +711,7 @@ struct __wt_session { /*! Upgrade a file or table. * - * Upgrade upgrades a file, or the files of which a table is comprised. + * Upgrade upgrades a file or table, if upgrade is required. * * @snippet ex_all.c session upgrade * @@ -1058,10 +1058,10 @@ struct __wt_connection { * information.,a boolean flag; default \c false.} * @config{verbose, enable messages for various events. Options are given as a * list\, such as <code>"verbose=[evictserver\,read]"</code>.,a list\, with - * values chosen from the following options: \c "block"\, \c "evict"\, \c - * "evictserver"\, \c "fileops"\, \c "hazard"\, \c "mutex"\, \c "read"\, \c - * "readserver"\, \c "reconcile"\, \c "salvage"\, \c "snapshot"\, \c "verify"\, - * \c "write"; default empty.} + * values chosen from the following options: \c "block"\, \c "ckpt"\, \c + * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "mutex"\, \c + * "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c "verify"\, \c + * "write"; default empty.} * @configend * Additionally, if a file named \c WiredTiger.config appears in the WiredTiger * home directory, it is read for configuration values (see @ref config_file diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 81c6fe7b053..ffc619f5d9b 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -50,12 +50,12 @@ struct __wt_addr; typedef struct __wt_addr WT_ADDR; struct __wt_block; typedef struct __wt_block WT_BLOCK; +struct __wt_block_ckpt; + typedef struct __wt_block_ckpt WT_BLOCK_CKPT; struct __wt_block_desc; typedef struct __wt_block_desc WT_BLOCK_DESC; struct __wt_block_header; typedef struct __wt_block_header WT_BLOCK_HEADER; -struct __wt_block_snapshot; - typedef struct __wt_block_snapshot WT_BLOCK_SNAPSHOT; struct __wt_btree; typedef struct __wt_btree WT_BTREE; struct __wt_btree_session; @@ -68,6 +68,8 @@ struct __wt_cell; typedef struct __wt_cell WT_CELL; struct __wt_cell_unpack; typedef struct __wt_cell_unpack WT_CELL_UNPACK; +struct __wt_ckpt; + typedef struct __wt_ckpt WT_CKPT; struct __wt_col; typedef struct __wt_col WT_COL; struct __wt_col_rle; @@ -144,8 +146,6 @@ struct __wt_session_impl; typedef struct __wt_session_impl WT_SESSION_IMPL; struct __wt_size; typedef struct __wt_size WT_SIZE; -struct __wt_snapshot; - typedef struct __wt_snapshot WT_SNAPSHOT; struct __wt_stats; typedef struct __wt_stats WT_STATS; struct __wt_table; diff --git a/src/meta/meta_api.c b/src/meta/meta_api.c index 74ea9ea91e5..dee40585f90 100644 --- a/src/meta/meta_api.c +++ b/src/meta/meta_api.c @@ -18,23 +18,23 @@ __wt_metadata_get(WT_SESSION *session, const char *uri, const char **valuep) } /* - * __wt_snaplist_get -- - * Public entry point to __wt_meta_snaplist_get (for wt list). + * __wt_metadata_get_ckptlist -- + * Public entry point to __wt_meta_ckptlist_get (for wt list). */ int -__wt_metadata_get_snaplist( - WT_SESSION *session, const char *name, WT_SNAPSHOT **snapbasep) +__wt_metadata_get_ckptlist( + WT_SESSION *session, const char *name, WT_CKPT **ckptbasep) { - return (__wt_meta_snaplist_get( - (WT_SESSION_IMPL *)session, name, snapbasep)); + return (__wt_meta_ckptlist_get( + (WT_SESSION_IMPL *)session, name, ckptbasep)); } /* - * __wt_snaplist_free -- - * Public entry point to __wt_snapshot_list_free (for wt list). + * __wt_metadata_free_ckptlist -- + * Public entry point to __wt_meta_ckptlist_free (for wt list). */ void -__wt_metadata_free_snaplist(WT_SESSION *session, WT_SNAPSHOT *snapbase) +__wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase) { - __wt_meta_snaplist_free((WT_SESSION_IMPL *)session, snapbase); + __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase); } diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c new file mode 100644 index 00000000000..da40ab5742e --- /dev/null +++ b/src/meta/meta_ckpt.c @@ -0,0 +1,411 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int __ckpt_get( + WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *); +static int __ckpt_get_last(WT_SESSION_IMPL *, const char *, WT_ITEM *); +static int __ckpt_get_name( + WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *); +static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *); +static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); + +/* + * __wt_meta_checkpoint_get -- + * Return a file's checkpoint address. + */ +int +__wt_meta_checkpoint_get(WT_SESSION_IMPL *session, + const char *name, const char *checkpoint, WT_ITEM *addr) +{ + WT_DECL_RET; + + /* Get the checkpoint address. */ + ret = __ckpt_get(session, name, checkpoint, addr); + + /* + * If we find a checkpoint, check the version and return the address. + * If we don't find a named checkpoint, we're done, they're read-only. + * If we don't find a default checkpoint, it's creation, return "no + * data" and let our caller handle it. + */ + if (ret == WT_NOTFOUND) { + /* + * If the caller didn't give us a specific checkpoint name, we + * assume it's a creation and there isn't a checkpoint to find. + * Let the caller deal with the failure. + */ + if (checkpoint != NULL) + WT_RET_MSG(session, WT_NOTFOUND, + "no \"%s\" checkpoint found in %s", + checkpoint, name); + + addr->data = NULL; + addr->size = 0; + } + return (0); +} + +/* + * __wt_meta_checkpoint_clear -- + * Clear a file's checkpoint. + */ +int +__wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *name) +{ + WT_DECL_RET; + + ret = __ckpt_set(session, name, NULL); + + /* + * If we are unrolling a failed create, we may have already removed the + * metadata entry. If no entry is found to update and we're trying to + * clear the checkpoint, just ignore it. + */ + if (ret == WT_NOTFOUND) + ret = 0; + return (ret); +} + +/* + * __ckpt_get -- + * Get a file's checkpoint. + */ +static int +__ckpt_get(WT_SESSION_IMPL *session, + const char *name, const char *checkpoint, WT_ITEM *addr) +{ + WT_DECL_RET; + const char *config; + + config = NULL; + + /* Retrieve the metadata entry for the file. */ + WT_ERR(__wt_metadata_read(session, name, &config)); + + /* Check the major/minor version numbers. */ + WT_ERR(__ckpt_version_chk(session, name, config)); + + /* Retrieve the named checkpoint or the last checkpoint. */ + if (checkpoint == NULL) + WT_ERR(__ckpt_get_last(session, config, addr)); + else + WT_ERR(__ckpt_get_name(session, checkpoint, config, addr)); + +err: __wt_free(session, config); + return (ret); +} + +/* + * __ckpt_set -- + * Set a file's checkpoint. + */ +static int +__ckpt_set(WT_SESSION_IMPL *session, const char *name, const char *v) +{ + WT_DECL_RET; + const char *config, *cfg[3], *newcfg; + + config = newcfg = NULL; + + /* Retrieve the metadata for this file. */ + WT_ERR(__wt_metadata_read(session, name, &config)); + + /* Replace the checkpoint entry. */ + cfg[0] = config; + cfg[1] = v == NULL ? "checkpoint=()" : v; + cfg[2] = NULL; + WT_ERR(__wt_config_collapse(session, cfg, &newcfg)); + WT_ERR(__wt_metadata_update(session, name, newcfg)); + +err: __wt_free(session, config); + __wt_free(session, newcfg); + return (ret); +} + +/* + * __ckpt_get_name -- + * Return the cookie associated with a file's named checkpoint. + */ +static int +__ckpt_get_name(WT_SESSION_IMPL *session, + const char *name, const char *config, WT_ITEM *addr) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM a, k, v; + + WT_RET(__wt_config_getones(session, config, "checkpoint", &v)); + WT_RET(__wt_config_subinit(session, &ckptconf, &v)); + while (__wt_config_next(&ckptconf, &k, &v) == 0) + if (strlen(name) == k.len && strncmp(name, k.str, k.len) == 0) { + WT_RET(__wt_config_subgets(session, &v, "addr", &a)); + WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr)); + return (0); + } + return (WT_NOTFOUND); +} + +/* + * __ckpt_get_last -- + * Return the cookie associated with the file's last checkpoint. + */ +static int +__ckpt_get_last( + WT_SESSION_IMPL *session, const char *config, WT_ITEM *addr) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM a, k, v; + int64_t found; + + WT_RET(__wt_config_getones(session, config, "checkpoint", &v)); + WT_RET(__wt_config_subinit(session, &ckptconf, &v)); + for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) { + if (found) { + WT_RET(__wt_config_subgets(session, &v, "order", &a)); + if (a.val < found) + continue; + } + + WT_RET(__wt_config_subgets(session, &v, "addr", &a)); + if (a.len == 0) + WT_RET(EINVAL); + + /* Our caller wants the raw cookie, not the hex. */ + WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr)); + WT_RET(__wt_config_subgets(session, &v, "order", &a)); + found = a.val; + } + + return (found ? 0 : WT_NOTFOUND); +} + +/* + * __ckpt_compare_order -- + * Qsort comparison routine for the checkpoint list. + */ +static int +__ckpt_compare_order(const void *a, const void *b) +{ + WT_CKPT *ackpt, *bckpt; + + ackpt = (WT_CKPT *)a; + bckpt = (WT_CKPT *)b; + + return (ackpt->order > bckpt->order ? 1 : -1); +} + +/* + * __wt_meta_ckptlist_get -- + * Load all available checkpoint information for a file. + */ +int +__wt_meta_ckptlist_get( + WT_SESSION_IMPL *session, const char *name, WT_CKPT **ckptbasep) +{ + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG ckptconf; + WT_CONFIG_ITEM a, k, v; + WT_DECL_RET; + WT_ITEM *buf; + size_t allocated, slot; + const char *config; + char timebuf[64]; + + *ckptbasep = NULL; + + buf = NULL; + ckptbase = NULL; + allocated = slot = 0; + config = NULL; + + /* Retrieve the metadata information for the file. */ + WT_RET(__wt_metadata_read(session, name, &config)); + + /* Load any existing checkpoints into the array. */ + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + if (__wt_config_getones(session, config, "checkpoint", &v) == 0 && + __wt_config_subinit(session, &ckptconf, &v) == 0) + for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) { + if (slot * sizeof(WT_CKPT) == allocated) + WT_ERR(__wt_realloc(session, &allocated, + (slot + 50) * sizeof(WT_CKPT), &ckptbase)); + ckpt = &ckptbase[slot]; + + /* + * Copy the name, address (raw and hex), order and time + * into the slot. + */ + WT_ERR( + __wt_strndup(session, k.str, k.len, &ckpt->name)); + + WT_ERR(__wt_config_subgets(session, &v, "addr", &a)); + if (a.len == 0) + goto format; + WT_ERR(__wt_buf_set( + session, &ckpt->addr, a.str, a.len)); + WT_ERR(__wt_nhex_to_raw( + session, a.str, a.len, &ckpt->raw)); + + WT_ERR(__wt_config_subgets(session, &v, "order", &a)); + if (a.val == 0) + goto format; + ckpt->order = a.val; + + WT_ERR(__wt_config_subgets(session, &v, "time", &a)); + if (a.len == 0) + goto format; + if (a.len > sizeof(timebuf) - 1) + goto format; + memcpy(timebuf, a.str, a.len); + timebuf[a.len] = '\0'; + if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) + goto format; + + WT_ERR(__wt_config_subgets(session, &v, "size", &a)); + ckpt->ckpt_size = (uint64_t)a.val; + } + + /* + * Allocate an extra slot for a new value, plus a slot to mark the end. + * + * This isn't very clean, but there's necessary cooperation between the + * schema layer (that maintains the list of checkpoints), the btree + * layer (that knows when the root page is written, creating a new + * checkpoint), and the block manager (which actually creates the + * checkpoint). All of that cooperation is handled in the WT_CKPT + * structure referenced from the WT_BTREE structure. + */ + if ((slot + 2) * sizeof(WT_CKPT) >= allocated) + WT_ERR(__wt_realloc(session, &allocated, + (slot + 2) * sizeof(WT_CKPT), &ckptbase)); + + /* Sort in creation-order. */ + qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order); + + /* Return the array to our caller. */ + *ckptbasep = ckptbase; + + if (0) { +format: WT_ERR_MSG(session, WT_ERROR, "corrupted checkpoint list"); +err: __wt_meta_ckptlist_free(session, ckptbase); + } + __wt_free(session, config); + __wt_scr_free(&buf); + + return (ret); +} + +/* + * __wt_meta_ckptlist_set -- + * Set a file's checkpoint value from the WT_CKPT list. + */ +int +__wt_meta_ckptlist_set( + WT_SESSION_IMPL *session, const char *name, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt; + WT_DECL_RET; + WT_ITEM *buf; + int64_t order; + const char *sep; + + buf = NULL; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + order = 0; + sep = ""; + WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=(")); + WT_CKPT_FOREACH(ckptbase, ckpt) { + /* Skip deleted checkpoints. */ + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + + /* + * Track the largest active checkpoint counter: it's not really + * a generational number or an ID because we reset it to 1 if + * the checkpoint we're writing is the only checkpoint the file + * has. The problem we're solving is when two checkpoints are + * taken quickly, the timer may not be unique and/or we can even + * see time travel on the second checkpoint if we read the time + * in-between nanoseconds rolling over. All we need to know + * is the real checkpoint order so we don't accidentally take + * the wrong "last" checkpoint. + */ + if (ckpt->order > order) + order = ckpt->order; + + if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) { + /* Convert the raw cookie to a hex string. */ + WT_ERR(__wt_raw_to_hex(session, + ckpt->raw.data, ckpt->raw.size, &ckpt->addr)); + + if (F_ISSET(ckpt, WT_CKPT_ADD)) + ckpt->order = order + 1; + } + WT_ERR(__wt_buf_catfmt(session, buf, + "%s%s=(addr=\"%.*s\",order=%" PRIu64 + ",time=%" PRIuMAX ",size=%" PRIu64 ")", + sep, ckpt->name, + (int)ckpt->addr.size, (char *)ckpt->addr.data, + ckpt->order, ckpt->sec, ckpt->ckpt_size)); + sep = ","; + } + WT_ERR(__wt_buf_catfmt(session, buf, ")")); + WT_ERR(__ckpt_set(session, name, buf->mem)); + +err: __wt_scr_free(&buf); + + return (ret); +} + +/* + * __wt_meta_ckptlist_free -- + * Discard the checkpoint array. + */ +void +__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +{ + WT_CKPT *ckpt; + + if (ckptbase == NULL) + return; + + WT_CKPT_FOREACH(ckptbase, ckpt) { + __wt_free(session, ckpt->name); + __wt_buf_free(session, &ckpt->addr); + __wt_buf_free(session, &ckpt->raw); + __wt_free(session, ckpt->bpriv); + } + __wt_free(session, ckptbase); +} + +/* + * __ckpt_version_chk -- + * Check the version major/minor numbers. + */ +static int +__ckpt_version_chk( + WT_SESSION_IMPL *session, const char *name, const char *config) +{ + WT_CONFIG_ITEM a, v; + int majorv, minorv; + + WT_RET(__wt_config_getones(session, config, "version", &v)); + WT_RET(__wt_config_subgets(session, &v, "major", &a)); + majorv = (int)a.val; + WT_RET(__wt_config_subgets(session, &v, "minor", &a)); + minorv = (int)a.val; + + if (majorv > WT_BTREE_MAJOR_VERSION || + (majorv == WT_BTREE_MAJOR_VERSION && + minorv > WT_BTREE_MINOR_VERSION)) + WT_RET_MSG(session, EACCES, + "%s is an unsupported version of a WiredTiger file", + name); + return (0); +} diff --git a/src/meta/meta_snapshot.c b/src/meta/meta_snapshot.c deleted file mode 100644 index 60642e7f53c..00000000000 --- a/src/meta/meta_snapshot.c +++ /dev/null @@ -1,410 +0,0 @@ -/*- - * Copyright (c) 2008-2012 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -static int __snap_get( - WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *); -static int __snap_get_last(WT_SESSION_IMPL *, const char *, WT_ITEM *); -static int __snap_get_name( - WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *); -static int __snap_set(WT_SESSION_IMPL *, const char *, const char *); -static int __snap_version_chk(WT_SESSION_IMPL *, const char *, const char *); - -/* - * __wt_meta_snapshot_get -- - * Get the file's most recent snapshot address. - */ -int -__wt_meta_snapshot_get(WT_SESSION_IMPL *session, - const char *name, const char *snapshot, WT_ITEM *addr) -{ - WT_DECL_RET; - - /* Get the snapshot address. */ - ret = __snap_get(session, name, snapshot, addr); - - /* - * If we find a snapshot, check the version and return the address. - * If we don't find a named snapshot, we're done, they're read-only. - * If we don't find a default snapshot, it's creation, return "no - * data" and let our caller handle it. - */ - if (ret == WT_NOTFOUND) { - /* - * If the caller didn't give us a specific snapshot name, we - * assume it's a creation and there isn't a snapshot to find. - * Let the caller deal with the failure. - */ - if (snapshot != NULL) - WT_RET_MSG(session, WT_NOTFOUND, - "no \"%s\" snapshot found in %s", - snapshot, name); - - addr->data = NULL; - addr->size = 0; - } - return (0); -} - -/* - * __wt_meta_snapshot_clear -- - * Clear a file's snapshot. - */ -int -__wt_meta_snapshot_clear(WT_SESSION_IMPL *session, const char *name) -{ - WT_DECL_RET; - - ret = __snap_set(session, name, NULL); - - /* - * If we are unrolling a failed create, we may have already removed the - * metadata entry. If no entry is found to update and we're trying to - * clear the snapshot, just ignore it. - */ - if (ret == WT_NOTFOUND) - ret = 0; - return (ret); -} - -/* - * __snap_get -- - * Get a file's snapshot. - */ -static int -__snap_get(WT_SESSION_IMPL *session, - const char *name, const char *snapshot, WT_ITEM *addr) -{ - WT_DECL_RET; - const char *config; - - config = NULL; - - /* Retrieve the metadata entry for the file. */ - WT_ERR(__wt_metadata_read(session, name, &config)); - - /* Check the major/minor version numbers. */ - WT_ERR(__snap_version_chk(session, name, config)); - - /* Retrieve the named snapshot or the last snapshot. */ - if (snapshot == NULL) - WT_ERR(__snap_get_last(session, config, addr)); - else - WT_ERR(__snap_get_name(session, snapshot, config, addr)); - -err: __wt_free(session, config); - return (ret); -} - -/* - * __snap_set -- - * Set a file's snapshot. - */ -static int -__snap_set(WT_SESSION_IMPL *session, const char *name, const char *v) -{ - WT_DECL_RET; - const char *config, *cfg[3], *newcfg; - - config = newcfg = NULL; - - /* Retrieve the metadata for this file. */ - WT_ERR(__wt_metadata_read(session, name, &config)); - - /* Replace the snapshot entry. */ - cfg[0] = config; - cfg[1] = v == NULL ? "snapshot=()" : v; - cfg[2] = NULL; - WT_ERR(__wt_config_collapse(session, cfg, &newcfg)); - WT_ERR(__wt_metadata_update(session, name, newcfg)); - -err: __wt_free(session, config); - __wt_free(session, newcfg); - return (ret); -} - -/* - * __snap_get_name -- - * Return the cookie associated with a file's named snapshot. - */ -static int -__snap_get_name(WT_SESSION_IMPL *session, - const char *name, const char *config, WT_ITEM *addr) -{ - WT_CONFIG snapconf; - WT_CONFIG_ITEM a, k, v; - - WT_RET(__wt_config_getones(session, config, "snapshot", &v)); - WT_RET(__wt_config_subinit(session, &snapconf, &v)); - while (__wt_config_next(&snapconf, &k, &v) == 0) - if (strlen(name) == k.len && strncmp(name, k.str, k.len) == 0) { - WT_RET(__wt_config_subgets(session, &v, "addr", &a)); - WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr)); - return (0); - } - return (WT_NOTFOUND); -} - -/* - * __snap_get_last -- - * Return the cookie associated with the file's last snapshot. - */ -static int -__snap_get_last( - WT_SESSION_IMPL *session, const char *config, WT_ITEM *addr) -{ - WT_CONFIG snapconf; - WT_CONFIG_ITEM a, k, v; - int64_t found; - - WT_RET(__wt_config_getones(session, config, "snapshot", &v)); - WT_RET(__wt_config_subinit(session, &snapconf, &v)); - for (found = 0; __wt_config_next(&snapconf, &k, &v) == 0;) { - if (found) { - WT_RET(__wt_config_subgets(session, &v, "order", &a)); - if (a.val < found) - continue; - } - - WT_RET(__wt_config_subgets(session, &v, "addr", &a)); - if (a.len == 0) - WT_RET(EINVAL); - - /* Our caller wants the raw cookie, not the hex. */ - WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr)); - WT_RET(__wt_config_subgets(session, &v, "order", &a)); - found = a.val; - } - - return (found ? 0 : WT_NOTFOUND); -} - -/* - * __snap_compare_order -- - * Qsort comparison routine for the snapshot list. - */ -static int -__snap_compare_order(const void *a, const void *b) -{ - WT_SNAPSHOT *asnap, *bsnap; - - asnap = (WT_SNAPSHOT *)a; - bsnap = (WT_SNAPSHOT *)b; - - return (asnap->order > bsnap->order ? 1 : -1); -} - -/* - * __wt_meta_snaplist_get -- - * Load all available snapshot information for a file. - */ -int -__wt_meta_snaplist_get( - WT_SESSION_IMPL *session, const char *name, WT_SNAPSHOT **snapbasep) -{ - WT_CONFIG snapconf; - WT_CONFIG_ITEM a, k, v; - WT_DECL_RET; - WT_ITEM *buf; - WT_SNAPSHOT *snap, *snapbase; - size_t allocated, slot; - const char *config; - char timebuf[64]; - - *snapbasep = NULL; - - buf = NULL; - snapbase = NULL; - allocated = slot = 0; - config = NULL; - - /* Retrieve the metadata information for the file. */ - WT_RET(__wt_metadata_read(session, name, &config)); - - /* Load any existing snapshots into the array. */ - WT_ERR(__wt_scr_alloc(session, 0, &buf)); - if (__wt_config_getones(session, config, "snapshot", &v) == 0 && - __wt_config_subinit(session, &snapconf, &v) == 0) - for (; __wt_config_next(&snapconf, &k, &v) == 0; ++slot) { - if (slot * sizeof(WT_SNAPSHOT) == allocated) - WT_ERR(__wt_realloc(session, &allocated, - (slot + 50) * sizeof(WT_SNAPSHOT), - &snapbase)); - snap = &snapbase[slot]; - - /* - * Copy the name, address (raw and hex), order and time - * into the slot. - */ - WT_ERR( - __wt_strndup(session, k.str, k.len, &snap->name)); - - WT_ERR(__wt_config_subgets(session, &v, "addr", &a)); - if (a.len == 0) - goto format; - WT_ERR(__wt_buf_set( - session, &snap->addr, a.str, a.len)); - WT_ERR(__wt_nhex_to_raw( - session, a.str, a.len, &snap->raw)); - - WT_ERR(__wt_config_subgets(session, &v, "order", &a)); - if (a.val == 0) - goto format; - snap->order = a.val; - - WT_ERR(__wt_config_subgets(session, &v, "time", &a)); - if (a.len == 0) - goto format; - if (a.len > sizeof(timebuf) - 1) - goto format; - memcpy(timebuf, a.str, a.len); - timebuf[a.len] = '\0'; - if (sscanf(timebuf, "%" SCNuMAX, &snap->sec) != 1) - goto format; - - WT_ERR(__wt_config_subgets(session, &v, "size", &a)); - snap->snapshot_size = (uint64_t)a.val; - } - - /* - * Allocate an extra slot for a new value, plus a slot to mark the end. - * - * This isn't very clean, but there's necessary cooperation between the - * schema layer (that maintains the list of snapshots), the btree layer - * (that knows when the root page is written, creating a new snapshot), - * and the block manager (which actually creates the snapshot). All of - * that cooperation is handled in the WT_SNAPSHOT structure referenced - * from the WT_BTREE structure. - */ - if ((slot + 2) * sizeof(WT_SNAPSHOT) >= allocated) - WT_ERR(__wt_realloc(session, &allocated, - (slot + 2) * sizeof(WT_SNAPSHOT), &snapbase)); - - /* Sort in creation-order. */ - qsort(snapbase, slot, sizeof(WT_SNAPSHOT), __snap_compare_order); - - /* Return the array to our caller. */ - *snapbasep = snapbase; - - if (0) { -format: WT_ERR_MSG(session, WT_ERROR, "corrupted snapshot list"); -err: __wt_meta_snaplist_free(session, snapbase); - } - __wt_free(session, config); - __wt_scr_free(&buf); - - return (ret); -} - -/* - * __wt_meta_snaplist_set -- - * Set a file's snapshot value from the WT_SNAPSHOT list. - */ -int -__wt_meta_snaplist_set( - WT_SESSION_IMPL *session, const char *name, WT_SNAPSHOT *snapbase) -{ - WT_DECL_RET; - WT_ITEM *buf; - WT_SNAPSHOT *snap; - int64_t order; - const char *sep; - - buf = NULL; - - WT_ERR(__wt_scr_alloc(session, 0, &buf)); - order = 0; - sep = ""; - WT_ERR(__wt_buf_fmt(session, buf, "snapshot=(")); - WT_SNAPSHOT_FOREACH(snapbase, snap) { - /* Skip deleted snapshots. */ - if (F_ISSET(snap, WT_SNAP_DELETE)) - continue; - - /* - * Track the largest active snapshot counter: it's not really - * a generational number or an ID because we reset it to 1 if - * the snapshot we're writing is the only snapshot the file has. - * The problem we're solving is when two snapshots are taken - * quickly, the timer may not be unique and/or we can even see - * time travel on the second snapshot if we read the time - * in-between nanoseconds rolling over. All we need to know - * is the real snapshot order so we don't accidentally take the - * wrong "last" snapshot. - */ - if (snap->order > order) - order = snap->order; - - if (F_ISSET(snap, WT_SNAP_ADD | WT_SNAP_UPDATE)) { - /* Convert the raw cookie to a hex string. */ - WT_ERR(__wt_raw_to_hex(session, - snap->raw.data, snap->raw.size, &snap->addr)); - - if (F_ISSET(snap, WT_SNAP_ADD)) - snap->order = order + 1; - } - WT_ERR(__wt_buf_catfmt(session, buf, - "%s%s=(addr=\"%.*s\",order=%" PRIu64 - ",time=%" PRIuMAX ",size=%" PRIu64 ")", - sep, snap->name, - (int)snap->addr.size, (char *)snap->addr.data, - snap->order, snap->sec, snap->snapshot_size)); - sep = ","; - } - WT_ERR(__wt_buf_catfmt(session, buf, ")")); - WT_ERR(__snap_set(session, name, buf->mem)); - -err: __wt_scr_free(&buf); - - return (ret); -} - -/* - * __wt_meta_snaplist_free -- - * Discard the snapshot array. - */ -void -__wt_meta_snaplist_free(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase) -{ - WT_SNAPSHOT *snap; - if (snapbase == NULL) - return; - WT_SNAPSHOT_FOREACH(snapbase, snap) { - __wt_free(session, snap->name); - __wt_buf_free(session, &snap->addr); - __wt_buf_free(session, &snap->raw); - __wt_free(session, snap->bpriv); - } - __wt_free(session, snapbase); -} - -/* - * __snap_version_chk -- - * Check the version major/minor numbers. - */ -static int -__snap_version_chk( - WT_SESSION_IMPL *session, const char *name, const char *config) -{ - WT_CONFIG_ITEM a, v; - int majorv, minorv; - - WT_RET(__wt_config_getones(session, config, "version", &v)); - WT_RET(__wt_config_subgets(session, &v, "major", &a)); - majorv = (int)a.val; - WT_RET(__wt_config_subgets(session, &v, "minor", &a)); - minorv = (int)a.val; - - if (majorv > WT_BTREE_MAJOR_VERSION || - (majorv == WT_BTREE_MAJOR_VERSION && - minorv > WT_BTREE_MINOR_VERSION)) - WT_RET_MSG(session, EACCES, - "%s is an unsupported version of a WiredTiger file", - name); - return (0); -} diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c index 9f17789a1cc..03cc8150e1b 100644 --- a/src/meta/meta_track.c +++ b/src/meta/meta_track.c @@ -106,9 +106,9 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) saved_btree = session->btree; session->btree = trk->btree; if (!unroll) - WT_TRET(__wt_bm_snapshot_resolve(session, NULL)); - /* Release the snapshot lock */ - __wt_rwunlock(session, session->btree->snaplock); + WT_TRET(__wt_bm_checkpoint_resolve(session)); + /* Release the checkpoint lock */ + __wt_rwunlock(session, session->btree->ckptlock); session->btree = saved_btree; break; case WT_ST_LOCK: /* Handle lock, see above */ diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index bd8a64736b4..6bb83642be7 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -24,7 +24,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *name) WT_RET(__wt_conn_btree_close_all(session, name)); /* Delete the root address and truncate the file. */ - WT_RET(__wt_meta_snapshot_clear(session, name)); + WT_RET(__wt_meta_checkpoint_clear(session, name)); WT_RET(__wt_btree_truncate(session, filename)); return (0); diff --git a/src/session/session_btree.c b/src/session/session_btree.c index abcc3efa16e..83d74201905 100644 --- a/src/session/session_btree.c +++ b/src/session/session_btree.c @@ -159,10 +159,10 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, btree = btree_session->btree; if (strcmp(uri, btree->name) != 0) continue; - if ((ckpt == NULL && btree->ckpt == NULL) || - (ckpt != NULL && btree->ckpt != NULL && - (strncmp(ckpt, btree->ckpt, ckptlen) == 0 && - btree->ckpt[ckptlen] == '\0'))) + if ((ckpt == NULL && btree->checkpoint == NULL) || + (ckpt != NULL && btree->checkpoint != NULL && + (strncmp(ckpt, btree->checkpoint, ckptlen) == 0 && + btree->checkpoint[ckptlen] == '\0'))) break; } @@ -201,12 +201,12 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, } /* - * __wt_session_lock_snapshot -- - * Lock the btree handle for the given snapshot name. + * __wt_session_lock_checkpoint -- + * Lock the btree handle for the given checkpoint name. */ int -__wt_session_lock_snapshot( - WT_SESSION_IMPL *session, const char *snapshot, uint32_t flags) +__wt_session_lock_checkpoint( + WT_SESSION_IMPL *session, const char *checkpoint, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; @@ -217,7 +217,7 @@ __wt_session_lock_snapshot( btree = session->btree; WT_ERR(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=\"%s\"", snapshot)); + WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=\"%s\"", checkpoint)); cfg[0] = buf->data; LF_SET(WT_BTREE_LOCK_ONLY); diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c index 87fb2d36854..4f45115423b 100644 --- a/src/session/session_salvage.c +++ b/src/session/session_salvage.c @@ -15,43 +15,43 @@ int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; + WT_CKPT *ckptbase; WT_DECL_RET; - WT_SNAPSHOT *snapbase; btree = session->btree; /* * XXX - * The salvage process reads and discards previous snapshot blocks, so - * the underlying block manager has to ignore any previous snapshot - * entries when creating a new snapshot, in other words, we can't use - * the metadata snapshot list, it has all of those snapshots listed and - * we don't care about them. Build a clean snapshot array and use it - * instead. + * The salvage process reads and discards previous checkpoints, so the + * underlying block manager has to ignore any previous checkpoint + * entries when creating a new checkpoint, in other words, we can't use + * the metadata checkpoint list, it has all of those checkpoint listed + * and we don't care about them. Build a clean checkpoint list and use + * it instead. * - * Don't first clear the metadata snapshot list and call the snapshot - * get routine: a crash between clearing the metadata snapshot list and - * creating a new snapshot list would look like a create or open of a - * file without a snapshot from which to roll-forward, and the contents - * of the file would be discarded. + * Don't first clear the metadata checkpoint list and call the function + * to get a list of checkpoints: a crash between clearing the metadata + * checkpoint list and creating a new checkpoint list would look like a + * create or open of a file without a checkpoint to roll-forward from, + * and the contents of the file would be discarded. */ - WT_RET(__wt_calloc_def(session, 2, &snapbase)); - WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snapbase[0].name)); - F_SET(&snapbase[0], WT_SNAP_ADD); + WT_RET(__wt_calloc_def(session, 2, &ckptbase)); + WT_ERR(__wt_strdup(session, WT_INTERNAL_CHKPT, &ckptbase[0].name)); + F_SET(&ckptbase[0], WT_CKPT_ADD); - WT_ERR(__wt_bt_salvage(session, snapbase, cfg)); + WT_ERR(__wt_bt_salvage(session, ckptbase, cfg)); /* - * If no snapshot was created, well, it's probably bad news, but there - * is nothing to do but clear any recorded snapshots for the file. If - * a snapshot was created, life is good, replace any recorded snapshots - * with the new one. + * If no checkpoint was created, well, it's probably bad news, but there + * is nothing to do but clear any recorded checkpoints for the file. If + * a checkpoint was created, life is good, replace any existing list of + * checkpoints with the single new one. */ - if (snapbase[0].raw.data == NULL) - WT_ERR(__wt_meta_snapshot_clear(session, btree->name)); + if (ckptbase[0].raw.data == NULL) + WT_ERR(__wt_meta_checkpoint_clear(session, btree->name)); else - WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase)); + WT_ERR(__wt_meta_ckptlist_set(session, btree->name, ckptbase)); -err: __wt_meta_snaplist_free(session, snapbase); +err: __wt_meta_ckptlist_free(session, ckptbase); return (ret); } diff --git a/src/txn/txn.c b/src/txn/txn.c index 09f9d8fe42c..b71a4e6cf84 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -129,11 +129,11 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * __txn_release -- + * __wt_txn_release -- * Release the resources associated with the current transaction. */ -static int -__txn_release(WT_SESSION_IMPL *session) +int +__wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; @@ -162,7 +162,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_UNUSED(cfg); - return (__txn_release(session)); + return (__wt_txn_release(session)); } /* @@ -182,107 +182,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) for (i = 0, m = txn->mod; i < txn->mod_count; i++, m++) **m = WT_TXN_ABORTED; - return (__txn_release(session)); -} - -/* - * __wt_txn_checkpoint -- - * Checkpoint a database or a list of objects in the database. - */ -int -__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_CONFIG targetconf; - WT_CONFIG_ITEM cval, k, v; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - int target_list, tracking; - const char *txn_cfg[] = { "isolation=snapshot", NULL }; - - target_list = tracking = 0; - txn_global = &S2C(session)->txn_global; - - /* Only one checkpoint can be active at a time. */ - __wt_writelock(session, S2C(session)->ckpt_rwlock); - WT_ERR(__wt_txn_begin(session, txn_cfg)); - - /* Prevent eviction from evicting anything newer than this. */ - txn_global->ckpt_txnid = session->txn.snap_min; - - WT_ERR(__wt_meta_track_on(session)); - tracking = 1; - - /* Step through the list of targets and snapshot each one. */ - cval.len = 0; - WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); - if (cval.len != 0) { - WT_ERR(__wt_scr_alloc(session, 512, &tmp)); - WT_ERR(__wt_config_subinit(session, &targetconf, &cval)); - while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { - target_list = 1; - WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", - (int)k.len, k.str)); - - if (v.len != 0) - WT_ERR_MSG(session, EINVAL, - "invalid checkpoint target \"%s\": " - "URIs may require quoting", - (const char *)tmp->data); - - __wt_spin_lock(session, &S2C(session)->schema_lock); - ret = __wt_schema_worker( - session, tmp->data, __wt_snapshot, cfg, 0); - __wt_spin_unlock(session, &S2C(session)->schema_lock); - - if (ret != 0) - WT_ERR_MSG(session, ret, "%s", - (const char *)tmp->data); - } - if (ret == WT_NOTFOUND) - ret = 0; - } - - if (!target_list) { - /* - * Possible checkpoint snapshot name. If snapshots are named, - * we must snapshot both open and closed files; if snapshots - * are not named, we only snapshot open files. - * - * XXX - * We don't optimize unnamed checkpoints of a list of targets, - * we open the targets and snapshot them even if they are - * quiescent and don't need a snapshot, believing applications - * unlikely to checkpoint a list of closed targets. - */ - cval.len = 0; - WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); - WT_ERR(cval.len == 0 ? - __wt_conn_btree_apply(session, __wt_snapshot, cfg) : - __wt_meta_btree_apply(session, __wt_snapshot, cfg, 0)); - } - -err: /* - * XXX Rolling back the changes here is problematic. - * - * If we unroll here, we need a way to roll back changes to the avail - * list for each tree that was successfully synced before the error - * occurred. Otherwise, the next time we try this operation, we will - * try to free an old snapshot again. - * - * OTOH, if we commit the changes after a failure, we have partially - * overwritten the checkpoint, so what ends up on disk is not - * consistent. - */ - if (tracking) - WT_TRET(__wt_meta_track_off(session, ret != 0)); - - txn_global->ckpt_txnid = WT_TXN_NONE; - if (F_ISSET(&session->txn, TXN_RUNNING)) - WT_TRET(__txn_release(session)); - __wt_rwunlock(session, S2C(session)->ckpt_rwlock); - __wt_scr_free(&tmp); - return (ret); + return (__wt_txn_release(session)); } /* diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c new file mode 100644 index 00000000000..d60a1938609 --- /dev/null +++ b/src/txn/txn_ckpt.c @@ -0,0 +1,333 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_txn_checkpoint -- + * Checkpoint a database or a list of objects in the database. + */ +int +__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG targetconf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + int target_list, tracking; + const char *txn_cfg[] = { "isolation=snapshot", NULL }; + + target_list = tracking = 0; + txn_global = &S2C(session)->txn_global; + + /* Only one checkpoint can be active at a time. */ + __wt_writelock(session, S2C(session)->ckpt_rwlock); + WT_ERR(__wt_txn_begin(session, txn_cfg)); + + /* Prevent eviction from evicting anything newer than this. */ + txn_global->ckpt_txnid = session->txn.snap_min; + + WT_ERR(__wt_meta_track_on(session)); + tracking = 1; + + /* Step through the list of targets and checkpoint each one. */ + cval.len = 0; + WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); + if (cval.len != 0) { + WT_ERR(__wt_scr_alloc(session, 512, &tmp)); + WT_ERR(__wt_config_subinit(session, &targetconf, &cval)); + while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { + target_list = 1; + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", + (int)k.len, k.str)); + + if (v.len != 0) + WT_ERR_MSG(session, EINVAL, + "invalid checkpoint target \"%s\": " + "URIs may require quoting", + (const char *)tmp->data); + + __wt_spin_lock(session, &S2C(session)->schema_lock); + ret = __wt_schema_worker( + session, tmp->data, __wt_checkpoint, cfg, 0); + __wt_spin_unlock(session, &S2C(session)->schema_lock); + + if (ret != 0) + WT_ERR_MSG(session, ret, "%s", + (const char *)tmp->data); + } + if (ret == WT_NOTFOUND) + ret = 0; + } + + if (!target_list) { + /* + * Possible checkpoint name. If checkpoints are named, we must + * checkpoint both open and closed files; if checkpoints are not + * named, we only checkpoint open files. + * + * XXX + * We don't optimize unnamed checkpoints of a list of targets, + * we open the targets and checkpoint them even if they are + * quiescent and don't need a checkpoint, believing applications + * unlikely to checkpoint a list of closed targets. + */ + cval.len = 0; + WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); + WT_ERR(cval.len == 0 ? + __wt_conn_btree_apply(session, __wt_checkpoint, cfg) : + __wt_meta_btree_apply(session, __wt_checkpoint, cfg, 0)); + } + +err: /* + * XXX Rolling back the changes here is problematic. + * + * If we unroll here, we need a way to roll back changes to the avail + * list for each tree that was successfully synced before the error + * occurred. Otherwise, the next time we try this operation, we will + * try to free an old checkpoint again. + * + * OTOH, if we commit the changes after a failure, we have partially + * overwritten the checkpoint, so what ends up on disk is not + * consistent. + */ + if (tracking) + WT_TRET(__wt_meta_track_off(session, ret != 0)); + + txn_global->ckpt_txnid = WT_TXN_NONE; + if (F_ISSET(&session->txn, TXN_RUNNING)) + WT_TRET(__wt_txn_release(session)); + __wt_rwunlock(session, S2C(session)->ckpt_rwlock); + __wt_scr_free(&tmp); + return (ret); +} + +/* + * __drop -- + * Drop all checkpoints with a specific name. + */ +static void +__drop(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt; + + WT_CKPT_FOREACH(ckptbase, ckpt) + if (strlen(ckpt->name) == len && + strncmp(ckpt->name, name, len) == 0) + F_SET(ckpt, WT_CKPT_DELETE); +} + +/* + * __drop_from -- + * Drop all checkpoints after, and including, the named checkpoint. + */ +static void +__drop_from(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt; + int matched; + + /* + * There's a special case -- if the name is "all", then we delete all + * of the checkpoints. + */ + if (len == strlen("all") && strncmp(name, "all", len) == 0) { + WT_CKPT_FOREACH(ckptbase, ckpt) + F_SET(ckpt, WT_CKPT_DELETE); + return; + } + + /* + * We use the first checkpoint we can find, that is, if there are two + * checkpoints with the same name in the list, we'll delete from the + * first match to the end. + */ + matched = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!matched && + (strlen(ckpt->name) != len || + strncmp(ckpt->name, name, len) != 0)) + continue; + + matched = 1; + F_SET(ckpt, WT_CKPT_DELETE); + } +} + +/* + * __drop_to -- + * Drop all checkpoints before, and including, the named checkpoint. + */ +static void +__drop_to(WT_CKPT *ckptbase, const char *name, size_t len) +{ + WT_CKPT *ckpt, *mark; + + /* + * We use the last checkpoint we can find, that is, if there are two + * checkpoints with the same name in the list, we'll delete from the + * beginning to the second match, not the first. + */ + mark = NULL; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (strlen(ckpt->name) == len && + strncmp(ckpt->name, name, len) == 0) + mark = ckpt; + + if (mark == NULL) + return; + + WT_CKPT_FOREACH(ckptbase, ckpt) { + F_SET(ckpt, WT_CKPT_DELETE); + + if (ckpt == mark) + break; + } +} + +/* + * __wt_checkpoint -- + * Checkpoint a tree. + */ +int +__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase, *deleted; + WT_CONFIG dropconf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_RET; + const char *name; + char *name_alloc; + int force, tracked; + + btree = session->btree; + force = tracked = 0; + ckpt = ckptbase = NULL; + name_alloc = NULL; + + /* Checkpoints are single-threaded. */ + __wt_writelock(session, btree->ckptlock); + + /* + * Get the list of checkpoints for this file. If there's no reference, + * this file is dead. Discard it from the cache without bothering to + * write any dirty pages. + */ + if ((ret = + __wt_meta_ckptlist_get(session, btree->name, &ckptbase)) != 0) { + if (ret == WT_NOTFOUND) + ret = __wt_bt_cache_flush( + session, NULL, WT_SYNC_DISCARD_NOWRITE, 0); + goto err; + } + + /* + * This may be a named checkpoint, check the configuration. If it's a + * named checkpoint, set force, we have to create the checkpoint even if + * the tree is clean. + */ + cval.len = 0; + if (cfg != NULL) + WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); + if (cval.len == 0) + name = WT_INTERNAL_CHKPT; + else { + force = 1; + WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); + name = name_alloc; + } + + /* + * We may be dropping checkpoints, check the configuration. If we're + * dropping checkpoints, set force, we have to create the checkpoint + * even if the tree is clean. + */ + if (cfg != NULL) { + cval.len = 0; + WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); + if (cval.len != 0) { + WT_ERR(__wt_config_subinit(session, &dropconf, &cval)); + while ((ret = + __wt_config_next(&dropconf, &k, &v)) == 0) { + force = 1; + + if (v.len == 0) + __drop(ckptbase, k.str, k.len); + else if (k.len == strlen("from") && + strncmp(k.str, "from", k.len) == 0) + __drop_from(ckptbase, v.str, v.len); + else if (k.len == strlen("to") && + strncmp(k.str, "to", k.len) == 0) + __drop_to(ckptbase, v.str, v.len); + else + WT_ERR_MSG(session, EINVAL, + "unexpected value for checkpoint " + "key: %.*s", + (int)k.len, k.str); + } + WT_ERR_NOTFOUND_OK(ret); + } + } + + /* Discard checkpoints with the same name as the new checkpoint. */ + __drop(ckptbase, name, strlen(name)); + + /* Add a new checkpoint entry at the end of the list. */ + WT_CKPT_FOREACH(ckptbase, ckpt) + ; + WT_ERR(__wt_strdup(session, name, &ckpt->name)); + F_SET(ckpt, WT_CKPT_ADD); + + /* + * Lock the checkpoints that will be deleted. + * + * Checkpoints are only locked when tracking is enabled, which covers + * sync and drop operations, but not close. The reasoning is that + * there should be no access to a checkpoint during close, because any + * thread accessing a checkpoint will also have the current file handle + * open. + */ + if (WT_META_TRACKING(session)) + WT_CKPT_FOREACH(ckptbase, deleted) + if (F_ISSET(deleted, WT_CKPT_DELETE)) + WT_ERR(__wt_session_lock_checkpoint(session, + deleted->name, WT_BTREE_EXCLUSIVE)); + + /* Flush the file from the cache, creating the checkpoint. */ + WT_ERR(__wt_bt_cache_flush( + session, ckptbase, cfg == NULL ? WT_SYNC_DISCARD : WT_SYNC, force)); + + /* If there was a checkpoint, update the metadata and resolve it. */ + if (ckpt->raw.data == NULL) { + if (force) + WT_ERR_MSG(session, EINVAL, + "cache flush failed to create a checkpoint"); + } else { + WT_ERR(__wt_meta_ckptlist_set(session, btree->name, ckptbase)); + /* + * If tracking is enabled, defer making pages available until + * the end of the transaction. The exception is if the handle + * is being discarded: in that case, it will be gone by the + * time we try to apply or unroll the meta tracking event. + */ + if (WT_META_TRACKING(session) && cfg != NULL) { + WT_ERR(__wt_meta_track_checkpoint(session)); + tracked = 1; + } else + WT_ERR(__wt_bm_checkpoint_resolve(session)); + } + +err: __wt_meta_ckptlist_free(session, ckptbase); + if (!tracked) + __wt_rwunlock(session, btree->ckptlock); + + __wt_free(session, name_alloc); + + return (ret); +} diff --git a/src/txn/txn_snapshot.c b/src/txn/txn_snapshot.c deleted file mode 100644 index 85ac650c21d..00000000000 --- a/src/txn/txn_snapshot.c +++ /dev/null @@ -1,233 +0,0 @@ -/*- - * Copyright (c) 2008-2012 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -/* - * __drop -- - * Drop all snapshots with a specific name. - */ -static void -__drop(WT_SNAPSHOT *snapbase, const char *name, size_t len) -{ - WT_SNAPSHOT *snap; - - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (strlen(snap->name) == len && - strncmp(snap->name, name, len) == 0) - F_SET(snap, WT_SNAP_DELETE); -} - -/* - * __drop_from -- - * Drop all snapshots after, and including, the named snapshot. - */ -static void -__drop_from(WT_SNAPSHOT *snapbase, const char *name, size_t len) -{ - WT_SNAPSHOT *snap; - int matched; - - /* - * There's a special case -- if the name is "all", then we delete all - * of the snapshots. - */ - if (len == strlen("all") && strncmp(name, "all", len) == 0) { - WT_SNAPSHOT_FOREACH(snapbase, snap) - F_SET(snap, WT_SNAP_DELETE); - return; - } - - /* - * We use the first snapshot we can find, that is, if there are two - * snapshots with the same name in the list, we'll delete from the - * first match to the end. - */ - matched = 0; - WT_SNAPSHOT_FOREACH(snapbase, snap) { - if (!matched && - (strlen(snap->name) != len || - strncmp(snap->name, name, len) != 0)) - continue; - - matched = 1; - F_SET(snap, WT_SNAP_DELETE); - } -} - -/* - * __drop_to -- - * Drop all snapshots before, and including, the named snapshot. - */ -static void -__drop_to(WT_SNAPSHOT *snapbase, const char *name, size_t len) -{ - WT_SNAPSHOT *mark, *snap; - - /* - * We use the last snapshot we can find, that is, if there are two - * snapshots with the same name in the list, we'll delete from the - * beginning to the second match, not the first. - */ - mark = NULL; - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (strlen(snap->name) == len && - strncmp(snap->name, name, len) == 0) - mark = snap; - - if (mark == NULL) - return; - - WT_SNAPSHOT_FOREACH(snapbase, snap) { - F_SET(snap, WT_SNAP_DELETE); - - if (snap == mark) - break; - } -} - -/* - * __wt_snapshot -- - * Snapshot a tree. - */ -int -__wt_snapshot(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_BTREE *btree; - WT_CONFIG dropconf; - WT_CONFIG_ITEM cval, k, v; - WT_DECL_RET; - WT_SNAPSHOT *deleted, *snap, *snapbase; - const char *name; - char *name_alloc; - int force, tracked; - - btree = session->btree; - force = tracked = 0; - snap = snapbase = NULL; - name_alloc = NULL; - - /* Snapshots are single-threaded. */ - __wt_writelock(session, btree->snaplock); - - /* - * Get the list of snapshots for this file. If there's no reference, - * this file is dead. Discard it from the cache without bothering to - * write any dirty pages. - */ - if ((ret = - __wt_meta_snaplist_get(session, btree->name, &snapbase)) != 0) { - if (ret == WT_NOTFOUND) - ret = __wt_bt_cache_flush( - session, NULL, WT_SYNC_DISCARD_NOWRITE, 0); - goto err; - } - - /* - * This may be a named snapshot, check the configuration. If it's a - * named snapshot, set force, we have to create the snapshot even if - * the tree is clean. - */ - cval.len = 0; - if (cfg != NULL) - WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); - if (cval.len == 0) - name = WT_INTERNAL_SNAPSHOT; - else { - force = 1; - WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); - name = name_alloc; - } - - /* - * We may be dropping snapshots, check the configuration. If we're - * dropping snapshots, set force, we have to create the snapshot even - * if the tree is clean. - */ - if (cfg != NULL) { - cval.len = 0; - WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); - if (cval.len != 0) { - WT_ERR(__wt_config_subinit(session, &dropconf, &cval)); - while ((ret = - __wt_config_next(&dropconf, &k, &v)) == 0) { - force = 1; - - if (v.len == 0) - __drop(snapbase, k.str, k.len); - else if (k.len == strlen("from") && - strncmp(k.str, "from", k.len) == 0) - __drop_from(snapbase, v.str, v.len); - else if (k.len == strlen("to") && - strncmp(k.str, "to", k.len) == 0) - __drop_to(snapbase, v.str, v.len); - else - WT_ERR_MSG(session, EINVAL, - "unexpected value for snapshot " - "key: %.*s", - (int)k.len, k.str); - } - WT_ERR_NOTFOUND_OK(ret); - } - } - - /* Discard snapshots named the same as the snapshot being created. */ - __drop(snapbase, name, strlen(name)); - - /* Add a new snapshot entry at the end of the list. */ - WT_SNAPSHOT_FOREACH(snapbase, snap) - ; - WT_ERR(__wt_strdup(session, name, &snap->name)); - F_SET(snap, WT_SNAP_ADD); - - /* - * Lock the snapshots that will be deleted. - * - * Snapshots are only locked when tracking is enabled, which covers - * sync and drop operations, but not close. The reasoning is that - * there should be no access to a snapshot during close, because any - * thread accessing a snapshot will also have the current file handle - * open. - */ - if (WT_META_TRACKING(session)) - WT_SNAPSHOT_FOREACH(snapbase, deleted) - if (F_ISSET(deleted, WT_SNAP_DELETE)) - WT_ERR(__wt_session_lock_snapshot(session, - deleted->name, WT_BTREE_EXCLUSIVE)); - - /* Flush the file from the cache, creating the snapshot. */ - WT_ERR(__wt_bt_cache_flush( - session, snapbase, cfg == NULL ? WT_SYNC_DISCARD : WT_SYNC, force)); - - /* If there was a snapshot, update the metadata and resolve it. */ - if (snap->raw.data == NULL) { - if (force) - WT_ERR_MSG(session, - EINVAL, "cache flush failed to create a snapshot"); - } else { - WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase)); - /* - * If tracking is enabled, defer making pages available until - * the end of the transaction. The exception is if the handle - * is being discarded: in that case, it will be gone by the - * time we try to apply or unroll the meta tracking event. - */ - if (WT_META_TRACKING(session) && cfg != NULL) { - WT_ERR(__wt_meta_track_checkpoint(session)); - tracked = 1; - } else - WT_ERR(__wt_bm_snapshot_resolve(session, snapbase)); - } - -err: __wt_meta_snaplist_free(session, snapbase); - if (!tracked) - __wt_rwunlock(session, btree->snaplock); - - __wt_free(session, name_alloc); - - return (ret); -} diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index 19db1677638..53b3a0d87d1 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -154,7 +154,7 @@ static int list_print_checkpoint(WT_SESSION *session, const char *key) { WT_DECL_RET; - WT_SNAPSHOT *snap, *snapbase; + WT_CKPT *ckpt, *ckptbase; size_t len; time_t t; uint64_t v; @@ -163,24 +163,23 @@ list_print_checkpoint(WT_SESSION *session, const char *key) /* * We may not find any checkpoints for this file, in which case we don't * report an error, and continue our caller's loop. Otherwise, read the - * list of snapshots (which is the same as the list of checkpoints), and - * print each snapshot's name and time. + * list of checkpoints and print each checkpoint's name and time. */ - if ((ret = __wt_metadata_get_snaplist(session, key, &snapbase)) != 0) + if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); /* Find the longest name, so we can pretty-print. */ len = 0; - WT_SNAPSHOT_FOREACH(snapbase, snap) - if (strlen(snap->name) > len) - len = strlen(snap->name); + WT_CKPT_FOREACH(ckptbase, ckpt) + if (strlen(ckpt->name) > len) + len = strlen(ckpt->name); ++len; - WT_SNAPSHOT_FOREACH(snapbase, snap) { - t = (time_t)snap->sec; - printf("\t%*s: %.24s", (int)len, snap->name, ctime_r(&t, buf)); + WT_CKPT_FOREACH(ckptbase, ckpt) { + t = (time_t)ckpt->sec; + printf("\t%*s: %.24s", (int)len, ckpt->name, ctime_r(&t, buf)); - v = snap->snapshot_size; + v = ckpt->ckpt_size; if (v >= WT_PETABYTE) printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE); else if (v >= WT_TERABYTE) @@ -195,7 +194,7 @@ list_print_checkpoint(WT_SESSION *session, const char *key) printf(" (%" PRIu64 " B)\n", v); } - __wt_metadata_free_snaplist(session, snapbase); + __wt_metadata_free_ckptlist(session, ckptbase); return (0); } |