diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-03-22 13:15:02 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-03-22 13:15:02 +1100 |
commit | a060ab97aed904e3fa49a0351c4100c18063097b (patch) | |
tree | 528a59d5515b3e0aefa0334db089b0cd58b9f5b2 /src | |
parent | bad1f3a25baf4fe80358ce2304388fba402fc479 (diff) | |
parent | 2371e490429ba3780e75e2910a2e686ae77ad040 (diff) | |
download | mongo-a060ab97aed904e3fa49a0351c4100c18063097b.tar.gz |
Merge branch 'develop' into wt-2501-lsm-drop
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_sync.c | 37 | ||||
-rw-r--r-- | src/btree/bt_vrfy.c | 2 | ||||
-rw-r--r-- | src/cursor/cur_backup.c | 5 | ||||
-rw-r--r-- | src/include/extern.h | 4 | ||||
-rw-r--r-- | src/include/lsm.h | 19 | ||||
-rw-r--r-- | src/lsm/lsm_manager.c | 4 | ||||
-rw-r--r-- | src/lsm/lsm_merge.c | 2 | ||||
-rw-r--r-- | src/lsm/lsm_tree.c | 30 | ||||
-rw-r--r-- | src/lsm/lsm_work_unit.c | 6 | ||||
-rw-r--r-- | src/schema/schema_worker.c | 2 | ||||
-rw-r--r-- | src/session/session_dhandle.c | 2 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 316 |
12 files changed, 236 insertions, 193 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 8b54087794f..57056eb5c99 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -269,24 +269,18 @@ err: /* On error, clear any left-over tree walk. */ * Cache operations. */ int -__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) +__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { - WT_DECL_RET; - WT_BTREE *btree; - - btree = S2BT(session); - switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* - * Set the checkpoint reference for reconciliation; it's ugly, - * but drilling a function parameter path from our callers to - * the reconciliation of the tree's root page is going to be - * worse. + * Make sure the checkpoint reference is set for + * reconciliation; it's ugly, but drilling a function parameter + * path from our callers to the reconciliation of the tree's + * root page is going to be worse. */ - WT_ASSERT(session, btree->ckpt == NULL); - btree->ckpt = ckptbase; + WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: @@ -296,23 +290,10 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: - WT_ERR(__sync_file(session, op)); - break; + return (__sync_file(session, op)); case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: - WT_ERR(__wt_evict_file(session, op)); - break; + return (__wt_evict_file(session, op)); + WT_ILLEGAL_VALUE(session); } - -err: switch (op) { - case WT_SYNC_CHECKPOINT: - case WT_SYNC_CLOSE: - btree->ckpt = NULL; - break; - case WT_SYNC_DISCARD: - case WT_SYNC_WRITE_LEAVES: - break; - } - - return (ret); } diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index ae2c20be1b6..952298f2456 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -226,7 +226,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); - WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index b097a8c08aa..2fb0c464a76 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -140,8 +140,9 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __backup_start(session, cb, cfg)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __backup_start(session, cb, cfg))); WT_ERR(ret); /* __wt_cursor_init is last so we don't have to clean up on error. */ diff --git a/src/include/extern.h b/src/include/extern.h index 6f5d3ff66d5..48c52d4a109 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -168,7 +168,7 @@ extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); -extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op); +extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok); @@ -754,7 +754,7 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len); -extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/lsm.h b/src/include/lsm.h index eacb4d52d3e..444073087df 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -242,13 +242,18 @@ struct __wt_lsm_tree { int64_t lsm_lookup_no_bloom; int64_t lsm_merge_throttle; -#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ -#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */ -#define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */ -#define WT_LSM_TREE_MERGES 0x08 /* Tree should run merges */ -#define WT_LSM_TREE_NEED_SWITCH 0x10 /* New chunk needs creating */ -#define WT_LSM_TREE_OPEN 0x20 /* The tree is open */ -#define WT_LSM_TREE_THROTTLE 0x40 /* Throttle updates */ + /* + * The tree is open for business. This used to be a flag, but it is + * susceptible to races. + */ + bool active; + +#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x01 /* Timer for merge aggression */ +#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */ +#define WT_LSM_TREE_MERGES 0x04 /* Tree should run merges */ +#define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */ +#define WT_LSM_TREE_OPEN 0x10 /* The tree is open */ +#define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */ uint32_t flags; }; diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 24707abdb5a..943a5894ab3 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -390,7 +390,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) continue; WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : @@ -650,7 +650,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, * is checked. */ (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + if (!lsm_tree->active) { (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1); return (0); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 973043f334f..6d907284546 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -463,7 +463,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 9d311a6edf3..7fb3cfc0e95 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -91,8 +91,13 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool wait) WT_DECL_RET; int i; - /* Stop any active merges. */ - F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE); + /* + * Stop any new work units being added. The barrier is necessary + * because we rely on the state change being visible before checking + * the tree queue state. + */ + lsm_tree->active = false; + WT_READ_BARRIER(); /* * Wait for all LSM operations to drain. If WiredTiger is shutting @@ -124,7 +129,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool wait) } return (0); -err: F_SET(lsm_tree, WT_LSM_TREE_ACTIVE); +err: lsm_tree->active = true; return (ret); } @@ -388,10 +393,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session, if (__lsm_tree_close( session, lsm_tree, false) != 0 || lsm_tree->refcnt != 1) { - (void)__wt_atomic_sub32( - &lsm_tree->refcnt, 1); - F_SET(lsm_tree, WT_LSM_TREE_ACTIVE); - lsm_tree->excl_session = NULL; + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } else { @@ -404,8 +407,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session, if (lsm_tree->excl_session != NULL) { WT_ASSERT(session, lsm_tree->refcnt > 0); - (void)__wt_atomic_sub32( - &lsm_tree->refcnt, 1); + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } @@ -505,7 +508,9 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); - F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN); + if (!exclusive) + lsm_tree->active = true; + F_SET(lsm_tree, WT_LSM_TREE_OPEN); *treep = lsm_tree; @@ -546,7 +551,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ASSERT(session, lsm_tree->refcnt > 0); if (lsm_tree->excl_session == session) { /* We cleared the active flag when getting exclusive access. */ - F_SET(lsm_tree, WT_LSM_TREE_ACTIVE); + lsm_tree->active = true; lsm_tree->excl_session = NULL; } (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1); @@ -1221,7 +1226,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) } /* Wait for the work unit queues to drain. */ - while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + while (lsm_tree->active) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. @@ -1303,7 +1308,6 @@ err: __wt_lsm_tree_release(session, lsm_tree); return (ret); - } /* diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 88054f86d65..87771e2cb6c 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -29,7 +29,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session, cookie->nchunks = 0; WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Take a copy of the current state of the LSM tree. */ @@ -79,7 +79,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0) + if (!lsm_tree->active || lsm_tree->nchunks == 0) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Search for a chunk to evict and/or a chunk to flush. */ @@ -322,7 +322,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; - ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); + ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index e60a7107786..52be76bb7a5 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -126,7 +126,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; - else if (file_func == __wt_checkpoint_list) + else if (file_func == __wt_checkpoint_get_handles) ; else if (file_func == __wt_checkpoint_sync) ; diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 242d9ac5cc4..ddf4d3dfa33 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -577,7 +577,7 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * files, since changes to the underlying file are visible to the in * memory pages. */ - WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); /* * We lock checkpoint handles that we are overwriting, so the handle diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index de8f0d84951..a4a24b41a89 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,6 +8,10 @@ #include "wt_internal.h" +static int __checkpoint_lock_tree( + WT_SESSION_IMPL *, bool, bool, const char *[]); +static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); + /* * __wt_checkpoint_name_ok -- * Complain if the checkpoint name isn't acceptable. @@ -224,11 +228,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * __wt_checkpoint_list -- + * __wt_checkpoint_get_handles -- * Get a list of handles to flush. */ int -__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; const char *name; @@ -254,6 +258,13 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + if (ret != 0) { + WT_TRET(__wt_session_release_btree(session)); + return (ret); + } + session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; return (0); } @@ -267,7 +278,7 @@ __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) { WT_UNUSED(cfg); - return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES)); + return (__wt_cache_op(session, WT_SYNC_WRITE_LEAVES)); } /* @@ -371,15 +382,20 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Configure logging only if doing a full checkpoint. */ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + /* Keep track of handles acquired for locking. */ + WT_ERR(__wt_meta_track_on(session)); + tracking = true; + /* * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. */ + WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, ret, WT_WITH_TABLE_LOCK(session, ret, WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_list, NULL)))); + session, cfg, __wt_checkpoint_get_handles, NULL)))); WT_ERR(ret); /* @@ -410,10 +426,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - /* Start the checkpoint for real. */ - WT_ERR(__wt_meta_track_on(session)); - tracking = true; - /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( @@ -426,6 +438,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_epoch(session, &start)); /* + * Start the checkpoint for real. + * * Bump the global checkpoint generation, used to figure out whether * checkpoint has visited a tree. There is no need for this to be * atomic: it is only written while holding the checkpoint lock. @@ -489,7 +503,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); /* * Clear the dhandle so the visibility check doesn't get confused about @@ -752,14 +766,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* - * __checkpoint_worker -- - * Checkpoint a tree. + * __checkpoint_lock_tree -- + * Acquire the locks required to checkpoint a tree. */ static int -__checkpoint_worker(WT_SESSION_IMPL *session, - const char *cfg[], bool is_checkpoint, bool need_tracking) +__checkpoint_lock_tree(WT_SESSION_IMPL *session, + bool is_checkpoint, bool need_tracking, const char *cfg[]) { - WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; @@ -767,19 +780,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - WT_LSN ckptlsn; - int deleted, was_modified; - bool fake_ckpt, force, hot_backup_locked; - const char *name; char *name_alloc; + const char *name; + bool hot_backup_locked; btree = S2BT(session); - bm = btree->bm; conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; - was_modified = btree->modified; - fake_ckpt = hot_backup_locked = false; + hot_backup_locked = false; name_alloc = NULL; /* @@ -798,15 +807,6 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_ASSERT(session, !need_tracking || WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session)); - /* - * Set the checkpoint LSN to the maximum LSN so that if logging is - * disabled, recovery will never roll old changes forward over the - * non-logged changes in this checkpoint. If logging is enabled, a - * real checkpoint LSN will be assigned later for this checkpoint and - * overwrite this. - */ - WT_MAX_LSN(&ckptlsn); - /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); @@ -857,74 +857,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, /* Drop checkpoints with the same name as the one we're taking. */ __drop(ckptbase, name, strlen(name)); - /* - * Check for clean objects not requiring a checkpoint. - * - * If we're closing a handle, and the object is clean, we can skip the - * checkpoint, whatever checkpoints we have are sufficient. (We might - * not have any checkpoints if the object was never modified, and that's - * OK: the object creation code doesn't mark the tree modified so we can - * skip newly created trees here.) - * - * If the application repeatedly checkpoints an object (imagine hourly - * checkpoints using the same explicit or internal name), there's no - * reason to repeat the checkpoint for clean objects. The test is if - * the only checkpoint we're deleting is the last one in the list and - * it has the same name as the checkpoint we're about to take, skip the - * work. (We can't skip checkpoints that delete more than the last - * checkpoint because deleting those checkpoints might free up space in - * the file.) This means an application toggling between two (or more) - * checkpoint names will repeatedly take empty checkpoints, but that's - * not likely enough to make detection worthwhile. - * - * Checkpoint read-only objects otherwise: the application must be able - * to open the checkpoint in a cursor after taking any checkpoint, which - * means it must exist. - */ - force = false; - F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && cfg != NULL) { - ret = __wt_config_gets(session, cfg, "force", &cval); - if (ret != 0 && ret != WT_NOTFOUND) - WT_ERR(ret); - if (ret == 0 && cval.val != 0) - force = true; - } - if (!btree->modified && !force) { - if (!is_checkpoint) - goto nockpt; - - deleted = 0; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (F_ISSET(ckpt, WT_CKPT_DELETE)) - ++deleted; - /* - * Complicated test: if the last checkpoint in the object has - * the same name as the checkpoint we're taking (correcting for - * internal checkpoint names with their generational suffix - * numbers), we can skip the checkpoint, there's nothing to do. - * The exception is if we're deleting two or more checkpoints: - * then we may save space. - */ - if (ckpt > ckptbase && - (strcmp(name, (ckpt - 1)->name) == 0 || - (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && - WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && - deleted < 2) { -nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); - WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, - btree->checkpoint_gen); - goto done; - } - } - /* Add a new checkpoint entry at the end of the list. */ WT_CKPT_FOREACH(ckptbase, ckpt) ; WT_ERR(__wt_strdup(session, name, &ckpt->name)); + /* + * We are now done with the local use of the name. Free the local + * allocation, if needed. + */ + __wt_free(session, name_alloc); F_SET(ckpt, WT_CKPT_ADD); /* @@ -1005,32 +946,119 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); * copy instead of forcing checkpoints on clean objects to associate * names with checkpoints. */ - if (is_checkpoint) - switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) { - case 0: - break; - case WT_BTREE_BULK: - /* - * The only checkpoints a bulk-loaded file should have - * are fake ones we created without the underlying block - * manager. I'm leaving this code here because it's a - * cheap test and a nasty race. - */ - WT_CKPT_FOREACH(ckptbase, ckpt) - if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE)) - WT_ERR_MSG(session, ret, - "block-manager checkpoint found " - "for a bulk-loaded file"); - fake_ckpt = true; - goto fake; - case WT_BTREE_REBALANCE: - case WT_BTREE_SALVAGE: - case WT_BTREE_UPGRADE: - case WT_BTREE_VERIFY: - WT_ERR_MSG(session, EINVAL, - "checkpoints are blocked during rebalance, " - "salvage, upgrade or verify operations"); + WT_ASSERT(session, + !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); + + hot_backup_locked = false; + WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); + + WT_ASSERT(session, btree->ckpt == NULL); + btree->ckpt = ckptbase; + + return (0); + +err: if (hot_backup_locked) + WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); + + __wt_meta_ckptlist_free(session, ckptbase); + __wt_free(session, name_alloc); + + return (ret); +} + +/* + * __checkpoint_tree -- + * Checkpoint a single tree. + * Assumes all necessary locks have been acquired by the caller. + */ +static int +__checkpoint_tree( + WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_LSN ckptlsn; + const char *name; + int deleted, was_modified; + bool fake_ckpt, force; + + btree = S2BT(session); + bm = btree->bm; + ckptbase = btree->ckpt; + conn = S2C(session); + dhandle = session->dhandle; + fake_ckpt = false; + was_modified = btree->modified; + + /* + * Check for clean objects not requiring a checkpoint. + * + * If we're closing a handle, and the object is clean, we can skip the + * checkpoint, whatever checkpoints we have are sufficient. (We might + * not have any checkpoints if the object was never modified, and that's + * OK: the object creation code doesn't mark the tree modified so we can + * skip newly created trees here.) + * + * If the application repeatedly checkpoints an object (imagine hourly + * checkpoints using the same explicit or internal name), there's no + * reason to repeat the checkpoint for clean objects. The test is if + * the only checkpoint we're deleting is the last one in the list and + * it has the same name as the checkpoint we're about to take, skip the + * work. (We can't skip checkpoints that delete more than the last + * checkpoint because deleting those checkpoints might free up space in + * the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's + * not likely enough to make detection worthwhile. + * + * Checkpoint read-only objects otherwise: the application must be able + * to open the checkpoint in a cursor after taking any checkpoint, which + * means it must exist. + */ + force = false; + F_CLR(btree, WT_BTREE_SKIP_CKPT); + if (!btree->modified && cfg != NULL) { + ret = __wt_config_gets(session, cfg, "force", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_ERR(ret); + if (ret == 0 && cval.val != 0) + force = true; + } + if (!btree->modified && !force) { + if (!is_checkpoint) + goto nockpt; + + deleted = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + ++deleted; + /* + * Complicated test: if the tree is clean and last two + * checkpoints have the same name (correcting for internal + * checkpoint names with their generational suffix numbers), we + * can skip the checkpoint, there's nothing to do. The + * exception is if we're deleting two or more checkpoints: then + * we may save space. + */ + name = (ckpt - 1)->name; + if (ckpt > ckptbase + 1 && deleted < 2 && + (strcmp(name, (ckpt - 2)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); + ret = 0; + goto err; } + } /* * If an object has never been used (in other words, if it could become @@ -1077,6 +1105,15 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); btree->modified = 0; WT_FULL_BARRIER(); + /* + * Set the checkpoint LSN to the maximum LSN so that if logging is + * disabled, recovery will never roll old changes forward over the + * non-logged changes in this checkpoint. If logging is enabled, a + * real checkpoint LSN will be assigned for this checkpoint and + * overwrite this. + */ + WT_MAX_LSN(&ckptlsn); + /* Tell logging that a file checkpoint is starting. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_ERR(__wt_txn_checkpoint_log( @@ -1084,9 +1121,9 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); /* Flush the file from the cache, creating the checkpoint. */ if (is_checkpoint) - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT)); else - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE)); /* * All blocks being written have been written; set the object's write @@ -1144,7 +1181,6 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_STOP, NULL)); -done: err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. @@ -1152,30 +1188,42 @@ err: /* if (ret != 0 && !btree->modified && was_modified) btree->modified = 1; - if (hot_backup_locked) - WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); - __wt_meta_ckptlist_free(session, ckptbase); - __wt_free(session, name_alloc); + btree->ckpt = NULL; return (ret); } /* + * __checkpoint_tree_helper -- + * Checkpoint a tree (suitable for use in *_apply functions). + */ +static int +__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) +{ + return (__checkpoint_tree(session, true, cfg)); +} + +/* * __wt_checkpoint -- * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_DECL_RET; + /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Should be holding the schema lock. */ + /* We must hold the metadata lock if checkpointing the metadata. */ WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - return (__checkpoint_worker(session, cfg, true, true)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + WT_RET(ret); + return (__checkpoint_tree(session, true, cfg)); } /* @@ -1225,7 +1273,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) - return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + return (__wt_cache_op(session, WT_SYNC_DISCARD)); /* * If closing an unmodified file, check that no update is required @@ -1234,7 +1282,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (!btree->modified && !bulk) { __wt_txn_update_oldest(session, true); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? - __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); + __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY); } /* @@ -1248,10 +1296,14 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); + WT_ASSERT(session, ret == 0); + if (ret == 0) + ret = __checkpoint_tree(session, false, NULL); if (need_tracking) - WT_RET(__wt_meta_track_off(session, true, ret != 0)); + WT_TRET(__wt_meta_track_off(session, true, ret != 0)); return (ret); } |