summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2016-03-22 13:15:02 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2016-03-22 13:15:02 +1100
commita060ab97aed904e3fa49a0351c4100c18063097b (patch)
tree528a59d5515b3e0aefa0334db089b0cd58b9f5b2
parentbad1f3a25baf4fe80358ce2304388fba402fc479 (diff)
parent2371e490429ba3780e75e2910a2e686ae77ad040 (diff)
downloadmongo-a060ab97aed904e3fa49a0351c4100c18063097b.tar.gz
Merge branch 'develop' into wt-2501-lsm-drop
-rw-r--r--src/btree/bt_sync.c37
-rw-r--r--src/btree/bt_vrfy.c2
-rw-r--r--src/cursor/cur_backup.c5
-rw-r--r--src/include/extern.h4
-rw-r--r--src/include/lsm.h19
-rw-r--r--src/lsm/lsm_manager.c4
-rw-r--r--src/lsm/lsm_merge.c2
-rw-r--r--src/lsm/lsm_tree.c30
-rw-r--r--src/lsm/lsm_work_unit.c6
-rw-r--r--src/schema/schema_worker.c2
-rw-r--r--src/session/session_dhandle.c2
-rw-r--r--src/txn/txn_ckpt.c316
-rw-r--r--test/suite/test_checkpoint01.py2
13 files changed, 237 insertions, 194 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 8b54087794f..57056eb5c99 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -269,24 +269,18 @@ err: /* On error, clear any left-over tree walk. */
* Cache operations.
*/
int
-__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op)
+__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op)
{
- WT_DECL_RET;
- WT_BTREE *btree;
-
- btree = S2BT(session);
-
switch (op) {
case WT_SYNC_CHECKPOINT:
case WT_SYNC_CLOSE:
/*
- * Set the checkpoint reference for reconciliation; it's ugly,
- * but drilling a function parameter path from our callers to
- * the reconciliation of the tree's root page is going to be
- * worse.
+ * Make sure the checkpoint reference is set for
+ * reconciliation; it's ugly, but drilling a function parameter
+ * path from our callers to the reconciliation of the tree's
+ * root page is going to be worse.
*/
- WT_ASSERT(session, btree->ckpt == NULL);
- btree->ckpt = ckptbase;
+ WT_ASSERT(session, S2BT(session)->ckpt != NULL);
break;
case WT_SYNC_DISCARD:
case WT_SYNC_WRITE_LEAVES:
@@ -296,23 +290,10 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op)
switch (op) {
case WT_SYNC_CHECKPOINT:
case WT_SYNC_WRITE_LEAVES:
- WT_ERR(__sync_file(session, op));
- break;
+ return (__sync_file(session, op));
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
- WT_ERR(__wt_evict_file(session, op));
- break;
+ return (__wt_evict_file(session, op));
+ WT_ILLEGAL_VALUE(session);
}
-
-err: switch (op) {
- case WT_SYNC_CHECKPOINT:
- case WT_SYNC_CLOSE:
- btree->ckpt = NULL;
- break;
- case WT_SYNC_DISCARD:
- case WT_SYNC_WRITE_LEAVES:
- break;
- }
-
- return (ret);
}
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index ae2c20be1b6..952298f2456 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -226,7 +226,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
WT_WITH_PAGE_INDEX(session,
ret = __verify_tree(session, &btree->root, vs));
- WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD));
}
/* Unload the checkpoint. */
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index b097a8c08aa..2fb0c464a76 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -140,8 +140,9 @@ __wt_curbackup_open(WT_SESSION_IMPL *session,
* Start the backup and fill in the cursor's list. Acquire the schema
* lock, we need a consistent view when creating a copy.
*/
- WT_WITH_SCHEMA_LOCK(session, ret,
- ret = __backup_start(session, cb, cfg));
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __backup_start(session, cb, cfg)));
WT_ERR(ret);
/* __wt_cursor_init is last so we don't have to clean up on error. */
diff --git a/src/include/extern.h b/src/include/extern.h
index 6f5d3ff66d5..48c52d4a109 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -168,7 +168,7 @@ extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
-extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op);
+extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op);
extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok);
@@ -754,7 +754,7 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session);
extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len);
-extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]);
diff --git a/src/include/lsm.h b/src/include/lsm.h
index eacb4d52d3e..444073087df 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -242,13 +242,18 @@ struct __wt_lsm_tree {
int64_t lsm_lookup_no_bloom;
int64_t lsm_merge_throttle;
-#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */
-#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */
-#define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */
-#define WT_LSM_TREE_MERGES 0x08 /* Tree should run merges */
-#define WT_LSM_TREE_NEED_SWITCH 0x10 /* New chunk needs creating */
-#define WT_LSM_TREE_OPEN 0x20 /* The tree is open */
-#define WT_LSM_TREE_THROTTLE 0x40 /* Throttle updates */
+ /*
+ * The tree is open for business. This used to be a flag, but it is
+ * susceptible to races.
+ */
+ bool active;
+
+#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x01 /* Timer for merge aggression */
+#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */
+#define WT_LSM_TREE_MERGES 0x04 /* Tree should run merges */
+#define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */
+#define WT_LSM_TREE_OPEN 0x10 /* The tree is open */
+#define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */
uint32_t flags;
};
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index 24707abdb5a..943a5894ab3 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -390,7 +390,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session)
F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST);
dhandle_locked = true;
TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ if (!lsm_tree->active)
continue;
WT_ERR(__wt_epoch(session, &now));
pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
@@ -650,7 +650,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
* is checked.
*/
(void)__wt_atomic_add32(&lsm_tree->queue_ref, 1);
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+ if (!lsm_tree->active) {
(void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
return (0);
}
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index 973043f334f..6d907284546 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -463,7 +463,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
#define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ if (!lsm_tree->active)
WT_ERR(EINTR);
WT_STAT_FAST_CONN_INCRV(session,
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 9d311a6edf3..7fb3cfc0e95 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -91,8 +91,13 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool wait)
WT_DECL_RET;
int i;
- /* Stop any active merges. */
- F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE);
+ /*
+ * Stop any new work units being added. The barrier is necessary
+ * because we rely on the state change being visible before checking
+ * the tree queue state.
+ */
+ lsm_tree->active = false;
+ WT_READ_BARRIER();
/*
* Wait for all LSM operations to drain. If WiredTiger is shutting
@@ -124,7 +129,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool wait)
}
return (0);
-err: F_SET(lsm_tree, WT_LSM_TREE_ACTIVE);
+err: lsm_tree->active = true;
return (ret);
}
@@ -388,10 +393,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
if (__lsm_tree_close(
session, lsm_tree, false) != 0 ||
lsm_tree->refcnt != 1) {
- (void)__wt_atomic_sub32(
- &lsm_tree->refcnt, 1);
- F_SET(lsm_tree, WT_LSM_TREE_ACTIVE);
- lsm_tree->excl_session = NULL;
+ __wt_lsm_tree_release(
+ session, lsm_tree);
return (EBUSY);
}
} else {
@@ -404,8 +407,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
if (lsm_tree->excl_session != NULL) {
WT_ASSERT(session,
lsm_tree->refcnt > 0);
- (void)__wt_atomic_sub32(
- &lsm_tree->refcnt, 1);
+ __wt_lsm_tree_release(
+ session, lsm_tree);
return (EBUSY);
}
}
@@ -505,7 +508,9 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
/* Now the tree is setup, make it visible to others. */
TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
- F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);
+ if (!exclusive)
+ lsm_tree->active = true;
+ F_SET(lsm_tree, WT_LSM_TREE_OPEN);
*treep = lsm_tree;
@@ -546,7 +551,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ASSERT(session, lsm_tree->refcnt > 0);
if (lsm_tree->excl_session == session) {
/* We cleared the active flag when getting exclusive access. */
- F_SET(lsm_tree, WT_LSM_TREE_ACTIVE);
+ lsm_tree->active = true;
lsm_tree->excl_session = NULL;
}
(void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
@@ -1221,7 +1226,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
}
/* Wait for the work unit queues to drain. */
- while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+ while (lsm_tree->active) {
/*
* The flush flag is cleared when the chunk has been flushed.
* Continue to push forced flushes until the chunk is on disk.
@@ -1303,7 +1308,6 @@ err:
__wt_lsm_tree_release(session, lsm_tree);
return (ret);
-
}
/*
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index 88054f86d65..87771e2cb6c 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -29,7 +29,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session,
cookie->nchunks = 0;
WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ if (!lsm_tree->active)
return (__wt_lsm_tree_readunlock(session, lsm_tree));
/* Take a copy of the current state of the LSM tree. */
@@ -79,7 +79,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
WT_ASSERT(session, lsm_tree->queue_ref > 0);
WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0)
+ if (!lsm_tree->active || lsm_tree->nchunks == 0)
return (__wt_lsm_tree_readunlock(session, lsm_tree));
/* Search for a chunk to evict and/or a chunk to flush. */
@@ -322,7 +322,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
*/
saved_isolation = session->txn.isolation;
session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
- ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
+ ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES);
session->txn.isolation = saved_isolation;
WT_TRET(__wt_session_release_btree(session));
}
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index e60a7107786..52be76bb7a5 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -126,7 +126,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
else if (file_func == __wt_checkpoint)
;
- else if (file_func == __wt_checkpoint_list)
+ else if (file_func == __wt_checkpoint_get_handles)
;
else if (file_func == __wt_checkpoint_sync)
;
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index 242d9ac5cc4..ddf4d3dfa33 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -577,7 +577,7 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
* files, since changes to the underlying file are visible to the in
* memory pages.
*/
- WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD));
/*
* We lock checkpoint handles that we are overwriting, so the handle
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index de8f0d84951..a4a24b41a89 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -8,6 +8,10 @@
#include "wt_internal.h"
+static int __checkpoint_lock_tree(
+ WT_SESSION_IMPL *, bool, bool, const char *[]);
+static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
+
/*
* __wt_checkpoint_name_ok --
* Complain if the checkpoint name isn't acceptable.
@@ -224,11 +228,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
- * __wt_checkpoint_list --
+ * __wt_checkpoint_get_handles --
* Get a list of handles to flush.
*/
int
-__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_DECL_RET;
const char *name;
@@ -254,6 +258,13 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0)
return (ret == EBUSY ? 0 : ret);
+ WT_SAVE_DHANDLE(session,
+ ret = __checkpoint_lock_tree(session, true, true, cfg));
+ if (ret != 0) {
+ WT_TRET(__wt_session_release_btree(session));
+ return (ret);
+ }
+
session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
return (0);
}
@@ -267,7 +278,7 @@ __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_UNUSED(cfg);
- return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES));
+ return (__wt_cache_op(session, WT_SYNC_WRITE_LEAVES));
}
/*
@@ -371,15 +382,20 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Configure logging only if doing a full checkpoint. */
logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED);
+ /* Keep track of handles acquired for locking. */
+ WT_ERR(__wt_meta_track_on(session));
+ tracking = true;
+
/*
* Get a list of handles we want to flush; this may pull closed objects
* into the session cache, but we're going to do that eventually anyway.
*/
+ WT_ASSERT(session, session->ckpt_handle_next == 0);
WT_WITH_SCHEMA_LOCK(session, ret,
WT_WITH_TABLE_LOCK(session, ret,
WT_WITH_HANDLE_LIST_LOCK(session,
ret = __checkpoint_apply_all(
- session, cfg, __wt_checkpoint_list, NULL))));
+ session, cfg, __wt_checkpoint_get_handles, NULL))));
WT_ERR(ret);
/*
@@ -410,10 +426,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
- /* Start the checkpoint for real. */
- WT_ERR(__wt_meta_track_on(session));
- tracking = true;
-
/* Tell logging that we are about to start a database checkpoint. */
if (full && logging)
WT_ERR(__wt_txn_checkpoint_log(
@@ -426,6 +438,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_epoch(session, &start));
/*
+ * Start the checkpoint for real.
+ *
* Bump the global checkpoint generation, used to figure out whether
* checkpoint has visited a tree. There is no need for this to be
* atomic: it is only written while holding the checkpoint lock.
@@ -489,7 +503,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_txn_checkpoint_log(
session, full, WT_TXN_LOG_CKPT_START, NULL));
- WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint));
+ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper));
/*
* Clear the dhandle so the visibility check doesn't get confused about
@@ -752,14 +766,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
}
/*
- * __checkpoint_worker --
- * Checkpoint a tree.
+ * __checkpoint_lock_tree --
+ * Acquire the locks required to checkpoint a tree.
*/
static int
-__checkpoint_worker(WT_SESSION_IMPL *session,
- const char *cfg[], bool is_checkpoint, bool need_tracking)
+__checkpoint_lock_tree(WT_SESSION_IMPL *session,
+ bool is_checkpoint, bool need_tracking, const char *cfg[])
{
- WT_BM *bm;
WT_BTREE *btree;
WT_CKPT *ckpt, *ckptbase;
WT_CONFIG dropconf;
@@ -767,19 +780,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- WT_LSN ckptlsn;
- int deleted, was_modified;
- bool fake_ckpt, force, hot_backup_locked;
- const char *name;
char *name_alloc;
+ const char *name;
+ bool hot_backup_locked;
btree = S2BT(session);
- bm = btree->bm;
conn = S2C(session);
ckpt = ckptbase = NULL;
dhandle = session->dhandle;
- was_modified = btree->modified;
- fake_ckpt = hot_backup_locked = false;
+ hot_backup_locked = false;
name_alloc = NULL;
/*
@@ -798,15 +807,6 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
WT_ASSERT(session, !need_tracking ||
WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session));
- /*
- * Set the checkpoint LSN to the maximum LSN so that if logging is
- * disabled, recovery will never roll old changes forward over the
- * non-logged changes in this checkpoint. If logging is enabled, a
- * real checkpoint LSN will be assigned later for this checkpoint and
- * overwrite this.
- */
- WT_MAX_LSN(&ckptlsn);
-
/* Get the list of checkpoints for this file. */
WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
@@ -857,74 +857,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session,
/* Drop checkpoints with the same name as the one we're taking. */
__drop(ckptbase, name, strlen(name));
- /*
- * Check for clean objects not requiring a checkpoint.
- *
- * If we're closing a handle, and the object is clean, we can skip the
- * checkpoint, whatever checkpoints we have are sufficient. (We might
- * not have any checkpoints if the object was never modified, and that's
- * OK: the object creation code doesn't mark the tree modified so we can
- * skip newly created trees here.)
- *
- * If the application repeatedly checkpoints an object (imagine hourly
- * checkpoints using the same explicit or internal name), there's no
- * reason to repeat the checkpoint for clean objects. The test is if
- * the only checkpoint we're deleting is the last one in the list and
- * it has the same name as the checkpoint we're about to take, skip the
- * work. (We can't skip checkpoints that delete more than the last
- * checkpoint because deleting those checkpoints might free up space in
- * the file.) This means an application toggling between two (or more)
- * checkpoint names will repeatedly take empty checkpoints, but that's
- * not likely enough to make detection worthwhile.
- *
- * Checkpoint read-only objects otherwise: the application must be able
- * to open the checkpoint in a cursor after taking any checkpoint, which
- * means it must exist.
- */
- force = false;
- F_CLR(btree, WT_BTREE_SKIP_CKPT);
- if (!btree->modified && cfg != NULL) {
- ret = __wt_config_gets(session, cfg, "force", &cval);
- if (ret != 0 && ret != WT_NOTFOUND)
- WT_ERR(ret);
- if (ret == 0 && cval.val != 0)
- force = true;
- }
- if (!btree->modified && !force) {
- if (!is_checkpoint)
- goto nockpt;
-
- deleted = 0;
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (F_ISSET(ckpt, WT_CKPT_DELETE))
- ++deleted;
- /*
- * Complicated test: if the last checkpoint in the object has
- * the same name as the checkpoint we're taking (correcting for
- * internal checkpoint names with their generational suffix
- * numbers), we can skip the checkpoint, there's nothing to do.
- * The exception is if we're deleting two or more checkpoints:
- * then we may save space.
- */
- if (ckpt > ckptbase &&
- (strcmp(name, (ckpt - 1)->name) == 0 ||
- (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
- WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) &&
- deleted < 2) {
-nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
- WT_PUBLISH(btree->checkpoint_gen,
- S2C(session)->txn_global.checkpoint_gen);
- WT_STAT_FAST_DATA_SET(session,
- btree_checkpoint_generation,
- btree->checkpoint_gen);
- goto done;
- }
- }
-
/* Add a new checkpoint entry at the end of the list. */
WT_CKPT_FOREACH(ckptbase, ckpt)
;
WT_ERR(__wt_strdup(session, name, &ckpt->name));
+ /*
+ * We are now done with the local use of the name. Free the local
+ * allocation, if needed.
+ */
+ __wt_free(session, name_alloc);
F_SET(ckpt, WT_CKPT_ADD);
/*
@@ -1005,32 +946,119 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
* copy instead of forcing checkpoints on clean objects to associate
* names with checkpoints.
*/
- if (is_checkpoint)
- switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) {
- case 0:
- break;
- case WT_BTREE_BULK:
- /*
- * The only checkpoints a bulk-loaded file should have
- * are fake ones we created without the underlying block
- * manager. I'm leaving this code here because it's a
- * cheap test and a nasty race.
- */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE))
- WT_ERR_MSG(session, ret,
- "block-manager checkpoint found "
- "for a bulk-loaded file");
- fake_ckpt = true;
- goto fake;
- case WT_BTREE_REBALANCE:
- case WT_BTREE_SALVAGE:
- case WT_BTREE_UPGRADE:
- case WT_BTREE_VERIFY:
- WT_ERR_MSG(session, EINVAL,
- "checkpoints are blocked during rebalance, "
- "salvage, upgrade or verify operations");
+ WT_ASSERT(session,
+ !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));
+
+ hot_backup_locked = false;
+ WT_ERR(__wt_readunlock(session, conn->hot_backup_lock));
+
+ WT_ASSERT(session, btree->ckpt == NULL);
+ btree->ckpt = ckptbase;
+
+ return (0);
+
+err: if (hot_backup_locked)
+ WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
+
+ __wt_meta_ckptlist_free(session, ckptbase);
+ __wt_free(session, name_alloc);
+
+ return (ret);
+}
+
+/*
+ * __checkpoint_tree --
+ * Checkpoint a single tree.
+ * Assumes all necessary locks have been acquired by the caller.
+ */
+static int
+__checkpoint_tree(
+ WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_LSN ckptlsn;
+ const char *name;
+ int deleted, was_modified;
+ bool fake_ckpt, force;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ ckptbase = btree->ckpt;
+ conn = S2C(session);
+ dhandle = session->dhandle;
+ fake_ckpt = false;
+ was_modified = btree->modified;
+
+ /*
+ * Check for clean objects not requiring a checkpoint.
+ *
+ * If we're closing a handle, and the object is clean, we can skip the
+ * checkpoint, whatever checkpoints we have are sufficient. (We might
+ * not have any checkpoints if the object was never modified, and that's
+ * OK: the object creation code doesn't mark the tree modified so we can
+ * skip newly created trees here.)
+ *
+ * If the application repeatedly checkpoints an object (imagine hourly
+ * checkpoints using the same explicit or internal name), there's no
+ * reason to repeat the checkpoint for clean objects. The test is if
+ * the only checkpoint we're deleting is the last one in the list and
+ * it has the same name as the checkpoint we're about to take, skip the
+ * work. (We can't skip checkpoints that delete more than the last
+ * checkpoint because deleting those checkpoints might free up space in
+ * the file.) This means an application toggling between two (or more)
+ * checkpoint names will repeatedly take empty checkpoints, but that's
+ * not likely enough to make detection worthwhile.
+ *
+ * Checkpoint read-only objects otherwise: the application must be able
+ * to open the checkpoint in a cursor after taking any checkpoint, which
+ * means it must exist.
+ */
+ force = false;
+ F_CLR(btree, WT_BTREE_SKIP_CKPT);
+ if (!btree->modified && cfg != NULL) {
+ ret = __wt_config_gets(session, cfg, "force", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_ERR(ret);
+ if (ret == 0 && cval.val != 0)
+ force = true;
+ }
+ if (!btree->modified && !force) {
+ if (!is_checkpoint)
+ goto nockpt;
+
+ deleted = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ ++deleted;
+ /*
+ * Complicated test: if the tree is clean and last two
+ * checkpoints have the same name (correcting for internal
+ * checkpoint names with their generational suffix numbers), we
+ * can skip the checkpoint, there's nothing to do. The
+ * exception is if we're deleting two or more checkpoints: then
+ * we may save space.
+ */
+ name = (ckpt - 1)->name;
+ if (ckpt > ckptbase + 1 && deleted < 2 &&
+ (strcmp(name, (ckpt - 2)->name) == 0 ||
+ (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+ WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) {
+nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_FAST_DATA_SET(session,
+ btree_checkpoint_generation,
+ btree->checkpoint_gen);
+ ret = 0;
+ goto err;
}
+ }
/*
* If an object has never been used (in other words, if it could become
@@ -1077,6 +1105,15 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
btree->modified = 0;
WT_FULL_BARRIER();
+ /*
+ * Set the checkpoint LSN to the maximum LSN so that if logging is
+ * disabled, recovery will never roll old changes forward over the
+ * non-logged changes in this checkpoint. If logging is enabled, a
+ * real checkpoint LSN will be assigned for this checkpoint and
+ * overwrite this.
+ */
+ WT_MAX_LSN(&ckptlsn);
+
/* Tell logging that a file checkpoint is starting. */
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
WT_ERR(__wt_txn_checkpoint_log(
@@ -1084,9 +1121,9 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
/* Flush the file from the cache, creating the checkpoint. */
if (is_checkpoint)
- WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT));
+ WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT));
else
- WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE));
+ WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE));
/*
* All blocks being written have been written; set the object's write
@@ -1144,7 +1181,6 @@ fake: /*
WT_ERR(__wt_txn_checkpoint_log(
session, false, WT_TXN_LOG_CKPT_STOP, NULL));
-done:
err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
@@ -1152,30 +1188,42 @@ err: /*
if (ret != 0 && !btree->modified && was_modified)
btree->modified = 1;
- if (hot_backup_locked)
- WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
-
__wt_meta_ckptlist_free(session, ckptbase);
- __wt_free(session, name_alloc);
+ btree->ckpt = NULL;
return (ret);
}
/*
+ * __checkpoint_tree_helper --
+ * Checkpoint a tree (suitable for use in *_apply functions).
+ */
+static int
+__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ return (__checkpoint_tree(session, true, cfg));
+}
+
+/*
* __wt_checkpoint --
* Checkpoint a file.
*/
int
__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
+ WT_DECL_RET;
+
/* Should not be called with a checkpoint handle. */
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
- /* Should be holding the schema lock. */
+ /* We must hold the metadata lock if checkpointing the metadata. */
WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) ||
F_ISSET(session, WT_SESSION_LOCKED_METADATA));
- return (__checkpoint_worker(session, cfg, true, true));
+ WT_SAVE_DHANDLE(session,
+ ret = __checkpoint_lock_tree(session, true, true, cfg));
+ WT_RET(ret);
+ return (__checkpoint_tree(session, true, cfg));
}
/*
@@ -1225,7 +1273,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
F_SET(session->dhandle, WT_DHANDLE_DEAD);
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
- return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ return (__wt_cache_op(session, WT_SYNC_DISCARD));
/*
* If closing an unmodified file, check that no update is required
@@ -1234,7 +1282,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (!btree->modified && !bulk) {
__wt_txn_update_oldest(session, true);
return (__wt_txn_visible_all(session, btree->rec_max_txn) ?
- __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY);
+ __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY);
}
/*
@@ -1248,10 +1296,14 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (need_tracking)
WT_RET(__wt_meta_track_on(session));
- WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking));
+ WT_SAVE_DHANDLE(session,
+ ret = __checkpoint_lock_tree(session, false, need_tracking, NULL));
+ WT_ASSERT(session, ret == 0);
+ if (ret == 0)
+ ret = __checkpoint_tree(session, false, NULL);
if (need_tracking)
- WT_RET(__wt_meta_track_off(session, true, ret != 0));
+ WT_TRET(__wt_meta_track_off(session, true, ret != 0));
return (ret);
}
diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py
index 9955944f73d..6e1ad7814ed 100644
--- a/test/suite/test_checkpoint01.py
+++ b/test/suite/test_checkpoint01.py
@@ -185,7 +185,7 @@ class test_checkpoint_cursor(wttest.WiredTigerTestCase):
# Check dropping all checkpoints fails.
msg = '/checkpoints cannot be dropped/'
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.checkpoint("name=checkpoint-2"), msg)
+ lambda: self.session.checkpoint("force,name=checkpoint-2"), msg)
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
lambda: self.session.checkpoint("drop=(checkpoint-2)"), msg)
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,