summaryrefslogtreecommitdiff
path: root/src/txn
diff options
context:
space:
mode:
Diffstat (limited to 'src/txn')
-rw-r--r--src/txn/txn.c119
-rw-r--r--src/txn/txn_ckpt.c485
-rw-r--r--src/txn/txn_log.c20
-rw-r--r--src/txn/txn_nsnap.c12
-rw-r--r--src/txn/txn_recover.c4
5 files changed, 387 insertions, 253 deletions
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 26a0ed679e2..6eebf5ecf9f 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
n = 0;
/* We're going to scan the table: wait for the lock. */
- __wt_readlock_spin(session, txn_global->scan_rwlock);
+ __wt_readlock_spin(session, &txn_global->scan_rwlock);
current_id = pinned_id = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -180,7 +180,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
txn_state->pinned_id = pinned_id;
-done: __wt_readunlock(session, txn_global->scan_rwlock);
+done: __wt_readunlock(session, &txn_global->scan_rwlock);
__txn_sort_snapshot(session, n, current_id);
}
@@ -293,13 +293,13 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
/* First do a read-only scan. */
if (wait)
- __wt_readlock_spin(session, txn_global->scan_rwlock);
+ __wt_readlock_spin(session, &txn_global->scan_rwlock);
else if ((ret =
- __wt_try_readlock(session, txn_global->scan_rwlock)) != 0)
+ __wt_try_readlock(session, &txn_global->scan_rwlock)) != 0)
return (ret == EBUSY ? 0 : ret);
__txn_oldest_scan(session,
&oldest_id, &last_running, &metadata_pinned, &oldest_session);
- __wt_readunlock(session, txn_global->scan_rwlock);
+ __wt_readunlock(session, &txn_global->scan_rwlock);
/*
* If the state hasn't changed (or hasn't moved far enough for
@@ -314,9 +314,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
/* It looks like an update is necessary, wait for exclusive access. */
if (wait)
- __wt_writelock(session, txn_global->scan_rwlock);
+ __wt_writelock(session, &txn_global->scan_rwlock);
else if ((ret =
- __wt_try_writelock(session, txn_global->scan_rwlock)) != 0)
+ __wt_try_writelock(session, &txn_global->scan_rwlock)) != 0)
return (ret == EBUSY ? 0 : ret);
/*
@@ -375,7 +375,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
#endif
}
-done: __wt_writeunlock(session, txn_global->scan_rwlock);
+done: __wt_writeunlock(session, &txn_global->scan_rwlock);
return (ret);
}
@@ -713,7 +713,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
snapshot_pinned = txn_global->nsnap_oldest_id;
WT_STAT_SET(session, stats, txn_pinned_range,
- txn_global->current - txn_global->oldest_id);
+ txn_global->current - txn_global->oldest_id);
WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
snapshot_pinned == WT_TXN_NONE ?
@@ -768,10 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
- WT_RET(__wt_rwlock_alloc(session,
- &txn_global->scan_rwlock, "transaction scan lock"));
- WT_RET(__wt_rwlock_alloc(session,
- &txn_global->nsnap_rwlock, "named snapshot lock"));
+ __wt_rwlock_init(session, &txn_global->scan_rwlock);
+ __wt_rwlock_init(session, &txn_global->nsnap_rwlock);
txn_global->nsnap_oldest_id = WT_TXN_NONE;
TAILQ_INIT(&txn_global->nsnaph);
@@ -805,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
__wt_free(session, txn_global->states);
}
+
+#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
+/*
+ * __wt_verbose_dump_txn --
+ * Output diagnostic information about the global transaction state.
+ */
+int
+__wt_verbose_dump_txn(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN *txn;
+ WT_TXN_STATE *s;
+ const char *iso_tag;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "transaction state dump"));
+
+ WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
+ WT_RET(__wt_msg(session,
+ "last running ID: %" PRIu64, txn_global->last_running));
+ WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
+ WT_RET(__wt_msg(session,
+ "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
+
+ WT_RET(__wt_msg(session, "checkpoint running? %s",
+ txn_global->checkpoint_running ? "yes" : "no"));
+ WT_RET(__wt_msg(session,
+ "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen));
+ WT_RET(__wt_msg(session,
+ "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned));
+ WT_RET(__wt_msg(session,
+ "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid));
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
+
+ WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
+
+ /*
+ * Walk each session transaction state and dump information. Accessing
+ * the content of session handles is not thread safe, so some
+ * information may change while traversing if other threads are active
+ * at the same time, which is OK since this is diagnostic code.
+ */
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Skip sessions with no active transaction */
+ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
+ continue;
+
+ txn = &conn->sessions[i].txn;
+ iso_tag = "INVALID";
+ switch (txn->isolation) {
+ case WT_ISO_READ_COMMITTED:
+ iso_tag = "WT_ISO_READ_COMMITTED";
+ break;
+ case WT_ISO_READ_UNCOMMITTED:
+ iso_tag = "WT_ISO_READ_UNCOMMITTED";
+ break;
+ case WT_ISO_SNAPSHOT:
+ iso_tag = "WT_ISO_SNAPSHOT";
+ break;
+ }
+
+ WT_RET(__wt_msg(session,
+ "ID: %6" PRIu64
+ ", mod count: %u"
+ ", pinned ID: %" PRIu64
+ ", snap min: %" PRIu64
+ ", snap max: %" PRIu64
+ ", metadata pinned ID: %" PRIu64
+ ", flags: 0x%08" PRIx32
+ ", name: %s"
+ ", isolation: %s",
+ id,
+ txn->mod_count,
+ s->pinned_id,
+ txn->snap_min,
+ txn->snap_max,
+ s->metadata_pinned,
+ txn->flags,
+ conn->sessions[i].name == NULL ?
+ "EMPTY" : conn->sessions[i].name,
+ iso_tag));
+ }
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+
+ return (0);
+}
+#endif
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 399d9187d82..f4ccf5eacd0 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -8,9 +8,9 @@
#include "wt_internal.h"
-static int __checkpoint_lock_tree(
- WT_SESSION_IMPL *, bool, bool, const char *[]);
-static int __checkpoint_mark_deletes(WT_SESSION_IMPL *, const char *[]);
+static int __checkpoint_lock_dirty_tree(
+ WT_SESSION_IMPL *, bool, bool, bool, const char *[]);
+static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool);
static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]);
static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
@@ -90,6 +90,33 @@ err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
}
/*
+ * __checkpoint_update_generation --
+ * Update the checkpoint generation of the current tree.
+ *
+ * This indicates that the tree will not be visited again by the current
+ * checkpoint.
+ */
+static void
+__checkpoint_update_generation(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /*
+ * Updates to the metadata are made by the checkpoint transaction, so
+ * the metadata tree's checkpoint generation should never be updated.
+ */
+ if (WT_IS_METADATA(session->dhandle))
+ return;
+
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_DATA_SET(session,
+ btree_checkpoint_generation, btree->checkpoint_gen);
+}
+
+/*
* __checkpoint_apply_all --
* Apply an operation to all files involved in a checkpoint.
*/
@@ -239,22 +266,82 @@ int
__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
WT_DECL_RET;
const char *name;
+ bool force;
+
+ btree = S2BT(session);
+
+ /* Find out if we have to force a checkpoint. */
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = cval.val != 0;
+ if (!force) {
+ WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
+ force = cval.len != 0;
+ }
/* Should not be called with anything other than a file object. */
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:"));
/* Skip files that are never involved in a checkpoint. */
- if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT))
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ return (0);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We may have raced between starting the checkpoint transaction and
+ * some operation completing on the handle that updated the metadata
+ * (e.g., closing a bulk load cursor). All such operations either have
+ * exclusive access to the handle or hold the schema lock. We are now
+ * holding the schema lock and have an open btree handle, so if we
+ * can't update the metadata, then there has been some state change
+ * invisible to the checkpoint transaction.
+ */
+ if (!WT_IS_METADATA(session->dhandle)) {
+ WT_CURSOR *meta_cursor;
+ bool metadata_race;
+
+ WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR));
+ WT_RET(__wt_metadata_cursor(session, &meta_cursor));
+ meta_cursor->set_key(meta_cursor, session->dhandle->name);
+ ret = __wt_curfile_insert_check(meta_cursor);
+ if (ret == WT_ROLLBACK) {
+ metadata_race = true;
+ ret = 0;
+ } else
+ metadata_race = false;
+ WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
+ WT_RET(ret);
+ WT_ASSERT(session, !metadata_race);
+ }
+#endif
+
+ /*
+ * Decide whether the tree needs to be included in the checkpoint and
+ * if so, acquire the necessary locks.
+ */
+ WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
+ session, true, force, true, cfg));
+ WT_RET(ret);
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
+ WT_ASSERT(session, btree->ckpt == NULL);
+ __checkpoint_update_generation(session);
return (0);
+ }
- /* Make sure there is space for the next entry. */
+ /*
+ * Make sure there is space for the new entry: do this before getting
+ * the handle to avoid cleanup if we can't allocate the memory.
+ */
WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
session->ckpt_handle_next + 1, &session->ckpt_handle));
- /* Not strictly necessary, but cleaner to clear the current handle. */
+ /*
+ * The current tree will be included: get it again because the handle
+ * we have is only valid for the duration of this function.
+ */
name = session->dhandle->name;
session->dhandle = NULL;
@@ -266,49 +353,13 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
* with eviction and we don't want to unfairly penalize (or promote)
* eviction in trees due to checkpoints.
*/
- btree = S2BT(session);
btree->evict_walk_saved = btree->evict_walk_period;
- WT_SAVE_DHANDLE(session,
- ret = __checkpoint_lock_tree(session, true, true, cfg));
- if (ret != 0) {
- WT_TRET(__wt_session_release_btree(session));
- return (ret);
- }
-
- /*
- * Flag that the handle is part of a checkpoint for the purposes
- * of transaction visibility checks.
- */
- WT_PUBLISH(btree->include_checkpoint_txn, true);
-
session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
return (0);
}
/*
- * __checkpoint_update_generation --
- * Update the checkpoint generation of the current tree.
- *
- * This indicates that the tree will not be visited again by the current
- * checkpoint.
- */
-static void
-__checkpoint_update_generation(WT_SESSION_IMPL *session)
-{
- WT_BTREE *btree;
-
- btree = S2BT(session);
- if (!WT_IS_METADATA(session->dhandle))
- WT_PUBLISH(btree->include_checkpoint_txn, false);
-
- WT_PUBLISH(btree->checkpoint_gen,
- S2C(session)->txn_global.checkpoint_gen);
- WT_STAT_DATA_SET(session,
- btree_checkpoint_generation, btree->checkpoint_gen);
-}
-
-/*
* __checkpoint_reduce_dirty_cache --
* Release clean trees from the list cached for checkpoints.
*/
@@ -371,7 +422,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
__wt_sleep(0, stepdown_us / 10);
__wt_epoch(session, &stop);
current_us = WT_TIMEDIFF_US(stop, last);
- total_ms = WT_TIMEDIFF_MS(stop, start);
bytes_written_total =
cache->bytes_written - bytes_written_start;
@@ -434,36 +484,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
}
/*
- * __checkpoint_release_clean_trees --
- * Release clean trees from the list cached for checkpoints.
- */
-static int
-__checkpoint_release_clean_trees(WT_SESSION_IMPL *session)
-{
- WT_BTREE *btree;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- u_int i;
-
- for (i = 0; i < session->ckpt_handle_next; i++) {
- dhandle = session->ckpt_handle[i];
- btree = dhandle->handle;
- if (!F_ISSET(btree, WT_BTREE_SKIP_CKPT))
- continue;
- __wt_meta_ckptlist_free(session, btree->ckpt);
- btree->ckpt = NULL;
- WT_WITH_DHANDLE(session, dhandle,
- __checkpoint_update_generation(session));
- session->ckpt_handle[i] = NULL;
- WT_WITH_DHANDLE(session, dhandle,
- ret = __wt_session_release_btree(session));
- WT_RET(ret);
- }
-
- return (0);
-}
-
-/*
* __checkpoint_stats --
* Update checkpoint timer stats.
*/
@@ -525,6 +545,112 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session,
}
/*
+ * __checkpoint_fail_reset --
+ * Reset fields when a failure occurs.
+ */
+static void
+__checkpoint_fail_reset(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+ btree->modified = true;
+ __wt_meta_ckptlist_free(session, &btree->ckpt);
+}
+
+/*
+ * __checkpoint_prepare --
+ * Start the transaction for a checkpoint and gather handles.
+ */
+static int
+__checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+ const char *txn_cfg[] = { WT_CONFIG_BASE(session,
+ WT_SESSION_begin_transaction), "isolation=snapshot", NULL };
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
+
+ /*
+ * Start a snapshot transaction for the checkpoint.
+ *
+ * Note: we don't go through the public API calls because they have
+ * side effects on cursors, which applications can hold open across
+ * calls to checkpoint.
+ */
+ WT_RET(__wt_txn_begin(session, txn_cfg));
+
+ WT_DIAGNOSTIC_YIELD;
+
+ /* Ensure a transaction ID is allocated prior to sharing it globally */
+ WT_RET(__wt_txn_id_check(session));
+
+ /*
+ * Mark the connection as clean. If some data gets modified after
+ * generating checkpoint transaction id, connection will be reset to
+ * dirty when reconciliation marks the btree dirty on encountering the
+ * dirty page.
+ */
+ conn->modified = false;
+
+ /*
+ * Save the checkpoint session ID.
+ *
+ * We never do checkpoints in the default session (with id zero).
+ */
+ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
+ txn_global->checkpoint_id = session->id;
+
+ /*
+ * Remove the checkpoint transaction from the global table.
+ *
+ * This allows ordinary visibility checks to move forward because
+ * checkpoints often take a long time and only write to the metadata.
+ */
+ __wt_writelock(session, &txn_global->scan_rwlock);
+ txn_global->checkpoint_txnid = txn->id;
+ txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min);
+
+ /*
+ * Sanity check that the oldest ID hasn't moved on before we have
+ * cleared our entry.
+ */
+ WT_ASSERT(session,
+ WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
+ WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
+
+ /*
+ * Clear our entry from the global transaction session table. Any
+ * operation that needs to know about the ID for this checkpoint will
+ * consider the checkpoint ID in the global structure. Most operations
+ * can safely ignore the checkpoint ID (see the visible all check for
+ * details).
+ */
+ txn_state->id = txn_state->pinned_id =
+ txn_state->metadata_pinned = WT_TXN_NONE;
+ __wt_writeunlock(session, &txn_global->scan_rwlock);
+
+ /*
+ * Get a list of handles we want to flush; for named checkpoints this
+ * may pull closed objects into the session cache.
+ *
+ * First, gather all handles, then start the checkpoint transaction,
+ * then release any clean handles.
+ */
+ WT_ASSERT(session, session->ckpt_handle_next == 0);
+ WT_WITH_TABLE_READ_LOCK(session, ret = __checkpoint_apply_all(
+ session, cfg, __wt_checkpoint_get_handles, NULL));
+ return (ret);
+}
+
+/*
* __txn_checkpoint --
* Checkpoint a database or a list of objects in the database.
*/
@@ -539,19 +665,15 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_ISOLATION saved_isolation;
- WT_TXN_STATE *txn_state;
void *saved_meta_next;
u_int i;
uint64_t fsync_duration_usecs;
- bool full, idle, logging, tracking;
- const char *txn_cfg[] = { WT_CONFIG_BASE(session,
- WT_SESSION_begin_transaction), "isolation=snapshot", NULL };
+ bool failed, full, idle, logging, tracking;
conn = S2C(session);
cache = conn->cache;
txn = &session->txn;
txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
saved_isolation = session->isolation;
full = idle = logging = tracking = false;
@@ -620,87 +742,24 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
tracking = true;
/*
- * Get a list of handles we want to flush; for named checkpoints this
- * may pull closed objects into the session cache.
- *
* We want to skip checkpointing clean handles whenever possible. That
* is, when the checkpoint is not named or forced. However, we need to
* take care about ordering with respect to the checkpoint transaction.
*
- * If we skip clean handles before starting the transaction, the
+ * We can't skip clean handles before starting the transaction or the
* checkpoint can miss updates in trees that become dirty as the
* checkpoint is starting. If we wait until the transaction has
* started before locking a handle, there could be a metadata-changing
* operation in between (e.g., salvage) that will cause a write
* conflict when the checkpoint goes to write the metadata.
*
- * First, gather all handles, then start the checkpoint transaction,
- * then release any clean handles.
+ * Hold the schema lock while starting the transaction and gathering
+ * handles so the set we get is complete and correct.
*/
- WT_ASSERT(session, session->ckpt_handle_next == 0);
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __checkpoint_apply_all(
- session, cfg, __wt_checkpoint_get_handles, NULL))));
+ WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, cfg));
WT_ERR(ret);
- /*
- * Start a snapshot transaction for the checkpoint.
- *
- * Note: we don't go through the public API calls because they have
- * side effects on cursors, which applications can hold open across
- * calls to checkpoint.
- */
- WT_ERR(__wt_txn_begin(session, txn_cfg));
-
- /* Ensure a transaction ID is allocated prior to sharing it globally */
- WT_ERR(__wt_txn_id_check(session));
-
- /*
- * Mark the connection as clean. If some data gets modified after
- * generating checkpoint transaction id, connection will be reset to
- * dirty when reconciliation marks the btree dirty on encountering the
- * dirty page.
- */
- conn->modified = false;
-
- /*
- * Save the checkpoint session ID.
- *
- * We never do checkpoints in the default session (with id zero).
- */
- WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
- txn_global->checkpoint_id = session->id;
-
- /*
- * Remove the checkpoint transaction from the global table.
- *
- * This allows ordinary visibility checks to move forward because
- * checkpoints often take a long time and only write to the metadata.
- */
- __wt_writelock(session, txn_global->scan_rwlock);
- txn_global->checkpoint_txnid = txn->id;
- txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min);
-
- /*
- * Sanity check that the oldest ID hasn't moved on before we have
- * cleared our entry.
- */
- WT_ASSERT(session,
- WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
- WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
-
- /*
- * Clear our entry from the global transaction session table. Any
- * operation that needs to know about the ID for this checkpoint will
- * consider the checkpoint ID in the global structure. Most operations
- * can safely ignore the checkpoint ID (see the visible all check for
- * details).
- */
- txn_state->id = txn_state->pinned_id =
- txn_state->metadata_pinned = WT_TXN_NONE;
- __wt_writeunlock(session, txn_global->scan_rwlock);
+ WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT);
/*
* Unblock updates -- we can figure out that any updates to clean pages
@@ -709,16 +768,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
cache->eviction_scrub_limit = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
- /*
- * Mark old checkpoints that are being deleted and figure out which
- * trees we can skip in this checkpoint.
- *
- * Release clean trees. Any updates made after this point will not
- * visible to the checkpoint transaction.
- */
- WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_mark_deletes));
- WT_ERR(__checkpoint_release_clean_trees(session));
-
/* Tell logging that we have started a database checkpoint. */
if (full && logging)
WT_ERR(__wt_txn_checkpoint_log(
@@ -825,12 +874,13 @@ err: /*
* overwritten the checkpoint, so what ends up on disk is not
* consistent.
*/
- if (ret != 0 && !conn->modified)
+ failed = ret != 0;
+ if (failed)
conn->modified = true;
session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
if (tracking)
- WT_TRET(__wt_meta_track_off(session, false, ret != 0));
+ WT_TRET(__wt_meta_track_off(session, false, failed));
cache->eviction_scrub_limit = 0.0;
WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
@@ -863,6 +913,13 @@ err: /*
for (i = 0; i < session->ckpt_handle_next; ++i) {
if (session->ckpt_handle[i] == NULL)
continue;
+ /*
+ * If the operation failed, mark all trees dirty so they are
+ * included if a future checkpoint can succeed.
+ */
+ if (failed)
+ WT_WITH_DHANDLE(session, session->ckpt_handle[i],
+ __checkpoint_fail_reset(session));
WT_WITH_DHANDLE(session, session->ckpt_handle[i],
WT_TRET(__wt_session_release_btree(session)));
}
@@ -1047,12 +1104,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
}
/*
- * __checkpoint_lock_tree --
- * Acquire the locks required to checkpoint a tree.
+ * __checkpoint_lock_dirty_tree --
+ * Decide whether the tree needs to be included in the checkpoint and if
+ * so, acquire the necessary locks.
*/
static int
-__checkpoint_lock_tree(WT_SESSION_IMPL *session,
- bool is_checkpoint, bool need_tracking, const char *cfg[])
+__checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session,
+ bool is_checkpoint, bool force, bool need_tracking, const char *cfg[])
{
WT_BTREE *btree;
WT_CKPT *ckpt, *ckptbase;
@@ -1159,7 +1217,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session,
* Hold the lock until we're done (blocking hot backups from starting),
* we don't want to race with a future hot backup.
*/
- __wt_readlock(session, conn->hot_backup_lock);
+ __wt_readlock(session, &conn->hot_backup_lock);
hot_backup_locked = true;
if (conn->hot_backup)
WT_CKPT_FOREACH(ckptbase, ckpt) {
@@ -1177,6 +1235,14 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session,
}
/*
+ * Mark old checkpoints that are being deleted and figure out which
+ * trees we can skip in this checkpoint.
+ */
+ WT_ERR(__checkpoint_mark_skip(session, ckptbase, force));
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
+ goto err;
+
+ /*
* Lock the checkpoints that will be deleted.
*
* Checkpoints are only locked when tracking is enabled, which covers
@@ -1209,64 +1275,47 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session,
}
/*
- * There are special files: those being bulk-loaded, salvaged, upgraded
- * or verified during the checkpoint. We have to do something for those
- * objects because a checkpoint is an external name the application can
- * reference and the name must exist no matter what's happening during
- * the checkpoint. For bulk-loaded files, we could block until the load
- * completes, checkpoint the partial load, or magic up an empty-file
- * checkpoint. The first is too slow, the second is insane, so do the
- * third.
- * Salvage, upgrade and verify don't currently require any work, all
- * three hold the schema lock, blocking checkpoints. If we ever want to
- * fix that (and I bet we eventually will, at least for verify), we can
- * copy the last checkpoint the file has. That works if we guarantee
- * salvage, upgrade and verify act on objects with previous checkpoints
- * (true if handles are closed/re-opened between object creation and a
- * subsequent salvage, upgrade or verify operation). Presumably,
- * salvage and upgrade will discard all previous checkpoints when they
- * complete, which is fine with us. This change will require reference
- * counting checkpoints, and once that's done, we should use checkpoint
- * copy instead of forcing checkpoints on clean objects to associate
- * names with checkpoints.
+ * There are special tree: those being bulk-loaded, salvaged, upgraded
+ * or verified during the checkpoint. They should never be part of a
+ * checkpoint: we will fail to lock them because the operations have
+ * exclusive access to the handles. Named checkpoints will fail in that
+ * case, ordinary checkpoints will skip files that cannot be opened
+ * normally.
*/
WT_ASSERT(session,
!is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));
- __wt_readunlock(session, conn->hot_backup_lock);
+ __wt_readunlock(session, &conn->hot_backup_lock);
- WT_ASSERT(session, btree->ckpt == NULL);
+ WT_ASSERT(session, btree->ckpt == NULL &&
+ !F_ISSET(btree, WT_BTREE_SKIP_CKPT));
btree->ckpt = ckptbase;
return (0);
err: if (hot_backup_locked)
- __wt_readunlock(session, conn->hot_backup_lock);
+ __wt_readunlock(session, &conn->hot_backup_lock);
- __wt_meta_ckptlist_free(session, ckptbase);
+ __wt_meta_ckptlist_free(session, &ckptbase);
__wt_free(session, name_alloc);
return (ret);
}
/*
- * __checkpoint_mark_deletes --
- * Figure out what old checkpoints will be deleted, and whether the
- * checkpoint can be skipped entirely.
+ * __checkpoint_mark_skip --
+ * Figure out whether the checkpoint can be skipped for a tree.
*/
static int
-__checkpoint_mark_deletes(
- WT_SESSION_IMPL *session, const char *cfg[])
+__checkpoint_mark_skip(
+ WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
{
WT_BTREE *btree;
- WT_CKPT *ckpt, *ckptbase;
- WT_CONFIG_ITEM cval;
+ WT_CKPT *ckpt;
const char *name;
int deleted;
- bool force;
btree = S2BT(session);
- ckptbase = btree->ckpt;
/*
* Check for clean objects not requiring a checkpoint.
@@ -1292,12 +1341,7 @@ __checkpoint_mark_deletes(
* to open the checkpoint in a cursor after taking any checkpoint, which
* means it must exist.
*/
- force = false;
F_CLR(btree, WT_BTREE_SKIP_CKPT);
- if (!btree->modified && cfg != NULL) {
- WT_RET(__wt_config_gets(session, cfg, "force", &cval));
- force = cval.val != 0;
- }
if (!btree->modified && !force) {
deleted = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -1341,7 +1385,6 @@ __checkpoint_tree(
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_LSN ckptlsn;
- int was_modified;
bool fake_ckpt;
WT_UNUSED(cfg);
@@ -1352,7 +1395,6 @@ __checkpoint_tree(
conn = S2C(session);
dhandle = session->dhandle;
fake_ckpt = false;
- was_modified = btree->modified;
/*
* Set the checkpoint LSN to the maximum LSN so that if logging is
@@ -1377,7 +1419,7 @@ __checkpoint_tree(
* delete a physical checkpoint, and that will end in tears.
*/
if (is_checkpoint)
- if (btree->bulk_load_ok) {
+ if (btree->original) {
fake_ckpt = true;
goto fake;
}
@@ -1483,14 +1525,12 @@ err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
*/
- if (ret != 0 && !btree->modified && was_modified) {
+ if (ret != 0) {
btree->modified = true;
- if (!S2C(session)->modified)
- S2C(session)->modified = true;
+ S2C(session)->modified = true;
}
- __wt_meta_ckptlist_free(session, ckptbase);
- btree->ckpt = NULL;
+ __wt_meta_ckptlist_free(session, &btree->ckpt);
return (ret);
}
@@ -1509,7 +1549,8 @@ __checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[])
WT_UNUSED(cfg);
btree = S2BT(session);
- WT_ASSERT(session, !btree->include_checkpoint_txn);
+ WT_ASSERT(session, btree->checkpoint_gen ==
+ S2C(session)->txn_global.checkpoint_gen);
btree->evict_walk_period = btree->evict_walk_saved;
return (0);
}
@@ -1558,7 +1599,9 @@ __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
int
__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
+ WT_CONFIG_ITEM cval;
WT_DECL_RET;
+ bool force;
/* Should not be called with a checkpoint handle. */
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
@@ -1567,12 +1610,13 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
F_ISSET(session, WT_SESSION_LOCKED_METADATA));
- WT_SAVE_DHANDLE(session,
- ret = __checkpoint_lock_tree(session, true, true, cfg));
- WT_RET(ret);
- WT_SAVE_DHANDLE(session,
- ret = __checkpoint_mark_deletes(session, cfg));
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = cval.val != 0;
+ WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
+ session, true, force, true, cfg));
WT_RET(ret);
+ if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
+ return (0);
return (__checkpoint_tree(session, true, cfg));
}
@@ -1647,15 +1691,10 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
if (need_tracking)
WT_RET(__wt_meta_track_on(session));
- WT_SAVE_DHANDLE(session,
- ret = __checkpoint_lock_tree(session, false, need_tracking, NULL));
+ WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
+ session, false, false, need_tracking, NULL));
WT_ASSERT(session, ret == 0);
- if (ret == 0) {
- WT_SAVE_DHANDLE(session,
- ret = __checkpoint_mark_deletes(session, NULL));
- WT_ASSERT(session, ret == 0);
- }
- if (ret == 0)
+ if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
ret = __checkpoint_tree(session, false, NULL);
if (need_tracking)
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index 5f4704b40c4..2931dc1ce82 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session,
WT_ITEM ckpt_snapshot_unused;
uint32_t ckpt_file, ckpt_offset;
u_int ckpt_nsnapshot_unused;
- const char *fmt = WT_UNCHECKED_STRING(IIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
&ckpt_file, &ckpt_offset,
@@ -297,7 +297,7 @@ __wt_txn_checkpoint_log(
uint8_t *end, *p;
size_t recsize;
uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
- const char *fmt = WT_UNCHECKED_STRING(IIIIU);
+ const char *fmt = WT_UNCHECKED_STRING(IIIIu);
txn = &session->txn;
ckpt_lsn = &txn->ckpt_lsn;
@@ -368,14 +368,16 @@ __wt_txn_checkpoint_log(
/*
* If this full checkpoint completed successfully and there is
- * no hot backup in progress, tell the logging subsystem the
- * checkpoint LSN so that it can archive. Do not update the
- * logging checkpoint LSN if this is during a clean connection
- * close, only during a full checkpoint. A clean close may not
- * update any metadata LSN and we do not want to archive in
- * that case.
+ * no hot backup in progress and this is not recovery, tell
+ * the logging subsystem the checkpoint LSN so that it can
+ * archive. Do not update the logging checkpoint LSN if this
+ * is during a clean connection close, only during a full
+ * checkpoint. A clean close may not update any metadata LSN
+ * and we do not want to archive in that case.
*/
- if (!S2C(session)->hot_backup && txn->full_ckpt)
+ if (!S2C(session)->hot_backup &&
+ !F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
+ txn->full_ckpt)
__wt_log_ckpt(session, ckpt_lsn);
/* FALLTHROUGH */
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index 65ec1a6662f..659570dbcd9 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -211,9 +211,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
if (TAILQ_EMPTY(&txn_global->nsnaph)) {
WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE &&
!__wt_txn_visible_all(session, nsnap_new->pinned_id));
- __wt_readlock(session, txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
txn_global->nsnap_oldest_id = nsnap_new->pinned_id;
- __wt_readunlock(session, txn_global->scan_rwlock);
+ __wt_readunlock(session, &txn_global->scan_rwlock);
}
TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
WT_STAT_CONN_INCR(session, txn_snapshots_created);
@@ -297,16 +297,16 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
if (session->ncursors > 0)
WT_RET(__wt_session_copy_values(session));
- __wt_readlock(session, txn_global->nsnap_rwlock);
+ __wt_readlock(session, &txn_global->nsnap_rwlock);
TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
/*
* Acquire the scan lock so the oldest ID can't move
* forward without seeing our pinned ID.
*/
- __wt_readlock(session, txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
txn_state->pinned_id = nsnap->pinned_id;
- __wt_readunlock(session, txn_global->scan_rwlock);
+ __wt_readunlock(session, &txn_global->scan_rwlock);
WT_ASSERT(session, !__wt_txn_visible_all(
session, txn_state->pinned_id) &&
@@ -327,7 +327,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
F_SET(txn, WT_TXN_HAS_SNAPSHOT);
break;
}
- __wt_readunlock(session, txn_global->nsnap_rwlock);
+ __wt_readunlock(session, &txn_global->nsnap_rwlock);
if (nsnap == NULL)
WT_RET_MSG(session, EINVAL,
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index a6390dcbd06..30932195b1e 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -93,7 +93,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
"%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \
"/%" PRIu32, \
cursor == NULL ? "Skipping" : "Applying", \
- optype, fileid, lsnp->l.file, lsnp->l.offset); \
+ optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \
if (cursor == NULL) \
break
@@ -501,7 +501,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
* Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
*/
r.metadata_only = false;
- __wt_verbose(session, WT_VERB_RECOVERY,
+ __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
"Main recovery loop: starting at %" PRIu32 "/%" PRIu32,
r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset);
WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));